Skip to content

Commit

Permalink
tools: Add basic VariantNormalizerExtensions. #191
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Sep 23, 2020
1 parent eb92ba0 commit 5fa58d7
Show file tree
Hide file tree
Showing 3 changed files with 258 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
package org.opencb.biodata.tools.variant.normalizer.extensions;

import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantFileMetadata;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.SampleEntry;
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine;
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderSimpleLine;
import org.opencb.commons.run.Task;

import java.util.LinkedList;
import java.util.List;
import java.util.Map;

public abstract class VariantNormalizerExtension implements Task<Variant, Variant> {

private VariantFileMetadata metadata;

public final VariantNormalizerExtension init(VariantFileMetadata metadata) {
this.metadata = metadata;
return this;
}

@Override
public void pre() throws Exception {
normalizeHeader(metadata);
}

@Override
public final List<Variant> apply(List<Variant> list) throws Exception {
for (Variant variant : list) {
normalizeVariant(variant);
if (variant.getStudies() == null || variant.getStudies().isEmpty()) {
continue;
}
// Only one study expected
StudyEntry study = variant.getStudies().get(0);
FileEntry fileEntry;
if (study.getFiles() != null && !study.getFiles().isEmpty()) {
// Only one file expected
fileEntry = study.getFiles().get(0);
normalizeFile(variant, study, fileEntry);
if (study.getSamples() != null) {
for (Map.Entry<String, Integer> entry : study.getSamplesPosition().entrySet()) {
normalizeSample(variant, study, fileEntry, entry.getKey(), study.getSample(entry.getValue()));
}
for (FileEntry file : study.getFiles()) {
normalizeFile(variant, study, file);
}
}
}
}
return list;
}

protected final VariantFileHeaderComplexLine getFileHeaderLine(VariantFileMetadata fileMetadata, String key, String id) {
for (VariantFileHeaderComplexLine line : fileMetadata.getHeader().getComplexLines()) {
if (line.getKey().equals(key)) {
if (line.getId().equals(id)) {
return line;
}
}
}
return null;
}

protected final List<VariantFileHeaderSimpleLine> getFileHeaderLine(VariantFileMetadata fileMetadata, String key) {
List<VariantFileHeaderSimpleLine> lines = new LinkedList<>();
for (VariantFileHeaderSimpleLine line : fileMetadata.getHeader().getSimpleLines()) {
if (line.getKey().equals(key)) {
lines.add(line);
}
}
return lines;
}

protected abstract boolean canUseExtension(VariantFileMetadata fileMetadata);

protected void normalizeHeader(VariantFileMetadata fileMetadata) {}

protected void normalizeVariant(Variant variant) {}

protected void normalizeSample(Variant variant, StudyEntry study, FileEntry file, String sampleId, SampleEntry sample) {}

protected void normalizeFile(Variant variant, StudyEntry study, FileEntry file) {}

}


Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.opencb.biodata.tools.variant.normalizer.extensions;

import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantFileMetadata;
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine;
import org.opencb.commons.run.Task;

import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;

public class VariantNormalizerExtensionFactory {

public static final Set<String> ALL_EXTENSIONS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(
"FILE_DP_TO_SAMPLE",
"SAMPLE_DP_TO_FORMAT"
)));
private final Set<String> enabledExtensions;

public VariantNormalizerExtensionFactory() {
this(ALL_EXTENSIONS);
}

public VariantNormalizerExtensionFactory(Set<String> enabledExtensions) {
this.enabledExtensions = enabledExtensions;
}


public Task<Variant, Variant> buildExtensions(VariantFileMetadata fileMetadata) {
Task<Variant, Variant> extensions = null;
for (String normalizerExtension : enabledExtensions) {
VariantNormalizerExtension extension;
switch (normalizerExtension) {
case "FILE_DP_TO_SAMPLE":
extension = new VariantNormalizerExtensionFileToSample("DP");
break;
case "FILE_AD_TO_SAMPLE_DP":
extension = new VariantNormalizerExtensionFileToSample("AD", "DP",
new VariantFileHeaderComplexLine("FORMAT", "DP", "", "1", "Integer", Collections.emptyMap()),
ad -> {
String[] split = ad.split(",");
int dp = 0;
for (String s : split) {
dp += Integer.parseInt(s);
}
return String.valueOf(dp);
});
break;
default:
throw new IllegalArgumentException("Unknown normalizer extension " + normalizerExtension);
}
if (extension.canUseExtension(fileMetadata)) {
extension.init(fileMetadata);
if (extensions == null) {
extensions = extension;
} else {
extensions = extensions.then(extension);
}
}
}
return extensions;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package org.opencb.biodata.tools.variant.normalizer.extensions;

import org.apache.commons.lang3.StringUtils;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantFileMetadata;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.avro.SampleEntry;
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine;

import java.util.Objects;
import java.util.function.Function;

public class VariantNormalizerExtensionFileToSample extends VariantNormalizerExtension {

private final String fileDataKey;
private final String sampleDataKey;
private VariantFileHeaderComplexLine newSampleMetadataLine;
private final Function<String, String> fieldMapper;


public VariantNormalizerExtensionFileToSample(String fileDataKey) {
this(fileDataKey, null);
}

public VariantNormalizerExtensionFileToSample(String fileDataKey, Function<String, String> fieldMapper) {
this.fileDataKey = fileDataKey;
this.sampleDataKey = fileDataKey;
newSampleMetadataLine = null;
this.fieldMapper = fieldMapper == null ? Function.identity() : fieldMapper;
}

public VariantNormalizerExtensionFileToSample(String fileDataKey, String sampleDataKey,
VariantFileHeaderComplexLine newSampleMetadataLine) {
this(fileDataKey, sampleDataKey, newSampleMetadataLine, null);
}

public VariantNormalizerExtensionFileToSample(String fileDataKey, String sampleDataKey,
VariantFileHeaderComplexLine newSampleMetadataLine,
Function<String, String> fieldMapper) {
this.fileDataKey = fileDataKey;
this.sampleDataKey = sampleDataKey;
this.newSampleMetadataLine = Objects.requireNonNull(newSampleMetadataLine);
this.fieldMapper = fieldMapper == null ? Function.identity() : fieldMapper;
}

@Override
protected boolean canUseExtension(VariantFileMetadata fileMetadata) {
if (fileMetadata.getSampleIds().size() != 1) {
// Fields from FILE_DATA can only be moved to SAMPLE_DATA if there is only one sample
return false;
}

VariantFileHeaderComplexLine headerLine = getFileHeaderLine(fileMetadata, "INFO", fileDataKey);
if (headerLine == null) {
// Need to have the field in the INFO
return false;
}

return true;
}

@Override
protected void normalizeHeader(VariantFileMetadata fileMetadata) {
if (getFileHeaderLine(fileMetadata, "FORMAT", sampleDataKey) == null) {
if (newSampleMetadataLine == null) {
VariantFileHeaderComplexLine info = getFileHeaderLine(fileMetadata, "INFO", fileDataKey);
newSampleMetadataLine = new VariantFileHeaderComplexLine(
"FORMAT",
info.getId(),
info.getDescription(),
info.getNumber(),
info.getType(),
info.getGenericFields());
}
fileMetadata.getHeader().getComplexLines().add(newSampleMetadataLine);
}
}

@Override
protected void normalizeSample(Variant variant, StudyEntry study, FileEntry file, String sampleId, SampleEntry sample) {
String fileValue = file.getData().get(fileDataKey);
if (fileValue == null || fileValue.isEmpty() || fileValue.equals(".")) {
// Nothing to do
return;
}

if (study.getSampleDataKeySet().contains(sampleDataKey)) {
study.addSampleDataKey(sampleDataKey);
}

Integer sampleDataKeyPosition = study.getSampleDataKeyPosition(sampleDataKey);
String sampleValue = sample.getData().get(sampleDataKeyPosition);
if (StringUtils.isEmpty(sampleValue)) {
String newSampleValue = fieldMapper.apply(fileValue);
while (sample.getData().size() < sampleDataKeyPosition) {
sample.getData().add("");
}
sample.getData().set(sampleDataKeyPosition, newSampleValue);
}
}
}

0 comments on commit 5fa58d7

Please sign in to comment.