-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tools: Add basic VariantNormalizerExtensions. #191
- Loading branch information
Showing
3 changed files
with
258 additions
and
0 deletions.
There are no files selected for viewing
90 changes: 90 additions & 0 deletions
90
...va/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtension.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
package org.opencb.biodata.tools.variant.normalizer.extensions; | ||
|
||
import org.opencb.biodata.models.variant.StudyEntry; | ||
import org.opencb.biodata.models.variant.Variant; | ||
import org.opencb.biodata.models.variant.VariantFileMetadata; | ||
import org.opencb.biodata.models.variant.avro.FileEntry; | ||
import org.opencb.biodata.models.variant.avro.SampleEntry; | ||
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; | ||
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderSimpleLine; | ||
import org.opencb.commons.run.Task; | ||
|
||
import java.util.LinkedList; | ||
import java.util.List; | ||
import java.util.Map; | ||
|
||
public abstract class VariantNormalizerExtension implements Task<Variant, Variant> { | ||
|
||
private VariantFileMetadata metadata; | ||
|
||
public final VariantNormalizerExtension init(VariantFileMetadata metadata) { | ||
this.metadata = metadata; | ||
return this; | ||
} | ||
|
||
@Override | ||
public void pre() throws Exception { | ||
normalizeHeader(metadata); | ||
} | ||
|
||
@Override | ||
public final List<Variant> apply(List<Variant> list) throws Exception { | ||
for (Variant variant : list) { | ||
normalizeVariant(variant); | ||
if (variant.getStudies() == null || variant.getStudies().isEmpty()) { | ||
continue; | ||
} | ||
// Only one study expected | ||
StudyEntry study = variant.getStudies().get(0); | ||
FileEntry fileEntry; | ||
if (study.getFiles() != null && !study.getFiles().isEmpty()) { | ||
// Only one file expected | ||
fileEntry = study.getFiles().get(0); | ||
normalizeFile(variant, study, fileEntry); | ||
if (study.getSamples() != null) { | ||
for (Map.Entry<String, Integer> entry : study.getSamplesPosition().entrySet()) { | ||
normalizeSample(variant, study, fileEntry, entry.getKey(), study.getSample(entry.getValue())); | ||
} | ||
for (FileEntry file : study.getFiles()) { | ||
normalizeFile(variant, study, file); | ||
} | ||
} | ||
} | ||
} | ||
return list; | ||
} | ||
|
||
protected final VariantFileHeaderComplexLine getFileHeaderLine(VariantFileMetadata fileMetadata, String key, String id) { | ||
for (VariantFileHeaderComplexLine line : fileMetadata.getHeader().getComplexLines()) { | ||
if (line.getKey().equals(key)) { | ||
if (line.getId().equals(id)) { | ||
return line; | ||
} | ||
} | ||
} | ||
return null; | ||
} | ||
|
||
protected final List<VariantFileHeaderSimpleLine> getFileHeaderLine(VariantFileMetadata fileMetadata, String key) { | ||
List<VariantFileHeaderSimpleLine> lines = new LinkedList<>(); | ||
for (VariantFileHeaderSimpleLine line : fileMetadata.getHeader().getSimpleLines()) { | ||
if (line.getKey().equals(key)) { | ||
lines.add(line); | ||
} | ||
} | ||
return lines; | ||
} | ||
|
||
protected abstract boolean canUseExtension(VariantFileMetadata fileMetadata); | ||
|
||
protected void normalizeHeader(VariantFileMetadata fileMetadata) {} | ||
|
||
protected void normalizeVariant(Variant variant) {} | ||
|
||
protected void normalizeSample(Variant variant, StudyEntry study, FileEntry file, String sampleId, SampleEntry sample) {} | ||
|
||
protected void normalizeFile(Variant variant, StudyEntry study, FileEntry file) {} | ||
|
||
} | ||
|
||
|
66 changes: 66 additions & 0 deletions
66
...opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.opencb.biodata.tools.variant.normalizer.extensions; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.opencb.biodata.models.variant.Variant; | ||
import org.opencb.biodata.models.variant.VariantFileMetadata; | ||
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; | ||
import org.opencb.commons.run.Task; | ||
|
||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.HashSet; | ||
import java.util.Set; | ||
|
||
public class VariantNormalizerExtensionFactory { | ||
|
||
public static final Set<String> ALL_EXTENSIONS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( | ||
"FILE_DP_TO_SAMPLE", | ||
"SAMPLE_DP_TO_FORMAT" | ||
))); | ||
private final Set<String> enabledExtensions; | ||
|
||
public VariantNormalizerExtensionFactory() { | ||
this(ALL_EXTENSIONS); | ||
} | ||
|
||
public VariantNormalizerExtensionFactory(Set<String> enabledExtensions) { | ||
this.enabledExtensions = enabledExtensions; | ||
} | ||
|
||
|
||
public Task<Variant, Variant> buildExtensions(VariantFileMetadata fileMetadata) { | ||
Task<Variant, Variant> extensions = null; | ||
for (String normalizerExtension : enabledExtensions) { | ||
VariantNormalizerExtension extension; | ||
switch (normalizerExtension) { | ||
case "FILE_DP_TO_SAMPLE": | ||
extension = new VariantNormalizerExtensionFileToSample("DP"); | ||
break; | ||
case "FILE_AD_TO_SAMPLE_DP": | ||
extension = new VariantNormalizerExtensionFileToSample("AD", "DP", | ||
new VariantFileHeaderComplexLine("FORMAT", "DP", "", "1", "Integer", Collections.emptyMap()), | ||
ad -> { | ||
String[] split = ad.split(","); | ||
int dp = 0; | ||
for (String s : split) { | ||
dp += Integer.parseInt(s); | ||
} | ||
return String.valueOf(dp); | ||
}); | ||
break; | ||
default: | ||
throw new IllegalArgumentException("Unknown normalizer extension " + normalizerExtension); | ||
} | ||
if (extension.canUseExtension(fileMetadata)) { | ||
extension.init(fileMetadata); | ||
if (extensions == null) { | ||
extensions = extension; | ||
} else { | ||
extensions = extensions.then(extension); | ||
} | ||
} | ||
} | ||
return extensions; | ||
} | ||
|
||
} |
102 changes: 102 additions & 0 deletions
102
...b/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFileToSample.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
package org.opencb.biodata.tools.variant.normalizer.extensions; | ||
|
||
import org.apache.commons.lang3.StringUtils; | ||
import org.opencb.biodata.models.variant.StudyEntry; | ||
import org.opencb.biodata.models.variant.Variant; | ||
import org.opencb.biodata.models.variant.VariantFileMetadata; | ||
import org.opencb.biodata.models.variant.avro.FileEntry; | ||
import org.opencb.biodata.models.variant.avro.SampleEntry; | ||
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; | ||
|
||
import java.util.Objects; | ||
import java.util.function.Function; | ||
|
||
public class VariantNormalizerExtensionFileToSample extends VariantNormalizerExtension { | ||
|
||
private final String fileDataKey; | ||
private final String sampleDataKey; | ||
private VariantFileHeaderComplexLine newSampleMetadataLine; | ||
private final Function<String, String> fieldMapper; | ||
|
||
|
||
public VariantNormalizerExtensionFileToSample(String fileDataKey) { | ||
this(fileDataKey, null); | ||
} | ||
|
||
public VariantNormalizerExtensionFileToSample(String fileDataKey, Function<String, String> fieldMapper) { | ||
this.fileDataKey = fileDataKey; | ||
this.sampleDataKey = fileDataKey; | ||
newSampleMetadataLine = null; | ||
this.fieldMapper = fieldMapper == null ? Function.identity() : fieldMapper; | ||
} | ||
|
||
public VariantNormalizerExtensionFileToSample(String fileDataKey, String sampleDataKey, | ||
VariantFileHeaderComplexLine newSampleMetadataLine) { | ||
this(fileDataKey, sampleDataKey, newSampleMetadataLine, null); | ||
} | ||
|
||
public VariantNormalizerExtensionFileToSample(String fileDataKey, String sampleDataKey, | ||
VariantFileHeaderComplexLine newSampleMetadataLine, | ||
Function<String, String> fieldMapper) { | ||
this.fileDataKey = fileDataKey; | ||
this.sampleDataKey = sampleDataKey; | ||
this.newSampleMetadataLine = Objects.requireNonNull(newSampleMetadataLine); | ||
this.fieldMapper = fieldMapper == null ? Function.identity() : fieldMapper; | ||
} | ||
|
||
@Override | ||
protected boolean canUseExtension(VariantFileMetadata fileMetadata) { | ||
if (fileMetadata.getSampleIds().size() != 1) { | ||
// Fields from FILE_DATA can only be moved to SAMPLE_DATA if there is only one sample | ||
return false; | ||
} | ||
|
||
VariantFileHeaderComplexLine headerLine = getFileHeaderLine(fileMetadata, "INFO", fileDataKey); | ||
if (headerLine == null) { | ||
// Need to have the field in the INFO | ||
return false; | ||
} | ||
|
||
return true; | ||
} | ||
|
||
@Override | ||
protected void normalizeHeader(VariantFileMetadata fileMetadata) { | ||
if (getFileHeaderLine(fileMetadata, "FORMAT", sampleDataKey) == null) { | ||
if (newSampleMetadataLine == null) { | ||
VariantFileHeaderComplexLine info = getFileHeaderLine(fileMetadata, "INFO", fileDataKey); | ||
newSampleMetadataLine = new VariantFileHeaderComplexLine( | ||
"FORMAT", | ||
info.getId(), | ||
info.getDescription(), | ||
info.getNumber(), | ||
info.getType(), | ||
info.getGenericFields()); | ||
} | ||
fileMetadata.getHeader().getComplexLines().add(newSampleMetadataLine); | ||
} | ||
} | ||
|
||
@Override | ||
protected void normalizeSample(Variant variant, StudyEntry study, FileEntry file, String sampleId, SampleEntry sample) { | ||
String fileValue = file.getData().get(fileDataKey); | ||
if (fileValue == null || fileValue.isEmpty() || fileValue.equals(".")) { | ||
// Nothing to do | ||
return; | ||
} | ||
|
||
if (study.getSampleDataKeySet().contains(sampleDataKey)) { | ||
study.addSampleDataKey(sampleDataKey); | ||
} | ||
|
||
Integer sampleDataKeyPosition = study.getSampleDataKeyPosition(sampleDataKey); | ||
String sampleValue = sample.getData().get(sampleDataKeyPosition); | ||
if (StringUtils.isEmpty(sampleValue)) { | ||
String newSampleValue = fieldMapper.apply(fileValue); | ||
while (sample.getData().size() < sampleDataKeyPosition) { | ||
sample.getData().add(""); | ||
} | ||
sample.getData().set(sampleDataKeyPosition, newSampleValue); | ||
} | ||
} | ||
} |