From 5fa58d7dc60e8998f8455e3f2f5b3d1b93b6b193 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 23 Sep 2020 19:07:50 +0100 Subject: [PATCH] tools: Add basic VariantNormalizerExtensions. #191 --- .../VariantNormalizerExtension.java | 90 ++++++++++++++++ .../VariantNormalizerExtensionFactory.java | 66 ++++++++++++ ...ariantNormalizerExtensionFileToSample.java | 102 ++++++++++++++++++ 3 files changed, 258 insertions(+) create mode 100644 biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtension.java create mode 100644 biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFactory.java create mode 100644 biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFileToSample.java diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtension.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtension.java new file mode 100644 index 000000000..83ac8321d --- /dev/null +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtension.java @@ -0,0 +1,90 @@ +package org.opencb.biodata.tools.variant.normalizer.extensions; + +import org.opencb.biodata.models.variant.StudyEntry; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.avro.FileEntry; +import org.opencb.biodata.models.variant.avro.SampleEntry; +import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; +import org.opencb.biodata.models.variant.metadata.VariantFileHeaderSimpleLine; +import org.opencb.commons.run.Task; + +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +public abstract class VariantNormalizerExtension implements Task { + + private VariantFileMetadata metadata; + + public final VariantNormalizerExtension init(VariantFileMetadata metadata) { + this.metadata = metadata; + return this; + } + + @Override + public void pre() throws Exception { + normalizeHeader(metadata); + } + + @Override + public final List apply(List list) throws Exception { + for (Variant variant : list) { + normalizeVariant(variant); + if (variant.getStudies() == null || variant.getStudies().isEmpty()) { + continue; + } + // Only one study expected + StudyEntry study = variant.getStudies().get(0); + FileEntry fileEntry; + if (study.getFiles() != null && !study.getFiles().isEmpty()) { + // Only one file expected + fileEntry = study.getFiles().get(0); + normalizeFile(variant, study, fileEntry); + if (study.getSamples() != null) { + for (Map.Entry entry : study.getSamplesPosition().entrySet()) { + normalizeSample(variant, study, fileEntry, entry.getKey(), study.getSample(entry.getValue())); + } + for (FileEntry file : study.getFiles()) { + normalizeFile(variant, study, file); + } + } + } + } + return list; + } + + protected final VariantFileHeaderComplexLine getFileHeaderLine(VariantFileMetadata fileMetadata, String key, String id) { + for (VariantFileHeaderComplexLine line : fileMetadata.getHeader().getComplexLines()) { + if (line.getKey().equals(key)) { + if (line.getId().equals(id)) { + return line; + } + } + } + return null; + } + + protected final List getFileHeaderLine(VariantFileMetadata fileMetadata, String key) { + List lines = new LinkedList<>(); + for (VariantFileHeaderSimpleLine line : fileMetadata.getHeader().getSimpleLines()) { + if (line.getKey().equals(key)) { + lines.add(line); + } + } + return lines; + } + + protected abstract boolean canUseExtension(VariantFileMetadata fileMetadata); + + protected void normalizeHeader(VariantFileMetadata fileMetadata) {} + + protected void normalizeVariant(Variant variant) {} + + protected void normalizeSample(Variant variant, StudyEntry study, FileEntry file, String sampleId, SampleEntry sample) {} + + protected void normalizeFile(Variant variant, StudyEntry study, FileEntry file) {} + +} + + diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFactory.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFactory.java new file mode 100644 index 000000000..d218de4ec --- /dev/null +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFactory.java @@ -0,0 +1,66 @@ +package org.opencb.biodata.tools.variant.normalizer.extensions; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; +import org.opencb.commons.run.Task; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +public class VariantNormalizerExtensionFactory { + + public static final Set ALL_EXTENSIONS = Collections.unmodifiableSet(new HashSet<>(Arrays.asList( + "FILE_DP_TO_SAMPLE", + "SAMPLE_DP_TO_FORMAT" + ))); + private final Set enabledExtensions; + + public VariantNormalizerExtensionFactory() { + this(ALL_EXTENSIONS); + } + + public VariantNormalizerExtensionFactory(Set enabledExtensions) { + this.enabledExtensions = enabledExtensions; + } + + + public Task buildExtensions(VariantFileMetadata fileMetadata) { + Task extensions = null; + for (String normalizerExtension : enabledExtensions) { + VariantNormalizerExtension extension; + switch (normalizerExtension) { + case "FILE_DP_TO_SAMPLE": + extension = new VariantNormalizerExtensionFileToSample("DP"); + break; + case "FILE_AD_TO_SAMPLE_DP": + extension = new VariantNormalizerExtensionFileToSample("AD", "DP", + new VariantFileHeaderComplexLine("FORMAT", "DP", "", "1", "Integer", Collections.emptyMap()), + ad -> { + String[] split = ad.split(","); + int dp = 0; + for (String s : split) { + dp += Integer.parseInt(s); + } + return String.valueOf(dp); + }); + break; + default: + throw new IllegalArgumentException("Unknown normalizer extension " + normalizerExtension); + } + if (extension.canUseExtension(fileMetadata)) { + extension.init(fileMetadata); + if (extensions == null) { + extensions = extension; + } else { + extensions = extensions.then(extension); + } + } + } + return extensions; + } + +} diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFileToSample.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFileToSample.java new file mode 100644 index 000000000..ef9452fe8 --- /dev/null +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/normalizer/extensions/VariantNormalizerExtensionFileToSample.java @@ -0,0 +1,102 @@ +package org.opencb.biodata.tools.variant.normalizer.extensions; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.variant.StudyEntry; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.avro.FileEntry; +import org.opencb.biodata.models.variant.avro.SampleEntry; +import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; + +import java.util.Objects; +import java.util.function.Function; + +public class VariantNormalizerExtensionFileToSample extends VariantNormalizerExtension { + + private final String fileDataKey; + private final String sampleDataKey; + private VariantFileHeaderComplexLine newSampleMetadataLine; + private final Function fieldMapper; + + + public VariantNormalizerExtensionFileToSample(String fileDataKey) { + this(fileDataKey, null); + } + + public VariantNormalizerExtensionFileToSample(String fileDataKey, Function fieldMapper) { + this.fileDataKey = fileDataKey; + this.sampleDataKey = fileDataKey; + newSampleMetadataLine = null; + this.fieldMapper = fieldMapper == null ? Function.identity() : fieldMapper; + } + + public VariantNormalizerExtensionFileToSample(String fileDataKey, String sampleDataKey, + VariantFileHeaderComplexLine newSampleMetadataLine) { + this(fileDataKey, sampleDataKey, newSampleMetadataLine, null); + } + + public VariantNormalizerExtensionFileToSample(String fileDataKey, String sampleDataKey, + VariantFileHeaderComplexLine newSampleMetadataLine, + Function fieldMapper) { + this.fileDataKey = fileDataKey; + this.sampleDataKey = sampleDataKey; + this.newSampleMetadataLine = Objects.requireNonNull(newSampleMetadataLine); + this.fieldMapper = fieldMapper == null ? Function.identity() : fieldMapper; + } + + @Override + protected boolean canUseExtension(VariantFileMetadata fileMetadata) { + if (fileMetadata.getSampleIds().size() != 1) { + // Fields from FILE_DATA can only be moved to SAMPLE_DATA if there is only one sample + return false; + } + + VariantFileHeaderComplexLine headerLine = getFileHeaderLine(fileMetadata, "INFO", fileDataKey); + if (headerLine == null) { + // Need to have the field in the INFO + return false; + } + + return true; + } + + @Override + protected void normalizeHeader(VariantFileMetadata fileMetadata) { + if (getFileHeaderLine(fileMetadata, "FORMAT", sampleDataKey) == null) { + if (newSampleMetadataLine == null) { + VariantFileHeaderComplexLine info = getFileHeaderLine(fileMetadata, "INFO", fileDataKey); + newSampleMetadataLine = new VariantFileHeaderComplexLine( + "FORMAT", + info.getId(), + info.getDescription(), + info.getNumber(), + info.getType(), + info.getGenericFields()); + } + fileMetadata.getHeader().getComplexLines().add(newSampleMetadataLine); + } + } + + @Override + protected void normalizeSample(Variant variant, StudyEntry study, FileEntry file, String sampleId, SampleEntry sample) { + String fileValue = file.getData().get(fileDataKey); + if (fileValue == null || fileValue.isEmpty() || fileValue.equals(".")) { + // Nothing to do + return; + } + + if (study.getSampleDataKeySet().contains(sampleDataKey)) { + study.addSampleDataKey(sampleDataKey); + } + + Integer sampleDataKeyPosition = study.getSampleDataKeyPosition(sampleDataKey); + String sampleValue = sample.getData().get(sampleDataKeyPosition); + if (StringUtils.isEmpty(sampleValue)) { + String newSampleValue = fieldMapper.apply(fileValue); + while (sample.getData().size() < sampleDataKeyPosition) { + sample.getData().add(""); + } + sample.getData().set(sampleDataKeyPosition, newSampleValue); + } + } +}