From 978618886c60ed61ffed14082896901bdc291b32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 31 Aug 2018 17:27:06 +0100 Subject: [PATCH] tools: Normalize allele into <*>. #162 --- .../tools/variant/VariantNormalizer.java | 49 +++++++++++++------ .../tools/variant/VariantNormalizerTest.java | 44 ++++++++++++++++- 2 files changed, 77 insertions(+), 16 deletions(-) diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java index 1743ba35a..b80623c05 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantNormalizer.java @@ -20,6 +20,7 @@ package org.opencb.biodata.tools.variant; import htsjdk.samtools.SAMException; +import htsjdk.variant.variantcontext.Allele; import htsjdk.variant.vcf.*; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; @@ -286,7 +287,11 @@ public List normalize(List batch, boolean reuse) throws NonSta List normalizedVariants = new ArrayList<>(batch.size()); for (Variant variant : batch) { - if (!isNormalizable(variant)) { + if (variant.getType().equals(VariantType.NO_VARIATION)) { + variant.setAlternate(normalizeNoVariationAlternate(variant.getAlternate())); + normalizedVariants.add(variant); + continue; + } else if (!isNormalizable(variant)) { normalizedVariants.add(variant); continue; } @@ -322,9 +327,14 @@ public List normalize(List batch, boolean reuse) throws NonSta } } else { for (StudyEntry entry : variant.getStudies()) { + List originalAlternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); List alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size()); alternates.add(alternate); - alternates.addAll(entry.getSecondaryAlternatesAlleles()); + originalAlternates.add(alternate); + for (String secondaryAlternatesAllele : entry.getSecondaryAlternatesAlleles()) { + alternates.add(normalizeNoVariationAlternate(secondaryAlternatesAllele)); + originalAlternates.add(secondaryAlternatesAllele); + } // FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed List keyFieldsList; @@ -346,11 +356,15 @@ public List normalize(List batch, boolean reuse) throws NonSta // Remove allele index callPrefix = call.substring(0, call.lastIndexOf(':') + 1); } else { - callPrefix = start + ":" + reference + ":" + String.join(",", alternates) + ":"; + callPrefix = start + ":" + reference + ":" + String.join(",", originalAlternates) + ":"; } // Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order! for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) { + // Skip symbolic NO_VARIATION + if (keyFields.alternate.equals(VariantBuilder.REF_ONLY_ALT)) { + continue; + } String call = callPrefix + keyFields.getNumAllele(); final Variant normalizedVariant; @@ -398,7 +412,7 @@ public List normalize(List batch, boolean reuse) throws NonSta //Set normalized secondary alternates List reorderedKeyFields = reorderVariantKeyFields(chromosome, keyFields, keyFieldsList); - normalizedEntry.setSecondaryAlternates(getSecondaryAlternates(keyFields, reorderedKeyFields)); + normalizedEntry.setSecondaryAlternates(getSecondaryAlternates(chromosome, keyFields, reorderedKeyFields)); VariantAlternateRearranger rearranger = null; if (originalKeyFieldsList.size() > 1 && !reorderedKeyFields.isEmpty()) { @@ -442,6 +456,14 @@ public List normalize(List batch, boolean reuse) throws NonSta return normalizedVariants; } + private String normalizeNoVariationAlternate(String alternate) { + if (alternate.equals(VariantBuilder.NON_REF_ALT)) { + return VariantBuilder.REF_ONLY_ALT; + } else { + return alternate; + } + } + private List normalizeFilesInfo(List files, VariantAlternateRearranger rearranger) { if (rearranger == null) { return files; @@ -662,7 +684,10 @@ public List normalize(String chromosome, int position, String VariantKeyFields keyFields; final boolean requireLeftAlignment; // left and right trimming - if (referenceLen == 0) { + if (Allele.wouldBeSymbolicAllele(currentAlternate.getBytes())) { + keyFields = new VariantKeyFields(position, position + referenceLen - 1, numAllelesIdx, reference, currentAlternate, false); + requireLeftAlignment = false; + } else if (referenceLen == 0) { requireLeftAlignment = this.config.isLeftAlignEnabled(); keyFields = createVariantsFromInsertionEmptyRef(position, currentAlternate); } else if (alternateLen == 0) { @@ -932,7 +957,6 @@ private SequencePair getPairwiseAlignment(Strin /** * Non normalizable variants - * TODO: Add {@link VariantType#SYMBOLIC} variants? */ private boolean isNormalizable(Variant variant) { return !variant.getType().equals(VariantType.NO_VARIATION) && !variant.getType().equals(VariantType.SYMBOLIC); @@ -1323,18 +1347,15 @@ public List reorderVariantKeyFields(String chromosome, Variant return secondaryAlternates; } - public List getSecondaryAlternates(VariantKeyFields alternate, List reorderedKeyFields) { + public List getSecondaryAlternates(String chromosome, VariantKeyFields alternate, List reorderedKeyFields) { List secondaryAlternates = new ArrayList<>(reorderedKeyFields.size()); for (VariantKeyFields keyFields : reorderedKeyFields) { if (!keyFields.equals(alternate)) { secondaryAlternates.add(new AlternateCoordinate( - // Chromosome is always the same, do not set - null, - //Set position only if is different from the original one - alternate.getStart() == keyFields.getStart() ? null : keyFields.getStart(), - alternate.getEnd() == keyFields.getEnd() ? null : keyFields.getEnd(), - //Set reference only if is different from the original one - alternate.getReference().equals(keyFields.getReference()) ? null : keyFields.getReference(), + chromosome, + keyFields.getStart(), + keyFields.getEnd(), + keyFields.getReference(), keyFields.getAlternate(), VariantBuilder.inferType(keyFields.getReference(), keyFields.getAlternate()) )); diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java index cdf915f37..b88e3aaed 100644 --- a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantNormalizerTest.java @@ -196,12 +196,52 @@ public void testNormalizeSamplesDataMNV2() throws NonStandardCompliantSampleFiel @Test public void testNormalizeNoVariation() throws NonStandardCompliantSampleField { - Variant variant = new Variant("2", 10, 1000, "A", ""); - variant.setType(VariantType.NO_VARIATION); + Variant variant = new Variant("2", 10, 1000, "A", "."); + assertEquals(VariantType.NO_VARIATION, variant.getType()); Variant normalizedVariant = normalizer.normalize(Collections.singletonList(variant), false).get(0); assertEquals(variant, normalizedVariant); + } + + @Test + public void testNormalizeNoVariationSymbolic() throws NonStandardCompliantSampleField { + Variant variant = new Variant("2", 10, 1000, "A", ""); + + Variant normalizedVariant = normalizer.normalize(Collections.singletonList(variant), false).get(0); + assertEquals("<*>", normalizedVariant.getAlternate()); + variant.setAlternate("<*>"); + assertEquals(variant, normalizedVariant); + } + @Test + public void testNormalizeMultiallelicNoVariationSymbolic() throws NonStandardCompliantSampleField { + Variant variant = Variant.newBuilder("2", 10, 10, "A", "C,<*>").setStudyId("s").setFileId("f") + .setFormat(Collections.emptyList()) + .setSamplesData(Collections.emptyList()).build(); + + List variants = normalizer.normalize(Collections.singletonList(variant), false); + assertEquals(1, variants.size()); + Variant normalizedVariant = variants.get(0); + String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall(); + assertEquals("10:A:C,<*>:0", call); + variant.getStudies().get(0).getFiles().get(0).setCall("10:A:C,<*>:0"); + assertEquals(variant.toJson(), normalizedVariant.toJson()); + } + + @Test + public void testNormalizeMultiallelicNoVariationSymbolicNonRef() throws NonStandardCompliantSampleField { + Variant variant = Variant.newBuilder("2", 10, 10, "A", "C,").setStudyId("s").setFileId("f") + .setFormat(Collections.emptyList()) + .setSamplesData(Collections.emptyList()).build(); + + List variants = normalizer.normalize(Collections.singletonList(variant), false); + assertEquals(1, variants.size()); + Variant normalizedVariant = variants.get(0); + String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall(); + assertEquals("10:A:C,:0", call); + variant.getStudies().get(0).getFiles().get(0).setCall("10:A:C,:0"); + variant.getStudies().get(0).getSecondaryAlternates().get(0).setAlternate("<*>"); + assertEquals(variant.toJson(), normalizedVariant.toJson()); } @Test