Skip to content

Commit

Permalink
tools: Normalize allele <NON_REF> into <*>. #162
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Aug 31, 2018
1 parent c3c9a9e commit 9786188
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
package org.opencb.biodata.tools.variant;

import htsjdk.samtools.SAMException;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.vcf.*;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -286,7 +287,11 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
List<Variant> normalizedVariants = new ArrayList<>(batch.size());

for (Variant variant : batch) {
if (!isNormalizable(variant)) {
if (variant.getType().equals(VariantType.NO_VARIATION)) {
variant.setAlternate(normalizeNoVariationAlternate(variant.getAlternate()));
normalizedVariants.add(variant);
continue;
} else if (!isNormalizable(variant)) {
normalizedVariants.add(variant);
continue;
}
Expand Down Expand Up @@ -322,9 +327,14 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
}
} else {
for (StudyEntry entry : variant.getStudies()) {
List<String> originalAlternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
List<String> alternates = new ArrayList<>(1 + entry.getSecondaryAlternates().size());
alternates.add(alternate);
alternates.addAll(entry.getSecondaryAlternatesAlleles());
originalAlternates.add(alternate);
for (String secondaryAlternatesAllele : entry.getSecondaryAlternatesAlleles()) {
alternates.add(normalizeNoVariationAlternate(secondaryAlternatesAllele));
originalAlternates.add(secondaryAlternatesAllele);
}

// FIXME: assumes there wont be multinucleotide positions with CNVs and short variants mixed
List<VariantKeyFields> keyFieldsList;
Expand All @@ -346,11 +356,15 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
// Remove allele index
callPrefix = call.substring(0, call.lastIndexOf(':') + 1);
} else {
callPrefix = start + ":" + reference + ":" + String.join(",", alternates) + ":";
callPrefix = start + ":" + reference + ":" + String.join(",", originalAlternates) + ":";
}

// Iterate keyFields sorting by position, so the generated variants are ordered. Do not modify original order!
for (VariantKeyFields keyFields : sortByPosition(keyFieldsList)) {
// Skip symbolic NO_VARIATION
if (keyFields.alternate.equals(VariantBuilder.REF_ONLY_ALT)) {
continue;
}
String call = callPrefix + keyFields.getNumAllele();

final Variant normalizedVariant;
Expand Down Expand Up @@ -398,7 +412,7 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta

//Set normalized secondary alternates
List<VariantKeyFields> reorderedKeyFields = reorderVariantKeyFields(chromosome, keyFields, keyFieldsList);
normalizedEntry.setSecondaryAlternates(getSecondaryAlternates(keyFields, reorderedKeyFields));
normalizedEntry.setSecondaryAlternates(getSecondaryAlternates(chromosome, keyFields, reorderedKeyFields));

VariantAlternateRearranger rearranger = null;
if (originalKeyFieldsList.size() > 1 && !reorderedKeyFields.isEmpty()) {
Expand Down Expand Up @@ -442,6 +456,14 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
return normalizedVariants;
}

private String normalizeNoVariationAlternate(String alternate) {
if (alternate.equals(VariantBuilder.NON_REF_ALT)) {
return VariantBuilder.REF_ONLY_ALT;
} else {
return alternate;
}
}

private List<FileEntry> normalizeFilesInfo(List<FileEntry> files, VariantAlternateRearranger rearranger) {
if (rearranger == null) {
return files;
Expand Down Expand Up @@ -662,7 +684,10 @@ public List<VariantKeyFields> normalize(String chromosome, int position, String
VariantKeyFields keyFields;
final boolean requireLeftAlignment;
// left and right trimming
if (referenceLen == 0) {
if (Allele.wouldBeSymbolicAllele(currentAlternate.getBytes())) {
keyFields = new VariantKeyFields(position, position + referenceLen - 1, numAllelesIdx, reference, currentAlternate, false);
requireLeftAlignment = false;
} else if (referenceLen == 0) {
requireLeftAlignment = this.config.isLeftAlignEnabled();
keyFields = createVariantsFromInsertionEmptyRef(position, currentAlternate);
} else if (alternateLen == 0) {
Expand Down Expand Up @@ -932,7 +957,6 @@ private SequencePair<DNASequence, NucleotideCompound> getPairwiseAlignment(Strin

/**
* Non normalizable variants
* TODO: Add {@link VariantType#SYMBOLIC} variants?
*/
private boolean isNormalizable(Variant variant) {
return !variant.getType().equals(VariantType.NO_VARIATION) && !variant.getType().equals(VariantType.SYMBOLIC);
Expand Down Expand Up @@ -1323,18 +1347,15 @@ public List<VariantKeyFields> reorderVariantKeyFields(String chromosome, Variant
return secondaryAlternates;
}

public List<AlternateCoordinate> getSecondaryAlternates(VariantKeyFields alternate, List<VariantKeyFields> reorderedKeyFields) {
public List<AlternateCoordinate> getSecondaryAlternates(String chromosome, VariantKeyFields alternate, List<VariantKeyFields> reorderedKeyFields) {
List<AlternateCoordinate> secondaryAlternates = new ArrayList<>(reorderedKeyFields.size());
for (VariantKeyFields keyFields : reorderedKeyFields) {
if (!keyFields.equals(alternate)) {
secondaryAlternates.add(new AlternateCoordinate(
// Chromosome is always the same, do not set
null,
//Set position only if is different from the original one
alternate.getStart() == keyFields.getStart() ? null : keyFields.getStart(),
alternate.getEnd() == keyFields.getEnd() ? null : keyFields.getEnd(),
//Set reference only if is different from the original one
alternate.getReference().equals(keyFields.getReference()) ? null : keyFields.getReference(),
chromosome,
keyFields.getStart(),
keyFields.getEnd(),
keyFields.getReference(),
keyFields.getAlternate(),
VariantBuilder.inferType(keyFields.getReference(), keyFields.getAlternate())
));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,12 +196,52 @@ public void testNormalizeSamplesDataMNV2() throws NonStandardCompliantSampleFiel

@Test
public void testNormalizeNoVariation() throws NonStandardCompliantSampleField {
Variant variant = new Variant("2", 10, 1000, "A", "");
variant.setType(VariantType.NO_VARIATION);
Variant variant = new Variant("2", 10, 1000, "A", ".");

assertEquals(VariantType.NO_VARIATION, variant.getType());
Variant normalizedVariant = normalizer.normalize(Collections.singletonList(variant), false).get(0);
assertEquals(variant, normalizedVariant);
}

@Test
public void testNormalizeNoVariationSymbolic() throws NonStandardCompliantSampleField {
Variant variant = new Variant("2", 10, 1000, "A", "<NON_REF>");

Variant normalizedVariant = normalizer.normalize(Collections.singletonList(variant), false).get(0);
assertEquals("<*>", normalizedVariant.getAlternate());
variant.setAlternate("<*>");
assertEquals(variant, normalizedVariant);
}

@Test
public void testNormalizeMultiallelicNoVariationSymbolic() throws NonStandardCompliantSampleField {
Variant variant = Variant.newBuilder("2", 10, 10, "A", "C,<*>").setStudyId("s").setFileId("f")
.setFormat(Collections.emptyList())
.setSamplesData(Collections.emptyList()).build();

List<Variant> variants = normalizer.normalize(Collections.singletonList(variant), false);
assertEquals(1, variants.size());
Variant normalizedVariant = variants.get(0);
String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall();
assertEquals("10:A:C,<*>:0", call);
variant.getStudies().get(0).getFiles().get(0).setCall("10:A:C,<*>:0");
assertEquals(variant.toJson(), normalizedVariant.toJson());
}

@Test
public void testNormalizeMultiallelicNoVariationSymbolicNonRef() throws NonStandardCompliantSampleField {
Variant variant = Variant.newBuilder("2", 10, 10, "A", "C,<NON_REF>").setStudyId("s").setFileId("f")
.setFormat(Collections.emptyList())
.setSamplesData(Collections.emptyList()).build();

List<Variant> variants = normalizer.normalize(Collections.singletonList(variant), false);
assertEquals(1, variants.size());
Variant normalizedVariant = variants.get(0);
String call = normalizedVariant.getStudies().get(0).getFiles().get(0).getCall();
assertEquals("10:A:C,<NON_REF>:0", call);
variant.getStudies().get(0).getFiles().get(0).setCall("10:A:C,<NON_REF>:0");
variant.getStudies().get(0).getSecondaryAlternates().get(0).setAlternate("<*>");
assertEquals(variant.toJson(), normalizedVariant.toJson());
}

@Test
Expand Down

0 comments on commit 9786188

Please sign in to comment.