Skip to content

Commit

Permalink
tools: Rearrange file attributes at VariantNormalizer when splitting …
Browse files Browse the repository at this point in the history
…multi-allelic variants
  • Loading branch information
j-coll committed Aug 28, 2018
1 parent f64286f commit 1c6eb89
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 58 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
Expand Down Expand Up @@ -409,6 +408,11 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
//Set normalized samples data
try {
List<String> format = entry.getFormat();
if (!normalizedEntry.getFiles().isEmpty()) {
List<FileEntry> files = normalizeFilesInfo(normalizedEntry.getFiles(), rearranger);
normalizedEntry.setFiles(files);
}

if (keyFields.getPhaseSet() != null) {
if (!normalizedEntry.getFormatPositions().containsKey("PS")) {
normalizedEntry.addFormat("PS");
Expand Down Expand Up @@ -438,6 +442,21 @@ public List<Variant> normalize(List<Variant> batch, boolean reuse) throws NonSta
return normalizedVariants;
}

private List<FileEntry> normalizeFilesInfo(List<FileEntry> files, VariantAlternateRearranger rearranger) {
if (rearranger == null) {
return files;
}

for (FileEntry file : files) {
for (Map.Entry<String, String> entry : file.getAttributes().entrySet()) {
String data = rearranger.rearrange(entry.getKey(), entry.getValue());
entry.setValue(data);
}
}

return files;
}

private Collection<VariantKeyFields> sortByPosition(List<VariantKeyFields> keyFieldsList) {
List<VariantKeyFields> sortedKeyFields = new ArrayList<>(keyFieldsList);
sortedKeyFields.sort(Comparator.comparingInt(VariantKeyFields::getStart));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -350,20 +350,24 @@ public String rearrange(String key, String data, @Nullable Integer ploidy) {
configuration.otherFieldsMap.getOrDefault(key, Pair.of(VCFHeaderLineType.String, VCFHeaderLineCount.UNBOUNDED));
VCFHeaderLineType type = pair.getLeft();
String missingValue = type.equals(VCFHeaderLineType.Float) || type.equals(VCFHeaderLineType.Integer) ? "0" : ".";
switch (pair.getRight()) {
case A:
data = rearrangeNumberA(data, missingValue);
break;
case R:
data = rearrangeNumberR(data, missingValue);
break;
case G:
data = rearrangeNumberG(data, missingValue, ploidy);
break;
case INTEGER:
case UNBOUNDED:
default:
// Do not rearrange other fields
try {
switch (pair.getRight()) {
case A:
data = rearrangeNumberA(data, missingValue);
break;
case R:
data = rearrangeNumberR(data, missingValue);
break;
case G:
data = rearrangeNumberG(data, missingValue, ploidy);
break;
case INTEGER:
case UNBOUNDED:
default:
// Do not rearrange other fields
}
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException("Error rearranging key " + key + " = " + data, e);
}
return data;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.stats.VariantStats;

import java.util.*;
Expand Down Expand Up @@ -60,7 +61,15 @@ public VariantAggregatedEVSStatsCalculator(Properties tagMap) {
}

@Override
protected void parseStats(Variant variant, StudyEntry file, int numAllele, String reference, String[] alternateAlleles, Map<String, String> info) {
protected void parseStats(Variant variant, StudyEntry study, int numAllele, String reference, String[] alternateAlleles, Map<String, String> info) {
FileEntry fileEntry = study.getFiles().get(0);
// EVS params are not rearranged when normalizing. Use original call
if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) {
String[] ori = fileEntry.getCall().split(":");
numAllele = Integer.parseInt(ori[3]);
alternateAlleles = ori[2].split(",");
reference = ori[1];
}
VariantStats stats = new VariantStats();
if (info.containsKey("MAF")) {
String splitsMAF[] = info.get("MAF").split(",");
Expand All @@ -72,19 +81,27 @@ protected void parseStats(Variant variant, StudyEntry file, int numAllele, Strin

if (info.containsKey("GTS") && info.containsKey("GTC")) {
String splitsGTC[] = info.get("GTC").split(",");
addGenotypeWithGTS(file.getAttributes(), splitsGTC, reference, alternateAlleles, numAllele, stats);
addGenotypeWithGTS(study.getAttributes(), splitsGTC, reference, alternateAlleles, numAllele, stats);
}
file.setStats(StudyEntry.DEFAULT_COHORT, stats);
study.setStats(StudyEntry.DEFAULT_COHORT, stats);
}

@Override
protected void parseMappedStats(Variant variant, StudyEntry studyEntry,
int numAllele, String reference, String[] alternateAlleles, Map<String, String> info) {
FileEntry fileEntry = studyEntry.getFiles().get(0);
if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) {
String[] ori = fileEntry.getCall().split(":");
numAllele = Integer.parseInt(ori[3]);
alternateAlleles = ori[2].split(",");
reference = ori[1];
}

if (tagMap != null) {
for (String key : info.keySet()) {
String opencgaTag = reverseTagMap.get(key);
String[] values = info.get(key).split(COMMA);
if (opencgaTag != null) {
String[] values = info.get(key).split(COMMA);
String[] opencgaTagSplit = opencgaTag.split(DOT); // a literal point
if (opencgaTagSplit.length == 2) {
String cohort = opencgaTagSplit[0];
Expand Down Expand Up @@ -113,6 +130,7 @@ protected void parseMappedStats(Variant variant, StudyEntry studyEntry,
}
}
} else if (key.equals("MAF")) {
String[] values = info.get(key).split(COMMA);
String groups_order = tagMap.getProperty(VariantAggregatedEVSStatsCalculator.GROUPS_ORDER);
if (groups_order != null) {
String[] populations = groups_order.split(COMMA);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.formats.variant.vcf4.VariantVcfFactory;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.stats.VariantStats;

import java.util.LinkedHashMap;
Expand Down Expand Up @@ -65,13 +66,26 @@ public VariantAggregatedExacStatsCalculator(Properties tagMap) {
}

@Override
protected void parseStats(Variant variant, StudyEntry fileMetadata, int numAllele, String reference, String[] alternateAlleles, Map<String, String> info) {
StudyEntry studyentry = variant.getStudy(fileMetadata.getStudyId());
VariantStats stats = new VariantStats();
protected void parseStats(Variant variant, StudyEntry studyEntry, int numAllele, String reference, String[] alternateAlleles, Map<String, String> info) {
VariantStats stats = new VariantStats();

if (info.containsKey(AC_HET)) { // heterozygous genotype count
// Het count is a non standard field that can not be rearranged when decomposing multi-allelic variants.
// Get the original variant call to parse this field
FileEntry fileEntry = studyEntry.getFiles().get(0);
int numAlleleOri;
String[] alternateAllelesOri;
if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) {
String[] ori = fileEntry.getCall().split(":");
numAlleleOri = Integer.parseInt(ori[3]);
alternateAllelesOri = ori[2].split(",");
} else {
numAlleleOri = numAllele;
alternateAllelesOri = alternateAlleles;
}

String[] hetCounts = info.get(AC_HET).split(COMMA);
addHeterozygousGenotypes(variant, numAllele, alternateAlleles, stats, hetCounts);
addHeterozygousGenotypes(variant, numAlleleOri, alternateAllelesOri, stats, hetCounts);
}

if (info.containsKey(AC_HOM)) { // homozygous genotype count
Expand Down Expand Up @@ -101,7 +115,7 @@ protected void parseStats(Variant variant, StudyEntry fileMetadata, int numAllel
setMaf(an, acCounts, variant.getReference(), alternateAlleles, stats);
}

studyentry.setStats(StudyEntry.DEFAULT_COHORT, stats);
studyEntry.setStats(StudyEntry.DEFAULT_COHORT, stats);
}

@Override
Expand Down Expand Up @@ -132,7 +146,20 @@ protected void parseMappedStats(Variant variant, StudyEntry studyEntry, int numA
ans.put(cohortName, Integer.parseInt(values[0]));
break;
case "HET":
addHeterozygousGenotypes(variant, numAllele, alternateAlleles, cohortStats, values);
// Het count is a non standard field that can not be rearranged when decomposing multi-allelic variants.
// Get the original variant call to parse this field
FileEntry fileEntry = studyEntry.getFiles().get(0);
int numAlleleOri;
String[] alternateAllelesOri;
if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) {
String[] ori = fileEntry.getCall().split(":");
numAlleleOri = Integer.parseInt(ori[3]);
alternateAllelesOri = ori[2].split(",");
} else {
numAlleleOri = numAllele;
alternateAllelesOri = alternateAlleles;
}
addHeterozygousGenotypes(variant, numAlleleOri, alternateAllelesOri, cohortStats, values);
break;
case "HOM":
addHomozygousGenotype(variant, numAllele, alternateAlleles, cohortStats, values);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
import org.opencb.biodata.formats.variant.vcf4.VariantAggregatedVcfFactory;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.formats.variant.vcf4.VariantVcfFactory;
import org.opencb.biodata.models.variant.avro.AlternateCoordinate;
import org.opencb.biodata.models.variant.avro.FileEntry;
import org.opencb.biodata.models.variant.stats.VariantStats;

import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
* Created by jmmut on 2015-08-25.
Expand Down Expand Up @@ -101,16 +103,15 @@ public void calculate(Variant variant, StudyEntry study) {
FileEntry fileEntry = study.getFiles().get(0);
Map<String, String> infoMap = fileEntry.getAttributes();
int numAllele = 0;
String reference;
String reference = variant.getReference();
String[] alternateAlleles;
if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) {
String[] ori = fileEntry.getCall().split(":");
numAllele = Integer.parseInt(ori[3]);
alternateAlleles = ori[2].split(",");
reference = ori[1];
} else {
reference = variant.getReference();

if (study.getSecondaryAlternates().isEmpty()) {
alternateAlleles = new String[]{variant.getAlternate()};
} else {
List<String> secondaryAlternates = study.getSecondaryAlternates().stream().map(AlternateCoordinate::getAlternate).collect(Collectors.toList());
secondaryAlternates.add(0, variant.getAlternate());
alternateAlleles = secondaryAlternates.toArray(new String[secondaryAlternates.size()]);
}
if (tagMap != null) {
parseMappedStats(variant, study, numAllele, reference, alternateAlleles, infoMap);
Expand Down Expand Up @@ -287,6 +288,20 @@ protected void calculate(Variant variant, StudyEntry studyEntry, int numAllele,
if (attributes.containsKey("GTS")) { // GTS contains the format like: GTS=GG,GT,TT or GTS=A1A1,A1R,RR
addGenotypeWithGTS(attributes, gtcs, reference, alternateAlleles, numAllele, variantStats);
} else {
// Het count is a non standard field that can not be rearranged when decomposing multi-allelic variants.
// Get the original variant call to parse this field
FileEntry fileEntry = studyEntry.getFiles().get(0);
int numAlleleOri;
String[] alternateAllelesOri;
if (fileEntry.getCall() != null && !fileEntry.getCall().isEmpty()) {
String[] ori = fileEntry.getCall().split(":");
numAlleleOri = Integer.parseInt(ori[3]);
alternateAllelesOri = ori[2].split(",");
} else {
numAlleleOri = numAllele;
alternateAllelesOri = alternateAlleles;
}

for (int i = 0; i < gtcs.length; i++) {
String[] gtcSplit = gtcs[i].split(":");
Integer alleles[] = new Integer[2];
Expand All @@ -296,14 +311,14 @@ protected void calculate(Variant variant, StudyEntry studyEntry, int numAllele,
if (gtcSplit.length == 1) { // GTC=0,5,8
getGenotype(i, alleles);
gtc = Integer.parseInt(gtcs[i]);
gt = VariantVcfFactory.mapToMultiallelicIndex(alleles[0], numAllele) + "/" + VariantVcfFactory.mapToMultiallelicIndex(alleles[1], numAllele);
gt = VariantVcfFactory.mapToMultiallelicIndex(alleles[0], numAlleleOri) + "/" + VariantVcfFactory.mapToMultiallelicIndex(alleles[1], numAlleleOri);
} else { // GTC=0/0:0,0/1:5,1/1:8
Matcher matcher = numNum.matcher(gtcSplit[0]);
if (matcher.matches()) { // number/number:number
alleles[0] = Integer.parseInt(matcher.group(1));
alleles[1] = Integer.parseInt(matcher.group(2));
gtc = Integer.parseInt(gtcSplit[1]);
gt = VariantVcfFactory.mapToMultiallelicIndex(alleles[0], numAllele) + "/" + VariantVcfFactory.mapToMultiallelicIndex(alleles[1], numAllele);
gt = VariantVcfFactory.mapToMultiallelicIndex(alleles[0], numAlleleOri) + "/" + VariantVcfFactory.mapToMultiallelicIndex(alleles[1], numAlleleOri);
} else {
if (gtcSplit[0].equals("./.")) { // ./.:number
alleles[0] = -1;
Expand All @@ -316,7 +331,7 @@ protected void calculate(Variant variant, StudyEntry studyEntry, int numAllele,
}
}
if (parseable) {
Genotype genotype = new Genotype(gt, variant.getReference(), alternateAlleles[numAllele]);
Genotype genotype = new Genotype(gt, variant.getReference(), alternateAlleles[numAlleleOri]);
variantStats.addGenotype(genotype, gtc);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -555,25 +555,25 @@ public void testParseInfo() {
List<Variant> result = createAndNormalize(line);
assertEquals(2, result.size());

Variant getVar0 = result.get(0);
StudyEntry getFile0 = getVar0.getStudy(metadata.getId());
assertEquals(4, Integer.parseInt(getFile0.getAttribute("NS")));
// assertEquals(2, Integer.parseInt(getFile0.getAttribute("AN")));
assertEquals(1, Integer.parseInt(getFile0.getAttribute("AC").split(",")[0]));
assertEquals(0.125, Double.parseDouble(getFile0.getAttribute("AF").split(",")[0]), 1e-8);
assertEquals(63, Integer.parseInt(getFile0.getAttribute("DP")));
assertEquals(10685, Integer.parseInt(getFile0.getAttribute("MQ")));
// assertEquals(1, Integer.parseInt(getFile0.getAttribute("MQ0")));

Variant getVar1 = result.get(1);
StudyEntry getFile1 = getVar1.getStudy(metadata.getId());
assertEquals(4, Integer.parseInt(getFile1.getAttribute("NS")));
// assertEquals(2, Integer.parseInt(getFile1.getAttribute("AN")));
assertEquals(2, Integer.parseInt(getFile1.getAttribute("AC").split(",")[1]));
assertEquals(0.25, Double.parseDouble(getFile1.getAttribute("AF").split(",")[1]), 1e-8);
assertEquals(63, Integer.parseInt(getFile1.getAttribute("DP")));
assertEquals(10685, Integer.parseInt(getFile1.getAttribute("MQ")));
// assertEquals(1, Integer.parseInt(getFile1.getAttribute("MQ0")));
Variant variant0 = result.get(0);
StudyEntry study0 = variant0.getStudy(metadata.getId());
assertEquals(4, Integer.parseInt(study0.getAttribute("NS")));
// assertEquals(2, Integer.parseInt(study0.getAttribute("AN")));
assertEquals(1, Integer.parseInt(study0.getAttribute("AC").split(",")[0]));
assertEquals(0.125, Double.parseDouble(study0.getAttribute("AF").split(",")[0]), 1e-8);
assertEquals(63, Integer.parseInt(study0.getAttribute("DP")));
assertEquals(10685, Integer.parseInt(study0.getAttribute("MQ")));
// assertEquals(1, Integer.parseInt(study0.getAttribute("MQ0")));

Variant variant1 = result.get(1);
StudyEntry study1 = variant1.getStudy(metadata.getId());
assertEquals(4, Integer.parseInt(study1.getAttribute("NS")));
// assertEquals(2, Integer.parseInt(study1.getAttribute("AN")));
assertEquals(2, Integer.parseInt(study1.getAttribute("AC").split(",")[0]));
assertEquals(0.25, Double.parseDouble(study1.getAttribute("AF").split(",")[0]), 1e-8);
assertEquals(63, Integer.parseInt(study1.getAttribute("DP")));
assertEquals(10685, Integer.parseInt(study1.getAttribute("MQ")));
// assertEquals(1, Integer.parseInt(study1.getAttribute("MQ0")));
}

private List<Variant> createAndNormalize(String line) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ public void testRearrangeGenotypePloidy2_missingAlleles() {
VariantAlternateRearranger r = new VariantAlternateRearranger(Arrays.asList("A"), Arrays.asList("C", "A", "B"));
assertEquals(".,.,.,.,.,.,.,.,.,.", r.rearrangeNumberG(".", ".", 2));
assertEquals("00,.,.,01,.,11,.,.,.,.", r.rearrangeNumberG("00,01,11", ".", 2));

r = new VariantAlternateRearranger(Arrays.asList("A", "C"), Arrays.asList("C", "A", "B"));
assertEquals("00,02,22,01,12,11,.,.,.,.", r.rearrangeNumberG("00,01,11,02,12,22", ".", 2));

r = new VariantAlternateRearranger(Arrays.asList("A", "C"), Arrays.asList("C", "B", "A"));
assertEquals("00,02,22,.,.,.,01,12,.,11", r.rearrangeNumberG("00,01,11,02,12,22", ".", 2));
}


Expand Down
Loading

0 comments on commit 1c6eb89

Please sign in to comment.