-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
tools: Generate missing reference blocks with missing genotypes. #170
- Loading branch information
Showing
4 changed files
with
241 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
162 changes: 162 additions & 0 deletions
162
...ools/src/main/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTask.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
package org.opencb.biodata.tools.variant; | ||
|
||
import htsjdk.samtools.SAMSequenceRecord; | ||
import htsjdk.variant.vcf.VCFConstants; | ||
import htsjdk.variant.vcf.VCFContigHeaderLine; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import org.apache.commons.lang.StringUtils; | ||
import org.opencb.biodata.models.core.Region; | ||
import org.opencb.biodata.models.variant.StudyEntry; | ||
import org.opencb.biodata.models.variant.Variant; | ||
import org.opencb.biodata.models.variant.VariantBuilder; | ||
import org.opencb.biodata.models.variant.metadata.VariantFileHeader; | ||
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; | ||
import org.opencb.commons.run.Task; | ||
|
||
import java.util.*; | ||
|
||
public class VariantReferenceBlockCreatorTask implements Task<Variant, Variant> { | ||
|
||
|
||
private String chromosome= null; | ||
private int position; | ||
private int end; | ||
private String studyId; | ||
private String fileId; | ||
private LinkedHashMap<String, Integer> samplesPosition; | ||
private List<List<String>> missingGtSamplesData; | ||
private Map<String, Integer> contigs; | ||
|
||
public VariantReferenceBlockCreatorTask() { | ||
} | ||
|
||
public VariantReferenceBlockCreatorTask(Map<String, Integer> contigs) { | ||
this.contigs = new HashMap<>(contigs); | ||
for (Map.Entry<String, Integer> entry : contigs.entrySet()) { | ||
this.contigs.put(Region.normalizeChromosome(entry.getKey()), entry.getValue()); | ||
} | ||
} | ||
|
||
public VariantReferenceBlockCreatorTask(VariantFileHeader fileHeader) { | ||
this.contigs = new HashMap<>(); | ||
for (VariantFileHeaderComplexLine line : fileHeader.getComplexLines()) { | ||
if (line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY)) { | ||
String contig = line.getId(); | ||
String length = line.getGenericFields().get("length"); | ||
if (StringUtils.isNumeric(length)) { | ||
contigs.put(contig, Integer.valueOf(length)); | ||
contigs.put(Region.normalizeChromosome(contig), Integer.valueOf(length)); | ||
} | ||
} | ||
} | ||
} | ||
|
||
public VariantReferenceBlockCreatorTask(VCFHeader fileHeader) { | ||
this.contigs = new HashMap<>(); | ||
for (VCFContigHeaderLine line : fileHeader.getContigLines()) { | ||
SAMSequenceRecord record = line.getSAMSequenceRecord(); | ||
String contig = record.getSequenceName(); | ||
int length = record.getSequenceLength(); | ||
if (length > 0) { | ||
contigs.put(contig, length); | ||
contigs.put(Region.normalizeChromosome(contig), length); | ||
} | ||
} | ||
} | ||
|
||
@Override | ||
public void pre() throws Exception { | ||
} | ||
|
||
@Override | ||
public List<Variant> apply(List<Variant> list) throws Exception { | ||
List<Variant> fixedList = new ArrayList<>(((int) (list.size() * 1.2))); | ||
for (Variant variant : list) { | ||
if (chromosome == null) { | ||
init(variant); | ||
|
||
// Create first telomere ref block (if needed) | ||
fixedList.addAll(createContigFirstBlock()); | ||
} else { | ||
if (!variant.getChromosome().equals(chromosome)) { | ||
// Change chromosome | ||
// Create first and last telomere ref block (if needed) | ||
fixedList.addAll(createContigFirstBlock()); | ||
init(variant); | ||
fixedList.addAll(createContigLastBlock()); | ||
} else { | ||
if (variant.getStart() != position) { | ||
// Check if need to create a block | ||
|
||
if ((end + 1) != variant.getStart()) { | ||
// Create ref block | ||
fixedList.add(createRefBlock(chromosome, end + 1, variant.getStart() - 1)); | ||
} | ||
|
||
position = variant.getStart(); | ||
end = variant.getEnd(); | ||
} else { | ||
// Update end | ||
end = Math.max(variant.getEnd(), end); | ||
} | ||
} | ||
} | ||
fixedList.add(variant); | ||
} | ||
return fixedList; | ||
} | ||
|
||
@Override | ||
public List<Variant> drain() throws Exception { | ||
return createContigLastBlock(); | ||
} | ||
|
||
protected void init(Variant variant) { | ||
chromosome = variant.getChromosome(); | ||
position = variant.getStart(); | ||
end = variant.getEnd(); | ||
if (!variant.getStudies().isEmpty()) { | ||
StudyEntry studyEntry = variant.getStudies().get(0); | ||
studyId = studyEntry.getStudyId(); | ||
fileId = studyEntry.getFiles().get(0).getFileId(); | ||
samplesPosition = studyEntry.getSamplesPosition(); | ||
missingGtSamplesData = new ArrayList<>(samplesPosition.size()); | ||
for (int i = 0; i < samplesPosition.size(); i++) { | ||
missingGtSamplesData.add(Collections.singletonList("./.")); | ||
} | ||
} | ||
} | ||
|
||
protected List<Variant> createContigFirstBlock() { | ||
if (position <= 1) { | ||
return Collections.emptyList(); | ||
} else { | ||
return Collections.singletonList(createRefBlock(chromosome, 1, position - 1)); | ||
} | ||
} | ||
|
||
protected List<Variant> createContigLastBlock() { | ||
if (!contigs.containsKey(chromosome)) { | ||
return Collections.emptyList(); | ||
} else { | ||
Integer length = contigs.get(chromosome); | ||
if (end >= length) { | ||
return Collections.emptyList(); | ||
} | ||
return Collections.singletonList(createRefBlock(chromosome, end + 1, length)); | ||
} | ||
} | ||
|
||
protected Variant createRefBlock(String chromosome, int start, int end) { | ||
VariantBuilder builder = new VariantBuilder(chromosome, start, end, "N", "."); | ||
if (studyId != null) { | ||
builder.setStudyId(studyId) | ||
.setFileId(fileId) | ||
.setSamplesPosition(samplesPosition) | ||
.setFormat("GT") | ||
.setSamplesData(missingGtSamplesData); | ||
} | ||
return builder.build(); | ||
} | ||
|
||
} |
8 changes: 8 additions & 0 deletions
8
biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantSorterTask.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
package org.opencb.biodata.tools.variant; | ||
|
||
public class VariantSorterTask extends VariantDeduplicationTask { | ||
|
||
public VariantSorterTask(int bufferSize) { | ||
super(variants -> variants, bufferSize); | ||
} | ||
} |
66 changes: 66 additions & 0 deletions
66
.../src/test/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTaskTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package org.opencb.biodata.tools.variant; | ||
|
||
import org.junit.Test; | ||
import org.opencb.biodata.models.variant.Variant; | ||
import org.opencb.biodata.models.variant.VariantBuilder; | ||
import org.opencb.biodata.models.variant.avro.VariantType; | ||
|
||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
|
||
import static java.util.Arrays.*; | ||
import static org.junit.Assert.*; | ||
|
||
public class VariantReferenceBlockCreatorTaskTest { | ||
|
||
@Test | ||
public void testFillBlocks() throws Exception { | ||
|
||
assertEquals( | ||
asList("1:1-99:N:.", "1:100:A:C", "1:101-119:N:.", "1:120:A:C", "1:121-10000:N:."), | ||
apply( "1:100:A:C", "1:120:A:C")); | ||
assertEquals( | ||
asList("1:1-99:N:.", "1:100:A:CTG", "1:100:A:C", "1:101-119:N:.", "1:120:A:C", "1:121-10000:N:."), | ||
apply( "1:100:A:CTG", "1:100:A:C", "1:120:A:C")); | ||
assertEquals( | ||
asList("1:1-99:N:.", "1:100:ATG:C", "1:100:A:C", "1:103-119:N:.", "1:120:A:C", "1:121-10000:N:."), | ||
apply( "1:100:ATG:C", "1:100:A:C", "1:120:A:C")); | ||
assertEquals( | ||
asList("1:1-99:N:.", "1:100:A:C", "1:100:ATG:C", "1:103-119:N:.", "1:120:A:C", "1:121-10000:N:."), | ||
apply( "1:100:A:C", "1:100:ATG:C", "1:120:A:C")); | ||
|
||
} | ||
|
||
@Test | ||
public void testCreateBlock() throws Exception { | ||
VariantReferenceBlockCreatorTask task = new VariantReferenceBlockCreatorTask(); | ||
|
||
task.init(new VariantBuilder("1:1:A:C") | ||
.setStudyId("myStudy") | ||
.setFileId("myFile") | ||
.setFormat("GT", "DP") | ||
.addSample("s1", "1/0", "10") | ||
.addSample("s2", "0/0", "30") | ||
.addSample("s3", "0/1", "20") | ||
.build()); | ||
Variant variant = task.createRefBlock("1", 100, 200); | ||
System.out.println("variant = " + variant.toJson()); | ||
|
||
assertEquals(VariantType.NO_VARIATION, variant.getType()); | ||
assertEquals("myStudy", variant.getStudies().get(0).getStudyId()); | ||
assertEquals("myFile", variant.getStudies().get(0).getFileId()); | ||
assertEquals(Arrays.asList("s1", "s2", "s3"), variant.getStudies().get(0).getOrderedSamplesName()); | ||
assertEquals(Arrays.asList(Collections.singletonList("./."), Collections.singletonList("./."), Collections.singletonList("./.")), variant.getStudies().get(0).getSamplesData()); | ||
|
||
} | ||
|
||
private List<String> apply(String ...variants) throws Exception { | ||
VariantReferenceBlockCreatorTask task = new VariantReferenceBlockCreatorTask(Collections.singletonMap("chr1", 10000)); | ||
List<Variant> list = task.apply(Stream.of(variants).map(Variant::new).collect(Collectors.toList())); | ||
list.addAll(task.drain()); | ||
return list.stream().map(Variant::toString).collect(Collectors.toList()); | ||
} | ||
} |