Skip to content

Commit

Permalink
tools: Generate missing reference blocks with missing genotypes. #170
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Sep 30, 2019
1 parent 7de8246 commit 1fc44bc
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,12 @@ public VariantDeduplicationTask() {
}

public VariantDeduplicationTask(DuplicatedVariantsResolver duplicatedVariantsResolver) {
this(duplicatedVariantsResolver, 100);
}

public VariantDeduplicationTask(DuplicatedVariantsResolver duplicatedVariantsResolver, int bufferSize) {
resolver = duplicatedVariantsResolver;
queue = new CircularSortedArrayQueue<>(100, VARIANT_COMPARATOR);
queue = new CircularSortedArrayQueue<>(bufferSize, VARIANT_COMPARATOR);
}

@FunctionalInterface
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
package org.opencb.biodata.tools.variant;

import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.variant.vcf.VCFConstants;
import htsjdk.variant.vcf.VCFContigHeaderLine;
import htsjdk.variant.vcf.VCFHeader;
import org.apache.commons.lang.StringUtils;
import org.opencb.biodata.models.core.Region;
import org.opencb.biodata.models.variant.StudyEntry;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantBuilder;
import org.opencb.biodata.models.variant.metadata.VariantFileHeader;
import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine;
import org.opencb.commons.run.Task;

import java.util.*;

public class VariantReferenceBlockCreatorTask implements Task<Variant, Variant> {


private String chromosome= null;
private int position;
private int end;
private String studyId;
private String fileId;
private LinkedHashMap<String, Integer> samplesPosition;
private List<List<String>> missingGtSamplesData;
private Map<String, Integer> contigs;

public VariantReferenceBlockCreatorTask() {
}

public VariantReferenceBlockCreatorTask(Map<String, Integer> contigs) {
this.contigs = new HashMap<>(contigs);
for (Map.Entry<String, Integer> entry : contigs.entrySet()) {
this.contigs.put(Region.normalizeChromosome(entry.getKey()), entry.getValue());
}
}

public VariantReferenceBlockCreatorTask(VariantFileHeader fileHeader) {
this.contigs = new HashMap<>();
for (VariantFileHeaderComplexLine line : fileHeader.getComplexLines()) {
if (line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY)) {
String contig = line.getId();
String length = line.getGenericFields().get("length");
if (StringUtils.isNumeric(length)) {
contigs.put(contig, Integer.valueOf(length));
contigs.put(Region.normalizeChromosome(contig), Integer.valueOf(length));
}
}
}
}

public VariantReferenceBlockCreatorTask(VCFHeader fileHeader) {
this.contigs = new HashMap<>();
for (VCFContigHeaderLine line : fileHeader.getContigLines()) {
SAMSequenceRecord record = line.getSAMSequenceRecord();
String contig = record.getSequenceName();
int length = record.getSequenceLength();
if (length > 0) {
contigs.put(contig, length);
contigs.put(Region.normalizeChromosome(contig), length);
}
}
}

@Override
public void pre() throws Exception {
}

@Override
public List<Variant> apply(List<Variant> list) throws Exception {
List<Variant> fixedList = new ArrayList<>(((int) (list.size() * 1.2)));
for (Variant variant : list) {
if (chromosome == null) {
init(variant);

// Create first telomere ref block (if needed)
fixedList.addAll(createContigFirstBlock());
} else {
if (!variant.getChromosome().equals(chromosome)) {
// Change chromosome
// Create first and last telomere ref block (if needed)
fixedList.addAll(createContigFirstBlock());
init(variant);
fixedList.addAll(createContigLastBlock());
} else {
if (variant.getStart() != position) {
// Check if need to create a block

if ((end + 1) != variant.getStart()) {
// Create ref block
fixedList.add(createRefBlock(chromosome, end + 1, variant.getStart() - 1));
}

position = variant.getStart();
end = variant.getEnd();
} else {
// Update end
end = Math.max(variant.getEnd(), end);
}
}
}
fixedList.add(variant);
}
return fixedList;
}

@Override
public List<Variant> drain() throws Exception {
return createContigLastBlock();
}

protected void init(Variant variant) {
chromosome = variant.getChromosome();
position = variant.getStart();
end = variant.getEnd();
if (!variant.getStudies().isEmpty()) {
StudyEntry studyEntry = variant.getStudies().get(0);
studyId = studyEntry.getStudyId();
fileId = studyEntry.getFiles().get(0).getFileId();
samplesPosition = studyEntry.getSamplesPosition();
missingGtSamplesData = new ArrayList<>(samplesPosition.size());
for (int i = 0; i < samplesPosition.size(); i++) {
missingGtSamplesData.add(Collections.singletonList("./."));
}
}
}

protected List<Variant> createContigFirstBlock() {
if (position <= 1) {
return Collections.emptyList();
} else {
return Collections.singletonList(createRefBlock(chromosome, 1, position - 1));
}
}

protected List<Variant> createContigLastBlock() {
if (!contigs.containsKey(chromosome)) {
return Collections.emptyList();
} else {
Integer length = contigs.get(chromosome);
if (end >= length) {
return Collections.emptyList();
}
return Collections.singletonList(createRefBlock(chromosome, end + 1, length));
}
}

protected Variant createRefBlock(String chromosome, int start, int end) {
VariantBuilder builder = new VariantBuilder(chromosome, start, end, "N", ".");
if (studyId != null) {
builder.setStudyId(studyId)
.setFileId(fileId)
.setSamplesPosition(samplesPosition)
.setFormat("GT")
.setSamplesData(missingGtSamplesData);
}
return builder.build();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package org.opencb.biodata.tools.variant;

public class VariantSorterTask extends VariantDeduplicationTask {

public VariantSorterTask(int bufferSize) {
super(variants -> variants, bufferSize);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package org.opencb.biodata.tools.variant;

import org.junit.Test;
import org.opencb.biodata.models.variant.Variant;
import org.opencb.biodata.models.variant.VariantBuilder;
import org.opencb.biodata.models.variant.avro.VariantType;

import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static java.util.Arrays.*;
import static org.junit.Assert.*;

public class VariantReferenceBlockCreatorTaskTest {

@Test
public void testFillBlocks() throws Exception {

assertEquals(
asList("1:1-99:N:.", "1:100:A:C", "1:101-119:N:.", "1:120:A:C", "1:121-10000:N:."),
apply( "1:100:A:C", "1:120:A:C"));
assertEquals(
asList("1:1-99:N:.", "1:100:A:CTG", "1:100:A:C", "1:101-119:N:.", "1:120:A:C", "1:121-10000:N:."),
apply( "1:100:A:CTG", "1:100:A:C", "1:120:A:C"));
assertEquals(
asList("1:1-99:N:.", "1:100:ATG:C", "1:100:A:C", "1:103-119:N:.", "1:120:A:C", "1:121-10000:N:."),
apply( "1:100:ATG:C", "1:100:A:C", "1:120:A:C"));
assertEquals(
asList("1:1-99:N:.", "1:100:A:C", "1:100:ATG:C", "1:103-119:N:.", "1:120:A:C", "1:121-10000:N:."),
apply( "1:100:A:C", "1:100:ATG:C", "1:120:A:C"));

}

@Test
public void testCreateBlock() throws Exception {
VariantReferenceBlockCreatorTask task = new VariantReferenceBlockCreatorTask();

task.init(new VariantBuilder("1:1:A:C")
.setStudyId("myStudy")
.setFileId("myFile")
.setFormat("GT", "DP")
.addSample("s1", "1/0", "10")
.addSample("s2", "0/0", "30")
.addSample("s3", "0/1", "20")
.build());
Variant variant = task.createRefBlock("1", 100, 200);
System.out.println("variant = " + variant.toJson());

assertEquals(VariantType.NO_VARIATION, variant.getType());
assertEquals("myStudy", variant.getStudies().get(0).getStudyId());
assertEquals("myFile", variant.getStudies().get(0).getFileId());
assertEquals(Arrays.asList("s1", "s2", "s3"), variant.getStudies().get(0).getOrderedSamplesName());
assertEquals(Arrays.asList(Collections.singletonList("./."), Collections.singletonList("./."), Collections.singletonList("./.")), variant.getStudies().get(0).getSamplesData());

}

private List<String> apply(String ...variants) throws Exception {
VariantReferenceBlockCreatorTask task = new VariantReferenceBlockCreatorTask(Collections.singletonMap("chr1", 10000));
List<Variant> list = task.apply(Stream.of(variants).map(Variant::new).collect(Collectors.toList()));
list.addAll(task.drain());
return list.stream().map(Variant::toString).collect(Collectors.toList());
}
}

0 comments on commit 1fc44bc

Please sign in to comment.