diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantDeduplicationTask.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantDeduplicationTask.java index f97735c95..42db128d0 100644 --- a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantDeduplicationTask.java +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantDeduplicationTask.java @@ -40,8 +40,12 @@ public VariantDeduplicationTask() { } public VariantDeduplicationTask(DuplicatedVariantsResolver duplicatedVariantsResolver) { + this(duplicatedVariantsResolver, 100); + } + + public VariantDeduplicationTask(DuplicatedVariantsResolver duplicatedVariantsResolver, int bufferSize) { resolver = duplicatedVariantsResolver; - queue = new CircularSortedArrayQueue<>(100, VARIANT_COMPARATOR); + queue = new CircularSortedArrayQueue<>(bufferSize, VARIANT_COMPARATOR); } @FunctionalInterface diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTask.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTask.java new file mode 100644 index 000000000..3bbd57f41 --- /dev/null +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTask.java @@ -0,0 +1,162 @@ +package org.opencb.biodata.tools.variant; + +import htsjdk.samtools.SAMSequenceRecord; +import htsjdk.variant.vcf.VCFConstants; +import htsjdk.variant.vcf.VCFContigHeaderLine; +import htsjdk.variant.vcf.VCFHeader; +import org.apache.commons.lang.StringUtils; +import org.opencb.biodata.models.core.Region; +import org.opencb.biodata.models.variant.StudyEntry; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantBuilder; +import org.opencb.biodata.models.variant.metadata.VariantFileHeader; +import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; +import org.opencb.commons.run.Task; + +import java.util.*; + +public class VariantReferenceBlockCreatorTask implements Task { + + + private String chromosome= null; + private int position; + private int end; + private String studyId; + private String fileId; + private LinkedHashMap samplesPosition; + private List> missingGtSamplesData; + private Map contigs; + + public VariantReferenceBlockCreatorTask() { + } + + public VariantReferenceBlockCreatorTask(Map contigs) { + this.contigs = new HashMap<>(contigs); + for (Map.Entry entry : contigs.entrySet()) { + this.contigs.put(Region.normalizeChromosome(entry.getKey()), entry.getValue()); + } + } + + public VariantReferenceBlockCreatorTask(VariantFileHeader fileHeader) { + this.contigs = new HashMap<>(); + for (VariantFileHeaderComplexLine line : fileHeader.getComplexLines()) { + if (line.getKey().equals(VCFConstants.CONTIG_HEADER_KEY)) { + String contig = line.getId(); + String length = line.getGenericFields().get("length"); + if (StringUtils.isNumeric(length)) { + contigs.put(contig, Integer.valueOf(length)); + contigs.put(Region.normalizeChromosome(contig), Integer.valueOf(length)); + } + } + } + } + + public VariantReferenceBlockCreatorTask(VCFHeader fileHeader) { + this.contigs = new HashMap<>(); + for (VCFContigHeaderLine line : fileHeader.getContigLines()) { + SAMSequenceRecord record = line.getSAMSequenceRecord(); + String contig = record.getSequenceName(); + int length = record.getSequenceLength(); + if (length > 0) { + contigs.put(contig, length); + contigs.put(Region.normalizeChromosome(contig), length); + } + } + } + + @Override + public void pre() throws Exception { + } + + @Override + public List apply(List list) throws Exception { + List fixedList = new ArrayList<>(((int) (list.size() * 1.2))); + for (Variant variant : list) { + if (chromosome == null) { + init(variant); + + // Create first telomere ref block (if needed) + fixedList.addAll(createContigFirstBlock()); + } else { + if (!variant.getChromosome().equals(chromosome)) { + // Change chromosome + // Create first and last telomere ref block (if needed) + fixedList.addAll(createContigFirstBlock()); + init(variant); + fixedList.addAll(createContigLastBlock()); + } else { + if (variant.getStart() != position) { + // Check if need to create a block + + if ((end + 1) != variant.getStart()) { + // Create ref block + fixedList.add(createRefBlock(chromosome, end + 1, variant.getStart() - 1)); + } + + position = variant.getStart(); + end = variant.getEnd(); + } else { + // Update end + end = Math.max(variant.getEnd(), end); + } + } + } + fixedList.add(variant); + } + return fixedList; + } + + @Override + public List drain() throws Exception { + return createContigLastBlock(); + } + + protected void init(Variant variant) { + chromosome = variant.getChromosome(); + position = variant.getStart(); + end = variant.getEnd(); + if (!variant.getStudies().isEmpty()) { + StudyEntry studyEntry = variant.getStudies().get(0); + studyId = studyEntry.getStudyId(); + fileId = studyEntry.getFiles().get(0).getFileId(); + samplesPosition = studyEntry.getSamplesPosition(); + missingGtSamplesData = new ArrayList<>(samplesPosition.size()); + for (int i = 0; i < samplesPosition.size(); i++) { + missingGtSamplesData.add(Collections.singletonList("./.")); + } + } + } + + protected List createContigFirstBlock() { + if (position <= 1) { + return Collections.emptyList(); + } else { + return Collections.singletonList(createRefBlock(chromosome, 1, position - 1)); + } + } + + protected List createContigLastBlock() { + if (!contigs.containsKey(chromosome)) { + return Collections.emptyList(); + } else { + Integer length = contigs.get(chromosome); + if (end >= length) { + return Collections.emptyList(); + } + return Collections.singletonList(createRefBlock(chromosome, end + 1, length)); + } + } + + protected Variant createRefBlock(String chromosome, int start, int end) { + VariantBuilder builder = new VariantBuilder(chromosome, start, end, "N", "."); + if (studyId != null) { + builder.setStudyId(studyId) + .setFileId(fileId) + .setSamplesPosition(samplesPosition) + .setFormat("GT") + .setSamplesData(missingGtSamplesData); + } + return builder.build(); + } + +} diff --git a/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantSorterTask.java b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantSorterTask.java new file mode 100644 index 000000000..e29f3aa4d --- /dev/null +++ b/biodata-tools/src/main/java/org/opencb/biodata/tools/variant/VariantSorterTask.java @@ -0,0 +1,8 @@ +package org.opencb.biodata.tools.variant; + +public class VariantSorterTask extends VariantDeduplicationTask { + + public VariantSorterTask(int bufferSize) { + super(variants -> variants, bufferSize); + } +} diff --git a/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTaskTest.java b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTaskTest.java new file mode 100644 index 000000000..632d9d5c4 --- /dev/null +++ b/biodata-tools/src/test/java/org/opencb/biodata/tools/variant/VariantReferenceBlockCreatorTaskTest.java @@ -0,0 +1,66 @@ +package org.opencb.biodata.tools.variant; + +import org.junit.Test; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantBuilder; +import org.opencb.biodata.models.variant.avro.VariantType; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static java.util.Arrays.*; +import static org.junit.Assert.*; + +public class VariantReferenceBlockCreatorTaskTest { + + @Test + public void testFillBlocks() throws Exception { + + assertEquals( + asList("1:1-99:N:.", "1:100:A:C", "1:101-119:N:.", "1:120:A:C", "1:121-10000:N:."), + apply( "1:100:A:C", "1:120:A:C")); + assertEquals( + asList("1:1-99:N:.", "1:100:A:CTG", "1:100:A:C", "1:101-119:N:.", "1:120:A:C", "1:121-10000:N:."), + apply( "1:100:A:CTG", "1:100:A:C", "1:120:A:C")); + assertEquals( + asList("1:1-99:N:.", "1:100:ATG:C", "1:100:A:C", "1:103-119:N:.", "1:120:A:C", "1:121-10000:N:."), + apply( "1:100:ATG:C", "1:100:A:C", "1:120:A:C")); + assertEquals( + asList("1:1-99:N:.", "1:100:A:C", "1:100:ATG:C", "1:103-119:N:.", "1:120:A:C", "1:121-10000:N:."), + apply( "1:100:A:C", "1:100:ATG:C", "1:120:A:C")); + + } + + @Test + public void testCreateBlock() throws Exception { + VariantReferenceBlockCreatorTask task = new VariantReferenceBlockCreatorTask(); + + task.init(new VariantBuilder("1:1:A:C") + .setStudyId("myStudy") + .setFileId("myFile") + .setFormat("GT", "DP") + .addSample("s1", "1/0", "10") + .addSample("s2", "0/0", "30") + .addSample("s3", "0/1", "20") + .build()); + Variant variant = task.createRefBlock("1", 100, 200); + System.out.println("variant = " + variant.toJson()); + + assertEquals(VariantType.NO_VARIATION, variant.getType()); + assertEquals("myStudy", variant.getStudies().get(0).getStudyId()); + assertEquals("myFile", variant.getStudies().get(0).getFileId()); + assertEquals(Arrays.asList("s1", "s2", "s3"), variant.getStudies().get(0).getOrderedSamplesName()); + assertEquals(Arrays.asList(Collections.singletonList("./."), Collections.singletonList("./."), Collections.singletonList("./.")), variant.getStudies().get(0).getSamplesData()); + + } + + private List apply(String ...variants) throws Exception { + VariantReferenceBlockCreatorTask task = new VariantReferenceBlockCreatorTask(Collections.singletonMap("chr1", 10000)); + List list = task.apply(Stream.of(variants).map(Variant::new).collect(Collectors.toList())); + list.addAll(task.drain()); + return list.stream().map(Variant::toString).collect(Collectors.toList()); + } +} \ No newline at end of file