diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/exomiser/ExomiserWrapperAnalysisExecutor.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/exomiser/ExomiserWrapperAnalysisExecutor.java index 1b19e410b5b..099d3c6c7db 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/exomiser/ExomiserWrapperAnalysisExecutor.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/wrappers/exomiser/ExomiserWrapperAnalysisExecutor.java @@ -7,7 +7,6 @@ import org.opencb.biodata.models.clinical.Phenotype; import org.opencb.biodata.models.clinical.pedigree.Member; import org.opencb.biodata.models.clinical.pedigree.Pedigree; -import org.opencb.biodata.models.core.SexOntologyTermAnnotation; import org.opencb.biodata.models.pedigree.IndividualProperty; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.exec.Command; @@ -21,12 +20,9 @@ import org.opencb.opencga.core.exceptions.ToolExecutorException; import org.opencb.opencga.core.models.family.Family; import org.opencb.opencga.core.models.individual.Individual; -import org.opencb.opencga.core.models.sample.Sample; import org.opencb.opencga.core.tools.annotations.ToolExecutor; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; -import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery; -import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,7 +32,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; -import java.util.stream.Collectors; @ToolExecutor(id = ExomiserWrapperAnalysisExecutor.ID, tool = ExomiserWrapperAnalysis.ID, @@ -133,7 +128,8 @@ public void run() throws ToolException { .sample(sampleId) .includeSample(samples) .includeSampleData("GT") - .unknownGenotype("./."); + .unknownGenotype("./.") + .append("includeAllFromSampleIndex", true); QueryOptions queryOptions = new QueryOptions(QueryOptions.INCLUDE, "id,studies.samples"); diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/SampleIndexConfiguration.java b/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/SampleIndexConfiguration.java index 0dcdf5b08fc..31b1bd45a45 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/SampleIndexConfiguration.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/config/storage/SampleIndexConfiguration.java @@ -169,7 +169,7 @@ public static SampleIndexConfiguration defaultConfiguration(boolean cellbaseV4) .addFileIndexField(new IndexFieldConfiguration( IndexFieldConfiguration.Source.FILE, StudyEntry.FILTER, - IndexFieldConfiguration.Type.CATEGORICAL, + IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE, VCFConstants.PASSES_FILTERS_v4)) .addFileIndexField(new IndexFieldConfiguration( IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL, QUAL_THRESHOLDS).setNullable(false)) diff --git a/opencga-storage/opencga-storage-core/src/test/resources/variant-multiallelic.vcf b/opencga-storage/opencga-storage-core/src/test/resources/variant-multiallelic.vcf new file mode 100644 index 00000000000..dc1cfc5d784 --- /dev/null +++ b/opencga-storage/opencga-storage-core/src/test/resources/variant-multiallelic.vcf @@ -0,0 +1,20 @@ +##fileformat=VCFv4.1 +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##command=seq 1000000 500 3000000 | while read i ; do echo -e "chr1\t$i\t.\tA\tC\t$RANDOM\tPASS\t.\tGT\t0/1\t1/1\t1|0\t0|1" ; done +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19600 NA19660 NA19661 NA19685 +chr1 1000000 . A C,T 5 noPass,noPass2 . GT 1/2 1/1 0|0 0|1 +chr1 1000010 . A AC,CA 20 PASS . GT 1/2 1/1 0|0 0|1 +chr1 1000020 . AT T,A 60 . . GT 1/2 1/1 0|0 0|1 +chr1 1000030 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 +chr1 1000040 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 +chr1 1000050 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 +chr1 1000060 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 +chr1 1000070 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 +chr1 1000080 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 +chr1 1000090 . C G 60 . PASS GT 1/0 1/1 0|0 0|1 \ No newline at end of file diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexOnlyVariantQueryExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexOnlyVariantQueryExecutor.java index 6c0668aa1e2..8b4f5d1c347 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexOnlyVariantQueryExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexOnlyVariantQueryExecutor.java @@ -2,7 +2,6 @@ import com.google.common.collect.Iterators; import htsjdk.variant.vcf.VCFConstants; -import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.concurrent.BasicThreadFactory; import org.apache.commons.lang3.time.StopWatch; import org.opencb.biodata.models.variant.StudyEntry; @@ -177,8 +176,9 @@ private VariantDBIterator getVariantDBIterator(SampleIndexQuery sampleIndexQuery } catch (IOException e) { throw VariantQueryException.internalException(e).setQuery(inputQuery); } - SampleVariantIndexEntryToVariantConverter converter = - new SampleVariantIndexEntryToVariantConverter(parsedQuery, sampleIndexQuery, dbAdaptor.getMetadataManager()); + boolean includeAll = inputQuery.getBoolean("includeAllFromSampleIndex", false); + SampleVariantIndexEntryToVariantConverter converter = new SampleVariantIndexEntryToVariantConverter( + parsedQuery, sampleIndexQuery, dbAdaptor.getMetadataManager(), includeAll); variantIterator = VariantDBIterator.wrapper(Iterators.transform(rawIterator, converter::convert)); AddMissingDataTask task = new AddMissingDataTask( parsedQuery, sampleIndexQuery, dbAdaptor.getMetadataManager()); @@ -289,8 +289,6 @@ private boolean isIncludeCovered(SampleIndexQuery sampleIndexQuery, Query inputQ private static class SampleVariantIndexEntryToVariantConverter implements Converter { - - enum FamilyRole { MOTHER, FATHER, @@ -304,19 +302,17 @@ enum FamilyRole { private String motherName; private String fatherName; private LinkedHashMap samplesPosition; - private final List files; + private List sampleFiles; private IndexField filterField; private IndexField qualField; private final SampleIndexSchema schema; + private final boolean includeAll; SampleVariantIndexEntryToVariantConverter(ParsedVariantQuery parseQuery, SampleIndexQuery sampleIndexQuery, - VariantStorageMetadataManager metadataManager) { + VariantStorageMetadataManager metadataManager, boolean includeAll) { schema = sampleIndexQuery.getSchema(); - filterField = schema.getFileIndex() - .getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.FILTER); - qualField = schema.getFileIndex() - .getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL); + this.includeAll = includeAll; VariantQueryProjection projection = parseQuery.getProjection(); includeStudy = !projection.getStudyIds().isEmpty(); @@ -367,13 +363,22 @@ enum FamilyRole { this.fatherName = null; } - List fileIds = metadataManager.getFileIdsFromSampleId(studyId, sampleId, true); - files = new ArrayList<>(fileIds.size()); - for (Integer fileId : fileIds) { - files.add(metadataManager.getFileName(studyId, fileId)); + if (includeAll) { + if (sampleMetadata == null) { + sampleMetadata = metadataManager.getSampleMetadata(studyId, sampleId); + } + if (sampleMetadata.isMultiFileSample()) { + List sampleFileIds = sampleMetadata.getFiles(); + sampleFiles = new ArrayList<>(sampleFileIds.size()); + for (Integer fileId : sampleFileIds) { + sampleFiles.add(metadataManager.getFileName(studyId, fileId)); + } + } + filterField = schema.getFileIndex() + .getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.FILTER); + qualField = schema.getFileIndex() + .getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL); } - } else { - files = null; } } @@ -386,6 +391,7 @@ public Variant convert(SampleVariantIndexEntry entry) { studyEntry.setStudyId(studyName); studyEntry.setSampleDataKeys(Collections.singletonList("GT")); studyEntry.setSamples(new ArrayList<>(familyRoleOrder.size())); + SampleEntry sampleEntry = null; for (FamilyRole role : familyRoleOrder) { switch (role) { case MOTHER: @@ -397,29 +403,37 @@ public Variant convert(SampleVariantIndexEntry entry) { Arrays.asList(GenotypeCodec.decodeFather(entry.getParentsCode())))); break; case SAMPLE: - studyEntry.getSamples().add(new SampleEntry(sampleName, null, - Arrays.asList(entry.getGenotype()))); + sampleEntry = new SampleEntry(sampleName, null, + Arrays.asList(entry.getGenotype())); + studyEntry.getSamples().add(sampleEntry); break; default: throw new IllegalStateException("Unexpected value: " + role); } } - HashMap fileAttributes = new HashMap<>(); - // TODO: What if multi-files? - BitBuffer fileIndexBitBuffer = entry.getFileIndex(); - String filter = filterField.readAndDecode(fileIndexBitBuffer); - if (filter == null) { - filter = "NA"; - } - fileAttributes.put(StudyEntry.FILTER, filter); - String qual = qualField.readAndDecode(fileIndexBitBuffer); - if (qual == null) { - qual = "NA"; + if (includeAll) { + HashMap fileAttributes = new HashMap<>(); + for (BitBuffer fileIndexBitBuffer : entry.getFilesIndex()) { + String filter = filterField.readAndDecode(fileIndexBitBuffer); + if (filter == null) { + filter = "NA"; + } + fileAttributes.put(StudyEntry.FILTER, filter); + String qual = qualField.readAndDecode(fileIndexBitBuffer); + if (qual == null) { + qual = "NA"; + } + fileAttributes.put(StudyEntry.QUAL, qual); + + Integer idx = schema.getFileIndex().getFilePositionIndex().readAndDecode(fileIndexBitBuffer); + String fileName = sampleFiles.get(idx); + studyEntry.setFiles(new ArrayList<>()); + studyEntry.getFiles().add(new FileEntry(fileName, null, fileAttributes)); + if (sampleEntry != null) { + sampleEntry.setFileIndex(0); + } + } } - fileAttributes.put(StudyEntry.QUAL, qual); - String fileName = files.get(0); - studyEntry.setFiles(new ArrayList<>()); - studyEntry.getFiles().add(new FileEntry(fileName, null, fileAttributes)); studyEntry.setSortedSamplesPosition(samplesPosition); v.setStudies(Collections.singletonList(studyEntry)); } @@ -430,9 +444,10 @@ public Variant convert(SampleVariantIndexEntry entry) { private class AddMissingDataTask implements Task { private final ParsedVariantQuery parsedQuery; private final String studyName; - private final List samples; - private final List files; - private final List allFiles; + private final String sampleName; + private final List filesFromSample; + private final List includeSamples; + private final List allFiles; // from all includedSamples private final int gtIdx; AddMissingDataTask(ParsedVariantQuery parsedQuery, SampleIndexQuery sampleIndexQuery, @@ -449,9 +464,9 @@ private class AddMissingDataTask implements Task { throw new IllegalStateException("Unexpected number of samples. Expected one, found " + sampleIndexQuery.getSamplesMap().keySet()); } - samples = new ArrayList<>(projectionStudy.getSamples().size()); + includeSamples = new ArrayList<>(projectionStudy.getSamples().size()); for (Integer sample : projectionStudy.getSamples()) { - samples.add(metadataManager.getSampleName(studyId, sample)); + includeSamples.add(metadataManager.getSampleName(studyId, sample)); } Set allFileIds = metadataManager.getFileIdsFromSampleIds(studyId, projectionStudy.getSamples(), true); allFiles = new ArrayList<>(allFileIds.size()); @@ -459,12 +474,12 @@ private class AddMissingDataTask implements Task { allFiles.add(metadataManager.getFileName(studyId, fileId)); } - String sampleName = sampleIndexQuery.getSamplesMap().keySet().iterator().next(); + sampleName = sampleIndexQuery.getSamplesMap().keySet().iterator().next(); Integer sampleId = metadataManager.getSampleId(studyId, sampleName); List fileIds = metadataManager.getFileIdsFromSampleId(studyId, sampleId, true); - files = new ArrayList<>(fileIds.size()); + filesFromSample = new ArrayList<>(fileIds.size()); for (Integer fileId : fileIds) { - files.add(metadataManager.getFileName(studyId, fileId)); + filesFromSample.add(metadataManager.getFileName(studyId, fileId)); } List includeSampleData = VariantQueryUtils.getIncludeSampleData(parsedQuery.getInputQuery()); gtIdx = includeSampleData.indexOf("GT"); @@ -543,7 +558,7 @@ private void addSecondaryAlternates(List toReadFull) { Map variantsExtra = dbAdaptor.get(new VariantQuery() .id(toReadFull) .study(studyName) - .includeSample(samples) + .includeSample(includeSamples) .includeSampleData("GT") // read only GT .includeFile(allFiles), options) @@ -560,23 +575,17 @@ private void addSecondaryAlternates(List toReadFull) { StudyEntry studyExtra = variantExtra.getStudies().get(0); StudyEntry study = variant.getStudies().get(0); + study.setSecondaryAlternates(studyExtra.getSecondaryAlternates()); + mergeFileEntries(study, studyExtra.getFiles(), (fe, newFe) -> { fe.setCall(newFe.getCall()); }); // merge sampleEntries - for (int i = 0; i < samples.size(); i++) { -// String sampleName = samples.get(i); + for (int i = 0; i < includeSamples.size(); i++) { SampleEntry sample = study.getSample(i); SampleEntry sampleExtra = studyExtra.getSample(i); sample.getData().set(gtIdx, sampleExtra.getData().get(0)); -// if (sampleExtra.getFileIndex() != null) { -// String fileIdFromFull = fullStudy.getFiles().get(sampleExtra.getFileIndex()).getFileId(); -// if (sample.getFileIndex() == null) { -// actualStudy.getFiles(). -// String fileIdFrmoFull = fullStudy.getFiles().get(sampleExtra.getFileIndex()).getFileId(); -// } -// } } } // logger.info(" # Fetch {} SEC_ALTS in {}", toReadFull.size(), TimeUtils.durationToString(stopWatch)); @@ -588,7 +597,7 @@ private void addOriginalCall(List variants, String study) { for (Variant variant : dbAdaptor.iterable( new Query() .append(VariantQueryParam.ID.key(), variants) - .append(VariantQueryParam.INCLUDE_FILE.key(), files) + .append(VariantQueryParam.INCLUDE_FILE.key(), filesFromSample) .append(VariantQueryParam.INCLUDE_SAMPLE.key(), NONE) .append(VariantQueryParam.INCLUDE_STUDY.key(), study), new QueryOptions() @@ -620,21 +629,23 @@ private void addOriginalCall(List variants, String study) { private void mergeFileEntries(StudyEntry studyEntry, List newFileEntries, BiConsumer merge) { - if (CollectionUtils.isEmpty(studyEntry.getFiles())) { - studyEntry.setFiles(newFileEntries); - } else { - for (FileEntry newFileEntry : newFileEntries) { - FileEntry fileEntry = studyEntry.getFile(newFileEntry.getFileId()); - if (fileEntry == null) { - studyEntry.getFiles().add(newFileEntry); - } else { - merge.accept(fileEntry, newFileEntry); + if (studyEntry.getFiles() == null) { + studyEntry.setFiles(new ArrayList<>(newFileEntries.size())); + } + for (FileEntry newFileEntry : newFileEntries) { + FileEntry fileEntry = studyEntry.getFile(newFileEntry.getFileId()); + if (fileEntry == null) { + fileEntry = new FileEntry(newFileEntry.getFileId(), null, new HashMap<>()); + studyEntry.getFiles().add(fileEntry); + if (filesFromSample.contains(fileEntry.getFileId())) { + SampleEntry sampleEntry = studyEntry.getSample(sampleName); + if (sampleEntry.getFileIndex() == null) { + sampleEntry.setFileIndex(studyEntry.getFiles().size() - 1); + } } } + merge.accept(fileEntry, newFileEntry); } } - - } - } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/FileIndexSchema.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/FileIndexSchema.java index 4ea023e75e5..a383109139d 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/FileIndexSchema.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/FileIndexSchema.java @@ -213,7 +213,32 @@ private static IndexField buildCustomIndexField(IndexFieldConfiguration } else { return Arrays.asList(s.split(VCFConstants.FILTER_CODE_SEPARATOR)); } - }, v -> v == null ? null : String.join(VCFConstants.FILTER_CODE_SEPARATOR, v)); + }, values -> { + if (values == null || values.isEmpty()) { + return null; + } + if (values.size() == 1) { + String value = values.get(0); + if (value == null) { + return null; + } else { + return value; + } + } else { + StringBuilder sb = new StringBuilder(); + for (String v : values) { + if (sb.length() != 0) { + sb.append(VCFConstants.FILTER_CODE_SEPARATOR); + } + if (v == null) { + sb.append("NA"); + } else { + sb.append(v); + } + } + return sb.toString(); + } + }); } else { return new CategoricalMultiValuedIndexField<>(conf, bitOffset, conf.getValues()) .from(s -> { diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/core/CategoricalIndexFieldTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/core/CategoricalIndexFieldTest.java index a062617da1f..0abaa66893a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/core/CategoricalIndexFieldTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/core/CategoricalIndexFieldTest.java @@ -1,7 +1,9 @@ package org.opencb.opencga.storage.hadoop.variant.index.core; +import org.apache.commons.lang3.tuple.Pair; import org.junit.Test; import org.junit.experimental.categories.Category; +import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.opencga.core.config.storage.IndexFieldConfiguration; import org.opencb.opencga.core.config.storage.SampleIndexConfiguration; import org.opencb.opencga.core.testclassification.duration.ShortTests; @@ -15,6 +17,7 @@ import java.util.Set; import static org.junit.Assert.assertEquals; +import static org.opencb.opencga.core.config.storage.IndexFieldConfiguration.Source.FILE; import static org.opencb.opencga.core.config.storage.IndexFieldConfiguration.Source.SAMPLE; @Category(ShortTests.class) @@ -39,10 +42,55 @@ public void testLength() { assertEquals(3, CategoricalIndexField.create(new IndexFieldConfiguration(SAMPLE, "K", IndexFieldConfiguration.Type.CATEGORICAL, "1", "2", "3", "4", "5", "6").setNullable(nullable), 0).getBitLength()); } + @Test + public void testEncodeDecodeQual() { + SampleIndexSchema indexSchema = SampleIndexSchema.defaultSampleIndexSchema(); + IndexField qualfield = indexSchema.getFileIndex().getCustomField(FILE, StudyEntry.QUAL); + + List> pairs = Arrays.asList( + Pair.of("45", "30.0"), + Pair.of("25", "20.0"), + Pair.of("30", "30.0"), + Pair.of("10", "10.0"), + Pair.of("0", Double.toString(Double.MIN_VALUE)) + ); + for (Pair pair : pairs) { + String qual = pair.getKey(); + String expectedQual = pair.getValue(); + int encode = qualfield.encode(qual); + String actualQual = qualfield.decode(encode); + assertEquals(expectedQual, actualQual); + } + } + + @Test + public void testEncodeDecodeFilter() { + SampleIndexConfiguration indexConfiguration = SampleIndexConfiguration.defaultConfiguration(); + indexConfiguration.getFileIndexConfiguration().getCustomField(FILE, StudyEntry.FILTER).setValues("PASS", "noPass"); + SampleIndexSchema indexSchema = new SampleIndexSchema(indexConfiguration, 0); + IndexField field = indexSchema.getFileIndex().getCustomField(FILE, StudyEntry.FILTER); + + List> pairs = Arrays.asList( + Pair.of("PASS", "PASS"), + Pair.of("asdfasdf", null), + Pair.of("noPass", "noPass"), + Pair.of("PASS;noPass", "PASS;noPass"), + Pair.of("PASS;noPass;other;another", "PASS;noPass;NA"), + Pair.of(".", null) + ); + for (Pair pair : pairs) { + String filter = pair.getKey(); + String expectedFilter = pair.getValue(); + int encode = field.encode(filter); + String actualFilter = field.decode(encode); + assertEquals(expectedFilter, actualFilter); + } + } + @Test public void testEncodeDecode() { SampleIndexSchema indexSchema = SampleIndexSchema.defaultSampleIndexSchema(); - CategoricalMultiValuedIndexField field = (CategoricalMultiValuedIndexField) indexSchema.getCtIndex().getField(); + CategoricalMultiValuedIndexField field = indexSchema.getCtIndex().getField(); List expected = Arrays.asList("synonymous_variant", "missense_variant"); int encode = field.encode(expected); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java index 7693fc1a53e..3cea4d62452 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java @@ -14,8 +14,10 @@ import org.junit.experimental.categories.Category; import org.junit.rules.ExternalResource; import org.opencb.biodata.models.core.Region; +import org.opencb.biodata.models.variant.StudyEntry; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ConsequenceType; +import org.opencb.biodata.models.variant.avro.FileEntry; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.biodata.models.variant.metadata.SampleVariantStats; import org.opencb.commons.datastore.core.*; @@ -59,6 +61,7 @@ import java.util.*; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; import java.util.stream.Collectors; import static java.util.stream.Collectors.*; @@ -84,8 +87,15 @@ public class SampleIndexTest extends VariantStorageBaseTest implements HadoopVar private static boolean loaded = false; public static final String STUDY_NAME_3 = "study_3"; public static final String STUDY_NAME_4 = "study_4"; - public static final String STUDY_NAME_5 = "study_5"; - private static final List studies = Arrays.asList(STUDY_NAME, STUDY_NAME_2, STUDY_NAME_3, STUDY_NAME_4, STUDY_NAME_5); + public static final String STUDY_NAME_5 = "study_5"; // large SV + public static final String STUDY_NAME_6 = "study_6"; // multiallelic + private static final List studies = Arrays.asList( + STUDY_NAME, + STUDY_NAME_2, + STUDY_NAME_3, + STUDY_NAME_4, + STUDY_NAME_5, + STUDY_NAME_6); private static final Map> sampleNames = new HashMap>() {{ put(STUDY_NAME, Arrays.asList("NA19600", "NA19660", "NA19661", "NA19685")); put(STUDY_NAME_2, Arrays.asList("NA19600", "NA19660", "NA19661", "NA19685")); @@ -192,6 +202,20 @@ public void load() throws Exception { runETL(engine, getResourceUri("variant-large-sv.vcf"), outputUri, params, true, true, true); engine.familyIndex(STUDY_NAME_5, trios, new ObjectMap()); + // Study 6, multiallelic + SampleIndexConfiguration sampleIndexConfiguration = SampleIndexConfiguration.defaultConfiguration(); + System.out.println("sampleIndexConfiguration.getFileIndexConfiguration() = " + sampleIndexConfiguration.getFileIndexConfiguration()); + sampleIndexConfiguration.getFileIndexConfiguration().getCustomField(IndexFieldConfiguration.Source.FILE, "FILTER") + .setValues("PASS", "noPass", "noPass2"); + engine.getMetadataManager().addSampleIndexConfiguration(STUDY_NAME_6, sampleIndexConfiguration, true); + + params = new ObjectMap() + .append(VariantStorageOptions.STUDY.key(), STUDY_NAME_6) + .append(VariantStorageOptions.ANNOTATE.key(), false) + .append(VariantStorageOptions.STATS_CALCULATE.key(), false); + runETL(engine, getResourceUri("variant-multiallelic.vcf"), outputUri, params, true, true, true); + engine.familyIndex(STUDY_NAME_6, trios, new ObjectMap()); + // ---------------- Annotate // variantStorageEngine.getConfiguration().getCellbase().setUrl(ParamConstants.CELLBASE_URL); @@ -1122,9 +1146,41 @@ public void testSampleIndexOnlyVariantQueryExecutor() { new QueryOptions(QueryOptions.INCLUDE, Arrays.asList(VariantField.ID, VariantField.STUDIES_SAMPLES)), SampleIndexOnlyVariantQueryExecutor.class); + testSampleIndexOnlyVariantQueryExecutor( + new VariantQuery() + .study(STUDY_NAME_6) + .sample("NA19600") + .includeGenotype(true), + new QueryOptions(QueryOptions.INCLUDE, Arrays.asList(VariantField.ID, VariantField.STUDIES_SAMPLES)), + SampleIndexOnlyVariantQueryExecutor.class); + + testSampleIndexOnlyVariantQueryExecutor( + new VariantQuery() + .study(STUDY_NAME_6) + .sample("NA19600") + .includeGenotype(true) + .append("includeAllFromSampleIndex", true), + new QueryOptions(QueryOptions.INCLUDE, Arrays.asList(VariantField.ID, VariantField.STUDIES_SAMPLES)), + SampleIndexOnlyVariantQueryExecutor.class, + v -> { + for (FileEntry fe : v.getStudies().get(0).getFiles()) { + assertNotNull(fe.getData().get(StudyEntry.FILTER)); + fe.setData(Collections.emptyMap()); + } + v.getStudies().get(0).getFiles().removeIf(fe -> fe.getCall() == null); + if (v.getStudies().get(0).getFiles().isEmpty()) { + v.getStudies().get(0).getSamples().forEach(s -> s.setFileIndex(null)); + } + return v; + }); } private void testSampleIndexOnlyVariantQueryExecutor(VariantQuery query, QueryOptions options, Class expected) { + testSampleIndexOnlyVariantQueryExecutor(query, options, expected, v -> v); + } + + private void testSampleIndexOnlyVariantQueryExecutor(VariantQuery query, QueryOptions options, Class expected, + Function mapper) { VariantQueryExecutor variantQueryExecutor = variantStorageEngine.getVariantQueryExecutor( query, options); @@ -1158,6 +1214,7 @@ private void testSampleIndexOnlyVariantQueryExecutor(VariantQuery query, QueryOp for (int i = 0; i < actualVariants.size(); i++) { Variant av = actualVariants.get(i); Variant ev = expectedVariants.get(i); + mapper.apply(av); if (!ev.getStudies().isEmpty()) { if (av.getLengthAlternate() == 0 || av.getLengthReference() == 0) { // System.out.println("-------" + av + "----------");