Skip to content

Commit

Permalink
storage: Do not return extra files information. #TASK-4794
Browse files Browse the repository at this point in the history
  • Loading branch information
j-coll committed Jul 25, 2023
1 parent c21e416 commit 4de465a
Show file tree
Hide file tree
Showing 7 changed files with 233 additions and 76 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import org.opencb.biodata.models.clinical.Phenotype;
import org.opencb.biodata.models.clinical.pedigree.Member;
import org.opencb.biodata.models.clinical.pedigree.Pedigree;
import org.opencb.biodata.models.core.SexOntologyTermAnnotation;
import org.opencb.biodata.models.pedigree.IndividualProperty;
import org.opencb.commons.datastore.core.QueryOptions;
import org.opencb.commons.exec.Command;
Expand All @@ -21,12 +20,9 @@
import org.opencb.opencga.core.exceptions.ToolExecutorException;
import org.opencb.opencga.core.models.family.Family;
import org.opencb.opencga.core.models.individual.Individual;
import org.opencb.opencga.core.models.sample.Sample;
import org.opencb.opencga.core.tools.annotations.ToolExecutor;
import org.opencb.opencga.storage.core.exceptions.StorageEngineException;
import org.opencb.opencga.storage.core.variant.adaptors.VariantField;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery;
import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam;
import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -36,7 +32,6 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.stream.Collectors;

@ToolExecutor(id = ExomiserWrapperAnalysisExecutor.ID,
tool = ExomiserWrapperAnalysis.ID,
Expand Down Expand Up @@ -133,7 +128,8 @@ public void run() throws ToolException {
.sample(sampleId)
.includeSample(samples)
.includeSampleData("GT")
.unknownGenotype("./.");
.unknownGenotype("./.")
.append("includeAllFromSampleIndex", true);

QueryOptions queryOptions = new QueryOptions(QueryOptions.INCLUDE, "id,studies.samples");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ public static SampleIndexConfiguration defaultConfiguration(boolean cellbaseV4)
.addFileIndexField(new IndexFieldConfiguration(
IndexFieldConfiguration.Source.FILE,
StudyEntry.FILTER,
IndexFieldConfiguration.Type.CATEGORICAL,
IndexFieldConfiguration.Type.CATEGORICAL_MULTI_VALUE,
VCFConstants.PASSES_FILTERS_v4))
.addFileIndexField(new IndexFieldConfiguration(
IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL, QUAL_THRESHOLDS).setNullable(false))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
##fileformat=VCFv4.1
##FILTER=<ID=PASS,Description="All filters passed">
##FILTER=<ID=noPass,Description="No pass">
##FILTER=<ID=noPass2,Description="No pass other">
##FILTER=<ID=.,Description="unknown filter state">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype likelihoods">
##FORMAT=<ID=DS,Number=1,Type=Float,Description="">
##command=seq 1000000 500 3000000 | while read i ; do echo -e "chr1\t$i\t.\tA\tC\t$RANDOM\tPASS\t.\tGT\t0/1\t1/1\t1|0\t0|1" ; done
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA19600 NA19660 NA19661 NA19685
chr1 1000000 . A C,T 5 noPass,noPass2 . GT 1/2 1/1 0|0 0|1
chr1 1000010 . A AC,CA 20 PASS . GT 1/2 1/1 0|0 0|1
chr1 1000020 . AT T,A 60 . . GT 1/2 1/1 0|0 0|1
chr1 1000030 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
chr1 1000040 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
chr1 1000050 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
chr1 1000060 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
chr1 1000070 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
chr1 1000080 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
chr1 1000090 . C G 60 . PASS GT 1/0 1/1 0|0 0|1
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

import com.google.common.collect.Iterators;
import htsjdk.variant.vcf.VCFConstants;
import org.apache.commons.collections4.CollectionUtils;
import org.apache.commons.lang3.concurrent.BasicThreadFactory;
import org.apache.commons.lang3.time.StopWatch;
import org.opencb.biodata.models.variant.StudyEntry;
Expand Down Expand Up @@ -177,8 +176,9 @@ private VariantDBIterator getVariantDBIterator(SampleIndexQuery sampleIndexQuery
} catch (IOException e) {
throw VariantQueryException.internalException(e).setQuery(inputQuery);
}
SampleVariantIndexEntryToVariantConverter converter =
new SampleVariantIndexEntryToVariantConverter(parsedQuery, sampleIndexQuery, dbAdaptor.getMetadataManager());
boolean includeAll = inputQuery.getBoolean("includeAllFromSampleIndex", false);
SampleVariantIndexEntryToVariantConverter converter = new SampleVariantIndexEntryToVariantConverter(
parsedQuery, sampleIndexQuery, dbAdaptor.getMetadataManager(), includeAll);
variantIterator = VariantDBIterator.wrapper(Iterators.transform(rawIterator, converter::convert));
AddMissingDataTask task = new AddMissingDataTask(
parsedQuery, sampleIndexQuery, dbAdaptor.getMetadataManager());
Expand Down Expand Up @@ -289,8 +289,6 @@ private boolean isIncludeCovered(SampleIndexQuery sampleIndexQuery, Query inputQ

private static class SampleVariantIndexEntryToVariantConverter implements Converter<SampleVariantIndexEntry, Variant> {



enum FamilyRole {
MOTHER,
FATHER,
Expand All @@ -304,19 +302,17 @@ enum FamilyRole {
private String motherName;
private String fatherName;
private LinkedHashMap<String, Integer> samplesPosition;
private final List<String> files;
private List<String> sampleFiles;
private IndexField<String> filterField;
private IndexField<String> qualField;
private final SampleIndexSchema schema;
private final boolean includeAll;


SampleVariantIndexEntryToVariantConverter(ParsedVariantQuery parseQuery, SampleIndexQuery sampleIndexQuery,
VariantStorageMetadataManager metadataManager) {
VariantStorageMetadataManager metadataManager, boolean includeAll) {
schema = sampleIndexQuery.getSchema();
filterField = schema.getFileIndex()
.getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.FILTER);
qualField = schema.getFileIndex()
.getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL);
this.includeAll = includeAll;

VariantQueryProjection projection = parseQuery.getProjection();
includeStudy = !projection.getStudyIds().isEmpty();
Expand Down Expand Up @@ -367,13 +363,22 @@ enum FamilyRole {
this.fatherName = null;
}

List<Integer> fileIds = metadataManager.getFileIdsFromSampleId(studyId, sampleId, true);
files = new ArrayList<>(fileIds.size());
for (Integer fileId : fileIds) {
files.add(metadataManager.getFileName(studyId, fileId));
if (includeAll) {
if (sampleMetadata == null) {
sampleMetadata = metadataManager.getSampleMetadata(studyId, sampleId);
}
if (sampleMetadata.isMultiFileSample()) {
List<Integer> sampleFileIds = sampleMetadata.getFiles();
sampleFiles = new ArrayList<>(sampleFileIds.size());
for (Integer fileId : sampleFileIds) {
sampleFiles.add(metadataManager.getFileName(studyId, fileId));
}
}
filterField = schema.getFileIndex()
.getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.FILTER);
qualField = schema.getFileIndex()
.getCustomField(IndexFieldConfiguration.Source.FILE, StudyEntry.QUAL);
}
} else {
files = null;
}
}

Expand All @@ -386,6 +391,7 @@ public Variant convert(SampleVariantIndexEntry entry) {
studyEntry.setStudyId(studyName);
studyEntry.setSampleDataKeys(Collections.singletonList("GT"));
studyEntry.setSamples(new ArrayList<>(familyRoleOrder.size()));
SampleEntry sampleEntry = null;
for (FamilyRole role : familyRoleOrder) {
switch (role) {
case MOTHER:
Expand All @@ -397,29 +403,37 @@ public Variant convert(SampleVariantIndexEntry entry) {
Arrays.asList(GenotypeCodec.decodeFather(entry.getParentsCode()))));
break;
case SAMPLE:
studyEntry.getSamples().add(new SampleEntry(sampleName, null,
Arrays.asList(entry.getGenotype())));
sampleEntry = new SampleEntry(sampleName, null,
Arrays.asList(entry.getGenotype()));
studyEntry.getSamples().add(sampleEntry);
break;
default:
throw new IllegalStateException("Unexpected value: " + role);
}
}
HashMap<String, String> fileAttributes = new HashMap<>();
// TODO: What if multi-files?
BitBuffer fileIndexBitBuffer = entry.getFileIndex();
String filter = filterField.readAndDecode(fileIndexBitBuffer);
if (filter == null) {
filter = "NA";
}
fileAttributes.put(StudyEntry.FILTER, filter);
String qual = qualField.readAndDecode(fileIndexBitBuffer);
if (qual == null) {
qual = "NA";
if (includeAll) {
HashMap<String, String> fileAttributes = new HashMap<>();
for (BitBuffer fileIndexBitBuffer : entry.getFilesIndex()) {
String filter = filterField.readAndDecode(fileIndexBitBuffer);
if (filter == null) {
filter = "NA";
}
fileAttributes.put(StudyEntry.FILTER, filter);
String qual = qualField.readAndDecode(fileIndexBitBuffer);
if (qual == null) {
qual = "NA";
}
fileAttributes.put(StudyEntry.QUAL, qual);

Integer idx = schema.getFileIndex().getFilePositionIndex().readAndDecode(fileIndexBitBuffer);
String fileName = sampleFiles.get(idx);
studyEntry.setFiles(new ArrayList<>());
studyEntry.getFiles().add(new FileEntry(fileName, null, fileAttributes));
if (sampleEntry != null) {
sampleEntry.setFileIndex(0);
}
}
}
fileAttributes.put(StudyEntry.QUAL, qual);
String fileName = files.get(0);
studyEntry.setFiles(new ArrayList<>());
studyEntry.getFiles().add(new FileEntry(fileName, null, fileAttributes));
studyEntry.setSortedSamplesPosition(samplesPosition);
v.setStudies(Collections.singletonList(studyEntry));
}
Expand All @@ -430,9 +444,10 @@ public Variant convert(SampleVariantIndexEntry entry) {
private class AddMissingDataTask implements Task<Variant, Variant> {
private final ParsedVariantQuery parsedQuery;
private final String studyName;
private final List<String> samples;
private final List<String> files;
private final List<String> allFiles;
private final String sampleName;
private final List<String> filesFromSample;
private final List<String> includeSamples;
private final List<String> allFiles; // from all includedSamples
private final int gtIdx;

AddMissingDataTask(ParsedVariantQuery parsedQuery, SampleIndexQuery sampleIndexQuery,
Expand All @@ -449,22 +464,22 @@ private class AddMissingDataTask implements Task<Variant, Variant> {
throw new IllegalStateException("Unexpected number of samples. Expected one, found "
+ sampleIndexQuery.getSamplesMap().keySet());
}
samples = new ArrayList<>(projectionStudy.getSamples().size());
includeSamples = new ArrayList<>(projectionStudy.getSamples().size());
for (Integer sample : projectionStudy.getSamples()) {
samples.add(metadataManager.getSampleName(studyId, sample));
includeSamples.add(metadataManager.getSampleName(studyId, sample));
}
Set<Integer> allFileIds = metadataManager.getFileIdsFromSampleIds(studyId, projectionStudy.getSamples(), true);
allFiles = new ArrayList<>(allFileIds.size());
for (Integer fileId : allFileIds) {
allFiles.add(metadataManager.getFileName(studyId, fileId));
}

String sampleName = sampleIndexQuery.getSamplesMap().keySet().iterator().next();
sampleName = sampleIndexQuery.getSamplesMap().keySet().iterator().next();
Integer sampleId = metadataManager.getSampleId(studyId, sampleName);
List<Integer> fileIds = metadataManager.getFileIdsFromSampleId(studyId, sampleId, true);
files = new ArrayList<>(fileIds.size());
filesFromSample = new ArrayList<>(fileIds.size());
for (Integer fileId : fileIds) {
files.add(metadataManager.getFileName(studyId, fileId));
filesFromSample.add(metadataManager.getFileName(studyId, fileId));
}
List<String> includeSampleData = VariantQueryUtils.getIncludeSampleData(parsedQuery.getInputQuery());
gtIdx = includeSampleData.indexOf("GT");
Expand Down Expand Up @@ -543,7 +558,7 @@ private void addSecondaryAlternates(List<Variant> toReadFull) {
Map<String, Variant> variantsExtra = dbAdaptor.get(new VariantQuery()
.id(toReadFull)
.study(studyName)
.includeSample(samples)
.includeSample(includeSamples)
.includeSampleData("GT") // read only GT
.includeFile(allFiles),
options)
Expand All @@ -560,23 +575,17 @@ private void addSecondaryAlternates(List<Variant> toReadFull) {
StudyEntry studyExtra = variantExtra.getStudies().get(0);
StudyEntry study = variant.getStudies().get(0);

study.setSecondaryAlternates(studyExtra.getSecondaryAlternates());

mergeFileEntries(study, studyExtra.getFiles(), (fe, newFe) -> {
fe.setCall(newFe.getCall());
});
// merge sampleEntries
for (int i = 0; i < samples.size(); i++) {
// String sampleName = samples.get(i);
for (int i = 0; i < includeSamples.size(); i++) {
SampleEntry sample = study.getSample(i);
SampleEntry sampleExtra = studyExtra.getSample(i);

sample.getData().set(gtIdx, sampleExtra.getData().get(0));
// if (sampleExtra.getFileIndex() != null) {
// String fileIdFromFull = fullStudy.getFiles().get(sampleExtra.getFileIndex()).getFileId();
// if (sample.getFileIndex() == null) {
// actualStudy.getFiles().
// String fileIdFrmoFull = fullStudy.getFiles().get(sampleExtra.getFileIndex()).getFileId();
// }
// }
}
}
// logger.info(" # Fetch {} SEC_ALTS in {}", toReadFull.size(), TimeUtils.durationToString(stopWatch));
Expand All @@ -588,7 +597,7 @@ private void addOriginalCall(List<Variant> variants, String study) {
for (Variant variant : dbAdaptor.iterable(
new Query()
.append(VariantQueryParam.ID.key(), variants)
.append(VariantQueryParam.INCLUDE_FILE.key(), files)
.append(VariantQueryParam.INCLUDE_FILE.key(), filesFromSample)
.append(VariantQueryParam.INCLUDE_SAMPLE.key(), NONE)
.append(VariantQueryParam.INCLUDE_STUDY.key(), study),
new QueryOptions()
Expand Down Expand Up @@ -620,21 +629,23 @@ private void addOriginalCall(List<Variant> variants, String study) {

private void mergeFileEntries(StudyEntry studyEntry, List<FileEntry> newFileEntries,
BiConsumer<FileEntry, FileEntry> merge) {
if (CollectionUtils.isEmpty(studyEntry.getFiles())) {
studyEntry.setFiles(newFileEntries);
} else {
for (FileEntry newFileEntry : newFileEntries) {
FileEntry fileEntry = studyEntry.getFile(newFileEntry.getFileId());
if (fileEntry == null) {
studyEntry.getFiles().add(newFileEntry);
} else {
merge.accept(fileEntry, newFileEntry);
if (studyEntry.getFiles() == null) {
studyEntry.setFiles(new ArrayList<>(newFileEntries.size()));
}
for (FileEntry newFileEntry : newFileEntries) {
FileEntry fileEntry = studyEntry.getFile(newFileEntry.getFileId());
if (fileEntry == null) {
fileEntry = new FileEntry(newFileEntry.getFileId(), null, new HashMap<>());
studyEntry.getFiles().add(fileEntry);
if (filesFromSample.contains(fileEntry.getFileId())) {
SampleEntry sampleEntry = studyEntry.getSample(sampleName);
if (sampleEntry.getFileIndex() == null) {
sampleEntry.setFileIndex(studyEntry.getFiles().size() - 1);
}
}
}
merge.accept(fileEntry, newFileEntry);
}
}


}

}
Loading

0 comments on commit 4de465a

Please sign in to comment.