Skip to content

Commit

Permalink
Merge pull request #2450 from opencb/TASK-6240
Browse files Browse the repository at this point in the history
TASK-6240 - Symbolic inversions <INV> not indexed during data load
  • Loading branch information
j-coll authored May 17, 2024
2 parents ff2b0c8 + 9c31131 commit 37b1618
Show file tree
Hide file tree
Showing 5 changed files with 11 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
1 700000 . C <DEL:ME:ALU> . PASS SVTYPE=DEL;END=700297;SVLEN=-297;CIPOS=-22,18;CIEND=-12,32 GT 0/1 0/1
1 800000 . A <INS> . PASS SVTYPE=INS;END=800000;SVLEN=6027;CIPOS=-16,22;RIGHT_SVINSSEQ=ACCACACCCACACAACACACA;LEFT_SVINSSEQ=TGTGGTGTGTGTGGTGTG GT 0/1 0/1
1 850000 . A <INS> . PASS SVTYPE=INS;END=850000;SVINSSEQ=ACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACAACCACACCCACACAACACACA GT 0/1 0/1
1 860000 . A <INS> . PASS SVTYPE=INS;END=860000;SVLEN=1000 GT 0/1 0/1
1 860000 . A <INV> . PASS SVTYPE=INVERSION;END=870000 GT 0/1 0/1
1 900000 . G <INS> . PASS SVTYPE=INS;END=900000;SVLEN=6027;CIPOS=-16,22 GT 0/1 0/1
1 1000000 . A <DUP> . PASS SVTYPE=DUP;END=1021100;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500 GT 0/1 0/1
1 1100000 . T <DUP:TANDEM> . PASS SVTYPE=DUP;END=1100076;SVLEN=76;CIPOS=-10,10;CIEND=-10,10 GT 0/1 0/1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@
1 700010 . T C . PASS . GT 0/1 0/1
1 800000 . A <INS> . PASS SVTYPE=INS;END=800000;SVLEN=6027;CIPOS=-16,22;RIGHT_SVINSSEQ=TGTGGTGTGTGTGGTGTG;LEFT_SVINSSEQ=ACCACACCCACACAACACACA GT 0/1 0/1
1 850000 . A <INS> . PASS SVTYPE=INS;END=850000;SVINSSEQ=TGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTGTGTGGTGTG GT 0/1 0/1
1 860000 . A <INS> . PASS SVTYPE=INS;END=860000;SVLEN=1000 GT 0/1 0/1
1 860000 . A <INV> . PASS SVTYPE=INVERSION;END=870000 GT 0/1 0/1
1 900000 . G <INS> . PASS SVTYPE=INS;END=900000;SVLEN=6027;CIPOS=-16,22 GT 0/1 0/1
1 1000000 . A <DUP> . PASS SVTYPE=DUP;END=1021100;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500 GT 0/1 0/1
1 1100000 . T <DUP:TANDEM> . PASS SVTYPE=DUP;END=1100076;SVLEN=76;CIPOS=-10,10;CIEND=-10,10 GT 0/1 0/1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@
import static org.opencb.opencga.storage.core.metadata.models.TaskMetadata.Type;
import static org.opencb.opencga.storage.core.variant.VariantStorageOptions.*;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.TARGET_VARIANT_TYPE_SET;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.UNSUPPORTED_VARIANT_TYPE_SET;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions.*;

/**
Expand Down Expand Up @@ -390,13 +391,13 @@ protected void loadFromAvroWithArchive(URI input, URI outdir, ArchiveTableHelper
throw new StorageEngineException("Error loading file " + input, e);
}

logLoadResults(variantReader.getVariantFileMetadata(), resolver, hadoopDBWriter);
if (sampleIndexDBLoader != null) {
// Update list of loaded genotypes
this.loadedGenotypes = sampleIndexDBLoader.getLoadedGenotypes();
this.sampleIndexVersion = sampleIndexDBLoader.getSampleIndexVersion();
this.largestVariantLength = largestVariantTask.getMaxLength();
}
logLoadResults(variantReader.getVariantFileMetadata(), resolver, hadoopDBWriter);
}

protected void loadFromAvroWithoutArchive(URI input, URI outdir, ArchiveTableHelper helper, ProgressLogger progressLogger)
Expand Down Expand Up @@ -438,13 +439,13 @@ protected void loadFromAvroWithoutArchive(URI input, URI outdir, ArchiveTableHel
throw new StorageEngineException("Error loading file " + input, e);
}

logLoadResults(variantReader.getVariantFileMetadata(), resolver, hadoopDBWriter);
if (sampleIndexDBLoader != null) {
// Update list of loaded genotypes
this.loadedGenotypes = sampleIndexDBLoader.getLoadedGenotypes();
this.sampleIndexVersion = sampleIndexDBLoader.getSampleIndexVersion();
this.largestVariantLength = largestVariantTask.getMaxLength();
}
logLoadResults(variantReader.getVariantFileMetadata(), resolver, hadoopDBWriter);
}

private void logLoadResults(VariantFileMetadata variantFileMetadata,
Expand Down Expand Up @@ -490,7 +491,7 @@ private void logLoadResults(VariantFileMetadata variantFileMetadata, int duplica
if (skipped > 0) {
logger.info("There were " + skipped + " skipped variants");
for (VariantType type : VariantType.values()) {
if (!TARGET_VARIANT_TYPE_SET.contains(type)) {
if (UNSUPPORTED_VARIANT_TYPE_SET.contains(type)) {
Long countByType = variantFileMetadata.getStats().getTypeCount().get(type.toString());
if (countByType != null && countByType > 0) {
logger.info(" * Of which " + countByType + " are " + type.toString() + " variants.");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,10 @@
public class HadoopVariantStorageEngine extends VariantStorageEngine implements Configurable {
public static final String STORAGE_ENGINE_ID = "hadoop";

public static final EnumSet<VariantType> TARGET_VARIANT_TYPE_SET = EnumSet.of(
VariantType.SNV, VariantType.SNP,
VariantType.INDEL,
VariantType.MNV, VariantType.MNP,
VariantType.INSERTION, VariantType.DELETION,
VariantType.CNV,
VariantType.COPY_NUMBER, VariantType.COPY_NUMBER_LOSS, VariantType.COPY_NUMBER_GAIN,
VariantType.DUPLICATION, VariantType.TANDEM_DUPLICATION, VariantType.TRANSLOCATION,
VariantType.BREAKEND,
VariantType.SV, VariantType.SYMBOLIC
public static final EnumSet<VariantType> UNSUPPORTED_VARIANT_TYPE_SET = EnumSet.of(
VariantType.NO_VARIATION, VariantType.MIXED
);
public static final EnumSet<VariantType> TARGET_VARIANT_TYPE_SET = EnumSet.complementOf(UNSUPPORTED_VARIANT_TYPE_SET);

public static final String FILE_ID = "fileId";
public static final String STUDY_ID = "studyId";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
import java.util.*;
import java.util.concurrent.TimeUnit;

import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.TARGET_VARIANT_TYPE_SET;
import static org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine.UNSUPPORTED_VARIANT_TYPE_SET;

/**
* Created on 31/10/17.
Expand Down Expand Up @@ -348,7 +348,7 @@ protected static Scan buildScan(String regionStr, int fileId, Configuration conf
protected static boolean isVariantAlreadyLoaded(VcfSliceProtos.VcfSlice slice, VcfSliceProtos.VcfRecord vcfRecord) {
VariantType variantType = VcfRecordProtoToVariantConverter.getVariantType(vcfRecord.getType());
// The variant is not loaded if is a NO_VARIATION (fast check first)
if (!TARGET_VARIANT_TYPE_SET.contains(variantType)) {
if (UNSUPPORTED_VARIANT_TYPE_SET.contains(variantType)) {
return false;
}

Expand Down

0 comments on commit 37b1618

Please sign in to comment.