Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TASK-6515 - Port Patch 1.10.6(.1) -> 2.2.1 #702

Merged
merged 15 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,14 @@ download:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz
version: "2023-12-01"
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz
version: 2024-05
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz
version: 2024-05
clinvarSummary:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
version: "2023-12-01"
Expand All @@ -158,10 +159,10 @@ download:
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
gwasCatalog:
# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
# version: "1.0.2 associations_e106_r2022-05-17"
host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv
version: "23-12-21"
#host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv"
#version: "1.0.2 associations_e106_r2022-05-17"
version: "2024-05-20"
hpo:
## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ public class EtlCommons {
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
public static final String CLINVAR_VERSION = "2022.11";
public static final String CLINVAR_DATE = "2022-11";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz";
public static final String CLINVAR_VERSION = "2024-05";
public static final String CLINVAR_DATE = "2024-05";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ private void printSummary() {
}

private boolean updateRocksDB(SequenceLocation sequenceLocation, String variationId, String[] lineFields,
String mateVariantString, Map<String, EFO> traitsToEfoTermsMap)
String mateVariantString, Map<String, EFO> traitsToEfoTermsMap)
throws RocksDBException, IOException {
// More than one variant being returned from the normalisation process would mean it's and MNV which has been
// decomposed
Expand Down Expand Up @@ -266,13 +266,34 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy
}

// parse RCVs
String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
.getClinicalSignificance()
.getDescription();
String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
.getReviewStatus().name();
List<ObservationSet> getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
String accession = null;
try {
accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
} catch (Exception e) {
logger.warn("Error getting accession. Ignore error and leave it as null.", e);
}
String clinicalSignficanceDescription = null;
try {
clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
.getClinicalSignificance()
.getDescription();
} catch (Exception e) {
logger.warn("Error getting clinical significance description. Ignore error and leave it as null.", e);
}
String reviewStatusName = null;
try {
reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
.getReviewStatus().name();
} catch (Exception e) {
logger.warn("Error getting review status name. Ignore error and leave it as null.", e);
}
List<ObservationSet> getObservedIn = null;
try {
getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
} catch (Exception e) {
logger.warn("Error getting observed in. Ignore error and leave it as null.", e);
}

addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString,
clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn);
Expand Down Expand Up @@ -388,7 +409,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
Map<String, EFO> traitsToEfoTermsMap, String accession,
String clinicalSignficanceDescription, String reviewStatusName,
List<ObservationSet> getObservedIn)
throws JsonProcessingException {
throws JsonProcessingException {

List<Property> additionalProperties = new ArrayList<>(3);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
Expand Down Expand Up @@ -544,7 +565,7 @@ private ModeOfInheritance getModeOfInheritance(String modeOfInheritance) {
private List<GenomicFeature> getGenomicFeature(PublicSetType publicSet, String alleleId) {
if (publicSet.getReferenceClinVarAssertion().getMeasureSet() != null) {
return getGenomicFeature(publicSet.getReferenceClinVarAssertion().getMeasureSet());
// No measureSet means there must be genotypeSet
// No measureSet means there must be genotypeSet
} else if (publicSet.getReferenceClinVarAssertion().getGenotypeSet() != null) {
for (MeasureSetType measureSet : publicSet.getReferenceClinVarAssertion().getGenotypeSet().getMeasureSet()) {
if (measureSet.getMeasure() != null) {
Expand Down Expand Up @@ -596,7 +617,7 @@ private List<HeritableTrait> getHeritableTrait(PublicSetType publicSet, Map<Stri
// root of the ReferenceClinvarAssertion rather than for each trait
ModeOfInheritance modeOfInheritance
= getInheritanceModel(publicSet.getReferenceClinVarAssertion().getAttributeSet(),
sourceInheritableTraitMap);
sourceInheritableTraitMap);

for (TraitType trait : publicSet.getReferenceClinVarAssertion().getTraitSet().getTrait()) {
String traitName = getTraitName(trait, publicSet);
Expand Down Expand Up @@ -649,14 +670,14 @@ private String getTraitName(TraitType trait, PublicSetType publicSet) {
// Found preferred name
if (i < trait.getName().size()) {
return trait.getName().get(i).getElementValue().getValue();
// No preferred name indicated (e.g. RCV000013735 version Jan 2020); arbitrarily return first one
// No preferred name indicated (e.g. RCV000013735 version Jan 2020); arbitrarily return first one
} else if (trait.getName().size() > 0) {
logger.warn("ClinVar record found "
+ publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc()
+ " with no preferred trait provided. Arbitrarily selecting first one: {}", trait.getName()
.get(0).getElementValue().getValue());
return trait.getName().get(0).getElementValue().getValue();
// No trait name provided at all
// No trait name provided at all
} else {
throw new IllegalArgumentException("ClinVar record found "
+ publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,12 @@ public abstract class ClinicalIndexer {
protected VariantNormalizer normalizer;

public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException {
// Forcing decomposition here in all cases - assuming the way CellBase stores clinical variants from here
// onwards will be decomposed and Adaptors will deal with phased/no-phased queries
// Use the same OpenCGA normalization parameters
VariantNormalizer.VariantNormalizerConfig variantNormalizerConfig
= (new VariantNormalizer.VariantNormalizerConfig())
.setReuseVariants(true)
.setNormalizeAlleles(false)
.setDecomposeMNVs(true);
.setNormalizeAlleles(true)
.setDecomposeMNVs(false);

if (genomeSequenceFilePath != null) {
logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public class CosmicIndexer extends ClinicalIndexer {
private Pattern mutationGRCh37GenomePositionPattern;
private Pattern snvPattern;

private static final String COSMIC_VERSION = "v95";
private static final String COSMIC_VERSION = "v99";

private static final int GENE_NAMES_COLUMN = 0;
private static final int HGNC_COLUMN = 3;
Expand Down
Loading
Loading