diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index 6e1657d1bf..17d5accff4 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \ git clone https://github.com/Ensembl/ensembl-compara.git && \ git clone https://github.com/Ensembl/ensembl-io.git -ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts +ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index 70865465e9..aa22cf10b1 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157"; our $ENSEMBL_GENOMES_USER = "anonymous"; ## Vertebrates -our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38"; -our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38"; -our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38"; -our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38"; +our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38"; +our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38"; +our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38"; +our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 507e85a75f..9a097fd202 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -26,7 +26,6 @@ public class DownloadProperties { private EnsemblProperties ensembl; private EnsemblProperties ensemblGenomes; private URLProperties hgnc; - private URLProperties cancerHotspot; private URLProperties refSeq; private URLProperties refSeqFasta; private URLProperties refSeqProteinFasta; @@ -72,7 +71,6 @@ public class DownloadProperties { private URLProperties hpoObo; private URLProperties goObo; private URLProperties doidObo; - private URLProperties mondoObo; private URLProperties goAnnotation; private URLProperties revel; private URLProperties pubmed; @@ -529,24 +527,6 @@ public DownloadProperties setHgnc(URLProperties hgnc) { return this; } - public URLProperties getCancerHotspot() { - return cancerHotspot; - } - - public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) { - this.cancerHotspot = cancerHotspot; - return this; - } - - public URLProperties getMondoObo() { - return mondoObo; - } - - public DownloadProperties setMondoObo(URLProperties mondoObo) { - this.mondoObo = mondoObo; - return this; - } - public static class EnsemblProperties { private DatabaseCredentials database; diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index 409c66ba1e..6e651f00d3 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -62,11 +62,7 @@ download: url: host: ftp://ftp.ensemblgenomes.org/pub hgnc: - host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: 2023-11-01 - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" + host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2022-01-01.txt refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz refSeqFasta: @@ -77,15 +73,12 @@ download: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: # host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz - host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz - version: "1.1" + host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz + version: 0.93 lrg: host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt - version: "2021-03-30" geneUniprotXref: host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ - version: "2023-11-08" geneExpressionAtlas: host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz mirbase: @@ -95,49 +88,45 @@ download: targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx - version: "9.0" - - ## Protein Data + host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/8.0/hsa_MTI.xlsx uniprot: - host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2023-11-08" + host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz uniprotRelNotes: - host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - version: "2023-11-08" + host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt + intact: + host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt interpro: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2023-11-08" + host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz interproRelNotes: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt - intact: - host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt - version: "2023-10-07" - - ## Conservation Scores + host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt conservation: host: https://hgdownload.cse.ucsc.edu/goldenPath/ - version: "2022-08-30" gerp: - host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw - version: "2023-05-17" + host: http://ftp.ensembl.org/pub/release-104/compara/conservation_scores/90_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw clinvar: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz +<<<<<<< HEAD + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz + clinvarVariation: +# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz +# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz +======= # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz - version: "2023-12-01" + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz + version: 2024-05 clinvarVariation: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz + version: 2024-05 +>>>>>>> release-6.2.x clinvarSummary: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - version: "2023-12-01" clinvarVariationAllele: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - version: "2023-12-01" clinvarEfoTerms: host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv dbSNP: @@ -158,12 +147,16 @@ download: genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/goldenPath gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv - version: "23-12-21" +<<<<<<< HEAD + host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv + version: "1.0.2 associations_e106_r2022-05-17" +======= + #host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv + host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv" + #version: "1.0.2 associations_e106_r2022-05-17" + version: "2024-05-20" +>>>>>>> release-6.2.x hpo: - ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt disgenet: host: https://www.disgenet.org/static/disgenet_ap1/files/downloads @@ -171,30 +164,20 @@ download: - all_gene_disease_associations.tsv.gz - readme.txt dgidb: - host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv - version: "2022-02-01" + host: https://dgidb.org/data/monthly_tsvs/2021-Jan/interactions.tsv cadd: - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" + host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz reactome: host: http://www.reactome.org/download/current/biopax.zip gnomadConstraints: host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: "2.1.1" + version: 2.1.1 hpoObo: host: http://purl.obolibrary.org/obo/hp.obo - version: "2023-12-01" goObo: host: http://purl.obolibrary.org/obo/go/go-basic.obo - version: "2023-12-01" doidObo: host: http://purl.obolibrary.org/obo/doid.obo - version: "2023-12-01" - mondoObo: - host: http://purl.obolibrary.org/obo/mondo.obo - version: "2023-12-01" goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz revel: @@ -221,7 +204,7 @@ species: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '110_38' + - ensemblVersion: '104_38' name: GRCh38 - ensemblVersion: '82_37' name: GRCh37 diff --git a/cellbase-lib/pom.xml b/cellbase-lib/pom.xml index 0c7fbf836f..514d844894 100644 --- a/cellbase-lib/pom.xml +++ b/cellbase-lib/pom.xml @@ -137,10 +137,10 @@ com.github.samtools htsjdk - + io.jsonwebtoken jjwt-api diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 6330cb71a3..d09291bc3e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -56,9 +56,9 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; - public static final String CLINVAR_VERSION = "2022.11"; - public static final String CLINVAR_DATE = "2022-11"; - public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; + public static final String CLINVAR_VERSION = "2024-05"; + public static final String CLINVAR_DATE = "2024-05"; + public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz"; public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; @@ -77,7 +77,6 @@ public class EtlCommons { public static final String HPO_FILE = "hp.obo"; public static final String GO_FILE = "go-basic.obo"; public static final String DOID_FILE = "doid.obo"; - public static final String MONDO_FILE = "mondo.obo"; public static final String PFM_DATA = "regulatory_pfm"; // Build specific data options diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index cd0863a259..563f76dea7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -90,8 +90,8 @@ public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, Species boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException { this(null, geneDirectoryPath.resolve("description.txt"), geneDirectoryPath.resolve("xrefs.txt"), - geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"), - geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"), + geneDirectoryPath.resolve("hgnc_complete_set_2022-01-01.txt"), + geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"), geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"), geneDirectoryPath.resolve("idmapping_selected.tab.gz"), geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"), diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 1eabf8975a..8873dd7f93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -32,14 +32,12 @@ public class OntologyBuilder extends CellBaseBuilder { private Path hpoFile; private Path goFile; private Path doidFile; - private Path mondoFile; public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { super(serializer); hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); - mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); } @Override @@ -66,13 +64,6 @@ public void parse() throws Exception { serializer.serialize(term); } - bufferedReader = FileUtils.newBufferedReader(mondoFile); - terms = parser.parseOBO(bufferedReader, "Mondo Ontology"); - for (OntologyTerm term : terms) { - term.setSource("MONDO"); - serializer.serialize(term); - } - serializer.close(); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index a31bd8d5e6..8b88f821f6 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -210,7 +210,7 @@ private void printSummary() { } private boolean updateRocksDB(SequenceLocation sequenceLocation, String variationId, String[] lineFields, - String mateVariantString, Map traitsToEfoTermsMap) + String mateVariantString, Map traitsToEfoTermsMap) throws RocksDBException, IOException { // More than one variant being returned from the normalisation process would mean it's and MNV which has been // decomposed @@ -266,13 +266,34 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy } // parse RCVs - String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); - String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() - .getClinicalSignificance() - .getDescription(); - String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance() - .getReviewStatus().name(); - List getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); + String accession = null; + try { + accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); + } catch (Exception e) { + logger.warn("Error getting accession. Ignore error and leave it as null.", e); + } + String clinicalSignficanceDescription = null; + try { + clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() + .getClinicalSignificance() + .getDescription(); + } catch (Exception e) { + logger.warn("Error getting clinical significance description. Ignore error and leave it as null.", e); + } + String reviewStatusName = null; + try { + reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance() + .getReviewStatus().name(); + } catch (Exception e) { + logger.warn("Error getting review status name. Ignore error and leave it as null.", e); + } + List getObservedIn = null; + try { + getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); + } catch (Exception e) { + logger.warn("Error getting observed in. Ignore error and leave it as null.", e); + } + addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString, clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription, reviewStatusName, getObservedIn); @@ -388,7 +409,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu Map traitsToEfoTermsMap, String accession, String clinicalSignficanceDescription, String reviewStatusName, List getObservedIn) - throws JsonProcessingException { + throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); @@ -544,7 +565,7 @@ private ModeOfInheritance getModeOfInheritance(String modeOfInheritance) { private List getGenomicFeature(PublicSetType publicSet, String alleleId) { if (publicSet.getReferenceClinVarAssertion().getMeasureSet() != null) { return getGenomicFeature(publicSet.getReferenceClinVarAssertion().getMeasureSet()); - // No measureSet means there must be genotypeSet + // No measureSet means there must be genotypeSet } else if (publicSet.getReferenceClinVarAssertion().getGenotypeSet() != null) { for (MeasureSetType measureSet : publicSet.getReferenceClinVarAssertion().getGenotypeSet().getMeasureSet()) { if (measureSet.getMeasure() != null) { @@ -596,7 +617,7 @@ private List getHeritableTrait(PublicSetType publicSet, Map 0) { logger.warn("ClinVar record found " + publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc() + " with no preferred trait provided. Arbitrarily selecting first one: {}", trait.getName() .get(0).getElementValue().getValue()); return trait.getName().get(0).getElementValue().getValue(); - // No trait name provided at all + // No trait name provided at all } else { throw new IllegalArgumentException("ClinVar record found " + publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc() diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java index bbe33017fd..2b34f86a50 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java @@ -74,13 +74,12 @@ public abstract class ClinicalIndexer { protected VariantNormalizer normalizer; public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException { - // Forcing decomposition here in all cases - assuming the way CellBase stores clinical variants from here - // onwards will be decomposed and Adaptors will deal with phased/no-phased queries + // Use the same OpenCGA normalization parameters VariantNormalizer.VariantNormalizerConfig variantNormalizerConfig = (new VariantNormalizer.VariantNormalizerConfig()) .setReuseVariants(true) - .setNormalizeAlleles(false) - .setDecomposeMNVs(true); + .setNormalizeAlleles(true) + .setDecomposeMNVs(false); if (genomeSequenceFilePath != null) { logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString()); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index f8d2f16d15..a26d18c60c 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -41,7 +41,7 @@ public class CosmicIndexer extends ClinicalIndexer { private Pattern mutationGRCh37GenomePositionPattern; private Pattern snvPattern; - private static final String COSMIC_VERSION = "v95"; + private static final String COSMIC_VERSION = "v99"; private static final int GENE_NAMES_COLUMN = 0; private static final int HGNC_COLUMN = 3; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java index 2b4f2e4d8b..0fe3b0f115 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java @@ -31,6 +31,7 @@ import java.nio.file.Path; import java.text.NumberFormat; import java.util.*; +import java.util.stream.Collectors; public class GwasIndexer extends ClinicalIndexer { @@ -46,6 +47,8 @@ public class GwasIndexer extends ClinicalIndexer { private int gwasLinesNotFoundInDbsnp; private int invalidVariantRecords; + private int lineCounter = 0; + public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); @@ -56,36 +59,31 @@ public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePa } public void index() throws RocksDBException, IOException { - logger.info("Parsing GWAS catalog file ..."); - - BufferedReader inputReader = null; - TabixReader dbsnpTabixReader = null; - - try { - logger.info("Opening GWAS catalog file " + gwasFile + " ..."); - inputReader = new BufferedReader(new FileReader(gwasFile.toFile())); + try (BufferedReader inputReader = new BufferedReader(new FileReader(gwasFile.toFile())); + TabixReader dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString())) { logger.info("Ignoring GWAS catalog file header line ..."); - String line = inputReader.readLine(); + inputReader.readLine(); + ++lineCounter; + Map chromosomeMap = buildChromosomeMap(dbsnpTabixReader); Map gwasMap = new HashMap<>(); - logger.info("Opening dbSNP tabix file " + dbSnpTabixFile + " ..."); - dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString()); long processedGwasLines = 0; - logger.info("Parsing GWAS catalog file ..."); + logger.info("Parsing GWAS catalog file {} ...", gwasFile); + String line; while ((line = inputReader.readLine()) != null) { + ++lineCounter; if (!line.isEmpty()) { processedGwasLines++; if (processedGwasLines % 10000 == 0) { logger.info("{} lines parsed", processedGwasLines); } - processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap); + processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap, chromosomeMap); } } - dbsnpTabixReader.close(); logger.info("Updating clinical variant annotation..."); long counter = 0; @@ -118,16 +116,9 @@ public void index() throws RocksDBException, IOException { rdb.put(entry.getKey().getBytes(), jsonObjectWriter.writeValueAsBytes(variantAnnotation)); } this.printSummary(processedGwasLines, gwasMap); - } catch (RocksDBException | IOException e) { + } catch (RocksDBException | IOException e) { logger.error("Error reading/writing from/to the RocksDB index while indexing GWAS catalog file"); throw e; - } finally { - if (inputReader != null) { - inputReader.close(); - } - if (dbsnpTabixReader != null) { - dbsnpTabixReader.close(); - } } } @@ -184,13 +175,14 @@ significant digit (for example, a published p-value of 4.8 x 10-7 is rounded to 37 GENOTYPING_TECHNOLOGY* +: Genotyping technology/ies used in this study, with additional array information (ex. Immunochip or Exome array) in brackets. */ - private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap) { + private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap, + Map chromosomeMap) throws IOException { Integer start = parseStart(values); if (start != null) { String chromosome = parseChromosome(values[11]); if (StringUtils.isNotEmpty(chromosome)) { String snpId = "rs" + values[23].trim(); - String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader); + String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader, chromosomeMap); if (refAndAlt != null) { // Create variant Variant variant; @@ -270,21 +262,27 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade // Scores management GwasAssociationStudyTraitScores scores = new GwasAssociationStudyTraitScores(); - try { - scores.setPValue(Double.parseDouble(values[27])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); + if (StringUtils.isNotEmpty(values[27])) { + try { + scores.setPValue(Double.parseDouble(values[27])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); + } } - try { - scores.setPValueMlog(Double.parseDouble(values[28])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); + if (StringUtils.isNotEmpty(values[28])) { + try { + scores.setPValueMlog(Double.parseDouble(values[28])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); + } } scores.setPValueText(values[29]); - try { - scores.setOrBeta(Double.parseDouble(values[30])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]); + if (StringUtils.isNotEmpty(values[30])) { + try { + scores.setOrBeta(Double.parseDouble(values[30])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]); + } } scores.setPercentCI(values[31]); @@ -301,15 +299,15 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade gwasMap.put(key, gwas); } } else { -// logger.warn("Variant not found in dbSNP " + snpId + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("dbSNP {} not found. Line: {}", snpId, lineCounter); gwasLinesNotFoundInDbsnp++; } } else { -// logger.warn("Invalid chromosome " + chromosome + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("Invalid chromosome {}. Line: {}", chromosome, lineCounter); invalidChromosome++; } } else { -// logger.warn("Invalid position " + start + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("Invalid position {}. Line: {}", start, lineCounter); invalidStartRecords++; } } @@ -342,6 +340,39 @@ private String parseChromosome(String chromosome) { return transformedChromosome; } + private Map buildChromosomeMap(TabixReader dbsnpTabixReader) { + List chroms = dbsnpTabixReader.getChromosomes().stream().filter(name -> name.startsWith("NC_")) + .collect(Collectors.toList()); + + Map chromMap = new HashMap<>(); + for (int i = 1; i < 22; i++) { + chromMap.put(Integer.toString(i), Integer.toString(i)); + } + chromMap.put("X", "X"); + chromMap.put("Y", "Y"); + chromMap.put("MT", "MT"); + + for (String chrom : chroms) { + String[] split = chrom.split("[_\\.]"); + int value = Integer.parseInt(split[1]); + switch (value) { + case 23: + chromMap.put("X", chrom); + break; + case 24: + chromMap.put("Y", chrom); + break; + case 12920: + chromMap.put("MT", chrom); + break; + default: + chromMap.put(Integer.toString(value), chrom); + break; + } + } + return chromMap; + } + private Float parseFloat(String value) { Float riskAlleleFrequency = null; if (NumberUtils.isNumber(value)) { @@ -350,29 +381,33 @@ private Float parseFloat(String value) { return riskAlleleFrequency; } - private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader) { + private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader, + Map chromosomeMap) throws IOException { + boolean found = false; + Set foundSnpIds = new HashSet<>(); String[] refAndAlt = null; - TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(chromosome + ":" + start + "-" + start); - try { - String dbSnpRecord = dbsnpIterator.next(); - boolean found = false; - while (dbSnpRecord != null && !found) { - String[] dbsnpFields = dbSnpRecord.split("\t"); - - if (snpId.equalsIgnoreCase(dbsnpFields[2])) { - refAndAlt = new String[2]; - refAndAlt[REF] = dbsnpFields[3]; - refAndAlt[ALT] = dbsnpFields[4]; - found = true; - } - - dbSnpRecord = dbsnpIterator.next(); + String query = chromosomeMap.get(chromosome) + ":" + start + "-" + start; + TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(query); + String dbSnpRecord = null; + dbSnpRecord = dbsnpIterator.next(); + while (dbSnpRecord != null && !found) { + String[] dbsnpFields = dbSnpRecord.split("\t"); + + if (snpId.equalsIgnoreCase(dbsnpFields[2])) { + refAndAlt = new String[2]; + refAndAlt[REF] = dbsnpFields[3]; + refAndAlt[ALT] = dbsnpFields[4]; + found = true; + } else { + foundSnpIds.add(dbsnpFields[2]); } - } catch (IOException e) { - logger.warn("Error reading position '" + chromosome + ":" + start + "' in dbSNP: " + e.getMessage()); - } + dbSnpRecord = dbsnpIterator.next(); + } + if (!found) { + logger.warn("dbSNP {} not found from query {}. Found: {}", snpId, query, foundSnpIds); + } return refAndAlt; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index eb1f28db2d..bb9e0c36e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -81,8 +81,10 @@ public List downloadClinical() throws IOException, InterruptedExce url = configuration.getDownload().getClinvarVariationAllele().getHost(); downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); + saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar() + .getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve("clinvarVersion.json")); + + logger.info("\t\tDone"); // Gwas catalog logger.info("\t\tDownloading GWAS catalog file ..."); @@ -91,6 +93,7 @@ public List downloadClinical() throws IOException, InterruptedExce downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); + logger.info("\t\tDone"); // List hgvsList = getDocmHgvsList(); // if (!hgvsList.isEmpty()) { @@ -236,10 +239,4 @@ private List getDocmHgvsList() throws IOException { return hgvsList; } - - private String getClinVarVersion() { - // ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2015-12.xml.gz - return configuration.getDownload().getClinvar().getHost().split("_")[1].split("\\.")[0]; - } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 260ff75427..9d2685eadf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -82,8 +82,6 @@ public List download() throws IOException, InterruptedException { downloadFiles.addAll(downloadRefSeq(refseqFolder)); downloadFiles.add(downloadMane(geneFolder)); downloadFiles.add(downloadLrg(geneFolder)); - downloadFiles.add(downloadHgnc(geneFolder)); - downloadFiles.add(downloadCancerHotspot(geneFolder)); downloadFiles.add(downloadDrugData(geneFolder)); downloadFiles.addAll(downloadGeneUniprotXref(geneFolder)); downloadFiles.add(downloadGeneExpressionAtlas(geneFolder)); @@ -210,30 +208,6 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte return null; } - private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading HGNC ..."); - String url = configuration.getDownload().getHgnc().getHost(); - saveVersionData(EtlCommons.GENE_DATA, "HGNC_GENE", configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("hgncVersion.json")); - String[] array = url.split("/"); - return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString()); - } - return null; - } - - private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading Cancer Hotspot ..."); - String url = configuration.getDownload().getCancerHotspot().getHost(); - saveVersionData(EtlCommons.GENE_DATA, "CANCER_HOTSPOT", configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("cancerHotspotVersion.json")); - String[] array = url.split("/"); - return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString()); - } - return null; - } - private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { logger.info("Downloading go annotation..."); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 0ba9f39db4..5a0609867f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -47,11 +47,11 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto public List download() throws IOException, InterruptedException { List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadReferenceGenome()); - downloadFiles.addAll(downloadConservation()); - downloadFiles.addAll(downloadRepeats()); +// downloadFiles.addAll(downloadConservation()); +// downloadFiles.addAll(downloadRepeats()); // cytobands -// runGenomeInfo(); + runGenomeInfo(); return downloadFiles; } @@ -115,16 +115,16 @@ public List downloadConservation() throws IOException, Interrupted List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons470way/hg38.470way.phastCons/chr" + chromosome - + ".phastCons470way.wigFix.gz"; + String phastConsUrl = url + "/phastCons100way/hg38.100way.phastCons/chr" + chromosome + + ".phastCons100way.wigFix.gz"; downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve("phastCons") - .resolve("chr" + chromosome + ".phastCons470way.wigFix.gz").toString())); + .resolve("chr" + chromosome + ".phastCons100way.wigFix.gz").toString())); phastconsUrls.add(phastConsUrl); - String phyloPUrl = url + "/phyloP470way/hg38.470way.phyloP/chr" + chromosome - + ".phyloP470way.wigFix.gz"; + String phyloPUrl = url + "/phyloP100way/hg38.100way.phyloP100way/chr" + chromosome + + ".phyloP100way.wigFix.gz"; downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve("phylop") - .resolve("chr" + chromosome + ".phyloP470way.wigFix.gz").toString())); + .resolve("chr" + chromosome + ".phyloP100way.wigFix.gz").toString())); phyloPUrls.add(phyloPUrl); } String gerpUrl = configuration.getDownload().getGerp().getHost(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 522be7b27d..0776354e80 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -36,7 +36,7 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec public List download() throws IOException, InterruptedException { - logger.info("Downloading OBO files ..."); + logger.info("Downloading obo files ..."); List downloadFiles = new ArrayList<>(); Path oboFolder = downloadFolder.resolve("ontology"); @@ -44,22 +44,20 @@ public List download() throws IOException, InterruptedException { String url = configuration.getDownload().getHpoObo().getHost(); downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString())); + saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(), Collections.singletonList(url), buildFolder.resolve(EtlCommons.HPO_VERSION_FILE)); url = configuration.getDownload().getGoObo().getHost(); downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString())); + saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(), Collections.singletonList(url), buildFolder.resolve(EtlCommons.GO_VERSION_FILE)); url = configuration.getDownload().getDoidObo().getHost(); downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); - url = configuration.getDownload().getMondoObo().getHost(); - downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(), + saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); return downloadFiles; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 5a722ed448..08f28cfdad 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -22,6 +22,7 @@ import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; +import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.nio.file.Files; @@ -33,8 +34,6 @@ public class ProteinDownloadManager extends AbstractDownloadManager { private static final String UNIPROT_NAME = "UniProt"; - private static final String INTERPRO_NAME = "InterPro"; - private static final String INTACT_NAME = "IntAct"; public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -57,7 +56,6 @@ public List download() throws IOException, InterruptedException { Files.createDirectories(proteinFolder); List downloadFiles = new ArrayList<>(); - // Uniprot String url = configuration.getDownload().getUniprot().getHost(); downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString())); Files.createDirectories(proteinFolder.resolve("uniprot_chunks")); @@ -65,25 +63,23 @@ public List download() throws IOException, InterruptedException { String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost(); downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString())); + saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1), getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json")); - // Interpro - String interproUrl = configuration.getDownload().getInterpro().getHost(); - downloadFiles.add(downloadFile(interproUrl, proteinFolder.resolve("protein2ipr.dat.gz").toString())); - - relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); - downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), - getTimeStamp(), Collections.singletonList(interproUrl), proteinFolder.resolve("interproVersion.json")); - - // Intact - String intactUrl = configuration.getDownload().getIntact().getHost(); - downloadFiles.add(downloadFile(intactUrl, proteinFolder.resolve("intact.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, configuration.getDownload().getIntact().getVersion(), - getTimeStamp(), Collections.singletonList(intactUrl), proteinFolder.resolve("intactVersion.json")); - return downloadFiles; + +// url = configuration.getDownload().getIntact().getHost(); +// downloadFile(url, proteinFolder.resolve("intact.txt").toString()); +// saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url), +// proteinFolder.resolve("intactVersion.json")); +// +// url = configuration.getDownload().getInterpro().getHost(); +// downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString()); +// relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); +// downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString()); +// saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), +// getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json")); } private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { @@ -100,7 +96,7 @@ private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOE inEntry = true; beforeEntry = false; if (count % 10000 == 0) { - pw = new PrintWriter(Files.newOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile().toPath())); + pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile())); pw.println(header.toString().trim()); } count++; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 51152e478d..1abb352fbe 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -64,8 +64,8 @@ public List download() throws IOException, InterruptedException, N List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadRegulatoryaAndMotifFeatures()); - downloadFiles.add(downloadMiRTarBase()); downloadFiles.add(downloadMirna()); + downloadFiles.add(downloadMiRTarBase()); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java index 5d7dbc65d0..e5cd4d38cc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java @@ -16,8 +16,6 @@ package org.opencb.cellbase.lib.impl.core; - -import com.fasterxml.jackson.databind.ObjectMapper; import com.mongodb.ReadPreference; import com.mongodb.WriteConcern; import com.mongodb.client.model.Filters; @@ -25,6 +23,7 @@ import org.bson.BsonDocument; import org.bson.Document; import org.bson.conversions.Bson; +import org.codehaus.jackson.map.ObjectMapper; import org.opencb.cellbase.core.api.key.ApiKeyStats; import org.opencb.cellbase.core.api.query.AbstractQuery; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; diff --git a/pom.xml b/pom.xml index e6599cd67c..8f7ee392ad 100644 --- a/pom.xml +++ b/pom.xml @@ -27,17 +27,15 @@ 4.0.0-SNAPSHOT 0.1.0 - 9.4.51.v20230217 - - 2.14.3 - 3.14.0 - 1.7.36 - + 2.11.4 + 1.9.13 2.30.1 + 1.7.32 2.17.2 1.5.2 5.5.2 0.8.8 + 9.4.17.v20190418 0.11.5 1.6.5 3.1.0 @@ -53,6 +51,7 @@ 1.48.0 2.4 2.4 + 3.12.0 2.1.6 4.4 1.69 @@ -414,11 +413,11 @@ swagger-annotations ${swagger-annotations.version} - + io.jsonwebtoken jjwt-jackson