diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
index 6e1657d1bf..17d5accff4 100644
--- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
+++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
@@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \
git clone https://github.com/Ensembl/ensembl-compara.git && \
git clone https://github.com/Ensembl/ensembl-io.git
-ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts
+ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase
diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
index 70865465e9..aa22cf10b1 100755
--- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
+++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
@@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157";
our $ENSEMBL_GENOMES_USER = "anonymous";
## Vertebrates
-our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38";
-our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38";
-our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38";
-our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38";
+our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38";
+our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38";
+our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38";
+our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38";
#our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
#our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
#our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";
diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java
index 507e85a75f..9a097fd202 100644
--- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java
+++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java
@@ -26,7 +26,6 @@ public class DownloadProperties {
private EnsemblProperties ensembl;
private EnsemblProperties ensemblGenomes;
private URLProperties hgnc;
- private URLProperties cancerHotspot;
private URLProperties refSeq;
private URLProperties refSeqFasta;
private URLProperties refSeqProteinFasta;
@@ -72,7 +71,6 @@ public class DownloadProperties {
private URLProperties hpoObo;
private URLProperties goObo;
private URLProperties doidObo;
- private URLProperties mondoObo;
private URLProperties goAnnotation;
private URLProperties revel;
private URLProperties pubmed;
@@ -529,24 +527,6 @@ public DownloadProperties setHgnc(URLProperties hgnc) {
return this;
}
- public URLProperties getCancerHotspot() {
- return cancerHotspot;
- }
-
- public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) {
- this.cancerHotspot = cancerHotspot;
- return this;
- }
-
- public URLProperties getMondoObo() {
- return mondoObo;
- }
-
- public DownloadProperties setMondoObo(URLProperties mondoObo) {
- this.mondoObo = mondoObo;
- return this;
- }
-
public static class EnsemblProperties {
private DatabaseCredentials database;
diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml
index 409c66ba1e..6e651f00d3 100644
--- a/cellbase-core/src/main/resources/configuration.yml
+++ b/cellbase-core/src/main/resources/configuration.yml
@@ -62,11 +62,7 @@ download:
url:
host: ftp://ftp.ensemblgenomes.org/pub
hgnc:
- host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt
- version: 2023-11-01
- cancerHotspot:
- host: https://www.cancerhotspots.org/files/hotspots_v2.xls
- version: "v2"
+ host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2022-01-01.txt
refSeq:
host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz
refSeqFasta:
@@ -77,15 +73,12 @@ download:
host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz
maneSelect:
# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz
-# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz
- host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz
- version: "1.1"
+ host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz
+ version: 0.93
lrg:
host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt
- version: "2021-03-30"
geneUniprotXref:
host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/
- version: "2023-11-08"
geneExpressionAtlas:
host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz
mirbase:
@@ -95,49 +88,45 @@ download:
targetScan:
host: http://hgdownload.cse.ucsc.edu/goldenPath/
miRTarBase:
- host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx
- version: "9.0"
-
- ## Protein Data
+ host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/8.0/hsa_MTI.xlsx
uniprot:
- host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
- version: "2023-11-08"
+ host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
uniprotRelNotes:
- host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
- version: "2023-11-08"
+ host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
+ intact:
+ host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
interpro:
- host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz
- version: "2023-11-08"
+ host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz
interproRelNotes:
- host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt
- intact:
- host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
- version: "2023-10-07"
-
- ## Conservation Scores
+ host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt
conservation:
host: https://hgdownload.cse.ucsc.edu/goldenPath/
- version: "2022-08-30"
gerp:
- host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw
- version: "2023-05-17"
+ host: http://ftp.ensembl.org/pub/release-104/compara/conservation_scores/90_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw
clinvar:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz
+<<<<<<< HEAD
+ host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
+ clinvarVariation:
+# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
+# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
+ host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
+=======
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
- host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz
- version: "2023-12-01"
+ host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz
+ version: 2024-05
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
- host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz
+ host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz
+ version: 2024-05
+>>>>>>> release-6.2.x
clinvarSummary:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
- version: "2023-12-01"
clinvarVariationAllele:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz
- version: "2023-12-01"
clinvarEfoTerms:
host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv
dbSNP:
@@ -158,12 +147,16 @@ download:
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
gwasCatalog:
-# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
-# version: "1.0.2 associations_e106_r2022-05-17"
- host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv
- version: "23-12-21"
+<<<<<<< HEAD
+ host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
+ version: "1.0.2 associations_e106_r2022-05-17"
+=======
+ #host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
+ host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv"
+ #version: "1.0.2 associations_e106_r2022-05-17"
+ version: "2024-05-20"
+>>>>>>> release-6.2.x
hpo:
- ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
disgenet:
host: https://www.disgenet.org/static/disgenet_ap1/files/downloads
@@ -171,30 +164,20 @@ download:
- all_gene_disease_associations.tsv.gz
- readme.txt
dgidb:
- host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv
- version: "2022-02-01"
+ host: https://dgidb.org/data/monthly_tsvs/2021-Jan/interactions.tsv
cadd:
- ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP!
-# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
- host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz
- version: "1.7-pre"
+ host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
reactome:
host: http://www.reactome.org/download/current/biopax.zip
gnomadConstraints:
host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz
- version: "2.1.1"
+ version: 2.1.1
hpoObo:
host: http://purl.obolibrary.org/obo/hp.obo
- version: "2023-12-01"
goObo:
host: http://purl.obolibrary.org/obo/go/go-basic.obo
- version: "2023-12-01"
doidObo:
host: http://purl.obolibrary.org/obo/doid.obo
- version: "2023-12-01"
- mondoObo:
- host: http://purl.obolibrary.org/obo/mondo.obo
- version: "2023-12-01"
goAnnotation:
host: http://geneontology.org/gene-associations/goa_human.gaf.gz
revel:
@@ -221,7 +204,7 @@ species:
- id: hsapiens
scientificName: Homo sapiens
assemblies:
- - ensemblVersion: '110_38'
+ - ensemblVersion: '104_38'
name: GRCh38
- ensemblVersion: '82_37'
name: GRCh37
diff --git a/cellbase-lib/pom.xml b/cellbase-lib/pom.xml
index 0c7fbf836f..514d844894 100644
--- a/cellbase-lib/pom.xml
+++ b/cellbase-lib/pom.xml
@@ -137,10 +137,10 @@
com.github.samtools
htsjdk
-
+
io.jsonwebtoken
jjwt-api
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
index 6330cb71a3..d09291bc3e 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java
@@ -56,9 +56,9 @@ public class EtlCommons {
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";
public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
- public static final String CLINVAR_VERSION = "2022.11";
- public static final String CLINVAR_DATE = "2022-11";
- public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz";
+ public static final String CLINVAR_VERSION = "2024-05";
+ public static final String CLINVAR_DATE = "2024-05";
+ public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
@@ -77,7 +77,6 @@ public class EtlCommons {
public static final String HPO_FILE = "hp.obo";
public static final String GO_FILE = "go-basic.obo";
public static final String DOID_FILE = "doid.obo";
- public static final String MONDO_FILE = "mondo.obo";
public static final String PFM_DATA = "regulatory_pfm";
// Build specific data options
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java
index cd0863a259..563f76dea7 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java
@@ -90,8 +90,8 @@ public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, Species
boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException {
this(null, geneDirectoryPath.resolve("description.txt"),
geneDirectoryPath.resolve("xrefs.txt"),
- geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"),
- geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"),
+ geneDirectoryPath.resolve("hgnc_complete_set_2022-01-01.txt"),
+ geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"),
geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"),
geneDirectoryPath.resolve("idmapping_selected.tab.gz"),
geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"),
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java
index 1eabf8975a..8873dd7f93 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java
@@ -32,14 +32,12 @@ public class OntologyBuilder extends CellBaseBuilder {
private Path hpoFile;
private Path goFile;
private Path doidFile;
- private Path mondoFile;
public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) {
super(serializer);
hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE);
goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE);
doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE);
- mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE);
}
@Override
@@ -66,13 +64,6 @@ public void parse() throws Exception {
serializer.serialize(term);
}
- bufferedReader = FileUtils.newBufferedReader(mondoFile);
- terms = parser.parseOBO(bufferedReader, "Mondo Ontology");
- for (OntologyTerm term : terms) {
- term.setSource("MONDO");
- serializer.serialize(term);
- }
-
serializer.close();
}
}
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java
index a31bd8d5e6..8b88f821f6 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java
@@ -210,7 +210,7 @@ private void printSummary() {
}
private boolean updateRocksDB(SequenceLocation sequenceLocation, String variationId, String[] lineFields,
- String mateVariantString, Map traitsToEfoTermsMap)
+ String mateVariantString, Map traitsToEfoTermsMap)
throws RocksDBException, IOException {
// More than one variant being returned from the normalisation process would mean it's and MNV which has been
// decomposed
@@ -266,13 +266,34 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy
}
// parse RCVs
- String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
- String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
- .getClinicalSignificance()
- .getDescription();
- String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
- .getReviewStatus().name();
- List getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
+ String accession = null;
+ try {
+ accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc();
+ } catch (Exception e) {
+ logger.warn("Error getting accession. Ignore error and leave it as null.", e);
+ }
+ String clinicalSignficanceDescription = null;
+ try {
+ clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion()
+ .getClinicalSignificance()
+ .getDescription();
+ } catch (Exception e) {
+ logger.warn("Error getting clinical significance description. Ignore error and leave it as null.", e);
+ }
+ String reviewStatusName = null;
+ try {
+ reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance()
+ .getReviewStatus().name();
+ } catch (Exception e) {
+ logger.warn("Error getting review status name. Ignore error and leave it as null.", e);
+ }
+ List getObservedIn = null;
+ try {
+ getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn();
+ } catch (Exception e) {
+ logger.warn("Error getting observed in. Ignore error and leave it as null.", e);
+ }
+
addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString,
clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription,
reviewStatusName, getObservedIn);
@@ -388,7 +409,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu
Map traitsToEfoTermsMap, String accession,
String clinicalSignficanceDescription, String reviewStatusName,
List getObservedIn)
- throws JsonProcessingException {
+ throws JsonProcessingException {
List additionalProperties = new ArrayList<>(3);
EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE);
@@ -544,7 +565,7 @@ private ModeOfInheritance getModeOfInheritance(String modeOfInheritance) {
private List getGenomicFeature(PublicSetType publicSet, String alleleId) {
if (publicSet.getReferenceClinVarAssertion().getMeasureSet() != null) {
return getGenomicFeature(publicSet.getReferenceClinVarAssertion().getMeasureSet());
- // No measureSet means there must be genotypeSet
+ // No measureSet means there must be genotypeSet
} else if (publicSet.getReferenceClinVarAssertion().getGenotypeSet() != null) {
for (MeasureSetType measureSet : publicSet.getReferenceClinVarAssertion().getGenotypeSet().getMeasureSet()) {
if (measureSet.getMeasure() != null) {
@@ -596,7 +617,7 @@ private List getHeritableTrait(PublicSetType publicSet, Map 0) {
logger.warn("ClinVar record found "
+ publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc()
+ " with no preferred trait provided. Arbitrarily selecting first one: {}", trait.getName()
.get(0).getElementValue().getValue());
return trait.getName().get(0).getElementValue().getValue();
- // No trait name provided at all
+ // No trait name provided at all
} else {
throw new IllegalArgumentException("ClinVar record found "
+ publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc()
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java
index bbe33017fd..2b34f86a50 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinicalIndexer.java
@@ -74,13 +74,12 @@ public abstract class ClinicalIndexer {
protected VariantNormalizer normalizer;
public ClinicalIndexer(Path genomeSequenceFilePath) throws IOException {
- // Forcing decomposition here in all cases - assuming the way CellBase stores clinical variants from here
- // onwards will be decomposed and Adaptors will deal with phased/no-phased queries
+ // Use the same OpenCGA normalization parameters
VariantNormalizer.VariantNormalizerConfig variantNormalizerConfig
= (new VariantNormalizer.VariantNormalizerConfig())
.setReuseVariants(true)
- .setNormalizeAlleles(false)
- .setDecomposeMNVs(true);
+ .setNormalizeAlleles(true)
+ .setDecomposeMNVs(false);
if (genomeSequenceFilePath != null) {
logger.info("Enabling left aligning by using sequence at {}", genomeSequenceFilePath.toString());
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java
index f8d2f16d15..a26d18c60c 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java
@@ -41,7 +41,7 @@ public class CosmicIndexer extends ClinicalIndexer {
private Pattern mutationGRCh37GenomePositionPattern;
private Pattern snvPattern;
- private static final String COSMIC_VERSION = "v95";
+ private static final String COSMIC_VERSION = "v99";
private static final int GENE_NAMES_COLUMN = 0;
private static final int HGNC_COLUMN = 3;
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java
index 2b4f2e4d8b..0fe3b0f115 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java
@@ -31,6 +31,7 @@
import java.nio.file.Path;
import java.text.NumberFormat;
import java.util.*;
+import java.util.stream.Collectors;
public class GwasIndexer extends ClinicalIndexer {
@@ -46,6 +47,8 @@ public class GwasIndexer extends ClinicalIndexer {
private int gwasLinesNotFoundInDbsnp;
private int invalidVariantRecords;
+ private int lineCounter = 0;
+
public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException {
super(genomeSequenceFilePath);
@@ -56,36 +59,31 @@ public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePa
}
public void index() throws RocksDBException, IOException {
- logger.info("Parsing GWAS catalog file ...");
-
- BufferedReader inputReader = null;
- TabixReader dbsnpTabixReader = null;
-
- try {
- logger.info("Opening GWAS catalog file " + gwasFile + " ...");
- inputReader = new BufferedReader(new FileReader(gwasFile.toFile()));
+ try (BufferedReader inputReader = new BufferedReader(new FileReader(gwasFile.toFile()));
+ TabixReader dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString())) {
logger.info("Ignoring GWAS catalog file header line ...");
- String line = inputReader.readLine();
+ inputReader.readLine();
+ ++lineCounter;
+ Map chromosomeMap = buildChromosomeMap(dbsnpTabixReader);
Map gwasMap = new HashMap<>();
- logger.info("Opening dbSNP tabix file " + dbSnpTabixFile + " ...");
- dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString());
long processedGwasLines = 0;
- logger.info("Parsing GWAS catalog file ...");
+ logger.info("Parsing GWAS catalog file {} ...", gwasFile);
+ String line;
while ((line = inputReader.readLine()) != null) {
+ ++lineCounter;
if (!line.isEmpty()) {
processedGwasLines++;
if (processedGwasLines % 10000 == 0) {
logger.info("{} lines parsed", processedGwasLines);
}
- processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap);
+ processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap, chromosomeMap);
}
}
- dbsnpTabixReader.close();
logger.info("Updating clinical variant annotation...");
long counter = 0;
@@ -118,16 +116,9 @@ public void index() throws RocksDBException, IOException {
rdb.put(entry.getKey().getBytes(), jsonObjectWriter.writeValueAsBytes(variantAnnotation));
}
this.printSummary(processedGwasLines, gwasMap);
- } catch (RocksDBException | IOException e) {
+ } catch (RocksDBException | IOException e) {
logger.error("Error reading/writing from/to the RocksDB index while indexing GWAS catalog file");
throw e;
- } finally {
- if (inputReader != null) {
- inputReader.close();
- }
- if (dbsnpTabixReader != null) {
- dbsnpTabixReader.close();
- }
}
}
@@ -184,13 +175,14 @@ significant digit (for example, a published p-value of 4.8 x 10-7 is rounded to
37 GENOTYPING_TECHNOLOGY* +: Genotyping technology/ies used in this study, with additional array information (ex. Immunochip or Exome
array) in brackets.
*/
- private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap) {
+ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap,
+ Map chromosomeMap) throws IOException {
Integer start = parseStart(values);
if (start != null) {
String chromosome = parseChromosome(values[11]);
if (StringUtils.isNotEmpty(chromosome)) {
String snpId = "rs" + values[23].trim();
- String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader);
+ String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader, chromosomeMap);
if (refAndAlt != null) {
// Create variant
Variant variant;
@@ -270,21 +262,27 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade
// Scores management
GwasAssociationStudyTraitScores scores = new GwasAssociationStudyTraitScores();
- try {
- scores.setPValue(Double.parseDouble(values[27]));
- } catch (NumberFormatException e) {
-// logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]);
+ if (StringUtils.isNotEmpty(values[27])) {
+ try {
+ scores.setPValue(Double.parseDouble(values[27]));
+ } catch (NumberFormatException e) {
+ logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]);
+ }
}
- try {
- scores.setPValueMlog(Double.parseDouble(values[28]));
- } catch (NumberFormatException e) {
-// logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]);
+ if (StringUtils.isNotEmpty(values[28])) {
+ try {
+ scores.setPValueMlog(Double.parseDouble(values[28]));
+ } catch (NumberFormatException e) {
+ logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]);
+ }
}
scores.setPValueText(values[29]);
- try {
- scores.setOrBeta(Double.parseDouble(values[30]));
- } catch (NumberFormatException e) {
-// logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]);
+ if (StringUtils.isNotEmpty(values[30])) {
+ try {
+ scores.setOrBeta(Double.parseDouble(values[30]));
+ } catch (NumberFormatException e) {
+ logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]);
+ }
}
scores.setPercentCI(values[31]);
@@ -301,15 +299,15 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade
gwasMap.put(key, gwas);
}
} else {
-// logger.warn("Variant not found in dbSNP " + snpId + ". Line: " + StringUtils.join(values, "\t\t\t"));
+ logger.warn("dbSNP {} not found. Line: {}", snpId, lineCounter);
gwasLinesNotFoundInDbsnp++;
}
} else {
-// logger.warn("Invalid chromosome " + chromosome + ". Line: " + StringUtils.join(values, "\t\t\t"));
+ logger.warn("Invalid chromosome {}. Line: {}", chromosome, lineCounter);
invalidChromosome++;
}
} else {
-// logger.warn("Invalid position " + start + ". Line: " + StringUtils.join(values, "\t\t\t"));
+ logger.warn("Invalid position {}. Line: {}", start, lineCounter);
invalidStartRecords++;
}
}
@@ -342,6 +340,39 @@ private String parseChromosome(String chromosome) {
return transformedChromosome;
}
+ private Map buildChromosomeMap(TabixReader dbsnpTabixReader) {
+ List chroms = dbsnpTabixReader.getChromosomes().stream().filter(name -> name.startsWith("NC_"))
+ .collect(Collectors.toList());
+
+ Map chromMap = new HashMap<>();
+ for (int i = 1; i < 22; i++) {
+ chromMap.put(Integer.toString(i), Integer.toString(i));
+ }
+ chromMap.put("X", "X");
+ chromMap.put("Y", "Y");
+ chromMap.put("MT", "MT");
+
+ for (String chrom : chroms) {
+ String[] split = chrom.split("[_\\.]");
+ int value = Integer.parseInt(split[1]);
+ switch (value) {
+ case 23:
+ chromMap.put("X", chrom);
+ break;
+ case 24:
+ chromMap.put("Y", chrom);
+ break;
+ case 12920:
+ chromMap.put("MT", chrom);
+ break;
+ default:
+ chromMap.put(Integer.toString(value), chrom);
+ break;
+ }
+ }
+ return chromMap;
+ }
+
private Float parseFloat(String value) {
Float riskAlleleFrequency = null;
if (NumberUtils.isNumber(value)) {
@@ -350,29 +381,33 @@ private Float parseFloat(String value) {
return riskAlleleFrequency;
}
- private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader) {
+ private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader,
+ Map chromosomeMap) throws IOException {
+ boolean found = false;
+ Set foundSnpIds = new HashSet<>();
String[] refAndAlt = null;
- TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(chromosome + ":" + start + "-" + start);
- try {
- String dbSnpRecord = dbsnpIterator.next();
- boolean found = false;
- while (dbSnpRecord != null && !found) {
- String[] dbsnpFields = dbSnpRecord.split("\t");
-
- if (snpId.equalsIgnoreCase(dbsnpFields[2])) {
- refAndAlt = new String[2];
- refAndAlt[REF] = dbsnpFields[3];
- refAndAlt[ALT] = dbsnpFields[4];
- found = true;
- }
-
- dbSnpRecord = dbsnpIterator.next();
+ String query = chromosomeMap.get(chromosome) + ":" + start + "-" + start;
+ TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(query);
+ String dbSnpRecord = null;
+ dbSnpRecord = dbsnpIterator.next();
+ while (dbSnpRecord != null && !found) {
+ String[] dbsnpFields = dbSnpRecord.split("\t");
+
+ if (snpId.equalsIgnoreCase(dbsnpFields[2])) {
+ refAndAlt = new String[2];
+ refAndAlt[REF] = dbsnpFields[3];
+ refAndAlt[ALT] = dbsnpFields[4];
+ found = true;
+ } else {
+ foundSnpIds.add(dbsnpFields[2]);
}
- } catch (IOException e) {
- logger.warn("Error reading position '" + chromosome + ":" + start + "' in dbSNP: " + e.getMessage());
- }
+ dbSnpRecord = dbsnpIterator.next();
+ }
+ if (!found) {
+ logger.warn("dbSNP {} not found from query {}. Found: {}", snpId, query, foundSnpIds);
+ }
return refAndAlt;
}
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java
index eb1f28db2d..bb9e0c36e4 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java
@@ -81,8 +81,10 @@ public List downloadClinical() throws IOException, InterruptedExce
url = configuration.getDownload().getClinvarVariationAllele().getHost();
downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString()));
clinvarUrls.add(url);
- saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls,
- clinicalFolder.resolve("clinvarVersion.json"));
+ saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar()
+ .getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve("clinvarVersion.json"));
+
+ logger.info("\t\tDone");
// Gwas catalog
logger.info("\t\tDownloading GWAS catalog file ...");
@@ -91,6 +93,7 @@ public List downloadClinical() throws IOException, InterruptedExce
downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString()));
saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(),
Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json"));
+ logger.info("\t\tDone");
// List hgvsList = getDocmHgvsList();
// if (!hgvsList.isEmpty()) {
@@ -236,10 +239,4 @@ private List getDocmHgvsList() throws IOException {
return hgvsList;
}
-
- private String getClinVarVersion() {
- // ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2015-12.xml.gz
- return configuration.getDownload().getClinvar().getHost().split("_")[1].split("\\.")[0];
- }
-
}
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java
index 260ff75427..9d2685eadf 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java
@@ -82,8 +82,6 @@ public List download() throws IOException, InterruptedException {
downloadFiles.addAll(downloadRefSeq(refseqFolder));
downloadFiles.add(downloadMane(geneFolder));
downloadFiles.add(downloadLrg(geneFolder));
- downloadFiles.add(downloadHgnc(geneFolder));
- downloadFiles.add(downloadCancerHotspot(geneFolder));
downloadFiles.add(downloadDrugData(geneFolder));
downloadFiles.addAll(downloadGeneUniprotXref(geneFolder));
downloadFiles.add(downloadGeneExpressionAtlas(geneFolder));
@@ -210,30 +208,6 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte
return null;
}
- private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException {
- if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
- logger.info("Downloading HGNC ...");
- String url = configuration.getDownload().getHgnc().getHost();
- saveVersionData(EtlCommons.GENE_DATA, "HGNC_GENE", configuration.getDownload().getHgnc().getVersion(),
- getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("hgncVersion.json"));
- String[] array = url.split("/");
- return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString());
- }
- return null;
- }
-
- private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException {
- if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
- logger.info("Downloading Cancer Hotspot ...");
- String url = configuration.getDownload().getCancerHotspot().getHost();
- saveVersionData(EtlCommons.GENE_DATA, "CANCER_HOTSPOT", configuration.getDownload().getHgnc().getVersion(),
- getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("cancerHotspotVersion.json"));
- String[] array = url.split("/");
- return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString());
- }
- return null;
- }
-
private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException {
if (speciesConfiguration.getScientificName().equals("Homo sapiens")) {
logger.info("Downloading go annotation...");
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java
index 0ba9f39db4..5a0609867f 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java
@@ -47,11 +47,11 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto
public List download() throws IOException, InterruptedException {
List downloadFiles = new ArrayList<>();
downloadFiles.addAll(downloadReferenceGenome());
- downloadFiles.addAll(downloadConservation());
- downloadFiles.addAll(downloadRepeats());
+// downloadFiles.addAll(downloadConservation());
+// downloadFiles.addAll(downloadRepeats());
// cytobands
-// runGenomeInfo();
+ runGenomeInfo();
return downloadFiles;
}
@@ -115,16 +115,16 @@ public List downloadConservation() throws IOException, Interrupted
List phastconsUrls = new ArrayList<>(chromosomes.length);
List phyloPUrls = new ArrayList<>(chromosomes.length);
for (String chromosome : chromosomes) {
- String phastConsUrl = url + "/phastCons470way/hg38.470way.phastCons/chr" + chromosome
- + ".phastCons470way.wigFix.gz";
+ String phastConsUrl = url + "/phastCons100way/hg38.100way.phastCons/chr" + chromosome
+ + ".phastCons100way.wigFix.gz";
downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve("phastCons")
- .resolve("chr" + chromosome + ".phastCons470way.wigFix.gz").toString()));
+ .resolve("chr" + chromosome + ".phastCons100way.wigFix.gz").toString()));
phastconsUrls.add(phastConsUrl);
- String phyloPUrl = url + "/phyloP470way/hg38.470way.phyloP/chr" + chromosome
- + ".phyloP470way.wigFix.gz";
+ String phyloPUrl = url + "/phyloP100way/hg38.100way.phyloP100way/chr" + chromosome
+ + ".phyloP100way.wigFix.gz";
downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve("phylop")
- .resolve("chr" + chromosome + ".phyloP470way.wigFix.gz").toString()));
+ .resolve("chr" + chromosome + ".phyloP100way.wigFix.gz").toString()));
phyloPUrls.add(phyloPUrl);
}
String gerpUrl = configuration.getDownload().getGerp().getHost();
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java
index 522be7b27d..0776354e80 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java
@@ -36,7 +36,7 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec
public List download() throws IOException, InterruptedException {
- logger.info("Downloading OBO files ...");
+ logger.info("Downloading obo files ...");
List downloadFiles = new ArrayList<>();
Path oboFolder = downloadFolder.resolve("ontology");
@@ -44,22 +44,20 @@ public List download() throws IOException, InterruptedException {
String url = configuration.getDownload().getHpoObo().getHost();
downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString()));
+
saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(),
Collections.singletonList(url), buildFolder.resolve(EtlCommons.HPO_VERSION_FILE));
url = configuration.getDownload().getGoObo().getHost();
downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString()));
+
saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(),
Collections.singletonList(url), buildFolder.resolve(EtlCommons.GO_VERSION_FILE));
url = configuration.getDownload().getDoidObo().getHost();
downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString()));
- saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(),
- Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE));
- url = configuration.getDownload().getMondoObo().getHost();
- downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString()));
- saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(),
+ saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(),
Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE));
return downloadFiles;
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java
index 5a722ed448..08f28cfdad 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java
@@ -22,6 +22,7 @@
import org.opencb.commons.utils.FileUtils;
import java.io.BufferedReader;
+import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
@@ -33,8 +34,6 @@
public class ProteinDownloadManager extends AbstractDownloadManager {
private static final String UNIPROT_NAME = "UniProt";
- private static final String INTERPRO_NAME = "InterPro";
- private static final String INTACT_NAME = "IntAct";
public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
@@ -57,7 +56,6 @@ public List download() throws IOException, InterruptedException {
Files.createDirectories(proteinFolder);
List downloadFiles = new ArrayList<>();
- // Uniprot
String url = configuration.getDownload().getUniprot().getHost();
downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString()));
Files.createDirectories(proteinFolder.resolve("uniprot_chunks"));
@@ -65,25 +63,23 @@ public List download() throws IOException, InterruptedException {
String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost();
downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString()));
+
saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1),
getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json"));
- // Interpro
- String interproUrl = configuration.getDownload().getInterpro().getHost();
- downloadFiles.add(downloadFile(interproUrl, proteinFolder.resolve("protein2ipr.dat.gz").toString()));
-
- relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost();
- downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString()));
- saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5),
- getTimeStamp(), Collections.singletonList(interproUrl), proteinFolder.resolve("interproVersion.json"));
-
- // Intact
- String intactUrl = configuration.getDownload().getIntact().getHost();
- downloadFiles.add(downloadFile(intactUrl, proteinFolder.resolve("intact.txt").toString()));
- saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, configuration.getDownload().getIntact().getVersion(),
- getTimeStamp(), Collections.singletonList(intactUrl), proteinFolder.resolve("intactVersion.json"));
-
return downloadFiles;
+
+// url = configuration.getDownload().getIntact().getHost();
+// downloadFile(url, proteinFolder.resolve("intact.txt").toString());
+// saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url),
+// proteinFolder.resolve("intactVersion.json"));
+//
+// url = configuration.getDownload().getInterpro().getHost();
+// downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString());
+// relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost();
+// downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString());
+// saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5),
+// getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json"));
}
private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException {
@@ -100,7 +96,7 @@ private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOE
inEntry = true;
beforeEntry = false;
if (count % 10000 == 0) {
- pw = new PrintWriter(Files.newOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile().toPath()));
+ pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile()));
pw.println(header.toString().trim());
}
count++;
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java
index 51152e478d..1abb352fbe 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java
@@ -64,8 +64,8 @@ public List download() throws IOException, InterruptedException, N
List downloadFiles = new ArrayList<>();
downloadFiles.addAll(downloadRegulatoryaAndMotifFeatures());
- downloadFiles.add(downloadMiRTarBase());
downloadFiles.add(downloadMirna());
+ downloadFiles.add(downloadMiRTarBase());
return downloadFiles;
}
diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java
index 5d7dbc65d0..e5cd4d38cc 100644
--- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java
+++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java
@@ -16,8 +16,6 @@
package org.opencb.cellbase.lib.impl.core;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
import com.mongodb.ReadPreference;
import com.mongodb.WriteConcern;
import com.mongodb.client.model.Filters;
@@ -25,6 +23,7 @@
import org.bson.BsonDocument;
import org.bson.Document;
import org.bson.conversions.Bson;
+import org.codehaus.jackson.map.ObjectMapper;
import org.opencb.cellbase.core.api.key.ApiKeyStats;
import org.opencb.cellbase.core.api.query.AbstractQuery;
import org.opencb.cellbase.core.api.query.ProjectionQueryOptions;
diff --git a/pom.xml b/pom.xml
index e6599cd67c..8f7ee392ad 100644
--- a/pom.xml
+++ b/pom.xml
@@ -27,17 +27,15 @@
4.0.0-SNAPSHOT
0.1.0
- 9.4.51.v20230217
-
- 2.14.3
- 3.14.0
- 1.7.36
-
+ 2.11.4
+ 1.9.13
2.30.1
+ 1.7.32
2.17.2
1.5.2
5.5.2
0.8.8
+ 9.4.17.v20190418
0.11.5
1.6.5
3.1.0
@@ -53,6 +51,7 @@
1.48.0
2.4
2.4
+ 3.12.0
2.1.6
4.4
1.69
@@ -414,11 +413,11 @@
swagger-annotations
${swagger-annotations.version}
-
+
io.jsonwebtoken
jjwt-jackson