Skip to content

Commit

Permalink
Merge pull request #711 from opencb/TASK-6780
Browse files Browse the repository at this point in the history
TASK-6780 - Port Patch 6.2.1 -> 7.0.0 - Xetabase 2.2.1 -> 3.0.0
  • Loading branch information
juanfeSanahuja authored Oct 14, 2024
2 parents b663e0f + 454c24a commit 6aeead1
Show file tree
Hide file tree
Showing 20 changed files with 221 additions and 250 deletions.
2 changes: 1 addition & 1 deletion cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \
git clone https://github.com/Ensembl/ensembl-compara.git && \
git clone https://github.com/Ensembl/ensembl-io.git

ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts
ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase
8 changes: 4 additions & 4 deletions cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157";
our $ENSEMBL_GENOMES_USER = "anonymous";

## Vertebrates
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38";
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38";
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38";
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38";
our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38";
our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38";
our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38";
our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38";
#our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38";
#our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38";
#our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ public class DownloadProperties {
private EnsemblProperties ensembl;
private EnsemblProperties ensemblGenomes;
private URLProperties hgnc;
private URLProperties cancerHotspot;
private URLProperties refSeq;
private URLProperties refSeqFasta;
private URLProperties refSeqProteinFasta;
Expand Down Expand Up @@ -72,7 +71,6 @@ public class DownloadProperties {
private URLProperties hpoObo;
private URLProperties goObo;
private URLProperties doidObo;
private URLProperties mondoObo;
private URLProperties goAnnotation;
private URLProperties revel;
private URLProperties pubmed;
Expand Down Expand Up @@ -529,24 +527,6 @@ public DownloadProperties setHgnc(URLProperties hgnc) {
return this;
}

public URLProperties getCancerHotspot() {
return cancerHotspot;
}

public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) {
this.cancerHotspot = cancerHotspot;
return this;
}

public URLProperties getMondoObo() {
return mondoObo;
}

public DownloadProperties setMondoObo(URLProperties mondoObo) {
this.mondoObo = mondoObo;
return this;
}

public static class EnsemblProperties {

private DatabaseCredentials database;
Expand Down
89 changes: 36 additions & 53 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,7 @@ download:
url:
host: ftp://ftp.ensemblgenomes.org/pub
hgnc:
host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt
version: 2023-11-01
cancerHotspot:
host: https://www.cancerhotspots.org/files/hotspots_v2.xls
version: "v2"
host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2022-01-01.txt
refSeq:
host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz
refSeqFasta:
Expand All @@ -77,15 +73,12 @@ download:
host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz
maneSelect:
# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz
# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz
host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz
version: "1.1"
host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz
version: 0.93
lrg:
host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt
version: "2021-03-30"
geneUniprotXref:
host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/
version: "2023-11-08"
geneExpressionAtlas:
host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz
mirbase:
Expand All @@ -95,49 +88,45 @@ download:
targetScan:
host: http://hgdownload.cse.ucsc.edu/goldenPath/
miRTarBase:
host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx
version: "9.0"

## Protein Data
host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/8.0/hsa_MTI.xlsx
uniprot:
host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
version: "2023-11-08"
host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
uniprotRelNotes:
host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
version: "2023-11-08"
host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
intact:
host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
interpro:
host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz
version: "2023-11-08"
host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz
interproRelNotes:
host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt
intact:
host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
version: "2023-10-07"

## Conservation Scores
host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt
conservation:
host: https://hgdownload.cse.ucsc.edu/goldenPath/
version: "2022-08-30"
gerp:
host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw
version: "2023-05-17"
host: http://ftp.ensembl.org/pub/release-104/compara/conservation_scores/90_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw
clinvar:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz
<<<<<<< HEAD
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
=======
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz
version: "2023-12-01"
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz
version: 2024-05
clinvarVariation:
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz
# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz
host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz
version: 2024-05
>>>>>>> release-6.2.x
clinvarSummary:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
version: "2023-12-01"
clinvarVariationAllele:
host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz
version: "2023-12-01"
clinvarEfoTerms:
host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv
dbSNP:
Expand All @@ -158,43 +147,37 @@ download:
genomicSuperDups:
host: http://hgdownload.cse.ucsc.edu/goldenPath
gwasCatalog:
# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
# version: "1.0.2 associations_e106_r2022-05-17"
host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv
version: "23-12-21"
<<<<<<< HEAD
host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
version: "1.0.2 associations_e106_r2022-05-17"
=======
#host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv
host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv"
#version: "1.0.2 associations_e106_r2022-05-17"
version: "2024-05-20"
>>>>>>> release-6.2.x
hpo:
## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations
host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt
disgenet:
host: https://www.disgenet.org/static/disgenet_ap1/files/downloads
files:
- all_gene_disease_associations.tsv.gz
- readme.txt
dgidb:
host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv
version: "2022-02-01"
host: https://dgidb.org/data/monthly_tsvs/2021-Jan/interactions.tsv
cadd:
## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP!
# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz
version: "1.7-pre"
host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz
reactome:
host: http://www.reactome.org/download/current/biopax.zip
gnomadConstraints:
host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz
version: "2.1.1"
version: 2.1.1
hpoObo:
host: http://purl.obolibrary.org/obo/hp.obo
version: "2023-12-01"
goObo:
host: http://purl.obolibrary.org/obo/go/go-basic.obo
version: "2023-12-01"
doidObo:
host: http://purl.obolibrary.org/obo/doid.obo
version: "2023-12-01"
mondoObo:
host: http://purl.obolibrary.org/obo/mondo.obo
version: "2023-12-01"
goAnnotation:
host: http://geneontology.org/gene-associations/goa_human.gaf.gz
revel:
Expand All @@ -221,7 +204,7 @@ species:
- id: hsapiens
scientificName: Homo sapiens
assemblies:
- ensemblVersion: '110_38'
- ensemblVersion: '104_38'
name: GRCh38
- ensemblVersion: '82_37'
name: GRCh37
Expand Down
4 changes: 2 additions & 2 deletions cellbase-lib/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,10 @@
<groupId>com.github.samtools</groupId>
<artifactId>htsjdk</artifactId>
</dependency>
<!-- <dependency>
<dependency>
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
</dependency>-->
</dependency>
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ public class EtlCommons {
public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json";

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
public static final String CLINVAR_VERSION = "2022.11";
public static final String CLINVAR_DATE = "2022-11";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz";
public static final String CLINVAR_VERSION = "2024-05";
public static final String CLINVAR_DATE = "2024-05";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz";
public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv";
public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz";
public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz";
Expand All @@ -77,7 +77,6 @@ public class EtlCommons {
public static final String HPO_FILE = "hp.obo";
public static final String GO_FILE = "go-basic.obo";
public static final String DOID_FILE = "doid.obo";
public static final String MONDO_FILE = "mondo.obo";
public static final String PFM_DATA = "regulatory_pfm";

// Build specific data options
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,8 @@ public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, Species
boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException {
this(null, geneDirectoryPath.resolve("description.txt"),
geneDirectoryPath.resolve("xrefs.txt"),
geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"),
geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"),
geneDirectoryPath.resolve("hgnc_complete_set_2022-01-01.txt"),
geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"),
geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"),
geneDirectoryPath.resolve("idmapping_selected.tab.gz"),
geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,12 @@ public class OntologyBuilder extends CellBaseBuilder {
private Path hpoFile;
private Path goFile;
private Path doidFile;
private Path mondoFile;

public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) {
super(serializer);
hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE);
goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE);
doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE);
mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE);
}

@Override
Expand All @@ -66,13 +64,6 @@ public void parse() throws Exception {
serializer.serialize(term);
}

bufferedReader = FileUtils.newBufferedReader(mondoFile);
terms = parser.parseOBO(bufferedReader, "Mondo Ontology");
for (OntologyTerm term : terms) {
term.setSource("MONDO");
serializer.serialize(term);
}

serializer.close();
}
}
Loading

0 comments on commit 6aeead1

Please sign in to comment.