diff --git a/.github/workflows/pull-request-approved.yml b/.github/workflows/pull-request-approved.yml index eb410c9cba..ba378d3abb 100644 --- a/.github/workflows/pull-request-approved.yml +++ b/.github/workflows/pull-request-approved.yml @@ -1,15 +1,40 @@ name: Pull request approve workflow +run-name: 'Pull request approve workflow ${{ github.event.pull_request.head.ref }} -> ${{ github.event.pull_request.base.ref }} by @${{ github.actor }}' on: pull_request_review: types: [ submitted ] jobs: - build: - uses: opencb/java-common-libs/.github/workflows/build-java-app-workflow.yml@develop + calculate-xetabase-branch: + name: Calculate Xetabase branch + runs-on: ubuntu-22.04 + outputs: + xetabase_branch: ${{ steps.get_xetabase_branch.outputs.xetabase_branch }} + steps: + - name: Clone java-common-libs + uses: actions/checkout@v4 + with: + fetch-depth: '10' + ## This is important to avoid the error in the next step: "fatal: repository 'https://github.com/zetta-genomics/opencga-enterprise.git/' not found" + persist-credentials: false + - id: get_xetabase_branch + name: "Get current branch for Xetabase from target branch" + run: | + chmod +x ./.github/workflows/scripts/get-xetabase-branch.sh + echo "github.event.pull_request.base.ref: ${{ github.event.pull_request.base.ref }}" + echo "github.event.pull_request.head.ref: ${{ github.event.pull_request.head.ref }}" + xetabase_branch=$(./.github/workflows/scripts/get-xetabase-branch.sh ${{ github.event.pull_request.base.ref }}) + echo "__Xetabase ref:__ \"${xetabase_branch}\"" | tee -a ${GITHUB_STEP_SUMMARY} + echo "xetabase_branch=${xetabase_branch}" >> $GITHUB_OUTPUT + env: + ZETTA_REPO_ACCESS_TOKEN: ${{ secrets.ZETTA_REPO_ACCESS_TOKEN }} test: - name: "Test analysis" - uses: ./.github/workflows/test-analysis.yml - needs: build - secrets: inherit + name: "Run all tests before merging" + needs: calculate-xetabase-branch + uses: opencb/java-common-libs/.github/workflows/test-xetabase-workflow.yml@develop + with: + branch: ${{ needs.calculate-xetabase-branch.outputs.xetabase_branch }} + task: ${{ github.event.pull_request.head.ref }} + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/scripts/get-xetabase-branch.sh b/.github/workflows/scripts/get-xetabase-branch.sh new file mode 100644 index 0000000000..fd9626a79a --- /dev/null +++ b/.github/workflows/scripts/get-xetabase-branch.sh @@ -0,0 +1,50 @@ +#!/bin/bash + +# Function to calculate the corresponding branch of Xetabase project +get_xetabase_branch() { + # Input parameter (branch name) + input_branch="$1" + + # If the branch begins with 'TASK' and exists in the opencga-enterprise repository, I return it + if [[ $input_branch == TASK* ]]; then + if [ "$(git ls-remote "https://$ZETTA_REPO_ACCESS_TOKEN@github.com/zetta-genomics/opencga-enterprise.git" "$input_branch" )" ] ; then + echo $input_branch; + return 0; + fi + fi + + # Check if the branch name is "develop" in that case return the same branch name + if [[ "$input_branch" == "develop" ]]; then + echo "develop" + return 0 + fi + + # Check if the branch name starts with "release-" and follows the patterns "release-a.x.x" or "release-a.b.x" + if [[ "$input_branch" =~ ^release-([0-9]+)\.x\.x$ ]] || [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.x$ ]]; then + # Extract the MAJOR part of the branch name + MAJOR=${BASH_REMATCH[1]} + # Calculate the XETABASE_MAJOR by subtracting 4 from MAJOR of cellbase + XETABASE_MAJOR=$((MAJOR - 4)) + # Check if the XETABASE_MAJOR is negative + if (( XETABASE_MAJOR < 0 )); then + echo "Error: 'MAJOR' digit after subtraction results in a negative number." + return 1 + fi + # Construct and echo the new branch name + echo "release-$XETABASE_MAJOR.${input_branch#release-$MAJOR.}" + return 0 + fi + + # If the branch name does not match any of the expected patterns + echo "Error: The branch name is not correct." + return 1 +} + +# Check if the script receives exactly one argument +if [ "$#" -ne 1 ]; then + echo "Usage: $0 " + exit 1 +fi + +# Call the function with the input branch name +get_xetabase_branch "$1" diff --git a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile index 6e1657d1bf..17d5accff4 100644 --- a/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile +++ b/cellbase-app/app/cloud/docker/cellbase-builder/Dockerfile @@ -28,4 +28,4 @@ RUN cd /opt/ensembl && \ git clone https://github.com/Ensembl/ensembl-compara.git && \ git clone https://github.com/Ensembl/ensembl-io.git -ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase/scripts/ensembl-scripts +ENV PERL5LIB=$PERL5LIB:/opt/ensembl/bioperl-live:/opt/ensembl/ensembl/modules:/opt/ensembl/ensembl-variation/modules:/opt/ensembl/ensembl-funcgen/modules:/opt/ensembl/ensembl-compara/modules:/opt/ensembl/lib/perl/5.18.2:/opt/cellbase diff --git a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm index 70865465e9..aa22cf10b1 100755 --- a/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm +++ b/cellbase-app/app/scripts/ensembl-scripts/DB_CONFIG.pm @@ -134,10 +134,10 @@ our $ENSEMBL_GENOMES_PORT = "4157"; our $ENSEMBL_GENOMES_USER = "anonymous"; ## Vertebrates -our $HOMO_SAPIENS_CORE = "homo_sapiens_core_110_38"; -our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_110_38"; -our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_110_38"; -our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_110_38"; +our $HOMO_SAPIENS_CORE = "homo_sapiens_core_104_38"; +our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_104_38"; +our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_104_38"; +our $HOMO_SAPIENS_COMPARA = "homo_sapiens_compara_104_38"; #our $HOMO_SAPIENS_CORE = "homo_sapiens_core_78_38"; #our $HOMO_SAPIENS_VARIATION = "homo_sapiens_variation_78_38"; #our $HOMO_SAPIENS_FUNCTIONAL = "homo_sapiens_funcgen_78_38"; diff --git a/cellbase-app/pom.xml b/cellbase-app/pom.xml index 441dc47bff..aed90e9897 100644 --- a/cellbase-app/pom.xml +++ b/cellbase-app/pom.xml @@ -6,7 +6,7 @@ org.opencb.cellbase cellbase - 6.2.1-SNAPSHOT + 6.3.0-SNAPSHOT ../pom.xml diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java index 6049ef9b4b..4a5f2c085f 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/AdminCliOptionsParser.java @@ -19,6 +19,7 @@ import com.beust.jcommander.*; import org.opencb.cellbase.app.cli.CliOptionsParser; import org.opencb.cellbase.core.api.key.ApiKeyQuota; +import org.opencb.cellbase.lib.EtlCommons; import java.util.HashMap; import java.util.List; @@ -74,6 +75,7 @@ public AdminCliOptionsParser() { jCommander.addCommand("validate", validationCommandOptions); } + @Override public void parse(String[] args) throws ParameterException { jCommander.parse(args); } @@ -87,9 +89,13 @@ public class DownloadCommandOptions { @ParametersDelegate public SpeciesAndAssemblyCommandOptions speciesAndAssemblyOptions = speciesAndAssemblyCommandOptions; - @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download: genome, gene, " - + "variation, variation_functional_score, regulation, protein, conservation, " - + "clinical_variants, repeats, svs, pubmed and 'all' to download everything", required = true, arity = 1) + @Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to download:" + + EtlCommons.GENOME_DATA + ", " + EtlCommons.GENE_DATA + ", " + EtlCommons.VARIATION_DATA + ", " + + EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA + ", " + EtlCommons.MISSENSE_VARIATION_SCORE_DATA + ", " + + EtlCommons.REGULATION_DATA + ", " + EtlCommons.PROTEIN_DATA + ", " + EtlCommons.CONSERVATION_DATA + ", " + + EtlCommons.CLINICAL_VARIANTS_DATA + ", " + EtlCommons.REPEATS_DATA + ", " + EtlCommons.OBO_DATA + ", " + + EtlCommons.PUBMED_DATA + ", " + EtlCommons.PHARMACOGENOMICS_DATA + "; and 'all' to download everything", + required = true, arity = 1) public String data; @Parameter(names = {"-o", "--outdir"}, description = "Downloaded files will be saved in this directory.", required = true, arity = 1) diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java index 8c0d477023..16db1f82bc 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/BuildCommandExecutor.java @@ -38,7 +38,7 @@ import java.util.Collections; import java.util.List; -import static org.opencb.cellbase.lib.EtlCommons.PHARMGKB_DATA; +import static org.opencb.cellbase.lib.EtlCommons.*; /** * Created by imedina on 03/02/15. @@ -132,6 +132,9 @@ public void execute() { case EtlCommons.REFSEQ_DATA: parser = buildRefSeq(); break; + case EtlCommons.VARIATION_DATA: + parser = buildVariation(); + break; case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: parser = buildCadd(); break; @@ -275,6 +278,21 @@ private CellBaseBuilder buildRefSeq() { return new RefSeqGeneBuilder(refseqFolderPath, speciesConfiguration, serializer); } + private CellBaseBuilder buildVariation() throws IOException { + Path downloadVariationPath = downloadFolder.resolve(VARIATION_DATA); + Path buildVariationPath = buildFolder.resolve(VARIATION_DATA); + if (!buildVariationPath.toFile().exists()) { + buildVariationPath.toFile().mkdirs(); + } + + CellBaseFileSerializer variationSerializer = new CellBaseJsonFileSerializer(buildVariationPath); + + // Currently, only dbSNP data + Files.copy(downloadVariationPath.resolve(DBSNP_VERSION_FILENAME), buildVariationPath.resolve(DBSNP_VERSION_FILENAME), + StandardCopyOption.REPLACE_EXISTING); + return new VariationBuilder(downloadVariationPath, variationSerializer, configuration); + } + private CellBaseBuilder buildCadd() { Path variationFunctionalScorePath = downloadFolder.resolve("variation_functional_score"); copyVersionFiles(Arrays.asList(variationFunctionalScorePath.resolve("caddVersion.json"))); diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java index f8197e6558..abb0629374 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/DownloadCommandExecutor.java @@ -69,9 +69,9 @@ public void execute() { case EtlCommons.GENE_DATA: downloadFiles.addAll(downloader.downloadGene()); break; -// case EtlCommons.VARIATION_DATA: -// downloadManager.downloadVariation(); -// break; + case EtlCommons.VARIATION_DATA: + downloadFiles.addAll(downloader.downloadVariation()); + break; case EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA: downloadFiles.addAll(downloader.downloadCaddScores()); break; diff --git a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java index 5a8fd9417b..97460d5a71 100644 --- a/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java +++ b/cellbase-app/src/main/java/org/opencb/cellbase/app/cli/admin/executors/LoadCommandExecutor.java @@ -44,6 +44,8 @@ import java.util.List; import java.util.concurrent.ExecutionException; +import static org.opencb.cellbase.lib.EtlCommons.*; + /** * Created by imedina on 03/02/15. */ @@ -372,30 +374,57 @@ private void checkParameters() throws CellBaseException { private void loadVariationData() throws NoSuchMethodException, InterruptedException, ExecutionException, InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException, IOException, LoaderException, CellBaseException { + Path variationPath = input.resolve(VARIATION_DATA); // First load data - // Common loading process from CellBase variation data models if (field == null) { - DirectoryStream stream = Files.newDirectoryStream(input, + // Common loading process from CellBase variation data models + DirectoryStream stream = Files.newDirectoryStream(variationPath, entry -> entry.getFileName().toString().startsWith("variation_chr")); + int numLoadings = 0; for (Path entry : stream) { logger.info("Loading file '{}'", entry); - loadRunner.load(input.resolve(entry.getFileName()), "variation", dataRelease); + loadRunner.load(variationPath.resolve(entry.getFileName()), "variation", dataRelease); + numLoadings++; } - // Create index - createIndex("variation"); - - // Update release (collection and sources) - List sources = new ArrayList<>(Arrays.asList( - input.resolve("ensemblVariationVersion.json") - )); - dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources); + if (numLoadings > 0) { + // Create index + createIndex("variation"); + // Update release (collection and sources) + List sources = new ArrayList<>(Arrays.asList( + variationPath.resolve("ensemblVariationVersion.json") + )); + dataReleaseManager.update(dataRelease, "variation", EtlCommons.VARIATION_DATA, sources); + } else { + logger.info("Any variation file 'variation_chr...' found within folder '{}'", variationPath); + } + } else { // Custom update required e.g. population freqs loading + logger.info("Loading file '{}'", variationPath); + loadRunner.load(variationPath, "variation", dataRelease, field, innerFields); + } + + // Load dbSNP + Path dbSnpFilePath = variationPath.resolve(DBSNP_NAME + ".json.gz"); + if (dbSnpFilePath.toFile().exists()) { + if (variationPath.resolve(DBSNP_VERSION_FILENAME).toFile().exists()) { + logger.info("Loading dbSNP file '{}'", dbSnpFilePath); + loadRunner.load(dbSnpFilePath, SNP_COLLECTION_NAME, dataRelease); + + // Create index + createIndex(SNP_COLLECTION_NAME); + + // Update release (collection and sources) + List sources = Collections.singletonList(variationPath.resolve(DBSNP_VERSION_FILENAME)); + dataReleaseManager.update(dataRelease, SNP_COLLECTION_NAME, EtlCommons.VARIATION_DATA, sources); + } else { + logger.warn("In order to load the dbSNP file you need the version file {} within the folder '{}'", DBSNP_VERSION_FILENAME, + variationPath); + } } else { - logger.info("Loading file '{}'", input); - loadRunner.load(input, "variation", dataRelease, field, innerFields); + logger.warn("Any dbSNP file found within the folder '{}'", variationPath); } } diff --git a/cellbase-client/pom.xml b/cellbase-client/pom.xml index a843ef0685..7424c21bbb 100644 --- a/cellbase-client/pom.xml +++ b/cellbase-client/pom.xml @@ -6,7 +6,7 @@ org.opencb.cellbase cellbase - 6.2.1-SNAPSHOT + 6.3.0-SNAPSHOT ../pom.xml diff --git a/cellbase-client/src/main/java/org/opencb/cellbase/client/rest/VariantClient.java b/cellbase-client/src/main/java/org/opencb/cellbase/client/rest/VariantClient.java index e9479ad18c..3e308e08d0 100644 --- a/cellbase-client/src/main/java/org/opencb/cellbase/client/rest/VariantClient.java +++ b/cellbase-client/src/main/java/org/opencb/cellbase/client/rest/VariantClient.java @@ -17,6 +17,7 @@ package org.opencb.cellbase.client.rest; import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.Snp; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.*; import org.opencb.cellbase.client.config.ClientConfiguration; @@ -236,6 +237,14 @@ public CellBaseDataResponse getAllConsequenceTypes(Query query) throws I return execute("consequenceTypes", query, new QueryOptions(), String.class); } + public CellBaseDataResponse searchSnp(Query query, QueryOptions options) throws IOException { + return execute("snp/search", query, options, Snp.class); + } + + public CellBaseDataResponse startsWithSnp(Query query, QueryOptions options) throws IOException { + return execute("snp/startsWith", query, options, Snp.class); + } + // public CellBaseDataResponse getConsequenceTypeById(String id, QueryOptions options) throws IOException { // return execute(id, "consequence_type", options, String.class); // } diff --git a/cellbase-client/src/test/java/org/opencb/cellbase/client/rest/VariantClientTest.java b/cellbase-client/src/test/java/org/opencb/cellbase/client/rest/VariantClientTest.java index 082c056d40..9b39239538 100644 --- a/cellbase-client/src/test/java/org/opencb/cellbase/client/rest/VariantClientTest.java +++ b/cellbase-client/src/test/java/org/opencb/cellbase/client/rest/VariantClientTest.java @@ -18,16 +18,26 @@ import org.apache.avro.specific.SpecificRecordBase; import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.junit.jupiter.api.Assumptions; import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.Test; import org.junit.jupiter.api.TestInstance; import org.junit.jupiter.params.ParameterizedTest; import org.junit.jupiter.params.provider.ArgumentsSource; +import org.opencb.biodata.models.core.Snp; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ConsequenceType; import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.client.config.ClientConfiguration; +import org.opencb.cellbase.client.config.RestConfig; +import org.opencb.cellbase.core.common.GitRepositoryState; +import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResponse; +import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.utils.VersionUtils; import java.util.*; import java.util.stream.Collectors; @@ -139,6 +149,92 @@ public void getAllConsequenceTypes(CellBaseClient cellBaseClient) throws Excepti assertNotNull(response.firstResult(), "List of all the consequence types present should be returned"); } + @Test + public void testSearchSnpBydbSnpId() throws Exception { + int dataRelease = 7; + ClientConfiguration clientConfiguration = new ClientConfiguration() + .setDefaultSpecies("hsapiens") + .setVersion("v5.8.3-SNAPSHOT") + .setRest(new RestConfig(Collections.singletonList("https://ws.zettagenomics.com/cellbase"), 2000)); + + CellBaseClient client = new CellBaseClient(clientConfiguration); + + // Assumptions before running the test + ObjectMap result = client.getMetaClient().about().firstResult(); + Assumptions.assumeTrue(VersionUtils.isMinVersion("5.8.3-SNAPSHOT", result.getString("Version"))); + CellBaseDataResponse dataReleaseResponse = client.getMetaClient().dataReleases(); + Assumptions.assumeTrue(dataReleaseResponse.getResponses().get(0).getResults().stream().map(DataRelease::getRelease).collect(Collectors.toList()).contains(dataRelease)); + + Query query = new Query(); + query.put("id", "rs1570391602,rs41278952"); + query.put("dataRelease", dataRelease); + + CellBaseDataResponse response = client.getVariantClient().searchSnp(query, new QueryOptions()); + assertEquals(2, response.getResponses().get(0).getNumResults()); + assertEquals("rs1570391602", response.getResponses().get(0).getResults().get(0).getId()); + assertEquals("rs41278952", response.getResponses().get(0).getResults().get(1).getId()); + } + + @Test + public void testSearchSnpByPosition() throws Exception { + int dataRelease = 7; + ClientConfiguration clientConfiguration = new ClientConfiguration() + .setDefaultSpecies("hsapiens") + .setVersion("v5.8.3-SNAPSHOT") + .setRest(new RestConfig(Collections.singletonList("https://ws.zettagenomics.com/cellbase"), 2000)); + + CellBaseClient client = new CellBaseClient(clientConfiguration); + + // Assumptions before running the test + ObjectMap result = client.getMetaClient().about().firstResult(); + Assumptions.assumeTrue(VersionUtils.isMinVersion("5.8.3-SNAPSHOT", result.getString("Version"))); + CellBaseDataResponse dataReleaseResponse = client.getMetaClient().dataReleases(); + Assumptions.assumeTrue(dataReleaseResponse.getResponses().get(0).getResults().stream().map(DataRelease::getRelease).collect(Collectors.toList()).contains(dataRelease)); + + Query query = new Query(); + query.put("chromosome", "1"); + query.put("position", "56948509"); + query.put("reference", "T"); + query.put("dataRelease", dataRelease); + + CellBaseDataResponse response = client.getVariantClient().searchSnp(query, new QueryOptions()); + assertEquals(1, response.getResponses().get(0).getNumResults()); + assertEquals("rs1570391602", response.getResponses().get(0).getResults().get(0).getId()); + assertEquals(query.getInt("position"), response.getResponses().get(0).getResults().get(0).getPosition()); + assertEquals(query.get("reference"), response.getResponses().get(0).getResults().get(0).getReference()); + assertEquals(1, response.getResponses().get(0).getResults().get(0).getAlternates().size()); + assertEquals("G", response.getResponses().get(0).getResults().get(0).getAlternates().get(0)); + } + + @Test + public void testStarsWithSnp() throws Exception { + int dataRelease = 7; + ClientConfiguration clientConfiguration = new ClientConfiguration() + .setDefaultSpecies("hsapiens") + .setVersion("v5.8.3-SNAPSHOT") + .setRest(new RestConfig(Collections.singletonList("https://ws.zettagenomics.com/cellbase"), 2000)); + + CellBaseClient client = new CellBaseClient(clientConfiguration); + + // Assumptions before running the test + ObjectMap result = client.getMetaClient().about().firstResult(); + Assumptions.assumeTrue(VersionUtils.isMinVersion("5.8.3-SNAPSHOT", result.getString("Version"))); + CellBaseDataResponse dataReleaseResponse = client.getMetaClient().dataReleases(); + Assumptions.assumeTrue(dataReleaseResponse.getResponses().get(0).getResults().stream().map(DataRelease::getRelease).collect(Collectors.toList()).contains(dataRelease)); + + Query query = new Query(); + query.put("id", "rs157039161"); + query.put("dataRelease", dataRelease); + + CellBaseDataResponse response = client.getVariantClient().startsWithSnp(query, new QueryOptions()); + assertEquals(9, response.getResponses().get(0).getNumResults()); + for (Snp snp : response.getResponses().get(0).getResults()) { + if (!snp.getId().startsWith(query.getString("id"))) { + fail(); + } + } + } + // @Test // public void getConsequenceTypeById() throws Exception { // CellBaseDataResponse stringCellBaseDataResponse = cellBaseClient.getVariantClient().getConsequenceTypeById("22:35490160:G:A", null); diff --git a/cellbase-core/pom.xml b/cellbase-core/pom.xml index db167df9bb..7c74e13d92 100644 --- a/cellbase-core/pom.xml +++ b/cellbase-core/pom.xml @@ -6,7 +6,7 @@ org.opencb.cellbase cellbase - 6.2.1-SNAPSHOT + 6.3.0-SNAPSHOT ../pom.xml diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/ParamConstants.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/ParamConstants.java index 3c2ca5791d..b056103910 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/ParamConstants.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/ParamConstants.java @@ -113,8 +113,8 @@ public Type type() { public static final String VERSION_DESCRIPTION = "API version, e.g.: " + DEFAULT_VERSION; public static final String DATA_RELEASE_PARAM = "dataRelease"; - public static final String DATA_RELEASE_DESCRIPTION = "Data release. To use the default data release, set this to 0. To get the list" - + " of available data release, please call the endpoint 'meta/dataReleases'"; + public static final String DATA_RELEASE_DESCRIPTION = "Data release. To get the list of available data releases, please call the" + + " endpoint 'meta/dataReleases'"; public static final String API_KEY_PARAM = "apiKey"; public static final String API_KEY_DESCRIPTION = "API key to allow access to licensed/restricted data sources such as" diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/api/SnpQuery.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/SnpQuery.java new file mode 100644 index 0000000000..ade217f387 --- /dev/null +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/api/SnpQuery.java @@ -0,0 +1,96 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.core.api; + +import org.opencb.cellbase.core.api.query.AbstractQuery; +import org.opencb.cellbase.core.api.query.QueryException; +import org.opencb.cellbase.core.api.query.QueryParameter; + +import java.util.List; +import java.util.Map; + +public class SnpQuery extends AbstractQuery { + + @QueryParameter(id = "id") + private List ids; + @QueryParameter(id = "chromosome") + private String chromosome; + @QueryParameter(id = "position") + private Integer position; + @QueryParameter(id = "reference") + private String reference; + + public SnpQuery() { + } + + public SnpQuery(Map params) throws QueryException { + super(params); + } + + @Override + protected void validateQuery() { + // nothing to validate + return; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SnpQuery{"); + sb.append("ids=").append(ids); + sb.append(", chromosome='").append(chromosome).append('\''); + sb.append(", position='").append(position).append('\''); + sb.append(", reference='").append(reference).append('\''); + sb.append('}'); + return sb.toString(); + } + + public List getIds() { + return ids; + } + + public SnpQuery setIds(List ids) { + this.ids = ids; + return this; + } + + public String getChromosome() { + return chromosome; + } + + public SnpQuery setChromosome(String chromosome) { + this.chromosome = chromosome; + return this; + } + + public Integer getPosition() { + return position; + } + + public SnpQuery setPosition(Integer position) { + this.position = position; + return this; + } + + public String getReference() { + return reference; + } + + public SnpQuery setReference(String reference) { + this.reference = reference; + return this; + } +} diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java index 19f1606c91..9a097fd202 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/config/DownloadProperties.java @@ -26,7 +26,6 @@ public class DownloadProperties { private EnsemblProperties ensembl; private EnsemblProperties ensemblGenomes; private URLProperties hgnc; - private URLProperties cancerHotspot; private URLProperties refSeq; private URLProperties refSeqFasta; private URLProperties refSeqProteinFasta; @@ -52,6 +51,7 @@ public class DownloadProperties { private URLProperties clinvarSummary; private URLProperties clinvarVariationAllele; private URLProperties clinvarEfoTerms; + private URLProperties dbSNP; private URLProperties iarctp53; private URLProperties docm; private URLProperties docmVersion; @@ -71,7 +71,6 @@ public class DownloadProperties { private URLProperties hpoObo; private URLProperties goObo; private URLProperties doidObo; - private URLProperties mondoObo; private URLProperties goAnnotation; private URLProperties revel; private URLProperties pubmed; @@ -265,6 +264,15 @@ public DownloadProperties setClinvarEfoTerms(URLProperties clinvarEfoTerms) { return this; } + public URLProperties getDbSNP() { + return dbSNP; + } + + public DownloadProperties setDbSNP(URLProperties dbSNP) { + this.dbSNP = dbSNP; + return this; + } + public URLProperties getIarctp53() { return iarctp53; } @@ -519,24 +527,6 @@ public DownloadProperties setHgnc(URLProperties hgnc) { return this; } - public URLProperties getCancerHotspot() { - return cancerHotspot; - } - - public DownloadProperties setCancerHotspot(URLProperties cancerHotspot) { - this.cancerHotspot = cancerHotspot; - return this; - } - - public URLProperties getMondoObo() { - return mondoObo; - } - - public DownloadProperties setMondoObo(URLProperties mondoObo) { - this.mondoObo = mondoObo; - return this; - } - public static class EnsemblProperties { private DatabaseCredentials database; diff --git a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java index d1e0fc326b..407278edca 100644 --- a/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java +++ b/cellbase-core/src/main/java/org/opencb/cellbase/core/exception/CellBaseException.java @@ -22,7 +22,7 @@ public CellBaseException(String msg) { super(msg); } - public CellBaseException(String msg, Throwable cause) { + public CellBaseException(String msg, Throwable cause) { super(msg, cause); } diff --git a/cellbase-core/src/main/resources/configuration.yml b/cellbase-core/src/main/resources/configuration.yml index f24827532c..6e651f00d3 100644 --- a/cellbase-core/src/main/resources/configuration.yml +++ b/cellbase-core/src/main/resources/configuration.yml @@ -62,11 +62,7 @@ download: url: host: ftp://ftp.ensemblgenomes.org/pub hgnc: - host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2023-11-01.txt - version: 2023-11-01 - cancerHotspot: - host: https://www.cancerhotspots.org/files/hotspots_v2.xls - version: "v2" + host: https://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/archive/monthly/tsv/hgnc_complete_set_2022-01-01.txt refSeq: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_genomic.gtf.gz refSeqFasta: @@ -77,15 +73,12 @@ download: host: https://ftp.ncbi.nih.gov/refseq/H_sapiens/annotation/GRCh38_latest/refseq_identifiers/GRCh38_latest_rna.fna.gz maneSelect: # host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_0.93/MANE.GRCh38.v0.93.summary.txt.gz -# host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz - host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.1/MANE.GRCh38.v1.1.summary.txt.gz - version: "1.1" + host: https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.0/MANE.GRCh38.v1.0.summary.txt.gz + version: 0.93 lrg: host: http://ftp.ebi.ac.uk/pub/databases/lrgex/list_LRGs_transcripts_xrefs.txt - version: "2021-03-30" geneUniprotXref: host: http://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/ - version: "2023-11-08" geneExpressionAtlas: host: ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/gxa/allgenes_updown_in_organism_part_2.0.14.tab.gz mirbase: @@ -95,51 +88,50 @@ download: targetScan: host: http://hgdownload.cse.ucsc.edu/goldenPath/ miRTarBase: - host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx - version: "9.0" - - ## Protein Data + host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/8.0/hsa_MTI.xlsx uniprot: - host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz - version: "2023-11-08" + host: ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz uniprotRelNotes: - host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt - version: "2023-11-08" + host: ftp://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt + intact: + host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt interpro: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz - version: "2023-11-08" + host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz interproRelNotes: - host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt - intact: - host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt - version: "2023-10-07" - - ## Conservation Scores + host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt conservation: host: https://hgdownload.cse.ucsc.edu/goldenPath/ - version: "2022-08-30" gerp: - host: http://ftp.ensembl.org/pub/release-110/compara/conservation_scores/91_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw - version: "2023-05-17" + host: http://ftp.ensembl.org/pub/release-104/compara/conservation_scores/90_mammals.gerp_conservation_score/gerp_conservation_scores.homo_sapiens.GRCh38.bw clinvar: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2021-07.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-02.xml.gz +<<<<<<< HEAD + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz + clinvarVariation: +# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz +# host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz +======= # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2023-12.xml.gz - version: "2023-12-01" + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/RCV_xml_old_format/ClinVarFullRelease_2024-05.xml.gz + version: 2024-05 clinvarVariation: # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2021-07.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-02.xml.gz # host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2022-11.xml.gz - host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/clinvar_variation/ClinVarVariationRelease_2023-12.xml.gz + host: https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/VCV_xml_old_format/ClinVarVariationRelease_2024-05.xml.gz + version: 2024-05 +>>>>>>> release-6.2.x clinvarSummary: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz - version: "2023-12-01" clinvarVariationAllele: host: http://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz - version: "2023-12-01" clinvarEfoTerms: host: ftp://ftp.ebi.ac.uk/pub/databases/eva/ClinVar/2015/ClinVar_Traits_EFO_Names_260615.csv + dbSNP: + host: https://ftp.ncbi.nih.gov/snp/latest_release/VCF/GCF_000001405.40.gz + version: "156" iarctp53: host: http://p53.iarc.fr/ajax/Zipper.ashx docm: @@ -155,12 +147,16 @@ download: genomicSuperDups: host: http://hgdownload.cse.ucsc.edu/goldenPath gwasCatalog: -# host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv -# version: "1.0.2 associations_e106_r2022-05-17" - host: ftp://ftp.ebi.ac.uk/pub/databases/gwas/releases/2023/12/21/gwas-catalog-associations.tsv - version: "23-12-21" +<<<<<<< HEAD + host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv + version: "1.0.2 associations_e106_r2022-05-17" +======= + #host: http://resources.opencb.org/opencb/cellbase/data/gwas/gwas_catalog_v1.0.2-associations_e106_r2022-05-17.tsv + host: "https://ftp.ebi.ac.uk/pub/databases/gwas/releases/2024/05/20/gwas-catalog-associations_ontology-annotated.tsv" + #version: "1.0.2 associations_e106_r2022-05-17" + version: "2024-05-20" +>>>>>>> release-6.2.x hpo: - ## Downlaod manually from here now: https://hpo.jax.org/app/data/annotations host: https://ci.monarchinitiative.org/view/hpo/job/hpo.annotations/lastSuccessfulBuild/artifact/rare-diseases/util/annotation/phenotype_to_genes.txt disgenet: host: https://www.disgenet.org/static/disgenet_ap1/files/downloads @@ -168,30 +164,20 @@ download: - all_gene_disease_associations.tsv.gz - readme.txt dgidb: - host: https://old.dgidb.org/data/monthly_tsvs/2022-Feb/interactions.tsv - version: "2022-02-01" + host: https://dgidb.org/data/monthly_tsvs/2021-Jan/interactions.tsv cadd: - ## Nacho: Move to https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz ASAP! -# host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz - host: https://krishna.gs.washington.edu/download/CADD/v1.7-pre/GRCh38/whole_genome_SNVs.tsv.gz - version: "1.7-pre" + host: https://krishna.gs.washington.edu/download/CADD/v1.6/GRCh38/whole_genome_SNVs.tsv.gz reactome: host: http://www.reactome.org/download/current/biopax.zip gnomadConstraints: host: https://storage.googleapis.com/gcp-public-data--gnomad/release/2.1.1/constraint/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz - version: "2.1.1" + version: 2.1.1 hpoObo: host: http://purl.obolibrary.org/obo/hp.obo - version: "2023-12-01" goObo: host: http://purl.obolibrary.org/obo/go/go-basic.obo - version: "2023-12-01" doidObo: host: http://purl.obolibrary.org/obo/doid.obo - version: "2023-12-01" - mondoObo: - host: http://purl.obolibrary.org/obo/mondo.obo - version: "2023-12-01" goAnnotation: host: http://geneontology.org/gene-associations/goa_human.gaf.gz revel: @@ -218,7 +204,7 @@ species: - id: hsapiens scientificName: Homo sapiens assemblies: - - ensemblVersion: '110_38' + - ensemblVersion: '104_38' name: GRCh38 - ensemblVersion: '82_37' name: GRCh37 @@ -233,6 +219,7 @@ species: - refseq - regulation - repeats + - variation - variation_functional_score - splice_score shards: diff --git a/cellbase-lib/pom.xml b/cellbase-lib/pom.xml index 28ccca9267..a181ccf4a9 100644 --- a/cellbase-lib/pom.xml +++ b/cellbase-lib/pom.xml @@ -6,7 +6,7 @@ org.opencb.cellbase cellbase - 6.2.1-SNAPSHOT + 6.3.0-SNAPSHOT ../pom.xml @@ -137,10 +137,10 @@ com.github.samtools htsjdk - + io.jsonwebtoken jjwt-api diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java index 124ac6e6fc..d09291bc3e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/EtlCommons.java @@ -35,6 +35,8 @@ */ public class EtlCommons { + public static final String HOMO_SAPIENS_NAME ="Homo sapiens"; + public static final String GENOME_DATA = "genome"; public static final String GENE_DATA = "gene"; public static final String REFSEQ_DATA = "refseq"; @@ -54,16 +56,20 @@ public class EtlCommons { public static final String PHARMGKB_VERSION_FILENAME = "pharmgkbVersion.json"; public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant"; - public static final String CLINVAR_VERSION = "2022.11"; - public static final String CLINVAR_DATE = "2022-11"; - public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2022-11.xml.gz"; + public static final String CLINVAR_VERSION = "2024-05"; + public static final String CLINVAR_DATE = "2024-05"; + public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2024-05.xml.gz"; public static final String CLINVAR_EFO_FILE = "ClinVar_Traits_EFO_Names.csv"; public static final String CLINVAR_SUMMARY_FILE = "variant_summary.txt.gz"; public static final String CLINVAR_VARIATION_ALLELE_FILE = "variation_allele.txt.gz"; public static final String IARCTP53_FILE = "IARC-TP53.zip"; public static final String GWAS_FILE = "gwas_catalog.tsv"; public static final String COSMIC_FILE = "CosmicMutantExport.tsv.gz"; - public static final String DBSNP_FILE = "All.vcf.gz"; + @Deprecated + public static final String DBSNP_FILE = "GCF_000001405.40.gz"; + public static final String DBSNP_NAME = "dbSNP"; + public static final String DBSNP_VERSION_FILENAME = DBSNP_NAME + "Version.json"; + public static final String SNP_COLLECTION_NAME = "snp"; public static final String STRUCTURAL_VARIANTS_DATA = "svs"; public static final String REPEATS_DATA = "repeats"; @@ -71,7 +77,6 @@ public class EtlCommons { public static final String HPO_FILE = "hp.obo"; public static final String GO_FILE = "go-basic.obo"; public static final String DOID_FILE = "doid.obo"; - public static final String MONDO_FILE = "mondo.obo"; public static final String PFM_DATA = "regulatory_pfm"; // Build specific data options diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java index b593f44901..f4c6c861fd 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/CaddScoreBuilder.java @@ -42,7 +42,7 @@ public CaddScoreBuilder(Path caddFilePath, CellBaseSerializer serializer) { super(serializer); this.caddFilePath = caddFilePath; - logger = LoggerFactory.getLogger(ConservationBuilder.class); + logger = LoggerFactory.getLogger(CaddScoreBuilder.class); } /* Example: diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java new file mode 100644 index 0000000000..4f128562e6 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/DbSnpBuilder.java @@ -0,0 +1,219 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import org.opencb.biodata.models.core.Snp; +import org.opencb.biodata.models.core.SnpAnnotation; +import org.opencb.biodata.models.variant.avro.PopulationFrequency; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.opencb.cellbase.core.serializer.CellBaseSerializer; +import org.opencb.commons.utils.FileUtils; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.*; + +import static org.opencb.cellbase.lib.EtlCommons.DBSNP_NAME; + +/** + * Created by imedina on 06/11/15. + */ +public class DbSnpBuilder extends CellBaseBuilder { + + private final Path sourceVariationPath; + private final DownloadProperties.URLProperties dbSnpUrlProperties; + private static final Map CHROMOSOME_MAPPING; + + static { + CHROMOSOME_MAPPING = new HashMap<>(); + CHROMOSOME_MAPPING.put("NC_000001", "1"); + CHROMOSOME_MAPPING.put("NC_000002", "2"); + CHROMOSOME_MAPPING.put("NC_000003", "3"); + CHROMOSOME_MAPPING.put("NC_000004", "4"); + CHROMOSOME_MAPPING.put("NC_000005", "5"); + CHROMOSOME_MAPPING.put("NC_000006", "6"); + CHROMOSOME_MAPPING.put("NC_000007", "7"); + CHROMOSOME_MAPPING.put("NC_000008", "8"); + CHROMOSOME_MAPPING.put("NC_000009", "9"); + CHROMOSOME_MAPPING.put("NC_000010", "10"); + CHROMOSOME_MAPPING.put("NC_000011", "11"); + CHROMOSOME_MAPPING.put("NC_000012", "12"); + CHROMOSOME_MAPPING.put("NC_000013", "13"); + CHROMOSOME_MAPPING.put("NC_000014", "14"); + CHROMOSOME_MAPPING.put("NC_000015", "15"); + CHROMOSOME_MAPPING.put("NC_000016", "16"); + CHROMOSOME_MAPPING.put("NC_000017", "17"); + CHROMOSOME_MAPPING.put("NC_000018", "18"); + CHROMOSOME_MAPPING.put("NC_000019", "19"); + CHROMOSOME_MAPPING.put("NC_000020", "20"); + CHROMOSOME_MAPPING.put("NC_000021", "21"); + CHROMOSOME_MAPPING.put("NC_000022", "22"); + CHROMOSOME_MAPPING.put("NC_000023", "X"); + CHROMOSOME_MAPPING.put("NC_000024", "Y"); + } + + public DbSnpBuilder(Path sourceVariationPath, DownloadProperties.URLProperties dbSnpUrlProperties, CellBaseSerializer serializer) { + super(serializer); + this.sourceVariationPath = sourceVariationPath; + this.dbSnpUrlProperties = dbSnpUrlProperties; + + logger = LoggerFactory.getLogger(DbSnpBuilder.class); + } + + /* Example: + ## dbSNP 156 + #CHROM POS ID REF ALT QUAL FILTER INFO + NC_000001.11 926003 rs1329301928 C A,T . . RS=1329301928;dbSNPBuildID=151;SSR=0; + GENEINFO=SAMD11:148398|LOC107985728:107985728;VC=SNV;NSM;R5;GNO; + FREQ=Estonian:0.9998,0.0002232,.|TOMMO:0.9999,.,0.0001062|dbGaP_PopFreq:0.9999,5.4e-05,0; + CLNVI=.,.,;CLNORIGIN=.,.,1;CLNSIG=.,.,0;CLNDISDB=.,.,MedGen:CN517202;CLNDN=.,.,not_provided;CLNREVSTAT=.,.,single; + CLNACC=.,.,RCV001929748.1;CLNHGVS=NC_000001.11:g.926003=,NC_000001.11:g.926003C>A,NC_000001.11:g.926003C>T + NC_000001.11 925952 rs1640863258 G A . . RS=1640863258;SSR=0; + GENEINFO=SAMD11:148398|LOC107985728:107985728;VC=SNV;NSM;R5;CLNVI=.,;CLNORIGIN=.,1;CLNSIG=.,0;CLNDISDB=.,MedGen:CN517202; + CLNDN=.,not_provided;CLNREVSTAT=.,single;CLNACC=.,RCV001318826.4;CLNHGVS=NC_000001.11:g.925952=,NC_000001.11:g.925952G>A + NC_000001.11 925953 rs1349221494 G A,T . . RS=1349221494;dbSNPBuildID=151;SSR=0; + GENEINFO=SAMD11:148398|LOC107985728:107985728;VC=SNV;SYN;R5;GNO; + FREQ=GnomAD:1,1.426e-05,.|GnomAD_exomes:1,.,4.008e-06|TOPMED:1,3.778e-06,.|dbGaP_PopFreq:1,0,3.124e-05 + NC_000001.11 925956 rs1342334044 C T . . RS=1342334044;dbSNPBuildID=155;SSR=0; + GENEINFO=SAMD11:148398|LOC107985728:107985728;VC=SNV;SYN;R5;GNO; + FREQ=TOPMED:1,1.133e-05|dbGaP_PopFreq:1,0; + CLNVI=.,;CLNORIGIN=.,1;CLNSIG=.,3;CLNDISDB=.,MedGen:CN517202;CLNDN=.,not_provided;CLNREVSTAT=.,single;CLNACC=.,RCV002170030.3; + CLNHGVS=NC_000001.11:g.925956=,NC_000001.11:g.925956C>T + */ + @Override + public void parse() throws Exception { + Path dbSnpFilePath = sourceVariationPath.resolve(Paths.get(dbSnpUrlProperties.getHost()).getFileName()); + FileUtils.checkPath(dbSnpFilePath); + + CellBaseFileSerializer fileSerializer = (CellBaseFileSerializer) serializer; + + String line; + String[] fields; + + String chromosome; + int position; + String id; + String ref; + String[] alt; + String type; + String version; + String info; + List flags; + Map additionalAttributes; + + SnpAnnotation snpAnnotation; + + try (BufferedReader bufferedReader = FileUtils.newBufferedReader(dbSnpFilePath)) { + while ((line = bufferedReader.readLine()) != null) { + if (!line.startsWith("#")) { + fields = line.split("\t"); + + chromosome = fields[0].split("\\.")[0]; + chromosome = CHROMOSOME_MAPPING.get(chromosome); + position = Integer.parseInt(fields[1]); + id = fields[2]; + ref = fields[3]; + alt = fields[4].split(","); + version = dbSnpUrlProperties.getVersion(); + info = fields[7]; + + // Calculate SNP type + type = "SNV"; + if (ref.length() > 1) { + type = "INDEL"; + } else { + for (String altAllele : alt) { + if (altAllele.length() > 1) { + type = "INDEL"; + break; + } + } + } + + snpAnnotation = new SnpAnnotation(); + flags = new ArrayList<>(); + additionalAttributes = new HashMap<>(); + + String[] infoFields = info.split(";"); + for (String infoField : infoFields) { + String[] infoKeyValue = infoField.split("="); + switch (infoKeyValue[0]) { + case "dbSNPBuildID": + version = infoKeyValue[1]; + break; + case "GENEINFO": { + snpAnnotation.setGene(infoKeyValue[1].split(":")[0]); + break; + } + case "FREQ": { + String[] studies = infoKeyValue[1].split("\\|"); + List populationFrequencies = new ArrayList<>(); + for (String study : studies) { + // After splitting 'GnomAD:1,1.426e-05,.' we get: + // freqFields: [GnomAD, 1, 1.426e-05, .] + String[] freqFields = study.split("[:,]"); + if (freqFields.length == alt.length + 2) { + for (int i = 0; i < alt.length; i++) { + if (".".equals(freqFields[1])) { + logger.warn("Skipping pop. frequency for ref. allele ({}) of study {}: it is '.')", + ref, freqFields[0]); + } else if (".".equals(freqFields[i + 2])) { + logger.warn("Skipping pop. frequency for alt. allele ({}) of study {}: it is '.')", + alt[i], freqFields[0]); + } else { + PopulationFrequency populationFrequency = new PopulationFrequency(); + populationFrequency.setStudy(freqFields[0]); + populationFrequency.setRefAllele(ref); + populationFrequency.setRefAlleleFreq(Float.parseFloat(freqFields[1])); + populationFrequency.setAltAllele(alt[i]); + populationFrequency.setAltAlleleFreq(Float.parseFloat(freqFields[i + 2])); + + populationFrequencies.add(populationFrequency); + } + } + } else { + logger.warn("Skipping pop. frequencies for study {}: the number of prop. frequencies ({}) does not" + + " match the number of alleles ({})", freqFields[0], freqFields.length - 1, + alt.length + 1); + } + } + snpAnnotation.setPopulationFrequencies(populationFrequencies); + break; + } + default: { + if (infoKeyValue.length == 1) { + flags.add(infoKeyValue[0]); + } else { + additionalAttributes.put(infoKeyValue[0], infoKeyValue[1]); + } + } + } + } + snpAnnotation.setFlags(flags); + snpAnnotation.setAdditionalAttributes(additionalAttributes); + + Snp snp = new Snp(id, chromosome, position, ref, Arrays.asList(alt), type, DBSNP_NAME, version, snpAnnotation); + fileSerializer.serialize(snp, DBSNP_NAME); + } + } + } + logger.info("Parsing finished."); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java index cd0863a259..563f76dea7 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/GeneBuilder.java @@ -90,8 +90,8 @@ public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, Species boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException { this(null, geneDirectoryPath.resolve("description.txt"), geneDirectoryPath.resolve("xrefs.txt"), - geneDirectoryPath.resolve("hgnc_complete_set_2023-11-01.txt"), - geneDirectoryPath.resolve("MANE.GRCh38.v1.1.summary.txt.gz"), + geneDirectoryPath.resolve("hgnc_complete_set_2022-01-01.txt"), + geneDirectoryPath.resolve("MANE.GRCh38.v1.0.summary.txt.gz"), geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"), geneDirectoryPath.resolve("idmapping_selected.tab.gz"), geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"), diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java index 1eabf8975a..8873dd7f93 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/OntologyBuilder.java @@ -32,14 +32,12 @@ public class OntologyBuilder extends CellBaseBuilder { private Path hpoFile; private Path goFile; private Path doidFile; - private Path mondoFile; public OntologyBuilder(Path oboDirectoryPath, CellBaseSerializer serializer) { super(serializer); hpoFile = oboDirectoryPath.resolve(EtlCommons.HPO_FILE); goFile = oboDirectoryPath.resolve(EtlCommons.GO_FILE); doidFile = oboDirectoryPath.resolve(EtlCommons.DOID_FILE); - mondoFile = oboDirectoryPath.resolve(EtlCommons.MONDO_FILE); } @Override @@ -66,13 +64,6 @@ public void parse() throws Exception { serializer.serialize(term); } - bufferedReader = FileUtils.newBufferedReader(mondoFile); - terms = parser.parseOBO(bufferedReader, "Mondo Ontology"); - for (OntologyTerm term : terms) { - term.setSource("MONDO"); - serializer.serialize(term); - } - serializer.close(); } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java new file mode 100644 index 0000000000..087a4aed36 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/VariationBuilder.java @@ -0,0 +1,48 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.builders; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.serializer.CellBaseFileSerializer; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; + +/** + * Created by imedina on 06/11/15. + */ +public class VariationBuilder extends CellBaseBuilder { + + private DbSnpBuilder dbSnpBuilder; + + public VariationBuilder(Path downloadVariationPath, CellBaseFileSerializer fileSerializer, CellBaseConfiguration configuration) { + super(fileSerializer); + + // dbSNP + DownloadProperties.URLProperties dbSnpUrlProperties = configuration.getDownload().getDbSNP(); + dbSnpBuilder = new DbSnpBuilder(downloadVariationPath, dbSnpUrlProperties, fileSerializer); + + logger = LoggerFactory.getLogger(VariationBuilder.class); + } + + @Override + public void parse() throws Exception { + // Parsing dbSNP data + dbSnpBuilder.parse(); + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java index 1817359e98..39cddb4fcf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/ClinVarIndexer.java @@ -211,7 +211,7 @@ private void printSummary() { } private boolean updateRocksDB(SequenceLocation sequenceLocation, String variationId, String[] lineFields, - String mateVariantString, Map traitsToEfoTermsMap) + String mateVariantString, Map traitsToEfoTermsMap) throws RocksDBException, IOException { // More than one variant being returned from the normalisation process would mean it's and MNV which has been // decomposed @@ -267,13 +267,34 @@ private boolean updateRocksDB(AlleleLocationData alleleLocationData, PublicSetTy } // parse RCVs - String accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); - String clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() - .getClinicalSignificance() - .getDescription(); - String reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance() - .getReviewStatus().name(); - List getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); + String accession = null; + try { + accession = publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc(); + } catch (Exception e) { + logger.warn("Error getting accession. Ignore error and leave it as null.", e); + } + String clinicalSignficanceDescription = null; + try { + clinicalSignficanceDescription = publicSet.getReferenceClinVarAssertion() + .getClinicalSignificance() + .getDescription(); + } catch (Exception e) { + logger.warn("Error getting clinical significance description. Ignore error and leave it as null.", e); + } + String reviewStatusName = null; + try { + reviewStatusName = publicSet.getReferenceClinVarAssertion().getClinicalSignificance() + .getReviewStatus().name(); + } catch (Exception e) { + logger.warn("Error getting review status name. Ignore error and leave it as null.", e); + } + List getObservedIn = null; + try { + getObservedIn = publicSet.getReferenceClinVarAssertion().getObservedIn(); + } catch (Exception e) { + logger.warn("Error getting observed in. Ignore error and leave it as null.", e); + } + addNewEntries(variantAnnotation, publicSet, alleleLocationData.getAlleleId(), mateVariantString, clinicalHaplotypeString, traitsToEfoTermsMap, accession, clinicalSignficanceDescription, reviewStatusName, getObservedIn); @@ -389,7 +410,7 @@ private void addNewEntries(VariantAnnotation variantAnnotation, PublicSetType pu Map traitsToEfoTermsMap, String accession, String clinicalSignficanceDescription, String reviewStatusName, List getObservedIn) - throws JsonProcessingException { + throws JsonProcessingException { List additionalProperties = new ArrayList<>(3); EvidenceSource evidenceSource = new EvidenceSource(EtlCommons.CLINVAR_DATA, CLINVAR_VERSION, CLINVAR_DATE); @@ -545,7 +566,7 @@ private ModeOfInheritance getModeOfInheritance(String modeOfInheritance) { private List getGenomicFeature(PublicSetType publicSet, String alleleId) { if (publicSet.getReferenceClinVarAssertion().getMeasureSet() != null) { return getGenomicFeature(publicSet.getReferenceClinVarAssertion().getMeasureSet()); - // No measureSet means there must be genotypeSet + // No measureSet means there must be genotypeSet } else if (publicSet.getReferenceClinVarAssertion().getGenotypeSet() != null) { for (MeasureSetType measureSet : publicSet.getReferenceClinVarAssertion().getGenotypeSet().getMeasureSet()) { if (measureSet.getMeasure() != null) { @@ -597,7 +618,7 @@ private List getHeritableTrait(PublicSetType publicSet, Map 0) { logger.warn("ClinVar record found " + publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc() + " with no preferred trait provided. Arbitrarily selecting first one: {}", trait.getName() .get(0).getElementValue().getValue()); return trait.getName().get(0).getElementValue().getValue(); - // No trait name provided at all + // No trait name provided at all } else { throw new IllegalArgumentException("ClinVar record found " + publicSet.getReferenceClinVarAssertion().getClinVarAccession().getAcc() diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java index 4a1662cfb1..4e7bbecad5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/CosmicIndexer.java @@ -32,7 +32,7 @@ public class CosmicIndexer extends ClinicalIndexer { private final Path cosmicFile; private final String assembly; - private static final String COSMIC_VERSION = "v95"; + private static final String COSMIC_VERSION = "v99"; public CosmicIndexer(Path cosmicFile, boolean normalize, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java index 2b4f2e4d8b..0fe3b0f115 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/builders/clinical/variant/GwasIndexer.java @@ -31,6 +31,7 @@ import java.nio.file.Path; import java.text.NumberFormat; import java.util.*; +import java.util.stream.Collectors; public class GwasIndexer extends ClinicalIndexer { @@ -46,6 +47,8 @@ public class GwasIndexer extends ClinicalIndexer { private int gwasLinesNotFoundInDbsnp; private int invalidVariantRecords; + private int lineCounter = 0; + public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePath, String assembly, RocksDB rdb) throws IOException { super(genomeSequenceFilePath); @@ -56,36 +59,31 @@ public GwasIndexer(Path gwasFile, Path dbSnpTabixFile, Path genomeSequenceFilePa } public void index() throws RocksDBException, IOException { - logger.info("Parsing GWAS catalog file ..."); - - BufferedReader inputReader = null; - TabixReader dbsnpTabixReader = null; - - try { - logger.info("Opening GWAS catalog file " + gwasFile + " ..."); - inputReader = new BufferedReader(new FileReader(gwasFile.toFile())); + try (BufferedReader inputReader = new BufferedReader(new FileReader(gwasFile.toFile())); + TabixReader dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString())) { logger.info("Ignoring GWAS catalog file header line ..."); - String line = inputReader.readLine(); + inputReader.readLine(); + ++lineCounter; + Map chromosomeMap = buildChromosomeMap(dbsnpTabixReader); Map gwasMap = new HashMap<>(); - logger.info("Opening dbSNP tabix file " + dbSnpTabixFile + " ..."); - dbsnpTabixReader = new TabixReader(dbSnpTabixFile.toString()); long processedGwasLines = 0; - logger.info("Parsing GWAS catalog file ..."); + logger.info("Parsing GWAS catalog file {} ...", gwasFile); + String line; while ((line = inputReader.readLine()) != null) { + ++lineCounter; if (!line.isEmpty()) { processedGwasLines++; if (processedGwasLines % 10000 == 0) { logger.info("{} lines parsed", processedGwasLines); } - processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap); + processGwasCatalogLine(line.split("\t"), dbsnpTabixReader, gwasMap, chromosomeMap); } } - dbsnpTabixReader.close(); logger.info("Updating clinical variant annotation..."); long counter = 0; @@ -118,16 +116,9 @@ public void index() throws RocksDBException, IOException { rdb.put(entry.getKey().getBytes(), jsonObjectWriter.writeValueAsBytes(variantAnnotation)); } this.printSummary(processedGwasLines, gwasMap); - } catch (RocksDBException | IOException e) { + } catch (RocksDBException | IOException e) { logger.error("Error reading/writing from/to the RocksDB index while indexing GWAS catalog file"); throw e; - } finally { - if (inputReader != null) { - inputReader.close(); - } - if (dbsnpTabixReader != null) { - dbsnpTabixReader.close(); - } } } @@ -184,13 +175,14 @@ significant digit (for example, a published p-value of 4.8 x 10-7 is rounded to 37 GENOTYPING_TECHNOLOGY* +: Genotyping technology/ies used in this study, with additional array information (ex. Immunochip or Exome array) in brackets. */ - private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap) { + private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReader, Map gwasMap, + Map chromosomeMap) throws IOException { Integer start = parseStart(values); if (start != null) { String chromosome = parseChromosome(values[11]); if (StringUtils.isNotEmpty(chromosome)) { String snpId = "rs" + values[23].trim(); - String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader); + String[] refAndAlt = getRefAndAltFromDbsnp(chromosome, start, snpId, dbsnpTabixReader, chromosomeMap); if (refAndAlt != null) { // Create variant Variant variant; @@ -270,21 +262,27 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade // Scores management GwasAssociationStudyTraitScores scores = new GwasAssociationStudyTraitScores(); - try { - scores.setPValue(Double.parseDouble(values[27])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); + if (StringUtils.isNotEmpty(values[27])) { + try { + scores.setPValue(Double.parseDouble(values[27])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing pValue: " + values[27]); + } } - try { - scores.setPValueMlog(Double.parseDouble(values[28])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); + if (StringUtils.isNotEmpty(values[28])) { + try { + scores.setPValueMlog(Double.parseDouble(values[28])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing pValue mlog: " + values[28]); + } } scores.setPValueText(values[29]); - try { - scores.setOrBeta(Double.parseDouble(values[30])); - } catch (NumberFormatException e) { -// logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]); + if (StringUtils.isNotEmpty(values[30])) { + try { + scores.setOrBeta(Double.parseDouble(values[30])); + } catch (NumberFormatException e) { + logger.warn(e.getMessage() + ". Parsing Odd or beta: " + values[30]); + } } scores.setPercentCI(values[31]); @@ -301,15 +299,15 @@ private void processGwasCatalogLine(String[] values, TabixReader dbsnpTabixReade gwasMap.put(key, gwas); } } else { -// logger.warn("Variant not found in dbSNP " + snpId + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("dbSNP {} not found. Line: {}", snpId, lineCounter); gwasLinesNotFoundInDbsnp++; } } else { -// logger.warn("Invalid chromosome " + chromosome + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("Invalid chromosome {}. Line: {}", chromosome, lineCounter); invalidChromosome++; } } else { -// logger.warn("Invalid position " + start + ". Line: " + StringUtils.join(values, "\t\t\t")); + logger.warn("Invalid position {}. Line: {}", start, lineCounter); invalidStartRecords++; } } @@ -342,6 +340,39 @@ private String parseChromosome(String chromosome) { return transformedChromosome; } + private Map buildChromosomeMap(TabixReader dbsnpTabixReader) { + List chroms = dbsnpTabixReader.getChromosomes().stream().filter(name -> name.startsWith("NC_")) + .collect(Collectors.toList()); + + Map chromMap = new HashMap<>(); + for (int i = 1; i < 22; i++) { + chromMap.put(Integer.toString(i), Integer.toString(i)); + } + chromMap.put("X", "X"); + chromMap.put("Y", "Y"); + chromMap.put("MT", "MT"); + + for (String chrom : chroms) { + String[] split = chrom.split("[_\\.]"); + int value = Integer.parseInt(split[1]); + switch (value) { + case 23: + chromMap.put("X", chrom); + break; + case 24: + chromMap.put("Y", chrom); + break; + case 12920: + chromMap.put("MT", chrom); + break; + default: + chromMap.put(Integer.toString(value), chrom); + break; + } + } + return chromMap; + } + private Float parseFloat(String value) { Float riskAlleleFrequency = null; if (NumberUtils.isNumber(value)) { @@ -350,29 +381,33 @@ private Float parseFloat(String value) { return riskAlleleFrequency; } - private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader) { + private String[] getRefAndAltFromDbsnp(String chromosome, Integer start, String snpId, TabixReader dbsnpTabixReader, + Map chromosomeMap) throws IOException { + boolean found = false; + Set foundSnpIds = new HashSet<>(); String[] refAndAlt = null; - TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(chromosome + ":" + start + "-" + start); - try { - String dbSnpRecord = dbsnpIterator.next(); - boolean found = false; - while (dbSnpRecord != null && !found) { - String[] dbsnpFields = dbSnpRecord.split("\t"); - - if (snpId.equalsIgnoreCase(dbsnpFields[2])) { - refAndAlt = new String[2]; - refAndAlt[REF] = dbsnpFields[3]; - refAndAlt[ALT] = dbsnpFields[4]; - found = true; - } - - dbSnpRecord = dbsnpIterator.next(); + String query = chromosomeMap.get(chromosome) + ":" + start + "-" + start; + TabixReader.Iterator dbsnpIterator = dbsnpTabixReader.query(query); + String dbSnpRecord = null; + dbSnpRecord = dbsnpIterator.next(); + while (dbSnpRecord != null && !found) { + String[] dbsnpFields = dbSnpRecord.split("\t"); + + if (snpId.equalsIgnoreCase(dbsnpFields[2])) { + refAndAlt = new String[2]; + refAndAlt[REF] = dbsnpFields[3]; + refAndAlt[ALT] = dbsnpFields[4]; + found = true; + } else { + foundSnpIds.add(dbsnpFields[2]); } - } catch (IOException e) { - logger.warn("Error reading position '" + chromosome + ":" + start + "' in dbSNP: " + e.getMessage()); - } + dbSnpRecord = dbsnpIterator.next(); + } + if (!found) { + logger.warn("dbSNP {} not found from query {}. Found: {}", snpId, query, foundSnpIds); + } return refAndAlt; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java index 580a855a19..bb9e0c36e4 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ClinicalDownloadManager.java @@ -55,12 +55,7 @@ public List download() throws IOException, InterruptedException { public List downloadClinical() throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { -// if (assemblyConfiguration.getName() == null) { -// throw new ParameterException("Assembly must be provided for downloading clinical variants data." -// + " Please, specify either --assembly GRCh37 or --assembly GRCh38"); -// } - - logger.info("Downloading clinical information ..."); + logger.info("Downloading clinical variant information ..."); String url; List downloadFiles = new ArrayList<>(); @@ -86,8 +81,10 @@ public List downloadClinical() throws IOException, InterruptedExce url = configuration.getDownload().getClinvarVariationAllele().getHost(); downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.CLINVAR_VARIATION_ALLELE_FILE).toString())); clinvarUrls.add(url); - saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, getClinVarVersion(), getTimeStamp(), clinvarUrls, - clinicalFolder.resolve("clinvarVersion.json")); + saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, CLINVAR_NAME, configuration.getDownload().getClinvar() + .getVersion(), getTimeStamp(), clinvarUrls, clinicalFolder.resolve("clinvarVersion.json")); + + logger.info("\t\tDone"); // Gwas catalog logger.info("\t\tDownloading GWAS catalog file ..."); @@ -96,6 +93,7 @@ public List downloadClinical() throws IOException, InterruptedExce downloadFiles.add(downloadFile(url, clinicalFolder.resolve(EtlCommons.GWAS_FILE).toString())); saveVersionData(EtlCommons.CLINICAL_VARIANTS_DATA, GWAS_NAME, gwasCatalog.getVersion(), getTimeStamp(), Collections.singletonList(url), clinicalFolder.resolve("gwasVersion.json")); + logger.info("\t\tDone"); // List hgvsList = getDocmHgvsList(); // if (!hgvsList.isEmpty()) { @@ -241,10 +239,4 @@ private List getDocmHgvsList() throws IOException { return hgvsList; } - - private String getClinVarVersion() { - // ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_2015-12.xml.gz - return configuration.getDownload().getClinvar().getHost().split("_")[1].split("\\.")[0]; - } - } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java index 17022cae4b..0deb62386b 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/Downloader.java @@ -64,6 +64,11 @@ public List downloadConservation() throws IOException, CellBaseExc return manager.downloadConservation(); } + public List downloadVariation() throws IOException, CellBaseException, InterruptedException { + VariationDownloadManager manager = new VariationDownloadManager(species, assembly, outputDirectory, configuration); + return manager.download(); + } + public List downloadClinicalVariants() throws IOException, CellBaseException, InterruptedException { ClinicalDownloadManager manager = new ClinicalDownloadManager(species, assembly, outputDirectory, configuration); return manager.download(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java index 260ff75427..9d2685eadf 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GeneDownloadManager.java @@ -82,8 +82,6 @@ public List download() throws IOException, InterruptedException { downloadFiles.addAll(downloadRefSeq(refseqFolder)); downloadFiles.add(downloadMane(geneFolder)); downloadFiles.add(downloadLrg(geneFolder)); - downloadFiles.add(downloadHgnc(geneFolder)); - downloadFiles.add(downloadCancerHotspot(geneFolder)); downloadFiles.add(downloadDrugData(geneFolder)); downloadFiles.addAll(downloadGeneUniprotXref(geneFolder)); downloadFiles.add(downloadGeneExpressionAtlas(geneFolder)); @@ -210,30 +208,6 @@ private DownloadFile downloadLrg(Path geneFolder) throws IOException, Interrupte return null; } - private DownloadFile downloadHgnc(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading HGNC ..."); - String url = configuration.getDownload().getHgnc().getHost(); - saveVersionData(EtlCommons.GENE_DATA, "HGNC_GENE", configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("hgncVersion.json")); - String[] array = url.split("/"); - return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString()); - } - return null; - } - - private DownloadFile downloadCancerHotspot(Path geneFolder) throws IOException, InterruptedException { - if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { - logger.info("Downloading Cancer Hotspot ..."); - String url = configuration.getDownload().getCancerHotspot().getHost(); - saveVersionData(EtlCommons.GENE_DATA, "CANCER_HOTSPOT", configuration.getDownload().getHgnc().getVersion(), - getTimeStamp(), Collections.singletonList(url), geneFolder.resolve("cancerHotspotVersion.json")); - String[] array = url.split("/"); - return downloadFile(url, geneFolder.resolve(array[array.length - 1]).toString()); - } - return null; - } - private DownloadFile downloadGO(Path geneFolder) throws IOException, InterruptedException { if (speciesConfiguration.getScientificName().equals("Homo sapiens")) { logger.info("Downloading go annotation..."); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java index 0ba9f39db4..5a0609867f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/GenomeDownloadManager.java @@ -47,11 +47,11 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto public List download() throws IOException, InterruptedException { List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadReferenceGenome()); - downloadFiles.addAll(downloadConservation()); - downloadFiles.addAll(downloadRepeats()); +// downloadFiles.addAll(downloadConservation()); +// downloadFiles.addAll(downloadRepeats()); // cytobands -// runGenomeInfo(); + runGenomeInfo(); return downloadFiles; } @@ -115,16 +115,16 @@ public List downloadConservation() throws IOException, Interrupted List phastconsUrls = new ArrayList<>(chromosomes.length); List phyloPUrls = new ArrayList<>(chromosomes.length); for (String chromosome : chromosomes) { - String phastConsUrl = url + "/phastCons470way/hg38.470way.phastCons/chr" + chromosome - + ".phastCons470way.wigFix.gz"; + String phastConsUrl = url + "/phastCons100way/hg38.100way.phastCons/chr" + chromosome + + ".phastCons100way.wigFix.gz"; downloadFiles.add(downloadFile(phastConsUrl, conservationFolder.resolve("phastCons") - .resolve("chr" + chromosome + ".phastCons470way.wigFix.gz").toString())); + .resolve("chr" + chromosome + ".phastCons100way.wigFix.gz").toString())); phastconsUrls.add(phastConsUrl); - String phyloPUrl = url + "/phyloP470way/hg38.470way.phyloP/chr" + chromosome - + ".phyloP470way.wigFix.gz"; + String phyloPUrl = url + "/phyloP100way/hg38.100way.phyloP100way/chr" + chromosome + + ".phyloP100way.wigFix.gz"; downloadFiles.add(downloadFile(phyloPUrl, conservationFolder.resolve("phylop") - .resolve("chr" + chromosome + ".phyloP470way.wigFix.gz").toString())); + .resolve("chr" + chromosome + ".phyloP100way.wigFix.gz").toString())); phyloPUrls.add(phyloPUrl); } String gerpUrl = configuration.getDownload().getGerp().getHost(); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java index 522be7b27d..0776354e80 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/OntologyDownloadManager.java @@ -36,7 +36,7 @@ public OntologyDownloadManager(String species, String assembly, Path targetDirec public List download() throws IOException, InterruptedException { - logger.info("Downloading OBO files ..."); + logger.info("Downloading obo files ..."); List downloadFiles = new ArrayList<>(); Path oboFolder = downloadFolder.resolve("ontology"); @@ -44,22 +44,20 @@ public List download() throws IOException, InterruptedException { String url = configuration.getDownload().getHpoObo().getHost(); downloadFiles.add(downloadFile(url, oboFolder.resolve("hp.obo").toString())); + saveVersionData(EtlCommons.OBO_DATA, "HPO", getTimeStamp(), getTimeStamp(), Collections.singletonList(url), buildFolder.resolve(EtlCommons.HPO_VERSION_FILE)); url = configuration.getDownload().getGoObo().getHost(); downloadFiles.add(downloadFile(url, oboFolder.resolve("go-basic.obo").toString())); + saveVersionData(EtlCommons.OBO_DATA, "GO", getTimeStamp(), getTimeStamp(), Collections.singletonList(url), buildFolder.resolve(EtlCommons.GO_VERSION_FILE)); url = configuration.getDownload().getDoidObo().getHost(); downloadFiles.add(downloadFile(url, oboFolder.resolve("doid.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), - Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); - url = configuration.getDownload().getMondoObo().getHost(); - downloadFiles.add(downloadFile(url, oboFolder.resolve("mondo.obo").toString())); - saveVersionData(EtlCommons.OBO_DATA, "MONDO", getTimeStamp(), getTimeStamp(), + saveVersionData(EtlCommons.OBO_DATA, "DO", getTimeStamp(), getTimeStamp(), Collections.singletonList(url), buildFolder.resolve(EtlCommons.DO_VERSION_FILE)); return downloadFiles; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java index 5a722ed448..08f28cfdad 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/ProteinDownloadManager.java @@ -22,6 +22,7 @@ import org.opencb.commons.utils.FileUtils; import java.io.BufferedReader; +import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; import java.nio.file.Files; @@ -33,8 +34,6 @@ public class ProteinDownloadManager extends AbstractDownloadManager { private static final String UNIPROT_NAME = "UniProt"; - private static final String INTERPRO_NAME = "InterPro"; - private static final String INTACT_NAME = "IntAct"; public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) throws IOException, CellBaseException { @@ -57,7 +56,6 @@ public List download() throws IOException, InterruptedException { Files.createDirectories(proteinFolder); List downloadFiles = new ArrayList<>(); - // Uniprot String url = configuration.getDownload().getUniprot().getHost(); downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString())); Files.createDirectories(proteinFolder.resolve("uniprot_chunks")); @@ -65,25 +63,23 @@ public List download() throws IOException, InterruptedException { String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost(); downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString())); + saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1), getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json")); - // Interpro - String interproUrl = configuration.getDownload().getInterpro().getHost(); - downloadFiles.add(downloadFile(interproUrl, proteinFolder.resolve("protein2ipr.dat.gz").toString())); - - relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); - downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), - getTimeStamp(), Collections.singletonList(interproUrl), proteinFolder.resolve("interproVersion.json")); - - // Intact - String intactUrl = configuration.getDownload().getIntact().getHost(); - downloadFiles.add(downloadFile(intactUrl, proteinFolder.resolve("intact.txt").toString())); - saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, configuration.getDownload().getIntact().getVersion(), - getTimeStamp(), Collections.singletonList(intactUrl), proteinFolder.resolve("intactVersion.json")); - return downloadFiles; + +// url = configuration.getDownload().getIntact().getHost(); +// downloadFile(url, proteinFolder.resolve("intact.txt").toString()); +// saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url), +// proteinFolder.resolve("intactVersion.json")); +// +// url = configuration.getDownload().getInterpro().getHost(); +// downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString()); +// relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost(); +// downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString()); +// saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5), +// getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json")); } private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException { @@ -100,7 +96,7 @@ private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOE inEntry = true; beforeEntry = false; if (count % 10000 == 0) { - pw = new PrintWriter(Files.newOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile().toPath())); + pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile())); pw.println(header.toString().trim()); } count++; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java index 51152e478d..1abb352fbe 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/RegulationDownloadManager.java @@ -64,8 +64,8 @@ public List download() throws IOException, InterruptedException, N List downloadFiles = new ArrayList<>(); downloadFiles.addAll(downloadRegulatoryaAndMotifFeatures()); - downloadFiles.add(downloadMiRTarBase()); downloadFiles.add(downloadMirna()); + downloadFiles.add(downloadMiRTarBase()); return downloadFiles; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java new file mode 100644 index 0000000000..7586505d21 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/download/VariationDownloadManager.java @@ -0,0 +1,66 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.download; + +import org.opencb.cellbase.core.config.CellBaseConfiguration; +import org.opencb.cellbase.core.config.DownloadProperties; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.lib.EtlCommons; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.List; + +import static org.opencb.cellbase.lib.EtlCommons.*; + +public class VariationDownloadManager extends AbstractDownloadManager { + + public VariationDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration) + throws IOException, CellBaseException { + super(species, assembly, targetDirectory, configuration); + } + + @Override + public List download() throws IOException, InterruptedException { + return Collections.singletonList(downloadDbSnp()); + } + + public DownloadFile downloadDbSnp() throws IOException, InterruptedException { + if (!speciesHasInfoToDownload(speciesConfiguration, VARIATION_DATA)) { + return null; + } + if (speciesConfiguration.getScientificName().equals(EtlCommons.HOMO_SAPIENS_NAME)) { + logger.info("Downloading dbSNP information ..."); + + Path variation = downloadFolder.resolve(VARIATION_DATA); + Files.createDirectories(variation); + + DownloadProperties.URLProperties dbSNP = configuration.getDownload().getDbSNP(); + String url = dbSNP.getHost(); + saveVersionData(VARIATION_DATA, DBSNP_NAME, dbSNP.getVersion(), getTimeStamp(), + Collections.singletonList(url), variation.resolve(DBSNP_VERSION_FILENAME)); + + Path outPath = variation.resolve(Paths.get(url).getFileName()); + logger.info("Downloading {} to {} ...", url, outPath); + return downloadFile(url, outPath.toString()); + } + return null; + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java index 5d7dbc65d0..e5cd4d38cc 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MetaMongoDBAdaptor.java @@ -16,8 +16,6 @@ package org.opencb.cellbase.lib.impl.core; - -import com.fasterxml.jackson.databind.ObjectMapper; import com.mongodb.ReadPreference; import com.mongodb.WriteConcern; import com.mongodb.client.model.Filters; @@ -25,6 +23,7 @@ import org.bson.BsonDocument; import org.bson.Document; import org.bson.conversions.Bson; +import org.codehaus.jackson.map.ObjectMapper; import org.opencb.cellbase.core.api.key.ApiKeyStats; import org.opencb.cellbase.core.api.query.AbstractQuery; import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java index e120e0ae51..8912840bd5 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/MongoDBAdaptorFactory.java @@ -95,6 +95,10 @@ public PharmacogenomicsMongoDBAdaptor getPharmacogenomicsMongoDBAdaptor() { return new PharmacogenomicsMongoDBAdaptor(mongoDatastore); } + public SnpMongoDBAdaptor getSnpDBAdaptor() { + return new SnpMongoDBAdaptor(mongoDatastore); + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("MongoDBAdaptorFactory{"); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/OntologyMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/OntologyMongoDBAdaptor.java index cacf8457d5..f1e664a508 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/OntologyMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/OntologyMongoDBAdaptor.java @@ -85,11 +85,6 @@ public List> info(List ids, ProjectionQ return results; } - @Override - public CellBaseDataResult count(OntologyQuery query) { - return null; - } - @Override public CellBaseDataResult distinct(OntologyQuery query) throws CellBaseException { Bson bsonDocument = parseQuery(query); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PublicationMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PublicationMongoDBAdaptor.java index a279f07653..5c8fcb571f 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PublicationMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/PublicationMongoDBAdaptor.java @@ -86,11 +86,6 @@ public List> info(List ids, Projection return results; } - @Override - public CellBaseDataResult count(PublicationQuery query) { - return null; - } - @Override public CellBaseDataResult distinct(PublicationQuery query) throws CellBaseException { Bson bsonDocument = parseQuery(query); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/SnpMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/SnpMongoDBAdaptor.java new file mode 100644 index 0000000000..6b3d78ce83 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/SnpMongoDBAdaptor.java @@ -0,0 +1,158 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.impl.core; + +import com.mongodb.client.model.Filters; +import com.mongodb.client.model.Projections; +import org.bson.Document; +import org.bson.conversions.Bson; +import org.opencb.biodata.models.core.Snp; +import org.opencb.cellbase.core.api.SnpQuery; +import org.opencb.cellbase.core.api.query.ProjectionQueryOptions; +import org.opencb.cellbase.core.exception.CellBaseException; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.iterator.CellBaseIterator; +import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.commons.datastore.core.QueryParam; +import org.opencb.commons.datastore.mongodb.GenericDocumentComplexConverter; +import org.opencb.commons.datastore.mongodb.MongoDBCollection; +import org.opencb.commons.datastore.mongodb.MongoDBIterator; +import org.opencb.commons.datastore.mongodb.MongoDataStore; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.regex.Pattern; + +import static org.opencb.cellbase.core.ParamConstants.API_KEY_PARAM; +import static org.opencb.cellbase.core.ParamConstants.DATA_RELEASE_PARAM; + +public class SnpMongoDBAdaptor extends CellBaseDBAdaptor implements CellBaseCoreDBAdaptor { + + private static final GenericDocumentComplexConverter CONVERTER; + + static { + CONVERTER = new GenericDocumentComplexConverter<>(Snp.class); + } + + public SnpMongoDBAdaptor(MongoDataStore mongoDataStore) { + super(mongoDataStore); + + this.init(); + } + + private void init() { + logger.debug("SnpMongoDBAdaptor: in 'constructor'"); + + mongoDBCollectionByRelease = buildCollectionByReleaseMap("snp"); + } + + @Override + public CellBaseIterator iterator(SnpQuery query) throws CellBaseException { + Bson bson = parseQuery(query); + Bson projection = getProjection(query); + QueryOptions queryOptions = query.toQueryOptions(); + + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + MongoDBIterator iterator = mongoDBCollection.iterator(null, bson, projection, CONVERTER, queryOptions); + return new CellBaseMongoDBIterator<>(iterator); + } + + @Override + public List> info(List ids, ProjectionQueryOptions queryOptions, int dataRelease, String apiKey) + throws CellBaseException { + List> results = new ArrayList<>(); + Bson projection = getProjection(queryOptions); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); + for (String id : ids) { + List orBsonList = new ArrayList<>(); + orBsonList.add(Filters.eq("id", id)); + Bson bson = Filters.or(orBsonList); + results.add(new CellBaseDataResult<>(mongoDBCollection.find(bson, projection, CONVERTER, new QueryOptions()))); + } + return results; + } + + @Override + public CellBaseDataResult distinct(SnpQuery query) throws CellBaseException { + Bson bsonQuery = parseQuery(query); + logger.info("snpQuery distinct: {}", bsonQuery.toBsonDocument().toJson()); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + return new CellBaseDataResult<>(mongoDBCollection.distinct(query.getFacet(), bsonQuery, String.class)); + } + + @Override + public CellBaseDataResult aggregationStats(SnpQuery query) { + return null; + } + + @Override + public CellBaseDataResult groupBy(SnpQuery query) throws CellBaseException { + Bson bsonQuery = parseQuery(query); + logger.info("snpQuery groupBy: {}", bsonQuery.toBsonDocument().toJson()); + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, query.getDataRelease()); + return groupBy(bsonQuery, query, "name", mongoDBCollection); + } + + public CellBaseDataResult startsWith(String id, QueryOptions options, int dataRelease) throws CellBaseException { + Bson regex = Filters.regex("id", Pattern.compile("^" + id)); + Bson projection = null; + if (options.containsKey(QueryOptions.INCLUDE)) { + projection = Projections.include(options.getAsStringList(QueryOptions.INCLUDE)); + } else if (options.containsKey(QueryOptions.EXCLUDE)) { + projection = Projections.exclude(options.getAsStringList(QueryOptions.EXCLUDE)); + } + + MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); + return new CellBaseDataResult<>(mongoDBCollection.find(regex, projection, CONVERTER, options)); + } + + public Bson parseQuery(SnpQuery query) throws CellBaseException { + List andBsonList = new ArrayList<>(); + try { + for (Map.Entry entry : query.toObjectMap().entrySet()) { + String dotNotationName = entry.getKey(); + Object value = entry.getValue(); + switch (dotNotationName) { + case "position": { + createAndOrQuery(value, dotNotationName, QueryParam.Type.INTEGER, andBsonList); + break; + } + case DATA_RELEASE_PARAM: + case API_KEY_PARAM: { + // Do nothing + break; + } + default: { + createAndOrQuery(value, dotNotationName, QueryParam.Type.STRING, andBsonList); + break; + } + } + } + } catch (IllegalAccessException e) { + throw new CellBaseException("Error parsing SNP query: " + query, e); + } + + logger.info("SnpMongoDBAdaptor parsed query: {}", andBsonList); + if (!andBsonList.isEmpty()) { + return Filters.and(andBsonList); + } else { + return new Document(); + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java index fc4b602cd9..3c33266f7e 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/VariantMongoDBAdaptor.java @@ -18,15 +18,20 @@ import com.mongodb.bulk.BulkWriteResult; import com.mongodb.client.model.Filters; +import com.mongodb.client.model.Projections; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.models.core.GenomicScoreRegion; import org.opencb.biodata.models.core.Region; +import org.opencb.biodata.models.core.Snp; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.Score; import org.opencb.biodata.models.variant.avro.StructuralVariantType; import org.opencb.biodata.models.variant.avro.VariantType; +import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; +import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.cellbase.core.ParamConstants; import org.opencb.cellbase.core.api.VariantQuery; import org.opencb.cellbase.core.api.query.CellBaseQueryOptions; @@ -40,6 +45,7 @@ import org.opencb.cellbase.lib.iterator.CellBaseIterator; import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; import org.opencb.cellbase.lib.iterator.VariantMongoDBIterator; +import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryParam; @@ -50,6 +56,7 @@ import java.util.*; import java.util.function.Consumer; +import static org.opencb.cellbase.core.ParamConstants.API_KEY_PARAM; import static org.opencb.cellbase.core.ParamConstants.DATA_RELEASE_PARAM; import static org.opencb.cellbase.lib.MongoDBCollectionConfiguration.VARIATION_FUNCTIONAL_SCORE_CHUNK_SIZE; @@ -68,6 +75,7 @@ public class VariantMongoDBAdaptor extends CellBaseDBAdaptor implements CellBase private Map caddDBCollectionByRelease; + private Map snpDBCollectionByRelease; public VariantMongoDBAdaptor(MongoDataStore mongoDataStore) { super(mongoDataStore); @@ -80,6 +88,7 @@ private void init() { mongoDBCollectionByRelease = buildCollectionByReleaseMap("variation"); caddDBCollectionByRelease = buildCollectionByReleaseMap("variation_functional_score"); + snpDBCollectionByRelease = buildCollectionByReleaseMap("snp"); } public CellBaseDataResult next(Query query, QueryOptions options) { @@ -106,10 +115,10 @@ public CellBaseDataResult update(List objectList, String field, String[] i CellBaseDataResult nLoadedObjects = null; switch (field) { case POP_FREQUENCIES_FIELD: - nLoadedObjects = updatePopulationFrequencies((List) objectList, dataRelease); + nLoadedObjects = updatePopulationFrequencies(objectList, dataRelease); break; case ANNOTATION_FIELD: - nLoadedObjects = updateAnnotation((List) objectList, innerFields, dataRelease); + nLoadedObjects = updateAnnotation(objectList, innerFields, dataRelease); break; default: logger.error("Invalid field {}: no action implemented for updating this field.", field); @@ -207,7 +216,7 @@ public CellBaseDataResult groupBy(Query query, List fields, QueryOptions } @Deprecated - private Bson parseQuery(Query query) { + private Bson parseQuery(Query query) throws CellBaseException { List andBsonList = new ArrayList<>(); createOrQuery(query, ParamConstants.QueryParams.CHROMOSOME.key(), "chromosome", andBsonList); @@ -221,7 +230,12 @@ private Bson parseQuery(Query query) { } createRegionQuery(query, ParamConstants.QueryParams.REGION.key(), MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE, andBsonList); - createOrQuery(query, ParamConstants.QueryParams.ID.key(), "id", andBsonList); + + if (StringUtils.isNotEmpty(query.getString(ParamConstants.QueryParams.ID.key()))) { + List variantIds = getVariantIds(query.getAsStringList(ParamConstants.QueryParams.ID.key()), + query.getInt(DATA_RELEASE_PARAM)); + createOrQuery(variantIds, "id", andBsonList); + } createImprecisePositionQuery(query, ParamConstants.QueryParams.CI_START_LEFT.key(), ParamConstants.QueryParams.CI_START_RIGHT.key(), @@ -236,23 +250,29 @@ private Bson parseQuery(Query query) { "annotation.consequenceTypes.sequenceOntologyTerms.name", andBsonList); createGeneOrQuery(query, ParamConstants.QueryParams.GENE.key(), andBsonList); - if (andBsonList.size() > 0) { + if (!andBsonList.isEmpty()) { return Filters.and(andBsonList); } else { return new Document(); } } - public Bson parseQuery(VariantQuery query) { + public Bson parseQuery(VariantQuery query) throws CellBaseException { List andBsonList = new ArrayList<>(); try { for (Map.Entry entry : query.toObjectMap().entrySet()) { String dotNotationName = entry.getKey(); Object value = entry.getValue(); switch (dotNotationName) { + case "id": + // Both variant IDs and dbSNP IDs are allowed + List variantIds = getVariantIds(Arrays.asList(query.getId().split(",")), query.getDataRelease()); + createAndOrQuery(variantIds, dotNotationName, QueryParam.Type.STRING, andBsonList); + break; case "region": createRegionQuery(query, query.getRegions(), MongoDBCollectionConfiguration.VARIATION_CHUNK_SIZE, andBsonList); break; + case API_KEY_PARAM: case DATA_RELEASE_PARAM: case "svType": // don't do anything, this is parsed later @@ -265,11 +285,13 @@ public Bson parseQuery(VariantQuery query) { break; case "ciStartLeft": createImprecisePositionQueryStart(query, andBsonList); + break; case "ciEndRight": // don't do anything, this is parsed later break; case "ciEndLeft": createImprecisePositionQueryEnd(query, andBsonList); + break; case "gene": createGeneOrQuery(query, andBsonList); break; @@ -283,11 +305,11 @@ public Bson parseQuery(VariantQuery query) { } } } catch (IllegalAccessException e) { - e.printStackTrace(); + throw new CellBaseException("Error parsing variant query: " + query, e); } - logger.debug("variant parsed query: " + andBsonList.toString()); - if (andBsonList.size() > 0) { + logger.debug("variant parsed query: {}", andBsonList); + if (!andBsonList.isEmpty()) { return Filters.and(andBsonList); } else { return new Document(); @@ -339,7 +361,7 @@ private void createTypeQuery(VariantQuery query, String typeMongoField, String s andBsonList.add(Filters.or(orBsonList)); // Inversion or just CNV (without subtype) } else { - andBsonList.add(Filters.eq(typeMongoField, variantTypeString.toString())); + andBsonList.add(Filters.eq(typeMongoField, variantTypeString)); } } } @@ -430,7 +452,7 @@ private CellBaseDataResult updateAnnotation(List variantDocument for (Document variantDBObject : variantDocumentList) { Document annotationDBObject = (Document) variantDBObject.get(ANNOTATION_FIELD); Document toOverwrite = new Document(); - if (innerFields != null & innerFields.length > 0) { + if (innerFields != null && innerFields.length > 0) { for (String field : innerFields) { if (annotationDBObject.get(field) != null) { toOverwrite.put(ANNOTATION_FIELD + "." + field, annotationDBObject.get(field)); @@ -569,21 +591,11 @@ public CellBaseDataResult getFunctionalScoreVariant(Variant variant, Quer if (position >= chunkStart && position <= chunkEnd) { int offset = (position - chunkStart); ArrayList basicDBList = dbObject.get("values", ArrayList.class); - -// long l1 = 0L; // TODO: delete -// try { // TODO: delete long l1 = Long.parseLong(basicDBList.get(offset).toString()); -// l1 = (Long) basicDBList.get(offset); -// } catch (Exception e) { // TODO: delete -// logger.error("problematic variant: {}", variant.toString()); -// throw e; -// } - if (dbObject.getString("source").equalsIgnoreCase("cadd_raw")) { float value = 0f; switch (alternate.toLowerCase()) { case "a": -// value = ((short) (l1 >> 48) - 10000) / DECIMAL_RESOLUTION; value = (((short) (l1 >> 48)) / DECIMAL_RESOLUTION) - 10; break; case "c": @@ -602,7 +614,6 @@ public CellBaseDataResult getFunctionalScoreVariant(Variant variant, Quer .setScore(value) .setSource(dbObject.getString("source")) .setDescription(null) - // .setDescription("") .build()); } @@ -745,13 +756,14 @@ public List> info(List ids, ProjectionQueryO throws CellBaseException { List> results = new ArrayList<>(); MongoDBCollection mongoDBCollection = getCollectionByRelease(mongoDBCollectionByRelease, dataRelease); - for (String id : ids) { - Bson projection = getProjection(queryOptions); - List orBsonList = new ArrayList<>(ids.size()); - orBsonList.add(Filters.eq("id", id)); - Bson bson = Filters.or(orBsonList); - results.add(new CellBaseDataResult<>(mongoDBCollection.find(bson, projection, Variant.class, new QueryOptions()))); + Bson projection = getProjection(queryOptions); + List variantIds = getVariantIds(ids, dataRelease); + List orBsonList = new ArrayList<>(variantIds.size()); + for (String variantId : variantIds) { + orBsonList.add(Filters.eq("id", variantId)); } + Bson bson = Filters.or(orBsonList); + results.add(new CellBaseDataResult<>(mongoDBCollection.find(bson, projection, Variant.class, new QueryOptions()))); return results; } @@ -776,6 +788,60 @@ public CellBaseDataResult getFunctionalScoreRegion(List(mongoDBCollection.find(bson, projection, GenomicScoreRegion.class, new QueryOptions())); } + + private List getVariantIds(List ids, int dataRelease) throws CellBaseException { + List variantIds = new ArrayList<>(ids.size()); + List snpIds = new ArrayList<>(); + // Split dbSNP IDs and variant IDs + for (String id : ids) { + if (id.startsWith("rs")) { + snpIds.add(id); + } else { + variantIds.add(id); + } + } + + // Get the variant ID for the dbSNP ID + if (CollectionUtils.isNotEmpty(snpIds)) { + // 1. Prepare the query + List orBsonList = new ArrayList<>(); + for (String snpId : snpIds) { + orBsonList.add(Filters.eq("id", snpId)); + } + Bson query = Filters.or(orBsonList); + + // 2. We must exclude as much information as possible to improve performance + MongoDBCollection mongoDBCollection = getCollectionByRelease(snpDBCollectionByRelease, dataRelease); + DataResult snpDataResult = mongoDBCollection.find(query, Projections.exclude(ANNOTATION_FIELD), Snp.class, + new QueryOptions()); + + // 3. Build the variant IDs + Set results = new HashSet<>(); + if (snpDataResult.getNumResults() > 0) { + // Create variant normalizer + VariantNormalizer variantNormalizer = new VariantNormalizer(); + + for (Snp snp : snpDataResult.getResults()) { + for (String alternate : snp.getAlternates()) { + Variant inputVariant = new Variant(snp.getChromosome(), snp.getPosition(), snp.getReference(), alternate); + try { + Variant normalizedVariant = variantNormalizer.normalize(Collections.singletonList(inputVariant), true).get(0); + results.add(normalizedVariant.toString()); + } catch (NonStandardCompliantSampleField e) { + throw new CellBaseException("Error normalizing variant " + inputVariant, e); + } + } + } + } + + // 4. Add new variant IDs, if necessary + if (CollectionUtils.isNotEmpty(results)) { + variantIds.addAll(results); + } + } + + return variantIds; + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java index 28f5c70fa7..670585204d 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/VariantManager.java @@ -16,10 +16,7 @@ package org.opencb.cellbase.lib.managers; -import org.opencb.biodata.models.core.Gene; -import org.opencb.biodata.models.core.GenomicScoreRegion; -import org.opencb.biodata.models.core.Region; -import org.opencb.biodata.models.core.SpliceScore; +import org.opencb.biodata.models.core.*; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantBuilder; import org.opencb.biodata.models.variant.avro.SampleEntry; @@ -27,7 +24,9 @@ import org.opencb.biodata.models.variant.avro.VariantAnnotation; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.cellbase.core.ParamConstants; +import org.opencb.cellbase.core.api.SnpQuery; import org.opencb.cellbase.core.api.VariantQuery; +import org.opencb.cellbase.core.api.key.ApiKeyLicensedDataUtils; import org.opencb.cellbase.core.api.query.CellBaseQueryOptions; import org.opencb.cellbase.core.api.query.QueryException; import org.opencb.cellbase.core.config.CellBaseConfiguration; @@ -36,9 +35,9 @@ import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.core.variant.AnnotationBasedPhasedQueryManager; import org.opencb.cellbase.lib.impl.core.CellBaseCoreDBAdaptor; +import org.opencb.cellbase.lib.impl.core.SnpMongoDBAdaptor; import org.opencb.cellbase.lib.impl.core.SpliceScoreMongoDBAdaptor; import org.opencb.cellbase.lib.impl.core.VariantMongoDBAdaptor; -import org.opencb.cellbase.core.api.key.ApiKeyLicensedDataUtils; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.cellbase.lib.variant.annotation.CellBaseNormalizerSequenceAdaptor; import org.opencb.cellbase.lib.variant.annotation.VariantAnnotationCalculator; @@ -59,6 +58,7 @@ public class VariantManager extends AbstractManager implements AggregationApi getFunctionalScoreRegion(List(chunkIdSet), options, dataRelease); } + + public CellBaseDataResult searchSnp(SnpQuery query) throws CellBaseException { + return snpDBAdaptor.query(query); + } + + public CellBaseDataResult startsWithSnp(String id, QueryOptions options, int dataRelease) throws CellBaseException { + return snpDBAdaptor.startsWith(id, options, dataRelease); + } } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java index a503ba7045..1b86b49367 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/VariantAnnotationCalculator.java @@ -40,6 +40,8 @@ import org.opencb.cellbase.lib.managers.*; import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; import org.opencb.cellbase.lib.variant.annotation.futures.FuturePharmacogenomicsAnnotator; +import org.opencb.cellbase.lib.variant.annotation.futures.FutureSnpAnnotator; +import org.opencb.cellbase.lib.variant.annotation.futures.FutureSpliceScoreAnnotator; import org.opencb.cellbase.lib.variant.hgvs.HgvsCalculator; import org.opencb.commons.datastore.core.QueryOptions; import org.slf4j.Logger; @@ -465,6 +467,13 @@ private List runAnnotationProcess(List normalizedVar variationFuture = CACHED_THREAD_POOL.submit(futureVariationAnnotator); } + FutureSnpAnnotator futureSnpAnnotator = null; + Future>> snpFuture = null; + if (annotatorSet.contains("xrefs") && dataRelease.getCollections().containsKey(EtlCommons.SNP_COLLECTION_NAME)) { + futureSnpAnnotator = new FutureSnpAnnotator(normalizedVariantList, dataRelease.getRelease(), variantManager, logger); + snpFuture = CACHED_THREAD_POOL.submit(futureSnpAnnotator); + } + FutureConservationAnnotator futureConservationAnnotator = null; Future>> conservationFuture = null; if (annotatorSet.contains("conservation")) { @@ -510,8 +519,8 @@ private List runAnnotationProcess(List normalizedVar FutureSpliceScoreAnnotator futureSpliceScoreAnnotator = null; Future>> spliceScoreFuture = null; if (annotatorSet.contains("consequenceType")) { - futureSpliceScoreAnnotator = new FutureSpliceScoreAnnotator(normalizedVariantList, QueryOptions.empty(), - dataRelease.getRelease()); + futureSpliceScoreAnnotator = new FutureSpliceScoreAnnotator(normalizedVariantList, dataRelease.getRelease(), apiKey, + variantManager, logger); spliceScoreFuture = CACHED_THREAD_POOL.submit(futureSpliceScoreAnnotator); } @@ -643,6 +652,9 @@ private List runAnnotationProcess(List normalizedVar if (futureVariationAnnotator != null) { futureVariationAnnotator.processResults(variationFuture, variantAnnotationList, annotatorSet); } + if (futureSnpAnnotator != null) { + futureSnpAnnotator.processResults(snpFuture, variantAnnotationList); + } if (futureConservationAnnotator != null) { futureConservationAnnotator.processResults(conservationFuture, variantAnnotationList); } @@ -1171,7 +1183,7 @@ private Set getAnnotatorSet(QueryOptions queryOptions) { // 'expression' removed in CB 5.0 annotatorSet = new HashSet<>(Arrays.asList("variation", "traitAssociation", "conservation", "functionalScore", "consequenceType", "geneDisease", "drugInteraction", "geneConstraints", "mirnaTargets", "pharmacogenomics", - "cancerGeneAssociation", "cancerHotspots", "populationFrequencies", "repeats", "cytoband", "hgvs")); + "cancerGeneAssociation", "cancerHotspots", "populationFrequencies", "repeats", "cytoband", "hgvs", "xrefs")); List excludeList = queryOptions.getAsStringList("exclude"); excludeList.forEach(annotatorSet::remove); } @@ -1909,74 +1921,6 @@ public void processResults(Future>> cytobandFu } } - class FutureSpliceScoreAnnotator implements Callable>> { - private List variantList; - private QueryOptions queryOptions; - private int dataRelease; - - FutureSpliceScoreAnnotator(List variantList, QueryOptions queryOptions, int dataRelease) { - this.variantList = variantList; - this.queryOptions = queryOptions; - this.dataRelease = dataRelease; - } - - @Override - public List> call() throws Exception { - long startTime = System.currentTimeMillis(); - - List> cellBaseDataResultList = new ArrayList<>(variantList.size()); - - logger.debug("Query splice"); - // Want to return only one CellBaseDataResult object per Variant - for (Variant variant : variantList) { - cellBaseDataResultList.add(variantManager.getSpliceScoreVariant(variant, apiKey, dataRelease)); - } - logger.debug("Splice score query performance is {}ms for {} variants", System.currentTimeMillis() - startTime, - variantList.size()); - return cellBaseDataResultList; - } - - public void processResults(Future>> spliceFuture, - List variantAnnotationList) - throws InterruptedException, ExecutionException { - List> spliceCellBaseDataResults; - try { - spliceCellBaseDataResults = spliceFuture.get(30, TimeUnit.SECONDS); - } catch (TimeoutException e) { - spliceFuture.cancel(true); - throw new ExecutionException("Unable to finish splice score query on time", e); - } - - if (CollectionUtils.isNotEmpty(spliceCellBaseDataResults)) { - for (int i = 0; i < variantAnnotationList.size(); i++) { - CellBaseDataResult spliceScoreResult = spliceCellBaseDataResults.get(i); - if (spliceScoreResult != null && CollectionUtils.isNotEmpty(spliceScoreResult.getResults())) { - for (SpliceScore spliceScore : spliceScoreResult.getResults()) { - for (ConsequenceType ct : variantAnnotationList.get(i).getConsequenceTypes()) { - for (SpliceScoreAlternate spliceScoreAlt : spliceScore.getAlternates()) { - String alt = StringUtils.isEmpty(variantAnnotationList.get(i).getAlternate()) - ? "-" - : variantAnnotationList.get(i).getAlternate(); - if (alt.equals(spliceScoreAlt.getAltAllele())) { - if (StringUtils.isEmpty(spliceScore.getTranscriptId()) - || StringUtils.isEmpty(ct.getTranscriptId()) - || spliceScore.getTranscriptId().equals(ct.getTranscriptId())) { - SpliceScores scores = new SpliceScores(spliceScore.getSource(), spliceScoreAlt.getScores()); - if (ct.getSpliceScores() == null) { - ct.setSpliceScores(new ArrayList<>()); - } - ct.getSpliceScores().add(scores); - } - } - } - } - } - } - } - } - } - } - public VariantNormalizer getNormalizer() { return normalizer; } diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSnpAnnotator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSnpAnnotator.java new file mode 100644 index 0000000000..a14dd62e69 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSnpAnnotator.java @@ -0,0 +1,123 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.variant.annotation.futures; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.Snp; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.biodata.models.variant.avro.Xref; +import org.opencb.cellbase.core.api.SnpQuery; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.managers.VariantManager; +import org.slf4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.*; + +public class FutureSnpAnnotator implements Callable>> { + private VariantManager variantManager; + + private List variantList; + private int dataRelease; + + private Logger logger; + + public FutureSnpAnnotator(List variantList, int dataRelease, VariantManager variantManager, Logger logger) { + this.variantManager = variantManager; + + this.variantList = variantList; + this.dataRelease = dataRelease; + + this.logger = logger; + } + + @Override + public List> call() throws Exception { + long startTime = System.currentTimeMillis(); + + List> cellBaseDataResultList = new ArrayList<>(variantList.size()); + + logger.debug("SNP queries..."); + // Want to return only one CellBaseDataResult object per Variant + List includes = new ArrayList<>(); + includes.add("id"); + includes.add("source"); + String logMsg = StringUtils.join(includes, ","); + logger.info("SNP annotation/search includes: {}", logMsg); + for (Variant variant : variantList) { + SnpQuery query = new SnpQuery(); + query.setChromosome(variant.getChromosome()); + query.setPosition(variant.getStart()); + query.setReference(variant.getReference()); + query.setDataRelease(dataRelease); + query.setIncludes(includes); + cellBaseDataResultList.add(variantManager.searchSnp(query)); + } + logger.info("SNP queries performance in {} ms for {} variants", System.currentTimeMillis() - startTime, variantList.size()); + return cellBaseDataResultList; + } + + public void processResults(Future>> snpFuture, List variantAnnotationList) + throws InterruptedException, ExecutionException { + List> snpCellBaseDataResults; + try { + snpCellBaseDataResults = snpFuture.get(30, TimeUnit.SECONDS); + } catch (TimeoutException e) { + snpFuture.cancel(true); + throw new ExecutionException("Unable to finish SNP query on time", e); + } + + if (CollectionUtils.isNotEmpty(snpCellBaseDataResults)) { + for (int i = 0; i < variantAnnotationList.size(); i++) { + CellBaseDataResult snpResult = snpCellBaseDataResults.get(i); + if (snpResult != null && CollectionUtils.isNotEmpty(snpResult.getResults())) { + if (CollectionUtils.isEmpty(variantAnnotationList.get(i).getXrefs())) { + // Add all dbSNP to the xrefs + variantAnnotationList.get(i).setXrefs(new ArrayList<>()); + for (Snp snp : snpResult.getResults()) { + variantAnnotationList.get(i).getXrefs().add(new Xref(snp.getId(), snp.getSource())); + } + } else { + // Check if the xrefs are already in the annotation (e.g., GWAS builder might add dbSNP IDs) + List newXrefs = new ArrayList<>(); + for (Snp snp : snpResult.getResults()) { + // Sanity check + if (StringUtils.isNotEmpty(snp.getId()) && StringUtils.isNotEmpty(snp.getSource())) { + boolean found = false; + for (Xref xref : variantAnnotationList.get(i).getXrefs()) { + if (snp.getId().equalsIgnoreCase(xref.getId()) && snp.getSource().equalsIgnoreCase(xref.getSource())) { + found = true; + break; + } + } + if (!found) { + newXrefs.add(new Xref(snp.getId(), snp.getSource())); + } + } + } + if (CollectionUtils.isNotEmpty(newXrefs)) { + variantAnnotationList.get(i).getXrefs().addAll(newXrefs); + } + } + } + } + } + } +} diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java new file mode 100644 index 0000000000..40523fdbc8 --- /dev/null +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/variant/annotation/futures/FutureSpliceScoreAnnotator.java @@ -0,0 +1,109 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.cellbase.lib.variant.annotation.futures; + +import org.apache.commons.collections4.CollectionUtils; +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.models.core.SpliceScore; +import org.opencb.biodata.models.core.SpliceScoreAlternate; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.avro.ConsequenceType; +import org.opencb.biodata.models.variant.avro.SpliceScores; +import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.core.result.CellBaseDataResult; +import org.opencb.cellbase.lib.managers.VariantManager; +import org.slf4j.Logger; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.*; + +public class FutureSpliceScoreAnnotator implements Callable>> { + private List variantList; + private int dataRelease; + private String apiKey; + + private VariantManager variantManager; + + private Logger logger; + + public FutureSpliceScoreAnnotator(List variantList, int dataRelease, String apiKey, VariantManager variantManager, + Logger logger) { + this.variantList = variantList; + this.dataRelease = dataRelease; + this.apiKey = apiKey; + + this.variantManager = variantManager; + + this.logger = logger; + } + + @Override + public List> call() throws Exception { + long startTime = System.currentTimeMillis(); + + List> cellBaseDataResultList = new ArrayList<>(variantList.size()); + + logger.debug("Query splice"); + // Want to return only one CellBaseDataResult object per Variant + for (Variant variant : variantList) { + cellBaseDataResultList.add(variantManager.getSpliceScoreVariant(variant, apiKey, dataRelease)); + } + logger.debug("Splice score query performance is {}ms for {} variants", System.currentTimeMillis() - startTime, + variantList.size()); + return cellBaseDataResultList; + } + + public void processResults(Future>> spliceFuture, List variantAnnotationList) + throws InterruptedException, ExecutionException { + List> spliceCellBaseDataResults; + try { + spliceCellBaseDataResults = spliceFuture.get(30, TimeUnit.SECONDS); + } catch (TimeoutException e) { + spliceFuture.cancel(true); + throw new ExecutionException("Unable to finish splice score query on time", e); + } + + if (CollectionUtils.isNotEmpty(spliceCellBaseDataResults)) { + for (int i = 0; i < variantAnnotationList.size(); i++) { + CellBaseDataResult spliceScoreResult = spliceCellBaseDataResults.get(i); + if (spliceScoreResult != null && CollectionUtils.isNotEmpty(spliceScoreResult.getResults())) { + for (SpliceScore spliceScore : spliceScoreResult.getResults()) { + for (ConsequenceType ct : variantAnnotationList.get(i).getConsequenceTypes()) { + for (SpliceScoreAlternate spliceScoreAlt : spliceScore.getAlternates()) { + String alt = StringUtils.isEmpty(variantAnnotationList.get(i).getAlternate()) + ? "-" + : variantAnnotationList.get(i).getAlternate(); + if (alt.equals(spliceScoreAlt.getAltAllele())) { + if (StringUtils.isEmpty(spliceScore.getTranscriptId()) + || StringUtils.isEmpty(ct.getTranscriptId()) + || spliceScore.getTranscriptId().equals(ct.getTranscriptId())) { + SpliceScores scores = new SpliceScores(spliceScore.getSource(), spliceScoreAlt.getScores()); + if (ct.getSpliceScores() == null) { + ct.setSpliceScores(new ArrayList<>()); + } + ct.getSpliceScores().add(scores); + } + } + } + } + } + } + } + } + } +} diff --git a/cellbase-lib/src/main/resources/mongodb-indexes.json b/cellbase-lib/src/main/resources/mongodb-indexes.json index de81c7b83b..93b86af2df 100644 --- a/cellbase-lib/src/main/resources/mongodb-indexes.json +++ b/cellbase-lib/src/main/resources/mongodb-indexes.json @@ -145,3 +145,6 @@ {"collection": "pharmacogenomics", "fields": {"variants.phenotypeType": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.confidence": 1}, "options": {"background": true}} {"collection": "pharmacogenomics", "fields": {"variants.evidences.pubmed": 1}, "options": {"background": true}} + +{"collection": "snp", "fields": {"id": 1}, "options": {"background": true}} +{"collection": "snp", "fields": {"chromosome": 1, "position": 1, "reference": 1}, "options": {"background": true}} \ No newline at end of file diff --git a/cellbase-server/pom.xml b/cellbase-server/pom.xml index 304cca8aa2..fe4509c6fc 100644 --- a/cellbase-server/pom.xml +++ b/cellbase-server/pom.xml @@ -6,7 +6,7 @@ org.opencb.cellbase cellbase - 6.2.1-SNAPSHOT + 6.3.0-SNAPSHOT ../pom.xml diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java index 331f562585..d8bb3a9f6d 100644 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/MetaWSServer.java @@ -28,6 +28,7 @@ import org.opencb.cellbase.core.config.SpeciesProperties; import org.opencb.cellbase.core.exception.CellBaseException; import org.opencb.cellbase.core.models.DataRelease; +import org.opencb.cellbase.core.models.DataReleaseSource; import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.core.utils.SpeciesUtils; import org.opencb.cellbase.lib.managers.DataReleaseManager; @@ -56,6 +57,9 @@ import java.text.SimpleDateFormat; import java.util.*; +import static org.opencb.cellbase.lib.EtlCommons.COSMIC_DATA; +import static org.opencb.cellbase.lib.EtlCommons.HGMD_DATA; + /** * Created by imedina on 04/08/15. */ @@ -84,13 +88,17 @@ public MetaWSServer(@PathParam("apiVersion") @GET @Path("/{species}/versions") @ApiOperation(httpMethod = "GET", value = "Returns source version metadata, including source urls from which " - + "data files were downloaded.", response = DownloadProperties.class, responseContainer = "QueryResponse") + + "data files were downloaded.", response = DataReleaseSource.class, responseContainer = "QueryResponse") public Response getVersion(@PathParam("species") @ApiParam(name = "species", value = ParamConstants.SPECIES_DESCRIPTION, defaultValue = ParamConstants.DEFAULT_SPECIES, required = true) String species, @ApiParam(name = "assembly", value = ParamConstants.ASSEMBLY_DESCRIPTION, - defaultValue = ParamConstants.DEFAULT_ASSEMBLY) @QueryParam("assembly") String assembly) { + defaultValue = ParamConstants.DEFAULT_ASSEMBLY) @QueryParam("assembly") String assembly, + @ApiParam(name = "dataRelease", value = ParamConstants.DATA_RELEASE_DESCRIPTION) @QueryParam("dataRelease") + int dataRelease) { try { + long dbTimeStart; + dbTimeStart = System.currentTimeMillis(); if (StringUtils.isEmpty(assembly)) { SpeciesConfiguration.Assembly assemblyObject = SpeciesUtils.getDefaultAssembly(cellBaseConfiguration, species); if (assemblyObject != null) { @@ -98,12 +106,24 @@ public Response getVersion(@PathParam("species") } } if (!SpeciesUtils.validateSpeciesAndAssembly(cellBaseConfiguration, species, assembly)) { - return createErrorResponse("getVersion", "Invalid species: '" + species + "' or assembly: '" + return createErrorResponse("/versions", "Invalid species: '" + species + "' or assembly: '" + assembly + "'"); } - logger.error("species " + species); - CellBaseDataResult queryResult = metaManager.getVersions(species, assembly); - return createOkResponse(queryResult); + DataReleaseManager dataReleaseManager = cellBaseManagerFactory.getDataReleaseManager(species, assembly); + DataRelease dr = dataReleaseManager.get(dataRelease); + if (dr == null) { + return createErrorResponse("/versions", "Could not find data release '" + dataRelease + "'"); + } + // Remove some sources + List sources = new ArrayList<>(); + for (DataReleaseSource source : dr.getSources()) { + if (!COSMIC_DATA.equalsIgnoreCase(source.getName()) && !HGMD_DATA.equalsIgnoreCase(source.getName())) { + sources.add(source); + } + } + int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); + return createOkResponse(new CellBaseDataResult<>("versions", dbTime, Collections.emptyList(), sources.size(), sources, + sources.size())); } catch (CellBaseException e) { return createErrorResponse(e); } @@ -135,7 +155,7 @@ public Response getDataRelease(@PathParam("species") } } if (!SpeciesUtils.validateSpeciesAndAssembly(cellBaseConfiguration, species, assembly)) { - return createErrorResponse("getVersion", "Invalid species: '" + species + "' or assembly: '" + return createErrorResponse("/dataReleases", "Invalid species: '" + species + "' or assembly: '" + assembly + "'"); } DataReleaseManager dataReleaseManager = cellBaseManagerFactory.getDataReleaseManager(species, assembly); diff --git a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/genomic/VariantWSServer.java b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/genomic/VariantWSServer.java index eba949398d..03e9f515f2 100755 --- a/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/genomic/VariantWSServer.java +++ b/cellbase-server/src/main/java/org/opencb/cellbase/server/rest/genomic/VariantWSServer.java @@ -18,8 +18,10 @@ import io.swagger.annotations.*; import org.apache.commons.lang.StringUtils; +import org.opencb.biodata.models.core.Snp; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.cellbase.core.api.SnpQuery; import org.opencb.cellbase.core.api.VariantQuery; import org.opencb.cellbase.core.models.DataRelease; import org.opencb.cellbase.core.result.CellBaseDataResult; @@ -50,9 +52,9 @@ public VariantWSServer(@PathParam("apiVersion") @ApiParam(name = "apiVersion", v defaultValue = DEFAULT_VERSION) String apiVersion, @PathParam("species") @ApiParam(name = "species", value = SPECIES_DESCRIPTION) String species, @ApiParam(name = "assembly", value = ASSEMBLY_DESCRIPTION) @DefaultValue("") @QueryParam("assembly") - String assembly, + String assembly, @ApiParam(name = "dataRelease", value = DATA_RELEASE_DESCRIPTION) @DefaultValue("0") @QueryParam("dataRelease") - int dataRelease, + int dataRelease, @ApiParam(name = "apiKey", value = API_KEY_DESCRIPTION) @DefaultValue("") @QueryParam("apiKey") String apiKey, @Context UriInfo uriInfo, @Context HttpServletRequest hsr) throws CellBaseServerException { @@ -117,20 +119,6 @@ public Response getNormalization(@PathParam("variants") @ApiParam(name = "varian } - // @GET -// @Path("/{phenotype}/phenotype") -// @ApiOperation(httpMethod = "GET", -// value = "Not implemented yet", -// response = CellBaseDataResponse.class, hidden = true) -// public Response getVariantsByPhenotype(@PathParam("phenotype") String phenotype) { -// try { -// parseQueryParams(); -// return Response.ok("Not implemented").build(); -// } catch (Exception e) { -// return createErrorResponse(e); -// } -// } - @POST @Consumes("text/plain") @Path("/annotation") @@ -289,7 +277,7 @@ public Response getAnnotationByVariantsGET(@PathParam("variants") @QueryParam("checkAminoAcidChange") @ApiParam(name = "checkAminoAcidChange", value = "", allowableValues = "false,true", defaultValue = "false", required = false) - Boolean checkAminoAcidChange, + Boolean checkAminoAcidChange, @QueryParam("consequenceTypeSource") @ApiParam(name = "consequenceTypeSource", value = "Gene set, either ensembl (default) " + "or refseq", allowableValues = "ensembl,refseq", allowMultiple = true, @@ -358,29 +346,6 @@ private Response getAnnotationByVariant(String variants, } } -// @GET -// @Deprecated -// @Path("/{variants}/cadd") -// @ApiOperation(httpMethod = "GET", value = "Get CADD scores for a (list of) variant(s)", response = Score.class, -// responseContainer = "QueryResponse", hidden = true) -// public Response getCaddScoreByVariant(@PathParam("variants") -// @ApiParam(name = "variants", value = "Comma separated list of variants for" -// + "which CADD socores will be returned, e.g. " -// + "19:45411941:T:C,14:38679764:-:GATCTG,1:6635210:G:-," -// + "2:114340663:GCTGGGCATCCT:ACTGGGCATCCT", -// required = true) String variants) { -// try { -// parseQueryParams(); -// VariantDBAdaptor variantDBAdaptor = dbAdaptorFactory.getVariationDBAdaptor(this.species, this.assembly); -// -// List> functionalScoreVariant = -// variantDBAdaptor.getFunctionalScoreVariant(Variant.parseVariants(variants), queryOptions); -// return createOkResponse(functionalScoreVariant); -// } catch (Exception e) { -// return createErrorResponse(e); -// } -// } - // @GET // @Path("/stats") // @Override @@ -493,58 +458,64 @@ public Response getAllConsequenceTypes() { } } - // FIXME: 29/04/16 GET and POST web services to be fixed -// @GET -// @Path("/{variants}/consequenceType") -// @ApiOperation(httpMethod = "GET", value = "Get the biological impact of the variant(s)", response = String.class, -// responseContainer = "QueryResponse") -// public Response getConsequenceTypeByGetMethod(@PathParam("variants") String variants) { -// return getConsequenceType(variants); -// } -// -// private Response getConsequenceType(String variants) { -// try { -// parseQueryParams(); -// VariantDBAdaptor variationDBAdaptor = dbAdaptorFactory.getVariationDBAdaptor(this.species, this.assembly); -// query.put(VariantDBAdaptor.QueryParams.ID.key(), variants); -// queryOptions.put(QueryOptions.INCLUDE, "annotation.displayConsequenceType"); -// CellBaseDataResult queryResult = variationDBAdaptor.get(query, queryOptions); -// CellBaseDataResult queryResult1 = new CellBaseDataResult<>( -// queryResult.getId(), queryResult.getTime(), queryResult.getEvents(), queryResult.getNumResults(), -// Collections.singletonList(queryResult.getResults().get(0).getAnnotation().getDisplayConsequenceType()), 1); -// return createOkResponse(queryResult1); -// } catch (Exception e) { -// return createErrorResponse("getConsequenceTypeByPostMethod", e.toString()); -// } -// } - - // FIXME: 29/04/16 GET and POST methods to be fixed -// @GET -// @Path("/{variants}/regulatory") -// @ApiOperation(httpMethod = "GET", value = "Get the regulatory impact of the variant(s)", hidden = true) -// public Response getRegulatoryByGetMethod(@PathParam("variants") String variants) { -// return getRegulatoryType(variants); -// } -// -// private Response getRegulatoryType(String variants) { -// try { -// parseQueryParams(); -// VariantDBAdaptor variationDBAdaptor = dbAdaptorFactory.getVariationDBAdaptor(this.species, this.assembly); -// return null; -// } catch (Exception e) { -// return createErrorResponse(e); -// } -// } + //------------------------------------------------------------------------- + // S N P + //------------------------------------------------------------------------- + @GET + @Path("/snp/search") + @ApiOperation(httpMethod = "GET", value = "Get SNPs", response = Snp.class, responseContainer = "QueryResponse") + @ApiImplicitParams({ + @ApiImplicitParam(name = "exclude", value = EXCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "include", value = INCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "sort", value = SORT_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "order", value = ORDER_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query", + defaultValue = "", allowableValues="ASCENDING,DESCENDING"), + @ApiImplicitParam(name = "limit", value = LIMIT_DESCRIPTION, + required = false, defaultValue = DEFAULT_LIMIT, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "skip", value = SKIP_DESCRIPTION, + required = false, defaultValue = DEFAULT_SKIP, dataType = "java.util.List", paramType = "query") + }) + public Response searchSnp(@QueryParam("id") @ApiParam(name = "id", value = "SNP ID") String id, + @QueryParam("chromosome") @ApiParam(name = "chromosome", value = "Chromosome") String chromosome, + @QueryParam("position") @ApiParam(name = "position", value = "Position") Integer position, + @QueryParam("reference") @ApiParam(name = "reference", value = "Reference") String reference) { + try { + SnpQuery query = new SnpQuery(uriParams); + CellBaseDataResult queryResult = variantManager.searchSnp(query); + return createOkResponse(queryResult); + } catch (Exception e) { + return createErrorResponse(e); + } + } -// @GET -// @Path("/{variants}/sequence") -// @ApiOperation(httpMethod = "GET", value = "Get the adjacent sequence to the SNP(s) - Not yet implemented", -// hidden = true) -// public Response getSequence(@PathParam("variants") String query) { -// try { -// return null; -// } catch (Exception e) { -// return createErrorResponse(e); -// } -// } + @GET + @Path("/snp/startsWith") + @ApiOperation(httpMethod = "GET", value = "Get SNPs starting with the input SNP ID", response = Snp.class, + responseContainer = "QueryResponse") + @ApiImplicitParams({ + @ApiImplicitParam(name = "exclude", value = EXCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "include", value = INCLUDE_DESCRIPTION, + required = false, dataType = "java.util.List", paramType = "query"), + @ApiImplicitParam(name = "limit", value = LIMIT_DESCRIPTION, + required = false, defaultValue = DEFAULT_LIMIT, dataType = "java.util.List", + paramType = "query") + }) + public Response startsWithSnp(@QueryParam("id") @ApiParam(name = "id", value = "SNP ID, e.g.: rs15703916") String id) { + try { + try { + SnpQuery query = new SnpQuery(uriParams); + CellBaseDataResult queryResult = variantManager.startsWithSnp(id, query.toQueryOptions(), getDataRelease()); + return createOkResponse(queryResult); + } catch (Exception e) { + return createErrorResponse(e); + } + } catch (Exception e) { + return createErrorResponse(e); + } + } } diff --git a/pom.xml b/pom.xml index 1c619a886b..35498a9a40 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ org.opencb.cellbase cellbase - 6.2.1-SNAPSHOT + 6.3.0-SNAPSHOT pom CellBase project @@ -23,20 +23,19 @@ ${project.version} - 5.2.1-SNAPSHOT - 3.2.1-SNAPSHOT - 0.1.0 - 9.4.51.v20230217 - - 2.14.3 - 3.14.0 - 1.7.36 + 5.3.0-SNAPSHOT + 3.3.0-SNAPSHOT + 0.1.0 + 2.11.4 + 1.9.13 2.30.1 + 1.7.32 2.17.2 1.5.2 5.5.2 0.8.8 + 9.4.17.v20190418 0.11.5 1.6.5 3.1.0 @@ -52,6 +51,7 @@ 1.48.0 2.4 2.4 + 3.12.0 2.1.6 4.4 1.69 @@ -413,11 +413,11 @@ swagger-annotations ${swagger-annotations.version} - + io.jsonwebtoken jjwt-jackson