From a4bbb37f5a658200373ac49bc077a45ff334ab8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joaqu=C3=ADn=20T=C3=A1rraga=20Gim=C3=A9nez?= Date: Fri, 12 Jan 2024 11:24:08 +0100 Subject: [PATCH] app: update protein manager and DB adaptor to retreive protein substitution predictions (sift, polyphen, revel and alphamissense), #TASK-TASK-5421, #TASK-5388 --- .../lib/impl/core/ProteinMongoDBAdaptor.java | 98 ++++++++----------- .../cellbase/lib/managers/ProteinManager.java | 19 ---- 2 files changed, 43 insertions(+), 74 deletions(-) diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java index 353b4042c..3a30b50f9 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/impl/core/ProteinMongoDBAdaptor.java @@ -18,11 +18,13 @@ import com.mongodb.BasicDBList; import com.mongodb.client.model.Filters; -import com.mongodb.client.model.Projections; +import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.bson.Document; import org.bson.conversions.Bson; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; +import org.opencb.biodata.models.core.ProteinSubstitutionPrediction; +import org.opencb.biodata.models.core.ProteinSubstitutionPredictionScore; import org.opencb.biodata.models.variant.avro.ProteinFeature; import org.opencb.biodata.models.variant.avro.ProteinVariantAnnotation; import org.opencb.biodata.models.variant.avro.Score; @@ -34,7 +36,7 @@ import org.opencb.cellbase.core.result.CellBaseDataResult; import org.opencb.cellbase.lib.iterator.CellBaseIterator; import org.opencb.cellbase.lib.iterator.CellBaseMongoDBIterator; -import org.opencb.cellbase.lib.variant.VariantAnnotationUtils; +import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryParam; @@ -90,69 +92,55 @@ private void init() { logger.debug("ProteinMongoDBAdaptor: in 'constructor'"); mongoDBCollectionByRelease = buildCollectionByReleaseMap("protein"); - proteinSubstitutionMongoDBCollectionByRelease = buildCollectionByReleaseMap("protein_functional_prediction"); + proteinSubstitutionMongoDBCollectionByRelease = buildCollectionByReleaseMap("protein_substitution_predictions"); } - public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer position, String aa) throws CellBaseException { - CellBaseDataResult result = null; + public CellBaseDataResult getSubstitutionScores(TranscriptQuery query, Integer aaPosition, String aa) throws CellBaseException { + long dbTimeStart = System.currentTimeMillis(); + Map scoreSet = new HashMap<>(); + + // transcriptId, aaPosition, aaAlternate are needed for this collection + if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null && aaPosition != null + && StringUtils.isNotEmpty(aa)) { - // Ensembl transcript id is needed for this collection - if (query.getTranscriptsId() != null && query.getTranscriptsId().get(0) != null) { - String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; - Bson transcript = Filters.eq("transcriptId", transcriptId); MongoDBCollection mongoDBCollection = getCollectionByRelease(proteinSubstitutionMongoDBCollectionByRelease, query.getDataRelease()); - String aaShortName = null; - // If position and aa change are provided we create a 'projection' to return only the required data from the database - if (position != null) { - String projectionString = "aaPositions." + position; - - // If aa change is provided we only return that information - if (StringUtils.isNotEmpty(aa)) { - aaShortName = aaShortNameMap.get(aa.toUpperCase()); - projectionString += "." + aaShortName; - } - - // Projection is used to minimize the returned data - Bson positionProjection = Projections.include(projectionString); - result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, positionProjection, query.toQueryOptions())); - } else { - // Return the whole transcript data - result = new CellBaseDataResult<>(mongoDBCollection.find(transcript, query.toQueryOptions())); - } - - if (result != null && !result.getResults().isEmpty()) { - Document document = (Document) result.getResults().get(0); - Document aaPositionsDocument = (Document) document.get("aaPositions"); - - // Position or aa change were not provided, returning whole transcript data - if (position == null || position == -1 || aaShortName == null) { - // Return only the inner Document, not the whole document projected - result.setResults(Collections.singletonList(aaPositionsDocument)); - // Position and aa were provided, return only corresponding Score objects - } else { - List scoreList = null; - if (result.getNumResults() == 1 && aaPositionsDocument != null) { - scoreList = new ArrayList<>(NUM_PROTEIN_SUBSTITUTION_SCORE_METHODS); - Document positionDocument = (Document) aaPositionsDocument.get(Integer.toString(position)); - Document aaDocument = (Document) positionDocument.get(aaShortName); - if (aaDocument.get("ss") != null) { - scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ss")), - "sift", VariantAnnotationUtils.SIFT_DESCRIPTIONS.get(aaDocument.get("se")))); - } - if (aaDocument.get("ps") != null) { - scoreList.add(new Score(Double.parseDouble("" + aaDocument.get("ps")), - "polyphen", VariantAnnotationUtils.POLYPHEN_DESCRIPTIONS.get(aaDocument.get("pe")))); + List andBsonList = new ArrayList<>(); + // Sanity check, protein substitution predictions do not contain the transcript ID version + String transcriptId = query.getTranscriptsId().get(0).split("\\.")[0]; + andBsonList.add(Filters.eq("transcriptId", transcriptId)); + andBsonList.add(Filters.eq("aaPosition", aaPosition)); + String aaAlternate = aaShortNameMap.get(aa.toUpperCase()); + andBsonList.add(Filters.eq("scores.aaAlternate", aaAlternate)); + Bson bson = Filters.and(andBsonList); + + System.out.println("transcriptId = " + transcriptId + ", aaPosition = " + aaPosition + ", aa = " + aa + ", aaAlternate = " + + aaAlternate); + + DataResult predictions = mongoDBCollection.find(bson, null, ProteinSubstitutionPrediction.class, + new QueryOptions()); + + if (predictions != null && CollectionUtils.isNotEmpty(predictions.getResults())) { + for (ProteinSubstitutionPrediction prediction : predictions.getResults()) { + for (ProteinSubstitutionPredictionScore predictionScore : prediction.getScores()) { + System.out.println("predictionScore = " + predictionScore.toString()); + if (StringUtils.isNotEmpty(predictionScore.getAaAlternate()) && StringUtils.isNotEmpty(aaAlternate) + && predictionScore.getAaAlternate().equals(aaAlternate)) { + String key = prediction.getSource() + ":" + predictionScore.getScore() + ":" + predictionScore.getEffect(); + if (!scoreSet.containsKey(key)) { + Score score = new Score(predictionScore.getScore(), prediction.getSource(), predictionScore.getEffect()); + scoreSet.put(key, score); + } } } - result.setResults(scoreList); } } } - // Return null if no transcript id is provided - return result; + int dbTime = Long.valueOf(System.currentTimeMillis() - dbTimeStart).intValue(); + return new CellBaseDataResult<>("getSubstitutionScores", dbTime, new ArrayList<>(), scoreSet.size(), + new ArrayList<>(scoreSet.values()), scoreSet.size()); } // public CellBaseDataResult getSubstitutionScores(Query query, QueryOptions options) { @@ -231,12 +219,12 @@ public CellBaseDataResult getVariantAnnotation(String // Stop_gain/lost variants do not have SIFT/POLYPHEN scores // System.out.println("aaReference = " + aaReference); // System.out.println("aaAlternate = " + aaAlternate); - if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { +// if (!aaAlternate.equals("STOP") && !aaReference.equals("STOP")) { TranscriptQuery query = new TranscriptQuery(); query.setTranscriptsId(Collections.singletonList(ensemblTranscriptId)); query.setDataRelease(dataRelease); proteinVariantAnnotation.setSubstitutionScores(getSubstitutionScores(query, position, aaAlternate).getResults()); - } +// } CellBaseDataResult proteinVariantData; String shortAlternativeAa = aaShortNameMap.get(aaAlternate); diff --git a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java index 0505c80ad..e1a068147 100644 --- a/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java +++ b/cellbase-lib/src/main/java/org/opencb/cellbase/lib/managers/ProteinManager.java @@ -18,9 +18,7 @@ import com.fasterxml.jackson.core.JsonProcessingException; import org.opencb.biodata.formats.protein.uniprot.v202003jaxb.Entry; -import org.opencb.biodata.models.core.MissenseVariantFunctionalScore; import org.opencb.biodata.models.core.Transcript; -import org.opencb.biodata.models.core.TranscriptMissenseVariantFunctionalScore; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ProteinVariantAnnotation; import org.opencb.biodata.models.variant.avro.Score; @@ -105,17 +103,6 @@ public CellBaseDataResult getVariantAnnotation(Variant int dataRelease) throws CellBaseException { CellBaseDataResult proteinVariantAnnotation = proteinDBAdaptor.getVariantAnnotation(ensemblTranscriptId, aaPosition, aaReference, aaAlternate, options, dataRelease); - CellBaseDataResult revelResults = - missenseVariationFunctionalScoreMongoDBAdaptor.getScores( - variant.getChromosome(), variant.getStart(), variant.getReference(), variant.getAlternate(), - aaReference, aaAlternate, dataRelease); - if (proteinVariantAnnotation.getResults() != null && revelResults.getResults() != null) { - if (proteinVariantAnnotation.getResults().get(0).getSubstitutionScores() == null) { - proteinVariantAnnotation.getResults().get(0).setSubstitutionScores(new ArrayList<>()); - } - proteinVariantAnnotation.getResults().get(0).getSubstitutionScores().add( - new Score(revelResults.first().getScore(), "revel", "")); - } return proteinVariantAnnotation; } @@ -123,12 +110,6 @@ public CellBaseDataResult getProteinSubstitutionRawData(List tra int dataRelease) throws CellBaseException { return proteinDBAdaptor.getProteinSubstitutionRawData(transcriptIds, options, dataRelease); } - - public CellBaseDataResult getMissenseVariantFunctionalScores(String chromosome, List positions, - CellBaseQueryOptions options, - int dataRelease) throws CellBaseException { - return missenseVariationFunctionalScoreMongoDBAdaptor.getScores(chromosome, positions, options, dataRelease); - } }