Skip to content

Commit

Permalink
app: major improvements to support splice scores (e.g., MMSplice spli…
Browse files Browse the repository at this point in the history
…ces), #558
  • Loading branch information
jtarraga committed Jan 21, 2022
1 parent 9fc6744 commit 8497984
Show file tree
Hide file tree
Showing 16 changed files with 72 additions and 140 deletions.
8 changes: 4 additions & 4 deletions cellbase-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase-lib</artifactId>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.opencb.cellbase</groupId>-->
<!-- <artifactId>cellbase-lib</artifactId>-->
<!-- </dependency>-->
<dependency>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase-client</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,6 @@ public CommandExecutor(String logLevel, String conf) {
this.conf = this.appHome + "/conf";
}

System.out.println(">>>>> " + this.conf);

if (logLevel != null && !logLevel.isEmpty()) {
// We must call to this method
setLogLevel(logLevel);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public class BuildCommandOptions {

@Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, "
+ "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, "
+ "clinical_variants, repeats, svs, splice. 'all' builds everything.", required = true, arity = 1)
+ "clinical_variants, repeats, svs, splice_score. 'all' builds everything.", required = true, arity = 1)
public String data;

@Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1)
Expand Down Expand Up @@ -128,7 +128,7 @@ public class LoadCommandOptions {
public CommonCommandOptions commonOptions = commonCommandOptions;

@Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, "
+ "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice. 'all' loads everything",
+ "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score. 'all' loads everything",
required = true, arity = 1)
public String data;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,9 @@ public void execute() {
}
// <output>/<species>_<assembly>/generated_json
buildFolder = output.resolve(spFolder + "/generated_json");
makeDir(buildFolder);
if (!buildFolder.toFile().exists()) {
makeDir(buildFolder);
}

if (buildCommandOptions.data != null) {
String[] buildOptions;
Expand Down Expand Up @@ -155,7 +157,7 @@ public void execute() {
case EtlCommons.OBO_DATA:
parser = buildObo();
break;
case EtlCommons.SPLICE_DATA:
case EtlCommons.SPLICE_SCORE_DATA:
parser = buildSplice();
break;
default:
Expand Down Expand Up @@ -376,8 +378,8 @@ private Path getFastaReferenceGenome() {
}

private CellBaseBuilder buildSplice() throws IOException {
Path spliceInputFolder = downloadFolder.resolve(EtlCommons.SPLICE_DATA);
Path spliceOutputFolder = buildFolder.resolve(EtlCommons.SPLICE_DATA);
Path spliceInputFolder = downloadFolder.resolve(EtlCommons.SPLICE_SCORE_DATA);
Path spliceOutputFolder = buildFolder.resolve(EtlCommons.SPLICE_SCORE_DATA);
if (!spliceOutputFolder.toFile().exists()) {
spliceOutputFolder.toFile().mkdirs();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public void execute() {
EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA,
EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA,
EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA,
EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_DATA};
EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA};
} else {
loadOptions = loadCommandOptions.data.split(",");
}
Expand Down Expand Up @@ -188,9 +188,9 @@ public void execute() {
loadIfExists(input.resolve(EtlCommons.DO_VERSION_FILE), METADATA);
createIndex("ontology");
break;
case EtlCommons.SPLICE_DATA:
case EtlCommons.SPLICE_SCORE_DATA:
loadSplice();
createIndex("splice");
createIndex("splice_score");
break;
default:
logger.warn("Not valid 'data'. We should not reach this point");
Expand Down Expand Up @@ -357,16 +357,16 @@ private void loadSplice() throws NoSuchMethodException, InterruptedException, Ex
logger.info("Loading splice scores from '{}'", input);

// MMSplice scores
Path mmspliceFolder = input.resolve(EtlCommons.SPLICE_DATA + "/" + EtlCommons.MMSPLICE_SUBDIRECTORY);
Path mmspliceFolder = input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_SUBDIRECTORY);
DirectoryStream<Path> stream = Files.newDirectoryStream(mmspliceFolder, entry -> {
return entry.getFileName().toString().startsWith("mmsplice_");
});

for (Path entry : stream) {
logger.info("Loading file '{}'", entry.toString());
loadRunner.load(mmspliceFolder.resolve(entry.getFileName()), "splice");
loadRunner.load(mmspliceFolder.resolve(entry.getFileName()), EtlCommons.SPLICE_SCORE_DATA);
}
loadIfExists(input.resolve(EtlCommons.SPLICE_DATA + "/" + EtlCommons.MMSPLICE_VERSION_FILENAME), METADATA);
loadIfExists(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_VERSION_FILENAME), METADATA);
}

private void createIndex(String collectionName) {
Expand Down
2 changes: 1 addition & 1 deletion cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ species:
- regulation
- repeats
- variation_functional_score
- splice
- splice_score
shards:
- collection: "variation"
key:
Expand Down
1 change: 1 addition & 0 deletions cellbase-core/src/test/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ species:
- regulation
- repeats
- variation_functional_score
- splice_score
shards:
- collection: "variation"
key:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public class EtlCommons {
public static final String PROTEIN_DATA = "protein";
public static final String CONSERVATION_DATA = "conservation";
public static final String CLINICAL_VARIANTS_DATA = "clinical_variants";
public static final String SPLICE_DATA = "splice";
public static final String SPLICE_SCORE_DATA = "splice_score";

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2021-07.xml.gz";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ public GeneBuilder(Path geneDirectoryPath, Path genomeSequenceFastaFile, Species
boolean flexibleGTFParsing, CellBaseSerializer serializer) throws CellBaseException {
this(null, geneDirectoryPath.resolve("description.txt"),
geneDirectoryPath.resolve("xrefs.txt"),
geneDirectoryPath.resolve("MANE.GRCh38.v0.91.summary.txt.gz"),
geneDirectoryPath.resolve("MANE.GRCh38.v0.93.summary.txt.gz"),
geneDirectoryPath.resolve("list_LRGs_transcripts_xrefs.txt"),
geneDirectoryPath.resolve("idmapping_selected.tab.gz"),
geneDirectoryPath.getParent().resolve("regulation/motif_features.gff.gz"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ private void mmspliceParser(Path mmsplicePath) throws IOException {
} else {
if (spliceScore != null) {
// Write the currant splice score
fileSerializer.serialize(spliceScore, EtlCommons.MMSPLICE_SUBDIRECTORY + "/mmsplice_"
fileSerializer.serialize(spliceScore, EtlCommons.MMSPLICE_SUBDIRECTORY + "/mmsplice_chr"
+ spliceScore.getChromosome());
}

Expand All @@ -146,7 +146,7 @@ private void mmspliceParser(Path mmsplicePath) throws IOException {

if (spliceScore != null) {
// Write the last splice score
fileSerializer.serialize(spliceScore, EtlCommons.MMSPLICE_SUBDIRECTORY + "/mmsplice_"
fileSerializer.serialize(spliceScore, EtlCommons.MMSPLICE_SUBDIRECTORY + "/mmsplice_chr"
+ spliceScore.getChromosome());
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ public GenomeDownloadManager(String species, String assembly, Path targetDirecto
public List<DownloadFile> download() throws IOException, InterruptedException {
List<DownloadFile> downloadFiles = new ArrayList<>();
downloadFiles.addAll(downloadReferenceGenome());
downloadFiles.addAll(downloadConservation());
downloadFiles.addAll(downloadRepeats());
// downloadFiles.addAll(downloadConservation());
// downloadFiles.addAll(downloadRepeats());

// cytobands
runGenomeInfo();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ public SpliceScoreMongoDBAdaptor(MongoDataStore mongoDataStore) {
private void init() {
logger.debug("SpliceScoreMongoDBAdaptor: in 'constructor'");

mongoDBCollection = mongoDataStore.getCollection(EtlCommons.SPLICE_DATA);
mongoDBCollection = mongoDataStore.getCollection(EtlCommons.SPLICE_SCORE_DATA);
}

public CellBaseDataResult<SpliceScore> query(String chromosome, int position, String reference) {
List<Bson> andBsonList = new ArrayList<>();
andBsonList.add(Filters.eq("chromosome", chromosome));
andBsonList.add(Filters.eq("position", position));
andBsonList.add(Filters.eq("reference", reference));
andBsonList.add(Filters.eq("refAllele", reference));
Bson query = Filters.and(andBsonList);
return new CellBaseDataResult<>(mongoDBCollection.find(query, null, SpliceScore.class, new QueryOptions()));

Expand All @@ -58,14 +58,14 @@ public CellBaseDataResult<SpliceScore> getScores(String chromosome, int position
List<Bson> andBsonList = new ArrayList<>();
andBsonList.add(Filters.eq("chromosome", chromosome));
andBsonList.add(Filters.eq("position", position));
andBsonList.add(Filters.eq("reference", reference));
andBsonList.add(Filters.eq("refAllele", reference));
Bson query = Filters.and(andBsonList);

final String id = chromosome + ":" + position + ":" + reference + ":" + alternate;

DataResult<SpliceScore> spliceScoreDataResult = mongoDBCollection.find(query, null, SpliceScore.class, new QueryOptions());

// Search for the right aa change
// Search for the right splice score
if (spliceScoreDataResult.getNumResults() > 0) {
for (SpliceScore score : spliceScoreDataResult.getResults()) {
for (SpliceScoreAlternate scoreAlternate : score.getAlternates()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,8 @@ private String getCollectionName(String data) throws LoaderException {
case "ontology":
collection = "ontology";
break;
case "splice":
collection = "splice";
case "splice_score":
collection = "splice_score";
break;
default:
throw new LoaderException("Unknown data to load: '" + data + "'");
Expand Down Expand Up @@ -298,7 +298,7 @@ private void getChunkSizes() {
case "repeats":
chunkSizes = new int[]{MongoDBCollectionConfiguration.REPEATS_CHUNK_SIZE};
break;
case "splice":
case "splice_score":
chunkSizes = new int[]{MongoDBCollectionConfiguration.SPLICE_CHUNK_SIZE};
break;
default:
Expand Down Expand Up @@ -700,8 +700,11 @@ private void addChunkId(Document document) {
if (chunkSizes != null && chunkSizes.length > 0) {
List<String> chunkIds = new ArrayList<>();
for (int chunkSize : chunkSizes) {
int chunkStart = (Integer) document.get("start") / chunkSize;
int chunkEnd = (Integer) document.get("end") / chunkSize;
int start = document.get("position") != null ? (Integer) document.get("position") : (Integer) document.get("start");
int end = document.get("position") != null ? (Integer) document.get("position") : (Integer) document.get("end");

int chunkStart = start / chunkSize;
int chunkEnd = end / chunkSize;
String chunkIdSuffix = chunkSize / 1000 + "k";
for (int i = chunkStart; i <= chunkEnd; i++) {
if (document.containsKey("chromosome")) {
Expand Down Expand Up @@ -781,8 +784,8 @@ private Path getIndexFilePath(String data) throws LoaderException {
case "repeats":
indexFileName = "repeat-indexes.js";
break;
case "splice":
indexFileName = "splice-indexes.js";
case "splice_score":
indexFileName = "splice_score-indexes.js";
break;
default:
break;
Expand Down
Loading

0 comments on commit 8497984

Please sign in to comment.