Skip to content

Commit

Permalink
Resolve conflicts after merging #558
Browse files Browse the repository at this point in the history
  • Loading branch information
jtarraga committed Feb 10, 2022
2 parents 14081ab + 7c48633 commit 27cc1f7
Show file tree
Hide file tree
Showing 24 changed files with 1,072 additions and 285 deletions.
8 changes: 4 additions & 4 deletions cellbase-app/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
</properties>

<dependencies>
<dependency>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase-lib</artifactId>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.opencb.cellbase</groupId>-->
<!-- <artifactId>cellbase-lib</artifactId>-->
<!-- </dependency>-->
<dependency>
<groupId>org.opencb.cellbase</groupId>
<artifactId>cellbase-client</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ public class BuildCommandOptions {

@Parameter(names = {"-d", "--data"}, description = "Comma separated list of data to build: genome, genome_info, "
+ "gene, variation, variation_functional_score, regulation, protein, ppi, conservation, drug, "
+ "clinical_variants, repeats, svs, splice. 'all' builds everything.", required = true, arity = 1)
+ "clinical_variants, repeats, svs, splice_score. 'all' builds everything.", required = true, arity = 1)
public String data;

@Parameter(names = {"-s", "--species"}, description = "Name of the species to be built, valid formats include 'Homo sapiens' or 'hsapiens'", required = false, arity = 1)
Expand Down Expand Up @@ -128,7 +128,7 @@ public class LoadCommandOptions {
public CommonCommandOptions commonOptions = commonCommandOptions;

@Parameter(names = {"-d", "--data"}, description = "Data model type to be loaded: genome, gene, variation, "
+ "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice. 'all' loads everything",
+ "conservation, regulation, protein, clinical_variants, repeats, regulatory_pfm, splice_score. 'all' loads everything",
required = true, arity = 1)
public String data;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
import org.opencb.cellbase.core.serializer.CellBaseFileSerializer;
import org.opencb.cellbase.core.serializer.CellBaseJsonFileSerializer;
import org.opencb.cellbase.core.serializer.CellBaseSerializer;
import org.opencb.cellbase.core.utils.SpeciesUtils;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.lib.MongoDBCollectionConfiguration;
import org.opencb.cellbase.core.utils.SpeciesUtils;
import org.opencb.cellbase.lib.builders.*;
import org.opencb.cellbase.lib.builders.clinical.variant.ClinicalVariantBuilder;

Expand Down Expand Up @@ -100,7 +100,9 @@ public void execute() {
}
// <output>/<species>_<assembly>/generated_json
buildFolder = output.resolve(spFolder + "/generated_json");
makeDir(buildFolder);
if (!buildFolder.toFile().exists()) {
makeDir(buildFolder);
}

if (buildCommandOptions.data != null) {
String[] buildOptions;
Expand Down Expand Up @@ -155,7 +157,7 @@ public void execute() {
case EtlCommons.OBO_DATA:
parser = buildObo();
break;
case EtlCommons.SPLICE_DATA:
case EtlCommons.SPLICE_SCORE_DATA:
parser = buildSplice();
break;
default:
Expand Down Expand Up @@ -375,11 +377,20 @@ private Path getFastaReferenceGenome() {
return fastaFile;
}

private CellBaseBuilder buildSplice() {
Path genePath = buildFolder.resolve("gene.json.gz");
Path genomeInfoPath = buildFolder.resolve("genome_info.json");
Path fastaPath = getFastaReferenceGenome();
CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(buildFolder, "splice");
return new SpliceBuilder(genePath, genomeInfoPath, fastaPath, serializer);
private CellBaseBuilder buildSplice() throws IOException {
Path spliceInputFolder = downloadFolder.resolve(EtlCommons.SPLICE_SCORE_DATA);
Path spliceOutputFolder = buildFolder.resolve(EtlCommons.SPLICE_SCORE_DATA);
if (!spliceOutputFolder.toFile().exists()) {
spliceOutputFolder.toFile().mkdirs();
}

if (spliceInputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME).toFile().exists()) {
Files.copy(spliceInputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME),
spliceOutputFolder.resolve(EtlCommons.MMSPLICE_VERSION_FILENAME),
StandardCopyOption.REPLACE_EXISTING);
}

CellBaseFileSerializer serializer = new CellBaseJsonFileSerializer(spliceOutputFolder);
return new SpliceBuilder(spliceInputFolder, serializer);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ public void execute() {
EtlCommons.CONSERVATION_DATA, EtlCommons.REGULATION_DATA, EtlCommons.PROTEIN_DATA,
EtlCommons.PROTEIN_FUNCTIONAL_PREDICTION_DATA, EtlCommons.VARIATION_DATA,
EtlCommons.VARIATION_FUNCTIONAL_SCORE_DATA, EtlCommons.CLINICAL_VARIANTS_DATA, EtlCommons.REPEATS_DATA,
EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_DATA};
EtlCommons.OBO_DATA, EtlCommons.MISSENSE_VARIATION_SCORE_DATA, EtlCommons.SPLICE_SCORE_DATA};
} else {
loadOptions = loadCommandOptions.data.split(",");
}
Expand Down Expand Up @@ -188,9 +188,9 @@ public void execute() {
loadIfExists(input.resolve(EtlCommons.DO_VERSION_FILE), METADATA);
createIndex("ontology");
break;
case EtlCommons.SPLICE_DATA:
loadIfExists(input.resolve("splice.json.gz"), "splice");
createIndex("splice");
case EtlCommons.SPLICE_SCORE_DATA:
loadSpliceScores();
createIndex("splice_score");
break;
default:
logger.warn("Not valid 'data'. We should not reach this point");
Expand Down Expand Up @@ -350,6 +350,35 @@ private void loadRepeats() {
}
}

private void loadSpliceScores() throws NoSuchMethodException, InterruptedException, ExecutionException,
InstantiationException, IllegalAccessException, InvocationTargetException, ClassNotFoundException,
IOException {
logger.info("Loading splice scores from '{}'", input);

// MMSplice scores
loadSpliceScores(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.MMSPLICE_SUBDIRECTORY),
EtlCommons.MMSPLICE_VERSION_FILENAME);

// SpliceAI scores
loadSpliceScores(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + EtlCommons.SPLICEAI_SUBDIRECTORY),
EtlCommons.SPLICEAI_VERSION_FILENAME);
}

private void loadSpliceScores(Path spliceFolder, String versionFilename) throws IOException, ExecutionException, InterruptedException,
ClassNotFoundException, InvocationTargetException, NoSuchMethodException, InstantiationException, IllegalAccessException {
// Get files from folder
DirectoryStream<Path> stream = Files.newDirectoryStream(spliceFolder, entry -> {
return entry.getFileName().toString().startsWith("splice_score_");
});
// Load from JSON files
for (Path entry : stream) {
logger.info("Loading file '{}'", entry.toString());
loadRunner.load(spliceFolder.resolve(entry.getFileName()), EtlCommons.SPLICE_SCORE_DATA);
}
loadIfExists(input.resolve(EtlCommons.SPLICE_SCORE_DATA + "/" + versionFilename), METADATA);
}


private void createIndex(String collectionName) {
if (!createIndexes) {
return;
Expand All @@ -358,7 +387,7 @@ private void createIndex(String collectionName) {
try {
indexManager.createMongoDBIndexes(collectionName, true);
} catch (IOException e) {
logger.error("Error creating indexes:" + e.toString());
logger.error("Error creating indexes:" + e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,4 @@ protected void reset(CellBaseDataResult<Variant> variantCellBaseDataResult) {
variantCellBaseDataResult.setNumMatches(0);
}





}
5 changes: 3 additions & 2 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
version: v5

version: "v5"
apiVersion: "${cellbase.version}"
wiki: https://github.com/opencb/cellbase/wiki
maintenanceFlagFile: "/tmp/maintenance"
Expand Down Expand Up @@ -170,7 +171,7 @@ species:
- regulation
- repeats
- variation_functional_score
- splice
- splice_score
shards:
- collection: "variation"
key:
Expand Down
1 change: 1 addition & 0 deletions cellbase-core/src/test/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ species:
- regulation
- repeats
- variation_functional_score
- splice_score
shards:
- collection: "variation"
key:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ public class EtlCommons {
public static final String PROTEIN_DATA = "protein";
public static final String CONSERVATION_DATA = "conservation";
public static final String CLINICAL_VARIANTS_DATA = "clinical_variants";
public static final String SPLICE_DATA = "splice";
public static final String SPLICE_SCORE_DATA = "splice_score";

public static final String CLINICAL_VARIANTS_FOLDER = "clinicalVariant";
public static final String CLINVAR_XML_FILE = "ClinVarFullRelease_2021-07.xml.gz";
Expand Down Expand Up @@ -87,10 +87,15 @@ public class EtlCommons {

// Path and file names
public static final String GERP_SUBDIRECTORY = "gerp";
public static final String MMSPLICE_SUBDIRECTORY = "mmsplice";
public static final String MMSPLICE_VERSION_FILENAME = "mmspliceVersion.json";
public static final String SPLICEAI_SUBDIRECTORY = "spliceai";
public static final String SPLICEAI_VERSION_FILENAME = "spliceaiVersion.json";

// binary bigwig file
public static final String GERP_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bw";
// bigwig file manually transformed to bedGraph file
public static final String GERP_PROCESSED_FILE = "gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz";
public static final String GERP_PROCESSED_FILE = "gerp.bedGraph.gz"; //"gerp_conservation_scores.homo_sapiens.GRCh38.bedGraph.gz";
public static final String CLINICAL_VARIANTS_JSON_FILE = "clinical_variants.json.gz";
public static final String CLINICAL_VARIANTS_ANNOTATED_JSON_FILE = "clinical_variants.full.json.gz";
public static final String DOCM_FILE = "docm.json.gz";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,5 @@ public class MongoDBCollectionConfiguration {
public static final int REGULATORY_REGION_CHUNK_SIZE = 2000;
public static final int VARIATION_FUNCTIONAL_SCORE_CHUNK_SIZE = 1000;
public static final int REPEATS_CHUNK_SIZE = 2000;
public static final int SPLICE_CHUNK_SIZE = 2000;
}
Loading

0 comments on commit 27cc1f7

Please sign in to comment.