Skip to content

Commit

Permalink
lib: use small chuncks to load pharma and pubmed, #TASK-4768, #TASK-4761
Browse files Browse the repository at this point in the history
  • Loading branch information
jtarraga committed Aug 31, 2023
1 parent a8bada3 commit 1a7b19d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.opencb.cellbase.core.config.CellBaseConfiguration;
import org.opencb.cellbase.core.exception.CellBaseException;
import org.opencb.cellbase.core.models.DataRelease;
import org.opencb.cellbase.lib.EtlCommons;
import org.opencb.cellbase.lib.managers.DataReleaseManager;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -100,7 +101,9 @@ public void load(Path filePath, String data, int dataRelease, String field, Stri
// protein_functional_prediction documents are extremely big. Increasing the batch size will probably
// lead to an OutOfMemory error for this collection. Batch size can be much higher for the rest of
// collections though
if (data.equals(PROTEIN_FUNCTIONAL_PREDICTION)) {
if (data.equals(PROTEIN_FUNCTIONAL_PREDICTION)
|| data.equals(EtlCommons.PHARMACOGENOMICS_DATA)
|| data.equals(EtlCommons.PUBMED_DATA)) {
batchSize = 50;
} else {
batchSize = 200;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
import java.util.List;
import java.util.concurrent.ExecutionException;

import static org.opencb.cellbase.lib.EtlCommons.PHARMACOGENOMICS_DATA;
import static org.opencb.cellbase.lib.EtlCommons.PUBMED_DATA;
import static org.opencb.cellbase.lib.db.MongoDBManager.DBNAME_SEPARATOR;

/**
Expand Down Expand Up @@ -209,10 +211,10 @@ private void downloadAndPopulate() throws IOException, ExecutionException, Class
loadData("clinical_variants", "clinical_variants", baseDir.resolve("clinical_variants.full.json.gz"));

// pharmacogenomics.json.gz
loadData("pharmacogenomics", "pharmacogenomics", baseDir.resolve("pharmacogenomics/pharmacogenomics.json.gz"));
loadData(PHARMACOGENOMICS_DATA, PHARMACOGENOMICS_DATA, baseDir.resolve("pharmacogenomics/pharmacogenomics.json.gz"));

// pubmed.json.gz
loadData("pubmed", "pubmed", baseDir.resolve("pubmed/pubmed.json.gz"));
loadData(PUBMED_DATA, PUBMED_DATA, baseDir.resolve("pubmed/pubmed.json.gz"));

// Clean temporary dir
}
Expand Down

0 comments on commit 1a7b19d

Please sign in to comment.