Skip to content

Commit

Permalink
download: fix protein interpro and intact data downloader
Browse files Browse the repository at this point in the history
  • Loading branch information
imedina committed Jan 3, 2024
1 parent 5c83213 commit 531c646
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 19 deletions.
12 changes: 8 additions & 4 deletions cellbase-core/src/main/resources/configuration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -97,20 +97,24 @@ download:
miRTarBase:
host: https://mirtarbase.cuhk.edu.cn/~miRTarBase/miRTarBase_2022/cache/download/9.0/hsa_MTI.xlsx
version: "9.0"

## Protein Data
uniprot:
host: https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.xml.gz
version: "2023-11-08"
uniprotRelNotes:
host: https://ftp.uniprot.org/pub/databases/uniprot/relnotes.txt
version: "2023-11-08"
intact:
host: ftp://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
interpro:
# host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/protein2ipr.dat.gz
host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/protein2ipr.dat.gz
version: "2023-11-08"
interproRelNotes:
# host: ftp://ftp.ebi.ac.uk/pub/databases/interpro/current/release_notes.txt
host: https://ftp.ebi.ac.uk/pub/databases/interpro/current_release/release_notes.txt
intact:
host: https://ftp.ebi.ac.uk/pub/databases/intact/current/psimitab/intact.txt
version: "2023-10-07"

## Conservation Scores
conservation:
host: https://hgdownload.cse.ucsc.edu/goldenPath/
version: "2022-08-30"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
import org.opencb.commons.utils.FileUtils;

import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.file.Files;
Expand All @@ -34,6 +33,8 @@
public class ProteinDownloadManager extends AbstractDownloadManager {

private static final String UNIPROT_NAME = "UniProt";
private static final String INTERPRO_NAME = "InterPro";
private static final String INTACT_NAME = "IntAct";

public ProteinDownloadManager(String species, String assembly, Path targetDirectory, CellBaseConfiguration configuration)
throws IOException, CellBaseException {
Expand All @@ -56,30 +57,33 @@ public List<DownloadFile> download() throws IOException, InterruptedException {
Files.createDirectories(proteinFolder);
List<DownloadFile> downloadFiles = new ArrayList<>();

// Uniprot
String url = configuration.getDownload().getUniprot().getHost();
downloadFiles.add(downloadFile(url, proteinFolder.resolve("uniprot_sprot.xml.gz").toString()));
Files.createDirectories(proteinFolder.resolve("uniprot_chunks"));
splitUniprot(proteinFolder.resolve("uniprot_sprot.xml.gz"), proteinFolder.resolve("uniprot_chunks"));

String relNotesUrl = configuration.getDownload().getUniprotRelNotes().getHost();
downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("uniprotRelnotes.txt").toString()));

saveVersionData(EtlCommons.PROTEIN_DATA, UNIPROT_NAME, getLine(proteinFolder.resolve("uniprotRelnotes.txt"), 1),
getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("uniprotVersion.json"));

return downloadFiles;
// Interpro
String interproUrl = configuration.getDownload().getInterpro().getHost();
downloadFiles.add(downloadFile(interproUrl, proteinFolder.resolve("protein2ipr.dat.gz").toString()));

relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost();
downloadFiles.add(downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString()));
saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5),
getTimeStamp(), Collections.singletonList(interproUrl), proteinFolder.resolve("interproVersion.json"));

// url = configuration.getDownload().getIntact().getHost();
// downloadFile(url, proteinFolder.resolve("intact.txt").toString());
// saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, null, getTimeStamp(), Collections.singletonList(url),
// proteinFolder.resolve("intactVersion.json"));
//
// url = configuration.getDownload().getInterpro().getHost();
// downloadFile(url, proteinFolder.resolve("protein2ipr.dat.gz").toString());
// relNotesUrl = configuration.getDownload().getInterproRelNotes().getHost();
// downloadFile(relNotesUrl, proteinFolder.resolve("interproRelnotes.txt").toString());
// saveVersionData(EtlCommons.PROTEIN_DATA, INTERPRO_NAME, getLine(proteinFolder.resolve("interproRelnotes.txt"), 5),
// getTimeStamp(), Collections.singletonList(url), proteinFolder.resolve("interproVersion.json"));
// Intact
String intactUrl = configuration.getDownload().getIntact().getHost();
downloadFiles.add(downloadFile(intactUrl, proteinFolder.resolve("intact.txt").toString()));
saveVersionData(EtlCommons.PROTEIN_DATA, INTACT_NAME, configuration.getDownload().getIntact().getVersion(),
getTimeStamp(), Collections.singletonList(intactUrl), proteinFolder.resolve("intactVersion.json"));

return downloadFiles;
}

private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOException {
Expand All @@ -96,7 +100,7 @@ private void splitUniprot(Path uniprotFilePath, Path splitOutdirPath) throws IOE
inEntry = true;
beforeEntry = false;
if (count % 10000 == 0) {
pw = new PrintWriter(new FileOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile()));
pw = new PrintWriter(Files.newOutputStream(splitOutdirPath.resolve("chunk_" + chunk + ".xml").toFile().toPath()));
pw.println(header.toString().trim());
}
count++;
Expand Down

0 comments on commit 531c646

Please sign in to comment.