diff --git a/README.md b/README.md index 938aa83..d83da22 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Public ENA Assembly uploader +# ENA Assembly uploader Upload of metagenome and metatranscriptome assemblies to the [European Nucleotide Archive (ENA)](https://www.ebi.ac.uk/ena) Pre-requisites: @@ -25,12 +25,13 @@ Install the package: pip install assembly-uploader ``` -## Register study and generate pre-upload files +## Usage +### From the command line +#### Register study and generate pre-upload files **If you already have a registered study accession for your assembly files skip to step 3.** -### Step 1 - +#### Step 1: generate XML files for a new assembly study submission This step will generate a folder STUDY_upload and a project XML and submission XML within it: ```bash @@ -45,7 +46,7 @@ study_xmls pubmed ID for connected publication if available ``` -### Step 2 +#### Step 2: submit the new assembly study to ENA This step submit the XML to ENA and generate a new assembly study accession. Keep note of the newly generated study accession: @@ -55,8 +56,7 @@ submit_study --test run test submission only ``` - -### Step 3 +#### Step 3: make a manifest file for each assembly This step will generate manifest files in the folder STUDY_UPLOAD for runs specified in the metadata file: @@ -69,7 +69,7 @@ assembly_manifest --force overwrite all existing manifests ``` -## Upload assemblies +#### Step 4: upload assemblies Once manifest files are generated, it is necessary to use ENA's webin-cli resource to upload genomes. @@ -85,4 +85,58 @@ ena-webin-cli \ -submit ``` -More information on ENA's webin-cli can be found [here](). +More information on ENA's webin-cli can be found [in the ENA docs](). + +### From a Python script +This `assembly_uploader` can also be used a Python library, so that you can integrate the steps into another Python workflow or tool. + +```python +from pathlib import Path + +from assembly_uploader.study_xmls import StudyXMLGenerator, METAGENOME +from assembly_uploader.submit_study import submit_study +from assembly_uploader.assembly_manifest import AssemblyManifestGenerator + +# Generate new assembly study XML files +StudyXMLGenerator( + study="SRP272267", + center_name="EMG", + library=METAGENOME, + tpa=True, + output_dir=Path("my-study"), +).write() + +# Submit new assembly study to ENA +new_study_accession = submit_study("SRP272267", is_test=True, directory=Path("my-study")) +print(f"My assembly study has the accession {new_study_accession}") + +# Create manifest files for the assemblies to be uploaded +# This assumes you have a CSV file detailing the assemblies with their assembler and coverage metadata +# see tests/fixtures/test_metadata for an example +AssemblyManifestGenerator( + study="SRP272267", + assembly_study=new_study_accession, + assemblies_csv=Path("/path/to/my/assemblies.csv"), + output_dir=Path("my-study"), +).write() +``` + +The ENA submission requires `webin-cli`, so follow [Step 4](#step-4-upload-assemblies) above. +(You could still call this from Python, e.g. with `subprocess.Popen`.) + +## Development setup +Prerequisites: a functioning conda or pixi installation. + +To install the assembly uploader codebase in "editable" mode: + +```bash +conda env create -f requirements.yml +conda activate assemblyuploader +pip install -e '.[dev,test]' +pre-commit install +``` + +### Testing +``` +pytest +``` diff --git a/assembly_uploader/assembly_manifest.py b/assembly_uploader/assembly_manifest.py index 20a3a31..fe153aa 100644 --- a/assembly_uploader/assembly_manifest.py +++ b/assembly_uploader/assembly_manifest.py @@ -20,6 +20,7 @@ import logging import os import sys +from pathlib import Path from .ena_queries import EnaQuery @@ -64,21 +65,31 @@ def parse_args(argv): return parser.parse_args(argv) -class AssemblyManifest: - def __init__(self, argv=sys.argv[1:]): - self.args = parse_args(argv) - self.study = self.args.study - self.metadata = parse_info(self.args.data) - self.new_project = self.args.assembly_study - if self.args.output_dir: - self.upload_dir = os.path.join(self.args.output_dir, f"{self.study}_upload") - else: - self.upload_dir = os.path.join(os.getcwd(), f"{self.study}_upload") - if not os.path.exists(self.upload_dir): - os.mkdir(self.upload_dir) - self.force = self.args.force - if not os.path.exists(self.upload_dir): - os.makedirs(self.upload_dir) +class AssemblyManifestGenerator: + def __init__( + self, + study: str, + assembly_study: str, + assemblies_csv: Path, + output_dir: Path = None, + force: bool = False, + ): + """ + Create an assembly manifest file for uploading assemblies detailed in assemblies_csv into the assembly_study. + :param study: study accession of the raw reads study + :param assembly_study: study accession of the assembly study (e.g. created by Study XMLs) + :param assemblies_csv: path to assemblies CSV file, listing run_id, coverage, assembler, version, filepath of each assembly + :param output_dir: path to output directory, otherwise CWD + :param force: overwrite existing manifests + """ + self.study = study + self.metadata = parse_info(assemblies_csv) + self.new_project = assembly_study + + self.upload_dir = (output_dir or Path(".")) / Path(f"{self.study}_upload") + self.upload_dir.mkdir(exist_ok=True, parents=True) + + self.force = force def generate_manifest( self, @@ -146,9 +157,19 @@ def write_manifests(self): row["Filepath"], ) + # alias for convenience + write = write_manifests + def main(): - gen_manifest = AssemblyManifest() + args = parse_args(sys.argv[1:]) + + gen_manifest = AssemblyManifestGenerator( + study=args.study, + assembly_study=args.assembly_study, + assemblies_csv=args.data, + force=args.force, + ) gen_manifest.write_manifests() logging.info("Completed") diff --git a/assembly_uploader/study_xmls.py b/assembly_uploader/study_xmls.py index 62852c1..e045d29 100644 --- a/assembly_uploader/study_xmls.py +++ b/assembly_uploader/study_xmls.py @@ -15,20 +15,28 @@ # limitations under the License. import argparse -import os import sys import xml.dom.minidom as minidom import xml.etree.ElementTree as ET from datetime import datetime +from pathlib import Path from .ena_queries import EnaQuery +METAGENOME = "metagenome" +METATRANSCRIPTOME = "metatranscriptome" + def parse_args(argv): parser = argparse.ArgumentParser(description="Study XML generation") parser.add_argument("--study", help="raw reads study ID", required=True) - parser.add_argument("--library", help="metagenome or metatranscriptome") - parser.add_argument("--center", help="center for upload e.g. EMG") + parser.add_argument( + "--library", + help="Library ", + choices=["metagenome", "metatranscriptome"], + required=True, + ) + parser.add_argument("--center", help="center for upload e.g. EMG", required=True) parser.add_argument( "--hold", help="hold date (private) if it should be different from the provided study in " @@ -45,48 +53,79 @@ def parse_args(argv): parser.add_argument( "--publication", help="pubmed ID for connected publication if available", + type=int, required=False, ) parser.add_argument("--output-dir", help="Path to output directory", required=False) return parser.parse_args(argv) -class RegisterStudy: - def __init__(self, argv=sys.argv[1:]): - self.args = parse_args(argv) - self.study = self.args.study - if self.args.output_dir: - self.upload_dir = os.path.join(self.args.output_dir, f"{self.study}_upload") - else: - self.upload_dir = os.path.join(os.getcwd(), f"{self.study}_upload") - self.study_xml_path = os.path.join(self.upload_dir, f"{self.study}_reg.xml") - self.submission_xml_path = os.path.join( - self.upload_dir, f"{self.study}_submission.xml" +class StudyXMLGenerator: + def __init__( + self, + study: str, + center_name: str, + library: str, + hold_date: datetime = None, + tpa: bool = False, + output_dir: Path = None, + publication: int = None, + ): + f""" + Build submission files for an assembly study. + + :param study: raw reads study ID/accession + :param center_name: submission centre name, e.g. EMG + :param library: {METAGENOME} or {METATRANSCRIPTOME} + :param hold_date: hold date for the data to remain private, if it should be different from the provided study" + :param tpa: is this a third-party assembly? + :param output_dir: path to output directory (default is CWD) + :param publication: pubmed ID for connected publication if available + :return: StudyXMLGenerator object + """ + self.study = study + + self.upload_dir = (output_dir or Path(".")) / Path(f"{self.study}_upload") + self.upload_dir = self.upload_dir.absolute() + self.upload_dir.mkdir(parents=True, exist_ok=True) + + self.study_xml_path = self.upload_dir / Path(f"{self.study}_reg.xml") + self.submission_xml_path = self.upload_dir / Path( + f"{self.study}_submission.xml" ) - self.center = self.args.center - self.hold = self.args.hold + + self.center = center_name + self.hold_date = hold_date + + assert library in [METAGENOME, METATRANSCRIPTOME] + + self.library = library + self.tpa = tpa + self.publication = publication ena_query = EnaQuery(self.study) self.study_obj = ena_query.build_query() - if not os.path.exists(self.upload_dir): - os.makedirs(self.upload_dir) + self._title = None + self._abstract = None def write_study_xml(self): - subtitle = self.args.library.lower() - if self.args.tpa: + subtitle = self.library.title() + if self.tpa: sub_abstract = "Third Party Annotation (TPA) " else: sub_abstract = "" title = ( f"{subtitle} assembly of {self.study_obj['study_accession']} data " - f"set ({self.study_obj['study_title']})." + f"set ({self.study_obj['study_title']})" ) + self._title = title abstract = ( f"The {sub_abstract}assembly was derived from the primary data " - f"set {self.study_obj['study_accession']}." + f"set {self.study_obj['study_accession']}" ) + self._abstract = abstract project_alias = self.study_obj["study_accession"] + "_assembly" with open(self.study_xml_path, "wb") as study_file: @@ -103,16 +142,16 @@ def write_study_xml(self): ET.SubElement(submission_project, "SEQUENCING_PROJECT") # publication links - if self.args.publication: + if self.publication: project_links = ET.SubElement(project, "PROJECT_LINKS") project_link = ET.SubElement(project_links, "PROJECT_LINK") xref_link = ET.SubElement(project_link, "XREF_LINK") ET.SubElement(xref_link, "DB").text = "PUBMED" - ET.SubElement(xref_link, "ID").text = self.args.publication + ET.SubElement(xref_link, "ID").text = self.publication # project attributes: TPA and assembly type project_attributes = ET.SubElement(project, "PROJECT_ATTRIBUTES") - if self.args.tpa: + if self.tpa: project_attribute_tpa = ET.SubElement( project_attributes, "PROJECT_ATTRIBUTE" ) @@ -124,7 +163,7 @@ def write_study_xml(self): ) ET.SubElement(project_attribute_type, "TAG").text = "new_study_type" ET.SubElement(project_attribute_type, "VALUE").text = ( - f"{self.args.library} assembly" + f"{self.library} assembly" ) dom = minidom.parseString(ET.tostring(project_set, encoding="utf-8")) @@ -143,11 +182,11 @@ def write_submission_xml(self): # attributes: function and hold date public = self.study_obj["first_public"] today = datetime.today().strftime("%Y-%m-%d") - if self.hold: + if self.hold_date: action_hold = ET.SubElement(actions, "ACTION") hold = ET.SubElement(action_hold, "HOLD") - hold.set("HoldUntilDate", self.hold) - elif public > today and not self.hold: + hold.set("HoldUntilDate", self.hold_date.strftime("%d-%m-%Y")) + elif public > today and not self.hold_date: action_hold = ET.SubElement(actions, "ACTION") hold = ET.SubElement(action_hold, "HOLD") hold.set("HoldUntilDate", public) @@ -155,9 +194,25 @@ def write_submission_xml(self): dom = minidom.parseString(ET.tostring(submission, encoding="utf-8")) submission_file.write(dom.toprettyxml().encode("utf-8")) + def write(self): + """ + Write registration and submission XML files. + """ + self.write_study_xml() + self.write_submission_xml() + def main(): - study_reg = RegisterStudy() + args = parse_args(sys.argv[1:]) + study_reg = StudyXMLGenerator( + study=args.study, + center_name=args.center, + library=args.library, + hold_date=args.hold, + tpa=args.tpa, + output_dir=Path(args.output_dir) if args.output_dir else None, + publication=args.publication, + ) study_reg.write_study_xml() study_reg.write_submission_xml() diff --git a/assembly_uploader/submit_study.py b/assembly_uploader/submit_study.py index 1641295..983c173 100644 --- a/assembly_uploader/submit_study.py +++ b/assembly_uploader/submit_study.py @@ -16,16 +16,19 @@ import argparse import logging -import os import re import xml.etree.ElementTree as ET +from pathlib import Path import requests +from assembly_uploader.webin_utils import ( + ensure_webin_credentials_exist, + get_webin_credentials, +) + logging.basicConfig(level=logging.INFO) -ENA_WEBIN = os.environ.get("ENA_WEBIN") -ENA_WEBIN_PASSWORD = os.environ.get("ENA_WEBIN_PASSWORD") DROPBOX_DEV = "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit" DROPBOX_PROD = "https://www.ebi.ac.uk/ena/submit/drop-box/submit/" @@ -53,15 +56,14 @@ def parse_success_study_acc(report): return new_acc[0] -def project_submission(study_id, test=False, directory=None): - endpoint = DROPBOX_DEV if test else DROPBOX_PROD +def submit_study(study_id: str, is_test: bool = False, directory: Path = None): + endpoint = DROPBOX_DEV if is_test else DROPBOX_PROD logging.info(f"Submitting study xml {study_id}") - if directory: - workdir = directory - else: - workdir = os.path.join(os.getcwd(), f"{study_id}_upload") - submission_xml = os.path.join(workdir, f"{study_id}_submission.xml") - study_xml = os.path.join(workdir, f"{study_id}_reg.xml") + workdir = directory or Path.cwd() / Path(f"{study_id}_upload") + assert workdir.exists() + + submission_xml = workdir / Path(f"{study_id}_submission.xml") + study_xml = workdir / Path(f"{study_id}_reg.xml") files = { "SUBMISSION": open(submission_xml, "rb"), "ACTION": (None, "ADD"), @@ -69,7 +71,7 @@ def project_submission(study_id, test=False, directory=None): } submission_report = requests.post( - endpoint, files=files, auth=(ENA_WEBIN, ENA_WEBIN_PASSWORD) + endpoint, files=files, auth=get_webin_credentials() ) receipt_xml_str = submission_report.content.decode("utf-8") @@ -113,12 +115,9 @@ def main(): ) args = parser.parse_args() - if "ENA_WEBIN_PASSWORD" not in os.environ: - raise Exception("The variable ENA_WEBIN_PASSWORD is missing from the env.") - if "ENA_WEBIN" not in os.environ: - raise Exception("The variable ENA_WEBIN is missing from the env") + ensure_webin_credentials_exist() - project_submission(args.study, args.test, args.directory) + submit_study(args.study, args.test, Path(args.directory)) if __name__ == "__main__": diff --git a/assembly_uploader/webin_utils.py b/assembly_uploader/webin_utils.py new file mode 100644 index 0000000..3b444b9 --- /dev/null +++ b/assembly_uploader/webin_utils.py @@ -0,0 +1,18 @@ +import os + +ENA_WEBIN = "ENA_WEBIN" +ENA_WEBIN_PASSWORD = "ENA_WEBIN_PASSWORD" + + +def ensure_webin_credentials_exist(): + if ENA_WEBIN not in os.environ: + raise Exception(f"The variable {ENA_WEBIN} is missing from the env.") + if ENA_WEBIN_PASSWORD not in os.environ: + raise Exception(f"The variable {ENA_WEBIN_PASSWORD} is missing from the env") + + +def get_webin_credentials(): + ensure_webin_credentials_exist() + webin = os.environ.get(ENA_WEBIN) + password = os.environ.get(ENA_WEBIN_PASSWORD) + return webin, password diff --git a/pyproject.toml b/pyproject.toml index 464ec92..7e6dd9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,7 @@ test = [ "pytest==8.2.2", "pytest-md==0.2.0", "pytest-workflow==2.1.0", + "pytest-responses==0.5.1" ] [tool.isort] diff --git a/pytest.ini b/pytest.ini index fd3dd05..9a4cb0f 100644 --- a/pytest.ini +++ b/pytest.ini @@ -2,3 +2,4 @@ [pytest] testpaths = tests required_plugins = pytest-workflow +addopts = --git-aware diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..d3a571a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,46 @@ +from pathlib import Path + +import pytest + + +@pytest.fixture(scope="module") +def study_submission_xml_dir(): + return Path(__file__).resolve().parent / Path("fixtures/SRP272267_upload") + + +@pytest.fixture(scope="module") +def study_reg_xml(study_submission_xml_dir): + return study_submission_xml_dir / Path("SRP272267_reg.xml") + + +@pytest.fixture(scope="module") +def study_reg_xml_content(study_reg_xml): + with study_reg_xml.open() as f: + return f.readlines() + + +@pytest.fixture(scope="module") +def study_submission_xml(study_submission_xml_dir): + return study_submission_xml_dir / Path("SRP272267_submission.xml") + + +@pytest.fixture(scope="module") +def study_submission_xml_content(study_submission_xml): + with study_submission_xml.open() as f: + return f.readlines() + + +@pytest.fixture(scope="module") +def assemblies_metadata(): + return Path(__file__).resolve().parent / Path("fixtures/test_metadata") + + +@pytest.fixture(scope="module") +def run_manifest(study_submission_xml_dir): + return study_submission_xml_dir / Path("SRR12240187.manifest") + + +@pytest.fixture(scope="module") +def run_manifest_content(run_manifest): + with run_manifest.open() as f: + return f.readlines() diff --git a/tests/fixtures/SRP272267_upload/SRP272267_reg.xml b/tests/fixtures/SRP272267_upload/SRP272267_reg.xml index c2d2d65..cf18f29 100644 --- a/tests/fixtures/SRP272267_upload/SRP272267_reg.xml +++ b/tests/fixtures/SRP272267_upload/SRP272267_reg.xml @@ -1,8 +1,8 @@ - metagenome assembly of PRJNA646656 data set (Metagenomic data reveal diverse fungal and algal communities associated with the lichen symbiosis). - The Third Party Annotation (TPA) assembly was derived from the primary data set PRJNA646656. + Metagenome assembly of PRJNA646656 data set (Metagenomic data reveal diverse fungal and algal communities associated with the lichen symbiosis) + The Third Party Annotation (TPA) assembly was derived from the primary data set PRJNA646656 diff --git a/tests/unit/test_assembly_manifest.py b/tests/unit/test_assembly_manifest.py new file mode 100644 index 0000000..d45174a --- /dev/null +++ b/tests/unit/test_assembly_manifest.py @@ -0,0 +1,33 @@ +from pathlib import Path + +import responses + +from assembly_uploader.assembly_manifest import AssemblyManifestGenerator + + +def test_assembly_manifest(assemblies_metadata, tmp_path, run_manifest_content): + responses.add( + responses.POST, + "https://www.ebi.ac.uk/ena/portal/api/v2.0/search", + json=[ + { + "run_accession": "SRR12240187", + "sample_accession": "SAMN15548970", + "instrument_model": "Illumina HiSeq 2500", + "instrument_platform": "ILLUMINA", + } + ], + ) + assembly_manifest_gen = AssemblyManifestGenerator( + study="SRP272267", + assembly_study="PRJ1", + assemblies_csv=assemblies_metadata, + output_dir=tmp_path, + ) + assembly_manifest_gen.write_manifests() + + manifest_file = tmp_path / Path("SRP272267_upload/SRR12240187.manifest") + assert manifest_file.exists() + + with manifest_file.open() as f: + assert f.readlines() == run_manifest_content diff --git a/tests/unit/test_study_xmls.py b/tests/unit/test_study_xmls.py new file mode 100644 index 0000000..20badff --- /dev/null +++ b/tests/unit/test_study_xmls.py @@ -0,0 +1,47 @@ +import responses + +from assembly_uploader import study_xmls + + +def test_study_xmls(tmp_path, study_reg_xml_content, study_submission_xml_content): + ena_api = responses.add( + responses.POST, + "https://www.ebi.ac.uk/ena/portal/api/v2.0/search", + json=[ + { + "study_accession": "PRJNA646656", + "study_title": "Metagenomic data reveal diverse fungal and algal communities associated with the lichen symbiosis", + "study_description": "short, metagnomic reads from lichen thalli", + "first_public": "2020-07-18", + } + ], + ) + study_reg = study_xmls.StudyXMLGenerator( + study="SRP272267", + center_name="EMG", + library=study_xmls.METAGENOME, + tpa=True, + output_dir=tmp_path, + ) + assert ena_api.call_count == 1 + + study_reg.write_study_xml() + assert ( + study_reg._title + == "Metagenome assembly of PRJNA646656 data set (Metagenomic data reveal diverse fungal and algal communities associated with the lichen symbiosis)" + ) + + assert study_reg.study_xml_path.is_relative_to(tmp_path) + assert study_reg.study_xml_path.is_file() + + with study_reg.study_xml_path.open() as f: + content = f.readlines() + assert content == study_reg_xml_content + + study_reg.write_submission_xml() + assert study_reg.submission_xml_path.is_relative_to(tmp_path) + assert study_reg.submission_xml_path.is_file() + + with study_reg.submission_xml_path.open() as f: + content = f.readlines() + assert content == study_submission_xml_content diff --git a/tests/unit/test_study_xmls.yml b/tests/unit/test_study_xmls.yml index 874831c..93e119f 100644 --- a/tests/unit/test_study_xmls.yml +++ b/tests/unit/test_study_xmls.yml @@ -6,4 +6,4 @@ - path: "SRP272267_upload/SRP272267_submission.xml" md5sum: bdae52720209d2e70e2911c650a64901 - path: "SRP272267_upload/SRP272267_reg.xml" - md5sum: d2c815fd6ad33847465e55ca5945f0ba + md5sum: 1b818a33b786fc9491759f03814b08bf diff --git a/tests/unit/test_submit_study.py b/tests/unit/test_submit_study.py new file mode 100644 index 0000000..f3df91f --- /dev/null +++ b/tests/unit/test_submit_study.py @@ -0,0 +1,35 @@ +import pytest +import responses + +from assembly_uploader.submit_study import submit_study +from assembly_uploader.webin_utils import ( + ENA_WEBIN, + ENA_WEBIN_PASSWORD, + ensure_webin_credentials_exist, +) + + +def test_submit_study(study_submission_xml_dir, monkeypatch): + ena_dropbox = responses.add( + responses.POST, + "https://wwwdev.ebi.ac.uk/ena/submit/drop-box/submit", + body=""" + This is a long receipt from the dropbox. + success="true" + Your new study has accession="PRJEA1" + """, + ) + + with pytest.raises(Exception): + ensure_webin_credentials_exist() + + monkeypatch.setenv(ENA_WEBIN, "fake-webin-999") + monkeypatch.setenv(ENA_WEBIN_PASSWORD, "fakewebinpw") + + ensure_webin_credentials_exist() + + new_study = submit_study( + "SRP272267", is_test=True, directory=study_submission_xml_dir + ) + assert ena_dropbox.call_count == 1 + assert new_study == "PRJEA1"