EBI-Metagenomics · SandyRogers · Oct 31, 2024 · Oct 3, 2024 · Oct 30, 2024 · Oct 30, 2024
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Public ENA Assembly uploader
+# ENA Assembly uploader
 Upload of metagenome and metatranscriptome assemblies to the [European Nucleotide Archive (ENA)](https://www.ebi.ac.uk/ena)
 
 Pre-requisites:
@@ -25,12 +25,13 @@ Install the package:
 pip install assembly-uploader
 ```
 
-## Register study and generate pre-upload files
+## Usage
+### From the command line
+#### Register study and generate pre-upload files
 
 **If you already have a registered study accession for your assembly files skip to step 3.**
 
-### Step 1
-
+#### Step 1: generate XML files for a new assembly study submission
 This step will generate a folder STUDY_upload and a project XML and submission XML within it:
 
 ```bash
@@ -45,7 +46,7 @@ study_xmls
                         pubmed ID for connected publication if available
 ```
 
-### Step 2
+#### Step 2: submit the new assembly study to ENA
 
 This step submit the XML to ENA and generate a new assembly study accession. Keep note of the newly generated study accession:
 
@@ -55,8 +56,7 @@ submit_study
   --test                run test submission only
 ```
 
-
-### Step 3
+#### Step 3: make a manifest file for each assembly
 
 This step will generate manifest files in the folder STUDY_UPLOAD for runs specified in the metadata file:
 
@@ -69,7 +69,7 @@ assembly_manifest
   --force               overwrite all existing manifests
 ```
 
-## Upload assemblies
+#### Step 4: upload assemblies
 
 Once manifest files are generated, it is necessary to use ENA's webin-cli resource to upload genomes.
 
@@ -85,4 +85,58 @@ ena-webin-cli \
   -submit
 ```
 
-More information on ENA's webin-cli can be found [here](<https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html>).
+More information on ENA's webin-cli can be found [in the ENA docs](<https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html>).
+
+### From a Python script
+This `assembly_uploader` can also be used a Python library, so that you can integrate the steps into another Python workflow or tool.
+
+```python
+from pathlib import Path
+
+from assembly_uploader.study_xmls import StudyXMLGenerator, METAGENOME
+from assembly_uploader.submit_study import submit_study
+from assembly_uploader.assembly_manifest import AssemblyManifestGenerator
+
+# Generate new assembly study XML files
+StudyXMLGenerator(
+    study="SRP272267",
+    center_name="EMG",
+    library=METAGENOME,
+    tpa=True,
+    output_dir=Path("my-study"),
+).write()
+
+# Submit new assembly study to ENA
+new_study_accession = submit_study("SRP272267", is_test=True, directory=Path("my-study"))
+print(f"My assembly study has the accession {new_study_accession}")
+
+# Create manifest files for the assemblies to be uploaded
+# This assumes you have a CSV file detailing the assemblies with their assembler and coverage metadata
+# see tests/fixtures/test_metadata for an example
+AssemblyManifestGenerator(
+    study="SRP272267",
+    assembly_study=new_study_accession,
+    assemblies_csv=Path("/path/to/my/assemblies.csv"),
+    output_dir=Path("my-study"),
+).write()
+```
+
+The ENA submission requires `webin-cli`, so follow [Step 4](#step-4-upload-assemblies) above.
+(You could still call this from Python, e.g. with `subprocess.Popen`.)
+
+## Development setup
+Prerequisites: a functioning conda or pixi installation.
+
+To install the assembly uploader codebase in "editable" mode:
+
+```bash
+conda env create -f requirements.yml
+conda activate assemblyuploader
+pip install -e '.[dev,test]'
+pre-commit install
+```
+
+### Testing
+```
+pytest
+```
diff --git a/assembly_uploader/assembly_manifest.py b/assembly_uploader/assembly_manifest.py
@@ -20,6 +20,7 @@
 import logging
 import os
 import sys
+from pathlib import Path
 
 from .ena_queries import EnaQuery
 
@@ -64,21 +65,31 @@ def parse_args(argv):
     return parser.parse_args(argv)
 
 
-class AssemblyManifest:
-    def __init__(self, argv=sys.argv[1:]):
-        self.args = parse_args(argv)
-        self.study = self.args.study
-        self.metadata = parse_info(self.args.data)
-        self.new_project = self.args.assembly_study
-        if self.args.output_dir:
-            self.upload_dir = os.path.join(self.args.output_dir, f"{self.study}_upload")
-        else:
-            self.upload_dir = os.path.join(os.getcwd(), f"{self.study}_upload")
-        if not os.path.exists(self.upload_dir):
-            os.mkdir(self.upload_dir)
-        self.force = self.args.force
-        if not os.path.exists(self.upload_dir):
-            os.makedirs(self.upload_dir)
+class AssemblyManifestGenerator:
+    def __init__(
+        self,
+        study: str,
+        assembly_study: str,
+        assemblies_csv: Path,
+        output_dir: Path = None,
+        force: bool = False,
+    ):
+        """
+        Create an assembly manifest file for uploading assemblies detailed in assemblies_csv into the assembly_study.
+        :param study: study accession of the raw reads study
+        :param assembly_study: study accession of the assembly study (e.g. created by Study XMLs)
+        :param assemblies_csv: path to assemblies CSV file, listing run_id, coverage, assembler, version, filepath of each assembly
+        :param output_dir: path to output directory, otherwise CWD
+        :param force: overwrite existing manifests
+        """
+        self.study = study
+        self.metadata = parse_info(assemblies_csv)
+        self.new_project = assembly_study
+
+        self.upload_dir = (output_dir or Path(".")) / Path(f"{self.study}_upload")
+        self.upload_dir.mkdir(exist_ok=True, parents=True)
+
+        self.force = force
 
     def generate_manifest(
         self,
@@ -146,9 +157,19 @@ def write_manifests(self):
                 row["Filepath"],
             )
 
+    # alias for convenience
+    write = write_manifests
+
 
 def main():
-    gen_manifest = AssemblyManifest()
+    args = parse_args(sys.argv[1:])
+
+    gen_manifest = AssemblyManifestGenerator(
+        study=args.study,
+        assembly_study=args.assembly_study,
+        assemblies_csv=args.data,
+        force=args.force,
+    )
     gen_manifest.write_manifests()
     logging.info("Completed")
 

diff --git a/assembly_uploader/study_xmls.py b/assembly_uploader/study_xmls.py
@@ -15,20 +15,28 @@
 # limitations under the License.
 
 import argparse
-import os
 import sys
 import xml.dom.minidom as minidom
 import xml.etree.ElementTree as ET
 from datetime import datetime
+from pathlib import Path
 
 from .ena_queries import EnaQuery
 
+METAGENOME = "metagenome"
+METATRANSCRIPTOME = "metatranscriptome"
+
 
 def parse_args(argv):
     parser = argparse.ArgumentParser(description="Study XML generation")
     parser.add_argument("--study", help="raw reads study ID", required=True)
-    parser.add_argument("--library", help="metagenome or metatranscriptome")
-    parser.add_argument("--center", help="center for upload e.g. EMG")
+    parser.add_argument(
+        "--library",
+        help="Library ",
+        choices=["metagenome", "metatranscriptome"],
+        required=True,
+    )
+    parser.add_argument("--center", help="center for upload e.g. EMG", required=True)
     parser.add_argument(
         "--hold",
         help="hold date (private) if it should be different from the provided study in "
@@ -45,48 +53,79 @@ def parse_args(argv):
     parser.add_argument(
         "--publication",
         help="pubmed ID for connected publication if available",
+        type=int,
         required=False,
     )
     parser.add_argument("--output-dir", help="Path to output directory", required=False)
     return parser.parse_args(argv)
 
 
-class RegisterStudy:
-    def __init__(self, argv=sys.argv[1:]):
-        self.args = parse_args(argv)
-        self.study = self.args.study
-        if self.args.output_dir:
-            self.upload_dir = os.path.join(self.args.output_dir, f"{self.study}_upload")
-        else:
-            self.upload_dir = os.path.join(os.getcwd(), f"{self.study}_upload")
-        self.study_xml_path = os.path.join(self.upload_dir, f"{self.study}_reg.xml")
-        self.submission_xml_path = os.path.join(
-            self.upload_dir, f"{self.study}_submission.xml"
+class StudyXMLGenerator:
+    def __init__(
+        self,
+        study: str,
+        center_name: str,
+        library: str,
+        hold_date: datetime = None,
+        tpa: bool = False,
+        output_dir: Path = None,
+        publication: int = None,
+    ):
+        f"""
+        Build submission files for an assembly study.
+
+        :param study: raw reads study ID/accession
+        :param center_name: submission centre name, e.g. EMG
+        :param library: {METAGENOME} or {METATRANSCRIPTOME}
+        :param hold_date: hold date for the data to remain private, if it should be different from the provided study"
+        :param tpa: is this a third-party assembly?
+        :param output_dir: path to output directory (default is CWD)
+        :param publication: pubmed ID for connected publication if available
+        :return: StudyXMLGenerator object
+        """
+        self.study = study
+
+        self.upload_dir = (output_dir or Path(".")) / Path(f"{self.study}_upload")
+        self.upload_dir = self.upload_dir.absolute()
+        self.upload_dir.mkdir(parents=True, exist_ok=True)
+
+        self.study_xml_path = self.upload_dir / Path(f"{self.study}_reg.xml")
+        self.submission_xml_path = self.upload_dir / Path(
+            f"{self.study}_submission.xml"
         )
-        self.center = self.args.center
-        self.hold = self.args.hold
+
+        self.center = center_name
+        self.hold_date = hold_date
+
+        assert library in [METAGENOME, METATRANSCRIPTOME]
+
+        self.library = library
+        self.tpa = tpa
+        self.publication = publication
 
         ena_query = EnaQuery(self.study)
         self.study_obj = ena_query.build_query()
 
-        if not os.path.exists(self.upload_dir):
-            os.makedirs(self.upload_dir)
+        self._title = None
+        self._abstract = None
 
     def write_study_xml(self):
-        subtitle = self.args.library.lower()
-        if self.args.tpa:
+        subtitle = self.library.title()
+        if self.tpa:
             sub_abstract = "Third Party Annotation (TPA) "
         else:
             sub_abstract = ""
 
         title = (
             f"{subtitle} assembly of {self.study_obj['study_accession']} data "
-            f"set ({self.study_obj['study_title']})."
+            f"set ({self.study_obj['study_title']})"
         )
+        self._title = title
         abstract = (
             f"The {sub_abstract}assembly was derived from the primary data "
-            f"set {self.study_obj['study_accession']}."
+            f"set {self.study_obj['study_accession']}"
         )
+        self._abstract = abstract
 
         project_alias = self.study_obj["study_accession"] + "_assembly"
         with open(self.study_xml_path, "wb") as study_file:
@@ -103,16 +142,16 @@ def write_study_xml(self):
             ET.SubElement(submission_project, "SEQUENCING_PROJECT")
 
             # publication links
-            if self.args.publication:
+            if self.publication:
                 project_links = ET.SubElement(project, "PROJECT_LINKS")
                 project_link = ET.SubElement(project_links, "PROJECT_LINK")
                 xref_link = ET.SubElement(project_link, "XREF_LINK")
                 ET.SubElement(xref_link, "DB").text = "PUBMED"
-                ET.SubElement(xref_link, "ID").text = self.args.publication
+                ET.SubElement(xref_link, "ID").text = self.publication
 
             # project attributes: TPA and assembly type
             project_attributes = ET.SubElement(project, "PROJECT_ATTRIBUTES")
-            if self.args.tpa:
+            if self.tpa:
                 project_attribute_tpa = ET.SubElement(
                     project_attributes, "PROJECT_ATTRIBUTE"
                 )
@@ -124,7 +163,7 @@ def write_study_xml(self):
             )
             ET.SubElement(project_attribute_type, "TAG").text = "new_study_type"
             ET.SubElement(project_attribute_type, "VALUE").text = (
-                f"{self.args.library} assembly"
+                f"{self.library} assembly"
             )
 
             dom = minidom.parseString(ET.tostring(project_set, encoding="utf-8"))
@@ -143,21 +182,37 @@ def write_submission_xml(self):
             # attributes: function and hold date
             public = self.study_obj["first_public"]
             today = datetime.today().strftime("%Y-%m-%d")
-            if self.hold:
+            if self.hold_date:
                 action_hold = ET.SubElement(actions, "ACTION")
                 hold = ET.SubElement(action_hold, "HOLD")
-                hold.set("HoldUntilDate", self.hold)
-            elif public > today and not self.hold:
+                hold.set("HoldUntilDate", self.hold_date.strftime("%d-%m-%Y"))
+            elif public > today and not self.hold_date:
                 action_hold = ET.SubElement(actions, "ACTION")
                 hold = ET.SubElement(action_hold, "HOLD")
                 hold.set("HoldUntilDate", public)
 
             dom = minidom.parseString(ET.tostring(submission, encoding="utf-8"))
             submission_file.write(dom.toprettyxml().encode("utf-8"))
 
+    def write(self):
+        """
+        Write registration and submission XML files.
+        """
+        self.write_study_xml()
+        self.write_submission_xml()
+
 
 def main():
-    study_reg = RegisterStudy()
+    args = parse_args(sys.argv[1:])
+    study_reg = StudyXMLGenerator(
+        study=args.study,
+        center_name=args.center,
+        library=args.library,
+        hold_date=args.hold,
+        tpa=args.tpa,
+        output_dir=Path(args.output_dir) if args.output_dir else None,
+        publication=args.publication,
+    )
     study_reg.write_study_xml()
     study_reg.write_submission_xml()