Merge pull request #3 from Arcadia-Science/ter/cleavage

Add cleavage peptide prediction via NLPPrecursor and DeepPeptide
Arcadia-Science · Feb 15, 2024 · ac6b76f · ac6b76f
2 parents 1fdaabb + 893941d
commit ac6b76f
Show file tree

Hide file tree

Showing 7 changed files with 405 additions and 12 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ envs/src/**
 /demo/output/
 /demo/input/
 /logs
+cloned_repositories/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/Snakefile b/Snakefile
@@ -5,9 +5,14 @@ from pathlib import Path
 ## Configuration
 ################################################################################
 
+# Retrieves the absolute path of the directory snakemake is launched in.
+# Used by DeepPeptide to simplify output file paths.
+WORKING_DIRPATH = Path(os.getcwd())
+
 
 # Default pipeline configuration parameters are in the config file.
-# If you create a new yml file and use the --configfile flag, options in that new file overwrite the defaults.
+# If you create a new yml file and use the --configfile flag,
+# options in that new file overwrite the defaults.
 configfile: "./config.yml"
 
 
@@ -41,7 +46,9 @@ rule filter_nt_contigs_to_short:
         """
 
 
-# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, when there is an accurate sORF rnasamba model, or using another tool from Singh & Roy.
+# TER TODO: Add a rule for sORF prediction, either once smallesm is developed,
+#           when there is an accurate sORF rnasamba model,
+#           or using another tool from Singh & Roy.
 
 
 rule filter_nt_contigs_to_long:
@@ -59,8 +66,10 @@ rule filter_nt_contigs_to_long:
 
 rule get_coding_contig_names:
     """
-    Extract amino acid contig names and remove everything after the first period, which are isoform labels.
-    This file will be used to select all contigs that DO NOT encode ORFs, according to transdecoder.
+    Extract amino acid contig names and remove everything after the first period, 
+    which are isoform labels.
+    This file will be used to select all contigs that DO NOT encode ORFs, 
+    according to transdecoder.
     """
     input:
         ORFS_AMINO_ACIDS,
@@ -77,8 +86,9 @@ rule get_coding_contig_names:
 rule filter_long_contigs_to_no_predicted_ORF:
     """
     Many of the contigs in the full transcriptome have predicted ORFs.
-    The names of these contigs are recorded in the transdecoder input files (*pep and *cds, orfs_*).
-    By definition, these contigs are not noncoding RNAs, so they don't need to be considered for classification as long noncoding RNAs (lncRNA).
+    The names of these contigs are recorded in the transdecoder input files (*pep & *cds, orfs_*).
+    By definition, these contigs are not noncoding RNAs, 
+    so they don't need to be considered for classification as long noncoding RNAs (lncRNA).
     This step removes the contigs that contain ORFs.
     """
     input:
@@ -97,7 +107,8 @@ rule filter_long_contigs_to_no_predicted_ORF:
 rule download_rnasamba_model:
     """
     Place holder rule.
-    For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile, which is available locally from running it.
+    For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile, 
+    which is available locally from running it.
     """
     output:
         model=OUTPUT_DIR / "models/rnasamba/build/3_model/eu_rnasamba.hdf5",
@@ -109,9 +120,11 @@ rule download_rnasamba_model:
 
 rule pip_install_rnasamba_no_deps:
     """
-    To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0),
+    To take advantage of nvidia GPU on AWS instance 
+    ("Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" ami-07eb000b3340966b0),
     we need to install specific versions of tensorflow and other dependencies.
-    This is accomplished in part in the envs/rnasamba.yml file, however rnasamba itself is not installed there because we need to use the command:
+    This is accomplished in part in the envs/rnasamba.yml file, 
+    however rnasamba itself is not installed there because we need to use the command:
     pip install --no-deps rnasamba
     and there is no way to specify the "--no-deps" flag in a yaml file.
     This rule installs rnasamba into the conda-generated environment.
@@ -132,10 +145,11 @@ rule pip_install_rnasamba_no_deps:
 
 rule rnasamba:
     """
-    The eu_rnasamba.hdf5 model is only accurate on longer contigs.
+    The eukaryote_rnasamba.hdf5 model is only accurate on longer contigs.
     It assesses whether they are long noncoding RNAs.
     However, lncRNAs often have sORFs that encode peptides.
-    This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to contain ORFs.
+    This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to 
+    contain ORFs.
     """
     input:
         # TER TODO: update path when model is downloaded
@@ -155,6 +169,124 @@ rule rnasamba:
 
 ## TER TODO: predict sORFs from lncRNAs
 
+################################################################################
+## cleavage prediction
+################################################################################
+
+
+rule remove_stop_codon_asterisk_from_transdecoder_ORFs:
+    input:
+        ORFS_AMINO_ACIDS,
+    output:
+        faa=OUTPUT_DIR / "cleavage/preprocessing/noasterisk.faa",
+    shell:
+        """
+        sed '/^[^>]/s/\*//g' {input} > {output}
+        """
+
+
+# Ribosomally synthesized and post-translationally modified peptide prediction
+
+
+rule download_nlpprecursor_models:
+    output:
+        tar=INPUT_DIR / "models/nlpprecursor/nlpprecursor_models.tar.gz",
+        model=INPUT_DIR / "models/nlpprecursor/models/annotation/model.p",
+    params:
+        outdir=INPUT_DIR / "models/nlpprecursor",
+    shell:
+        """
+        curl -JLo {output.tar} https://github.com/magarveylab/NLPPrecursor/releases/download/1.0/nlpprecursor_models.tar.gz
+        tar xf {output.tar} -C {params.outdir} 
+        """
+
+
+rule nlpprecursor:
+    """
+    The nlpprecursor tool is part of [DeepRiPP](https://doi.org/10.1073/pnas.1901493116)
+    that predicts ribosomally synthesized postranslationally modified peptides, 
+    a subclass of cleavage peptides.
+    Unlike many tools in the RiPP prediction space, "NLPPrecursor identifies RiPPs independent of 
+    genomic context and neighboring biosynthetic genes."
+    Note the paper suggests, "the precursor cleavage algorithm predicted N-terminal cleavage sites 
+    with 90% accuracy, when considering cleavage points ±5 amino acids from the true prediction 
+    site, a range within which all possible complete chemical structures can be elaborated in 
+    silico by combinatorial structure prediction."    
+    From the DeepRiPP paper supplement: 
+    "Protein sequences of open reading frames are used as input. The output of the model consists 
+     of a classification of each ORF as either a precursor peptide (further subclassified according 
+    to RiPP family), or a non-precursor peptide. A total of 14 classes are identified (n_class)."
+    """
+    input:
+        faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa,
+        model=rules.download_nlpprecursor_models.output.model,
+    output:
+        tsv=OUTPUT_DIR / "cleavage/nlpprecursor/nlpprecursor_ripp_predictions.tsv",
+    params:
+        modelsdir=INPUT_DIR / "models/nlpprecursor/models/",
+    conda:
+        "envs/nlpprecursor.yml"
+    shell:
+        """
+        python scripts/run_nlpprecursor.py {params.modelsdir} {input.faa} {output}
+        """
+
+
+# General Cleavage peptide prediction
+
+
+rule clone_deeppeptide:
+    output:
+        src=touch("cloned_repositories/DeepPeptide/deeppeptide_cloned.txt"),
+    shell:
+        """
+        # only clone the repo if it isn't already present
+        if [ ! -d cloned_repositories/DeepPeptide ]; then
+            git clone https://github.com/fteufel/DeepPeptide.git cloned_repositories/DeepPeptide
+        fi
+        cd cloned_repositories/DeepPeptide
+        git checkout 2657f5dca38e6417c65da5913c1974ed932746e3
+        """
+
+
+rule deeppeptide:
+    input:
+        src=rules.clone_deeppeptide.output.src,
+        faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa,
+    output:
+        json=OUTPUT_DIR / "cleavage/deeppeptide/peptide_predictions.json",
+    conda:
+        "envs/deeppeptide.yml"
+    params:
+        outdir=OUTPUT_DIR / "cleavage/deeppeptide",
+    shell:
+        """
+        cd cloned_repositories/DeepPeptide/predictor && python3 predict.py --fastafile {WORKING_DIRPATH}/{input.faa} --output_dir {params.outdir} --output_fmt json
+        mv {params.outdir}/* {WORKING_DIRPATH}/{params.outdir}/
+        """
+
+
+rule extract_deeppeptide_sequences:
+    """
+    DeepPeptide outputs a json file of peptide predictions and locations,
+    but does not output the sequences themselves.
+    This step parses the JSON file and the protein FASTA from which peptides were predicted.
+    It outputs the propeptide (full ORF, uncleaved) and the predicted peptide sequence (cleaved)
+    in FASTA format.
+    """
+    input:
+        faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa,
+        json=rules.deeppeptide.output.json,
+    output:
+        propeptide=OUTPUT_DIR / "cleavage/deeppeptide/propeptides.faa",
+        peptide=OUTPUT_DIR / "cleavage/deeppeptide/peptides.faa",
+    conda:
+        "envs/biopython.yml"
+    shell:
+        """
+        python scripts/extract_deeppeptide_sequences.py {input.json} {input.faa} {output.propeptide} {output.peptide}
+        """
+
 
 ################################################################################
 ## Target rule all
@@ -165,12 +297,24 @@ rule all:
     default_target: True
     input:
         rules.rnasamba.output.tsv,
+        rules.nlpprecursor.output.tsv,
+        rules.extract_deeppeptide_sequences.output.peptide,
 
 
 rule sORF:
     """
-    Defines a target rule for sORF prediction so a user can run only sORF prediction if they desire.
+    Defines a target rule for sORF prediction so a user can run only sORF prediction.
     snakemake sORF --software-deployment-method conda -j 8 
     """
     input:
         rules.rnasamba.output.tsv,
+
+
+rule cleavage:
+    """
+    Defines a target rule for cleavage prediction so a user can run only cleavage prediction.
+    snakemake cleavage --software-deployment-method conda -j 8 
+    """
+    input:
+        rules.nlpprecursor.output.tsv,
+        rules.extract_deeppeptide_sequences.output.peptide,
diff --git a/envs/biopython.yml b/envs/biopython.yml
@@ -0,0 +1,6 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - biopython=1.83
diff --git a/envs/deeppeptide.yml b/envs/deeppeptide.yml
@@ -0,0 +1,13 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - matplotlib=3.5.1
+   - numpy=1.22.3
+   - tqdm=4.64.0
+   - pytorch=1.11.0
+   - tabulate=0.8.9
+   - pandas=1.4.2
+   - fair-esm=2.0.0
+   - seaborn=0.11.2
diff --git a/envs/nlpprecursor.yml b/envs/nlpprecursor.yml
@@ -0,0 +1,12 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - python=3.7
+   - biopython=1.79
+   - pip=24.0
+   - pip:
+     - torch==1.0.0
+     - git+https://github.com/nishanthmerwin/fastai.git@deepripp_install
+     - git+https://github.com/magarveylab/nlpprecursor
diff --git a/scripts/extract_deeppeptide_sequences.py b/scripts/extract_deeppeptide_sequences.py
@@ -0,0 +1,99 @@
+import argparse
+import json
+
+from Bio import SeqIO
+
+
+def read_fasta(fasta_file):
+    """Read a FASTA file using BioPython and return a dictionary of sequences."""
+    sequences = {}
+    for record in SeqIO.parse(fasta_file, "fasta"):
+        sequences[record.id] = str(record.seq)
+    return sequences
+
+
+def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file):
+    """
+    Extract gene and peptide sequences based on the data dictionary and FASTA file,
+    then write to separate files.
+
+    Extracts protein and peptide sequences based on the provided data dictionary and a FASTA file.
+    The data object is created from a JSON file output by deeppeptide.
+    The protein sequences are extracted from the FASTA file using IDs in the data dictionary.
+    Peptide sequences are then extracted from these protein sequences based on start and end
+    positions specified for each peptide within the data dictionary.
+    The extracted protein and peptide sequences are written to separate output files.
+
+    Parameters:
+    - data (dict): A dictionary containing prediction data, where each key is a protein ID and
+      the associated value is another dictionary with details including peptides' start and end
+      positions.
+    - fasta_file (str): The path to a FASTA file containing protein sequences.
+      This should be the same file used to make the DeepPeptide predictions.
+    - proteins_output_file (str): The path to the output file where protein sequences will be saved.
+      Each sequence is written in FASTA format with its ID as the header.
+    - peptides_output_file (str): The path to the output file where peptide sequences will be saved.
+      Peptide sequences are also written in FASTA format,
+      with headers indicating their source transcript ID and their start and end positions within
+      the protein sequence.
+
+    Returns:
+    None
+
+    Raises:
+    - FileNotFoundError: If the fasta_file does not exist or cannot be read.
+    - KeyError: If the expected keys are not found in the data dictionary.
+
+    Example usage:
+    extract_peptide_sequences(data={'PREDICTIONS': {'>1': {'peptides': [{'start': 1, 'end': 9}]}}},
+                              fasta_file='path/to/fasta_file.fasta',
+                              proteins_output_file='path/to/proteins_output.fasta',
+                              peptides_output_file='path/to/peptides_output.fasta')
+    """
+    sequences = read_fasta(fasta_file)
+
+    with open(proteins_output_file, "w") as proteins_out, open(
+        peptides_output_file, "w"
+    ) as peptides_out:
+        for protein_key, protein_info in data["PREDICTIONS"].items():
+            protein_id = protein_key.split()[0][1:]  # Extract the ID part
+            peptides = protein_info.get("peptides")
+            if peptides:  # Check if there are peptides
+                protein_sequence = sequences.get(protein_id)
+                if protein_sequence:  # If the protein sequence is found in the FASTA
+                    proteins_out.write(f">{protein_id}\n{protein_sequence}\n")
+                    for peptide in peptides:
+                        start, end = peptide["start"], peptide["end"]
+                        peptide_sequence = protein_sequence[
+                            start - 1 : end
+                        ]  # Extract peptide sequence
+                        peptides_out.write(
+                            f">{protein_id}_peptide_{start}_{end}\n{peptide_sequence}\n"
+                        )
+
+
+def main(json_file, fasta_file, proteins_output_file, peptides_output_file):
+    with open(json_file) as f:
+        data = json.load(f)
+
+    extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Extract peptide sequences from DeepPeptide JSON.")
+
+    # Add the arguments
+    parser.add_argument("json_file", type=str, help="The JSON file output by DeepPeptide.")
+    parser.add_argument("fasta_file", type=str, help="The protein FASTA file input to DeepPeptide.")
+    parser.add_argument("proteins_output_file", type=str, help="The output file path for proteins.")
+    parser.add_argument("peptides_output_file", type=str, help="The output file path for peptides.")
+
+    # Execute the parse_args() method
+    args = parser.parse_args()
+
+    main(
+        args.json_file,
+        args.fasta_file,
+        args.proteins_output_file,
+        args.peptides_output_file,
+    )