Skip to content

Commit

Permalink
Merge pull request #3 from Arcadia-Science/ter/cleavage
Browse files Browse the repository at this point in the history
Add cleavage peptide prediction via NLPPrecursor and DeepPeptide
  • Loading branch information
taylorreiter authored Feb 15, 2024
2 parents 1fdaabb + 893941d commit ac6b76f
Show file tree
Hide file tree
Showing 7 changed files with 405 additions and 12 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ envs/src/**
/demo/output/
/demo/input/
/logs
cloned_repositories/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
168 changes: 156 additions & 12 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@ from pathlib import Path
## Configuration
################################################################################

# Retrieves the absolute path of the directory snakemake is launched in.
# Used by DeepPeptide to simplify output file paths.
WORKING_DIRPATH = Path(os.getcwd())


# Default pipeline configuration parameters are in the config file.
# If you create a new yml file and use the --configfile flag, options in that new file overwrite the defaults.
# If you create a new yml file and use the --configfile flag,
# options in that new file overwrite the defaults.
configfile: "./config.yml"


Expand Down Expand Up @@ -41,7 +46,9 @@ rule filter_nt_contigs_to_short:
"""


# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, when there is an accurate sORF rnasamba model, or using another tool from Singh & Roy.
# TER TODO: Add a rule for sORF prediction, either once smallesm is developed,
# when there is an accurate sORF rnasamba model,
# or using another tool from Singh & Roy.


rule filter_nt_contigs_to_long:
Expand All @@ -59,8 +66,10 @@ rule filter_nt_contigs_to_long:

rule get_coding_contig_names:
"""
Extract amino acid contig names and remove everything after the first period, which are isoform labels.
This file will be used to select all contigs that DO NOT encode ORFs, according to transdecoder.
Extract amino acid contig names and remove everything after the first period,
which are isoform labels.
This file will be used to select all contigs that DO NOT encode ORFs,
according to transdecoder.
"""
input:
ORFS_AMINO_ACIDS,
Expand All @@ -77,8 +86,9 @@ rule get_coding_contig_names:
rule filter_long_contigs_to_no_predicted_ORF:
"""
Many of the contigs in the full transcriptome have predicted ORFs.
The names of these contigs are recorded in the transdecoder input files (*pep and *cds, orfs_*).
By definition, these contigs are not noncoding RNAs, so they don't need to be considered for classification as long noncoding RNAs (lncRNA).
The names of these contigs are recorded in the transdecoder input files (*pep & *cds, orfs_*).
By definition, these contigs are not noncoding RNAs,
so they don't need to be considered for classification as long noncoding RNAs (lncRNA).
This step removes the contigs that contain ORFs.
"""
input:
Expand All @@ -97,7 +107,8 @@ rule filter_long_contigs_to_no_predicted_ORF:
rule download_rnasamba_model:
"""
Place holder rule.
For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile, which is available locally from running it.
For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile,
which is available locally from running it.
"""
output:
model=OUTPUT_DIR / "models/rnasamba/build/3_model/eu_rnasamba.hdf5",
Expand All @@ -109,9 +120,11 @@ rule download_rnasamba_model:

rule pip_install_rnasamba_no_deps:
"""
To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0),
To take advantage of nvidia GPU on AWS instance
("Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" ami-07eb000b3340966b0),
we need to install specific versions of tensorflow and other dependencies.
This is accomplished in part in the envs/rnasamba.yml file, however rnasamba itself is not installed there because we need to use the command:
This is accomplished in part in the envs/rnasamba.yml file,
however rnasamba itself is not installed there because we need to use the command:
pip install --no-deps rnasamba
and there is no way to specify the "--no-deps" flag in a yaml file.
This rule installs rnasamba into the conda-generated environment.
Expand All @@ -132,10 +145,11 @@ rule pip_install_rnasamba_no_deps:

rule rnasamba:
"""
The eu_rnasamba.hdf5 model is only accurate on longer contigs.
The eukaryote_rnasamba.hdf5 model is only accurate on longer contigs.
It assesses whether they are long noncoding RNAs.
However, lncRNAs often have sORFs that encode peptides.
This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to contain ORFs.
This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to
contain ORFs.
"""
input:
# TER TODO: update path when model is downloaded
Expand All @@ -155,6 +169,124 @@ rule rnasamba:

## TER TODO: predict sORFs from lncRNAs

################################################################################
## cleavage prediction
################################################################################


rule remove_stop_codon_asterisk_from_transdecoder_ORFs:
input:
ORFS_AMINO_ACIDS,
output:
faa=OUTPUT_DIR / "cleavage/preprocessing/noasterisk.faa",
shell:
"""
sed '/^[^>]/s/\*//g' {input} > {output}
"""


# Ribosomally synthesized and post-translationally modified peptide prediction


rule download_nlpprecursor_models:
output:
tar=INPUT_DIR / "models/nlpprecursor/nlpprecursor_models.tar.gz",
model=INPUT_DIR / "models/nlpprecursor/models/annotation/model.p",
params:
outdir=INPUT_DIR / "models/nlpprecursor",
shell:
"""
curl -JLo {output.tar} https://github.com/magarveylab/NLPPrecursor/releases/download/1.0/nlpprecursor_models.tar.gz
tar xf {output.tar} -C {params.outdir}
"""


rule nlpprecursor:
"""
The nlpprecursor tool is part of [DeepRiPP](https://doi.org/10.1073/pnas.1901493116)
that predicts ribosomally synthesized postranslationally modified peptides,
a subclass of cleavage peptides.
Unlike many tools in the RiPP prediction space, "NLPPrecursor identifies RiPPs independent of
genomic context and neighboring biosynthetic genes."
Note the paper suggests, "the precursor cleavage algorithm predicted N-terminal cleavage sites
with 90% accuracy, when considering cleavage points ±5 amino acids from the true prediction
site, a range within which all possible complete chemical structures can be elaborated in
silico by combinatorial structure prediction."
From the DeepRiPP paper supplement:
"Protein sequences of open reading frames are used as input. The output of the model consists
of a classification of each ORF as either a precursor peptide (further subclassified according
to RiPP family), or a non-precursor peptide. A total of 14 classes are identified (n_class)."
"""
input:
faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa,
model=rules.download_nlpprecursor_models.output.model,
output:
tsv=OUTPUT_DIR / "cleavage/nlpprecursor/nlpprecursor_ripp_predictions.tsv",
params:
modelsdir=INPUT_DIR / "models/nlpprecursor/models/",
conda:
"envs/nlpprecursor.yml"
shell:
"""
python scripts/run_nlpprecursor.py {params.modelsdir} {input.faa} {output}
"""


# General Cleavage peptide prediction


rule clone_deeppeptide:
output:
src=touch("cloned_repositories/DeepPeptide/deeppeptide_cloned.txt"),
shell:
"""
# only clone the repo if it isn't already present
if [ ! -d cloned_repositories/DeepPeptide ]; then
git clone https://github.com/fteufel/DeepPeptide.git cloned_repositories/DeepPeptide
fi
cd cloned_repositories/DeepPeptide
git checkout 2657f5dca38e6417c65da5913c1974ed932746e3
"""


rule deeppeptide:
input:
src=rules.clone_deeppeptide.output.src,
faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa,
output:
json=OUTPUT_DIR / "cleavage/deeppeptide/peptide_predictions.json",
conda:
"envs/deeppeptide.yml"
params:
outdir=OUTPUT_DIR / "cleavage/deeppeptide",
shell:
"""
cd cloned_repositories/DeepPeptide/predictor && python3 predict.py --fastafile {WORKING_DIRPATH}/{input.faa} --output_dir {params.outdir} --output_fmt json
mv {params.outdir}/* {WORKING_DIRPATH}/{params.outdir}/
"""


rule extract_deeppeptide_sequences:
"""
DeepPeptide outputs a json file of peptide predictions and locations,
but does not output the sequences themselves.
This step parses the JSON file and the protein FASTA from which peptides were predicted.
It outputs the propeptide (full ORF, uncleaved) and the predicted peptide sequence (cleaved)
in FASTA format.
"""
input:
faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa,
json=rules.deeppeptide.output.json,
output:
propeptide=OUTPUT_DIR / "cleavage/deeppeptide/propeptides.faa",
peptide=OUTPUT_DIR / "cleavage/deeppeptide/peptides.faa",
conda:
"envs/biopython.yml"
shell:
"""
python scripts/extract_deeppeptide_sequences.py {input.json} {input.faa} {output.propeptide} {output.peptide}
"""


################################################################################
## Target rule all
Expand All @@ -165,12 +297,24 @@ rule all:
default_target: True
input:
rules.rnasamba.output.tsv,
rules.nlpprecursor.output.tsv,
rules.extract_deeppeptide_sequences.output.peptide,


rule sORF:
"""
Defines a target rule for sORF prediction so a user can run only sORF prediction if they desire.
Defines a target rule for sORF prediction so a user can run only sORF prediction.
snakemake sORF --software-deployment-method conda -j 8
"""
input:
rules.rnasamba.output.tsv,


rule cleavage:
"""
Defines a target rule for cleavage prediction so a user can run only cleavage prediction.
snakemake cleavage --software-deployment-method conda -j 8
"""
input:
rules.nlpprecursor.output.tsv,
rules.extract_deeppeptide_sequences.output.peptide,
6 changes: 6 additions & 0 deletions envs/biopython.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- biopython=1.83
13 changes: 13 additions & 0 deletions envs/deeppeptide.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- matplotlib=3.5.1
- numpy=1.22.3
- tqdm=4.64.0
- pytorch=1.11.0
- tabulate=0.8.9
- pandas=1.4.2
- fair-esm=2.0.0
- seaborn=0.11.2
12 changes: 12 additions & 0 deletions envs/nlpprecursor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- python=3.7
- biopython=1.79
- pip=24.0
- pip:
- torch==1.0.0
- git+https://github.com/nishanthmerwin/fastai.git@deepripp_install
- git+https://github.com/magarveylab/nlpprecursor
99 changes: 99 additions & 0 deletions scripts/extract_deeppeptide_sequences.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import argparse
import json

from Bio import SeqIO


def read_fasta(fasta_file):
"""Read a FASTA file using BioPython and return a dictionary of sequences."""
sequences = {}
for record in SeqIO.parse(fasta_file, "fasta"):
sequences[record.id] = str(record.seq)
return sequences


def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file):
"""
Extract gene and peptide sequences based on the data dictionary and FASTA file,
then write to separate files.
Extracts protein and peptide sequences based on the provided data dictionary and a FASTA file.
The data object is created from a JSON file output by deeppeptide.
The protein sequences are extracted from the FASTA file using IDs in the data dictionary.
Peptide sequences are then extracted from these protein sequences based on start and end
positions specified for each peptide within the data dictionary.
The extracted protein and peptide sequences are written to separate output files.
Parameters:
- data (dict): A dictionary containing prediction data, where each key is a protein ID and
the associated value is another dictionary with details including peptides' start and end
positions.
- fasta_file (str): The path to a FASTA file containing protein sequences.
This should be the same file used to make the DeepPeptide predictions.
- proteins_output_file (str): The path to the output file where protein sequences will be saved.
Each sequence is written in FASTA format with its ID as the header.
- peptides_output_file (str): The path to the output file where peptide sequences will be saved.
Peptide sequences are also written in FASTA format,
with headers indicating their source transcript ID and their start and end positions within
the protein sequence.
Returns:
None
Raises:
- FileNotFoundError: If the fasta_file does not exist or cannot be read.
- KeyError: If the expected keys are not found in the data dictionary.
Example usage:
extract_peptide_sequences(data={'PREDICTIONS': {'>1': {'peptides': [{'start': 1, 'end': 9}]}}},
fasta_file='path/to/fasta_file.fasta',
proteins_output_file='path/to/proteins_output.fasta',
peptides_output_file='path/to/peptides_output.fasta')
"""
sequences = read_fasta(fasta_file)

with open(proteins_output_file, "w") as proteins_out, open(
peptides_output_file, "w"
) as peptides_out:
for protein_key, protein_info in data["PREDICTIONS"].items():
protein_id = protein_key.split()[0][1:] # Extract the ID part
peptides = protein_info.get("peptides")
if peptides: # Check if there are peptides
protein_sequence = sequences.get(protein_id)
if protein_sequence: # If the protein sequence is found in the FASTA
proteins_out.write(f">{protein_id}\n{protein_sequence}\n")
for peptide in peptides:
start, end = peptide["start"], peptide["end"]
peptide_sequence = protein_sequence[
start - 1 : end
] # Extract peptide sequence
peptides_out.write(
f">{protein_id}_peptide_{start}_{end}\n{peptide_sequence}\n"
)


def main(json_file, fasta_file, proteins_output_file, peptides_output_file):
with open(json_file) as f:
data = json.load(f)

extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract peptide sequences from DeepPeptide JSON.")

# Add the arguments
parser.add_argument("json_file", type=str, help="The JSON file output by DeepPeptide.")
parser.add_argument("fasta_file", type=str, help="The protein FASTA file input to DeepPeptide.")
parser.add_argument("proteins_output_file", type=str, help="The output file path for proteins.")
parser.add_argument("peptides_output_file", type=str, help="The output file path for peptides.")

# Execute the parse_args() method
args = parser.parse_args()

main(
args.json_file,
args.fasta_file,
args.proteins_output_file,
args.peptides_output_file,
)
Loading

0 comments on commit ac6b76f

Please sign in to comment.