diff --git a/.gitignore b/.gitignore index eeb440c..d98f209 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ envs/src/** /demo/output/ /demo/input/ /logs +cloned_repositories/ # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/Snakefile b/Snakefile index dbe1eff..f113050 100644 --- a/Snakefile +++ b/Snakefile @@ -5,9 +5,14 @@ from pathlib import Path ## Configuration ################################################################################ +# Retrieves the absolute path of the directory snakemake is launched in. +# Used by DeepPeptide to simplify output file paths. +WORKING_DIRPATH = Path(os.getcwd()) + # Default pipeline configuration parameters are in the config file. -# If you create a new yml file and use the --configfile flag, options in that new file overwrite the defaults. +# If you create a new yml file and use the --configfile flag, +# options in that new file overwrite the defaults. configfile: "./config.yml" @@ -41,7 +46,9 @@ rule filter_nt_contigs_to_short: """ -# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, when there is an accurate sORF rnasamba model, or using another tool from Singh & Roy. +# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, +# when there is an accurate sORF rnasamba model, +# or using another tool from Singh & Roy. rule filter_nt_contigs_to_long: @@ -59,8 +66,10 @@ rule filter_nt_contigs_to_long: rule get_coding_contig_names: """ - Extract amino acid contig names and remove everything after the first period, which are isoform labels. - This file will be used to select all contigs that DO NOT encode ORFs, according to transdecoder. + Extract amino acid contig names and remove everything after the first period, + which are isoform labels. + This file will be used to select all contigs that DO NOT encode ORFs, + according to transdecoder. """ input: ORFS_AMINO_ACIDS, @@ -77,8 +86,9 @@ rule get_coding_contig_names: rule filter_long_contigs_to_no_predicted_ORF: """ Many of the contigs in the full transcriptome have predicted ORFs. - The names of these contigs are recorded in the transdecoder input files (*pep and *cds, orfs_*). - By definition, these contigs are not noncoding RNAs, so they don't need to be considered for classification as long noncoding RNAs (lncRNA). + The names of these contigs are recorded in the transdecoder input files (*pep & *cds, orfs_*). + By definition, these contigs are not noncoding RNAs, + so they don't need to be considered for classification as long noncoding RNAs (lncRNA). This step removes the contigs that contain ORFs. """ input: @@ -97,7 +107,8 @@ rule filter_long_contigs_to_no_predicted_ORF: rule download_rnasamba_model: """ Place holder rule. - For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile, which is available locally from running it. + For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile, + which is available locally from running it. """ output: model=OUTPUT_DIR / "models/rnasamba/build/3_model/eu_rnasamba.hdf5", @@ -109,9 +120,11 @@ rule download_rnasamba_model: rule pip_install_rnasamba_no_deps: """ - To take advantage of nvidia GPU on AWS instance "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" (ami-07eb000b3340966b0), + To take advantage of nvidia GPU on AWS instance + ("Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" ami-07eb000b3340966b0), we need to install specific versions of tensorflow and other dependencies. - This is accomplished in part in the envs/rnasamba.yml file, however rnasamba itself is not installed there because we need to use the command: + This is accomplished in part in the envs/rnasamba.yml file, + however rnasamba itself is not installed there because we need to use the command: pip install --no-deps rnasamba and there is no way to specify the "--no-deps" flag in a yaml file. This rule installs rnasamba into the conda-generated environment. @@ -132,10 +145,11 @@ rule pip_install_rnasamba_no_deps: rule rnasamba: """ - The eu_rnasamba.hdf5 model is only accurate on longer contigs. + The eukaryote_rnasamba.hdf5 model is only accurate on longer contigs. It assesses whether they are long noncoding RNAs. However, lncRNAs often have sORFs that encode peptides. - This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to contain ORFs. + This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to + contain ORFs. """ input: # TER TODO: update path when model is downloaded @@ -155,6 +169,124 @@ rule rnasamba: ## TER TODO: predict sORFs from lncRNAs +################################################################################ +## cleavage prediction +################################################################################ + + +rule remove_stop_codon_asterisk_from_transdecoder_ORFs: + input: + ORFS_AMINO_ACIDS, + output: + faa=OUTPUT_DIR / "cleavage/preprocessing/noasterisk.faa", + shell: + """ + sed '/^[^>]/s/\*//g' {input} > {output} + """ + + +# Ribosomally synthesized and post-translationally modified peptide prediction + + +rule download_nlpprecursor_models: + output: + tar=INPUT_DIR / "models/nlpprecursor/nlpprecursor_models.tar.gz", + model=INPUT_DIR / "models/nlpprecursor/models/annotation/model.p", + params: + outdir=INPUT_DIR / "models/nlpprecursor", + shell: + """ + curl -JLo {output.tar} https://github.com/magarveylab/NLPPrecursor/releases/download/1.0/nlpprecursor_models.tar.gz + tar xf {output.tar} -C {params.outdir} + """ + + +rule nlpprecursor: + """ + The nlpprecursor tool is part of [DeepRiPP](https://doi.org/10.1073/pnas.1901493116) + that predicts ribosomally synthesized postranslationally modified peptides, + a subclass of cleavage peptides. + Unlike many tools in the RiPP prediction space, "NLPPrecursor identifies RiPPs independent of + genomic context and neighboring biosynthetic genes." + Note the paper suggests, "the precursor cleavage algorithm predicted N-terminal cleavage sites + with 90% accuracy, when considering cleavage points ±5 amino acids from the true prediction + site, a range within which all possible complete chemical structures can be elaborated in + silico by combinatorial structure prediction." + From the DeepRiPP paper supplement: + "Protein sequences of open reading frames are used as input. The output of the model consists + of a classification of each ORF as either a precursor peptide (further subclassified according + to RiPP family), or a non-precursor peptide. A total of 14 classes are identified (n_class)." + """ + input: + faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa, + model=rules.download_nlpprecursor_models.output.model, + output: + tsv=OUTPUT_DIR / "cleavage/nlpprecursor/nlpprecursor_ripp_predictions.tsv", + params: + modelsdir=INPUT_DIR / "models/nlpprecursor/models/", + conda: + "envs/nlpprecursor.yml" + shell: + """ + python scripts/run_nlpprecursor.py {params.modelsdir} {input.faa} {output} + """ + + +# General Cleavage peptide prediction + + +rule clone_deeppeptide: + output: + src=touch("cloned_repositories/DeepPeptide/deeppeptide_cloned.txt"), + shell: + """ + # only clone the repo if it isn't already present + if [ ! -d cloned_repositories/DeepPeptide ]; then + git clone https://github.com/fteufel/DeepPeptide.git cloned_repositories/DeepPeptide + fi + cd cloned_repositories/DeepPeptide + git checkout 2657f5dca38e6417c65da5913c1974ed932746e3 + """ + + +rule deeppeptide: + input: + src=rules.clone_deeppeptide.output.src, + faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa, + output: + json=OUTPUT_DIR / "cleavage/deeppeptide/peptide_predictions.json", + conda: + "envs/deeppeptide.yml" + params: + outdir=OUTPUT_DIR / "cleavage/deeppeptide", + shell: + """ + cd cloned_repositories/DeepPeptide/predictor && python3 predict.py --fastafile {WORKING_DIRPATH}/{input.faa} --output_dir {params.outdir} --output_fmt json + mv {params.outdir}/* {WORKING_DIRPATH}/{params.outdir}/ + """ + + +rule extract_deeppeptide_sequences: + """ + DeepPeptide outputs a json file of peptide predictions and locations, + but does not output the sequences themselves. + This step parses the JSON file and the protein FASTA from which peptides were predicted. + It outputs the propeptide (full ORF, uncleaved) and the predicted peptide sequence (cleaved) + in FASTA format. + """ + input: + faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa, + json=rules.deeppeptide.output.json, + output: + propeptide=OUTPUT_DIR / "cleavage/deeppeptide/propeptides.faa", + peptide=OUTPUT_DIR / "cleavage/deeppeptide/peptides.faa", + conda: + "envs/biopython.yml" + shell: + """ + python scripts/extract_deeppeptide_sequences.py {input.json} {input.faa} {output.propeptide} {output.peptide} + """ + ################################################################################ ## Target rule all @@ -165,12 +297,24 @@ rule all: default_target: True input: rules.rnasamba.output.tsv, + rules.nlpprecursor.output.tsv, + rules.extract_deeppeptide_sequences.output.peptide, rule sORF: """ - Defines a target rule for sORF prediction so a user can run only sORF prediction if they desire. + Defines a target rule for sORF prediction so a user can run only sORF prediction. snakemake sORF --software-deployment-method conda -j 8 """ input: rules.rnasamba.output.tsv, + + +rule cleavage: + """ + Defines a target rule for cleavage prediction so a user can run only cleavage prediction. + snakemake cleavage --software-deployment-method conda -j 8 + """ + input: + rules.nlpprecursor.output.tsv, + rules.extract_deeppeptide_sequences.output.peptide, diff --git a/envs/biopython.yml b/envs/biopython.yml new file mode 100644 index 0000000..493f4fb --- /dev/null +++ b/envs/biopython.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - biopython=1.83 diff --git a/envs/deeppeptide.yml b/envs/deeppeptide.yml new file mode 100644 index 0000000..c1cf0f0 --- /dev/null +++ b/envs/deeppeptide.yml @@ -0,0 +1,13 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - matplotlib=3.5.1 + - numpy=1.22.3 + - tqdm=4.64.0 + - pytorch=1.11.0 + - tabulate=0.8.9 + - pandas=1.4.2 + - fair-esm=2.0.0 + - seaborn=0.11.2 diff --git a/envs/nlpprecursor.yml b/envs/nlpprecursor.yml new file mode 100644 index 0000000..924ed49 --- /dev/null +++ b/envs/nlpprecursor.yml @@ -0,0 +1,12 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.7 + - biopython=1.79 + - pip=24.0 + - pip: + - torch==1.0.0 + - git+https://github.com/nishanthmerwin/fastai.git@deepripp_install + - git+https://github.com/magarveylab/nlpprecursor diff --git a/scripts/extract_deeppeptide_sequences.py b/scripts/extract_deeppeptide_sequences.py new file mode 100644 index 0000000..1b63f6f --- /dev/null +++ b/scripts/extract_deeppeptide_sequences.py @@ -0,0 +1,99 @@ +import argparse +import json + +from Bio import SeqIO + + +def read_fasta(fasta_file): + """Read a FASTA file using BioPython and return a dictionary of sequences.""" + sequences = {} + for record in SeqIO.parse(fasta_file, "fasta"): + sequences[record.id] = str(record.seq) + return sequences + + +def extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file): + """ + Extract gene and peptide sequences based on the data dictionary and FASTA file, + then write to separate files. + + Extracts protein and peptide sequences based on the provided data dictionary and a FASTA file. + The data object is created from a JSON file output by deeppeptide. + The protein sequences are extracted from the FASTA file using IDs in the data dictionary. + Peptide sequences are then extracted from these protein sequences based on start and end + positions specified for each peptide within the data dictionary. + The extracted protein and peptide sequences are written to separate output files. + + Parameters: + - data (dict): A dictionary containing prediction data, where each key is a protein ID and + the associated value is another dictionary with details including peptides' start and end + positions. + - fasta_file (str): The path to a FASTA file containing protein sequences. + This should be the same file used to make the DeepPeptide predictions. + - proteins_output_file (str): The path to the output file where protein sequences will be saved. + Each sequence is written in FASTA format with its ID as the header. + - peptides_output_file (str): The path to the output file where peptide sequences will be saved. + Peptide sequences are also written in FASTA format, + with headers indicating their source transcript ID and their start and end positions within + the protein sequence. + + Returns: + None + + Raises: + - FileNotFoundError: If the fasta_file does not exist or cannot be read. + - KeyError: If the expected keys are not found in the data dictionary. + + Example usage: + extract_peptide_sequences(data={'PREDICTIONS': {'>1': {'peptides': [{'start': 1, 'end': 9}]}}}, + fasta_file='path/to/fasta_file.fasta', + proteins_output_file='path/to/proteins_output.fasta', + peptides_output_file='path/to/peptides_output.fasta') + """ + sequences = read_fasta(fasta_file) + + with open(proteins_output_file, "w") as proteins_out, open( + peptides_output_file, "w" + ) as peptides_out: + for protein_key, protein_info in data["PREDICTIONS"].items(): + protein_id = protein_key.split()[0][1:] # Extract the ID part + peptides = protein_info.get("peptides") + if peptides: # Check if there are peptides + protein_sequence = sequences.get(protein_id) + if protein_sequence: # If the protein sequence is found in the FASTA + proteins_out.write(f">{protein_id}\n{protein_sequence}\n") + for peptide in peptides: + start, end = peptide["start"], peptide["end"] + peptide_sequence = protein_sequence[ + start - 1 : end + ] # Extract peptide sequence + peptides_out.write( + f">{protein_id}_peptide_{start}_{end}\n{peptide_sequence}\n" + ) + + +def main(json_file, fasta_file, proteins_output_file, peptides_output_file): + with open(json_file) as f: + data = json.load(f) + + extract_peptide_sequences(data, fasta_file, proteins_output_file, peptides_output_file) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Extract peptide sequences from DeepPeptide JSON.") + + # Add the arguments + parser.add_argument("json_file", type=str, help="The JSON file output by DeepPeptide.") + parser.add_argument("fasta_file", type=str, help="The protein FASTA file input to DeepPeptide.") + parser.add_argument("proteins_output_file", type=str, help="The output file path for proteins.") + parser.add_argument("peptides_output_file", type=str, help="The output file path for peptides.") + + # Execute the parse_args() method + args = parser.parse_args() + + main( + args.json_file, + args.fasta_file, + args.proteins_output_file, + args.peptides_output_file, + ) diff --git a/scripts/run_nlpprecursor.py b/scripts/run_nlpprecursor.py new file mode 100644 index 0000000..60239c4 --- /dev/null +++ b/scripts/run_nlpprecursor.py @@ -0,0 +1,118 @@ +import csv +import sys +import time +from pathlib import Path + +import nlpprecursor +from Bio import SeqIO +from nlpprecursor.annotation.data import DatasetGenerator as ADG +from nlpprecursor.classification.data import DatasetGenerator as CDG + +# This allows for backwards compatibility of the pickled models. +sys.modules["protai"] = nlpprecursor + + +def robust_predict(predict_function, *args, max_attempts=2, sleep_time=1): + """ + Attempts to call the predict function up to a maximum number of attempts. + TODO: debug this problem if this pipeline becomes widely used + On first attempt to call this function on a GPU, + the function produces a RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED. + Second execution succeeds. + Execution fails on the @classmethod predict(), on the line `predictions = model(tokens)[0]`. + Args: + predict_function: The prediction function to call (CDG.predict since it's the first called). + *args: Arguments to pass to the prediction function. + max_attempts (int): Maximum number of attempts to make. + sleep_time (int or float): Time to wait between attempts, in seconds. + Returns: + The result of the prediction function if successful. + Raises: + Exception: Re-raises the last exception if all attempts fail. + """ + for attempt in range(max_attempts): + try: + # Try to make the prediction + return predict_function(*args) + except RuntimeError as e: + print(f"Attempt {attempt + 1} failed with error: {e}") + if attempt + 1 < max_attempts: + print(f"Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) # Wait a bit before retrying + else: + print("All attempts failed. Raising the last exception.") + raise # Re-raise the last exception if out of attempts + + +def main(models_dir, multifasta_file, output_tsv): + models_dir = Path(models_dir) + + class_model_dir = models_dir / "classification" + class_model_path = class_model_dir / "model.p" + class_vocab_path = class_model_dir / "vocab.pkl" + + annot_model_dir = models_dir / "annotation" + annot_model_path = annot_model_dir / "model.p" + annot_vocab_path = annot_model_dir / "vocab.pkl" + + sequences = [] + + # Read sequences from the multifasta file + for record in SeqIO.parse(multifasta_file, "fasta"): + sequences.append({"sequence": str(record.seq), "name": record.id}) + + # Predict class and cleavage for each sequence + try: + class_predictions = robust_predict( + CDG.predict, class_model_path, class_vocab_path, sequences + ) + except Exception as final_error: + print(f"Failed to predict class after several attempts: {final_error}") + cleavage_predictions = ADG.predict(annot_model_path, annot_vocab_path, sequences) + + # The output of nlpprecursor predictions are in JSON format. + # The code below parses the JSON into a TSV format. + + with open(output_tsv, "w", newline="\n") as file: + writer = csv.writer(file, delimiter="\t") + + writer.writerow( + [ + "name", + "class", + "class_score", + "cleavage_sequence", + "cleavage_start", + "cleavage_stop", + "cleavage_score", + ] + ) + + for ind, sequence in enumerate(sequences): + name = sequence["name"] + class_pred = class_predictions[ind]["class_predictions"][0] + cleavage_pred = cleavage_predictions[ind]["cleavage_prediction"] + + writer.writerow( + [ + name, + class_pred["class"], + class_pred["score"], + cleavage_pred["sequence"], + cleavage_pred["start"], + cleavage_pred["stop"], + cleavage_pred["score"], + ] + ) + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: python run_nlpprecursor.py ") + sys.exit(1) + + models_dir = sys.argv[1] + multifasta_file = sys.argv[2] + output_tsv = sys.argv[3] + + main(models_dir, multifasta_file, output_tsv)