From 0fe633e64f800ee7c24673fef2237eda67679e0e Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 21 Feb 2024 13:16:22 -0500 Subject: [PATCH 1/6] add env file --- envs/autopeptideml.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 envs/autopeptideml.yml diff --git a/envs/autopeptideml.yml b/envs/autopeptideml.yml new file mode 100644 index 0000000..53703cc --- /dev/null +++ b/envs/autopeptideml.yml @@ -0,0 +1,9 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.12.2 + - pip=24.0 + - pip: + - autopeptideml==0.2.9 From 72747aa35b45f98444b805ef40772a413dae4a0e Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 21 Feb 2024 21:15:29 +0000 Subject: [PATCH 2/6] update deps, add snakefile rule, and run script for autopeptideml --- Snakefile | 37 ++++++++++++++++- envs/autopeptideml.yml | 4 +- scripts/run_autopeptideml.py | 79 ++++++++++++++++++++++++++++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) create mode 100644 scripts/run_autopeptideml.py diff --git a/Snakefile b/Snakefile index d20807c..0a1fc71 100644 --- a/Snakefile +++ b/Snakefile @@ -423,7 +423,40 @@ rule characterize_peptides: python scripts/characterize_peptides.py {input.peptide} {output.tsv} """ - +AUTOPEPTIDEML_MODEL_NAMES = ['AB', 'ACE', 'ACP', 'AF', 'AMAP', 'AMP', 'AOX', 'APP', 'AV', + 'BBP', 'DPPIV', 'MRSA', 'Neuro', 'QS', 'TOX', 'TTCA'] +rule run_autopeptideml: + """ + AutoPeptideML predicts the bioactivity of a peptide based on user-supplied models. + The tool is a binary classifier, so each bioactivty has it's own model. + As defined by AUTOPEPTIDEML_MODEL_NAMES, we use models trained in the autopeptideml preprint. + The abbreviations are AB: Antibacterial; ACE: ACE inhibitor; ACP: Anticancer; AF: Antifungal; + AMAP: Antimalarial; AMP: Antimicrobial; AOX: Antioxidant; APP: Antiparasitic; AV: Antiviral; + BBB: Brain-blood barrier crossing; DPPIV: DPPIV inhibitor; MRSA: Anti-MRSA; NP: Neuropeptide; + QS: Quorum sensing; TOX: Toxic; TTCA: Tumor T-cell antigens. + + The script below only predicts the bioactive classification against these models. + However, autopeptideml was built to train new binary classifiers and peptipedia contains a lot + of labelled peptides, so one could develop new models if the ones included above are + insufficient. + """ + input: + peptide=rules.combine_peptide_predictions.output.peptide, + # TER TODO: the authors of autopeptideml sent me these models. + # They said they're working on uploading them. + # Once they're available, I need to add a rule to download them and update the input here + # to be the rules syntax + model=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json" + output: + tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv", + params: + modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/" + conda: + "envs/autopeptideml.yml" + shell: + """ + python scripts/run_autopeptideml.py --input_fasta {input.peptide} --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble --model_name {wildcards.autopeptideml_model_name} --output_tsv {output.tsv} + """ ################################################################################ ## Target rule all ################################################################################ @@ -458,7 +491,7 @@ rule predict_cleavage: rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv, rules.run_deepsig.output.tsv, rules.characterize_peptides.output.tsv, - + expand(rules.run_autopeptideml.output.tsv, autopeptideml_model_name = AUTOPEPTIDEML_MODEL_NAMES) rule predict_nrps: """ diff --git a/envs/autopeptideml.yml b/envs/autopeptideml.yml index 53703cc..e5711cb 100644 --- a/envs/autopeptideml.yml +++ b/envs/autopeptideml.yml @@ -3,7 +3,9 @@ channels: - bioconda - defaults dependencies: - - python=3.12.2 + - scikit-learn=1.3.0 + - biopython=1.83 + - python=3.11.8 - pip=24.0 - pip: - autopeptideml==0.2.9 diff --git a/scripts/run_autopeptideml.py b/scripts/run_autopeptideml.py new file mode 100644 index 0000000..d9ac07d --- /dev/null +++ b/scripts/run_autopeptideml.py @@ -0,0 +1,79 @@ +import os +from pathlib import Path +import argparse +from Bio import SeqIO +import pandas as pd +from autopeptideml.autopeptideml import AutoPeptideML +from autopeptideml.utils.embeddings import RepresentationEngine + +def read_fasta(input_fasta): + """ + Reads a FASTA file and returns a pandas DataFrame with IDs and sequences. + + Args: + input_fasta (str): Path to the FASTA file. + + Returns: + pd.DataFrame: DataFrame with columns 'ID' and 'sequence'. + """ + sequences = [] + for seq_record in SeqIO.parse(input_fasta, "fasta"): + sequences.append({"ID": seq_record.id, "sequence": str(seq_record.seq)}) + return pd.DataFrame(sequences) + +def predict_sequences(df, model_folder, model_name, threads = 6, seed = 42, batch_size = 64, + delete = True, tmp_dirname = "tmp"): + """ + Predicts peptide sequence bioactivity using AutoPeptideML and returns the predictions DataFrame. + + Args: + df (pd.DataFrame): DataFrame with sequences to predict. + model_folder (str): Path to the model folder. + model_name (str): Name of the model. Used to rename "prediction" column to output name. + threads (int): Number of threads used to run the prediction. + seed (int): Random seed. + batch_size (int): Number of peptide sequences to compute in each batch. + delete (log): Whether to delete the unmodified CSV file output by autopeptideml. + tmp_dirname (str): Directory name supplied to AutoPeptideML's outputdir argument. + + Returns: + pd.DataFrame: DataFrame with predictions. + """ + autopeptideml = AutoPeptideML(verbose=True, threads=threads, seed=seed) + representation_engine = RepresentationEngine(model="esm2-8m", batch_size=batch_size) + + predictions = autopeptideml.predict(df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname) + predictions.rename(columns={'prediction': model_name}, inplace=True) + if delete: + # autopeptideml writes prediction dataframe with uninformative column names to a + # user-specified outputdir. This function specifies that folder as a temporary directory. + # When delete == True, this function removes the output file written there. + tmp_dirname = Path(tmp_dirname) + os.remove(tmp_dirname / "predictions.csv" ) + return predictions + +def save_predictions(predictions, output_path): + """ + Saves the predictions DataFrame to a TSV file. + + Args: + predictions (pd.DataFrame): DataFrame with predictions produced by predicut_sequences(). + output_path (str): Path to save the TSV file. + """ + predictions.to_csv(output_path, sep='\t', index=False) + +def main(): + parser = argparse.ArgumentParser(description='Predict sequences using AutoPeptideML.') + parser.add_argument('--input_fasta', required=True, help='Path to the FASTA file.') + parser.add_argument('--model_folder', required=True, help='Path to the model folder.') + parser.add_argument('--model_name', required=True, help='Name of the model.') + parser.add_argument('--output_tsv', required=True, help='Path to the output TSV file.') + + args = parser.parse_args() + + df = read_fasta(args.input_fasta) + predictions = predict_sequences(df, args.model_folder, args.model_name) + save_predictions(predictions, args.output_tsv) + +if __name__ == '__main__': + main() From c90ca0acc5154057dc4804d92608250c561a91b0 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Wed, 21 Feb 2024 21:18:56 +0000 Subject: [PATCH 3/6] linting --- Snakefile | 35 +++++++++++++++++++++++++----- scripts/run_autopeptideml.py | 41 ++++++++++++++++++++++-------------- 2 files changed, 55 insertions(+), 21 deletions(-) diff --git a/Snakefile b/Snakefile index 0a1fc71..4571aec 100644 --- a/Snakefile +++ b/Snakefile @@ -423,8 +423,27 @@ rule characterize_peptides: python scripts/characterize_peptides.py {input.peptide} {output.tsv} """ -AUTOPEPTIDEML_MODEL_NAMES = ['AB', 'ACE', 'ACP', 'AF', 'AMAP', 'AMP', 'AOX', 'APP', 'AV', - 'BBP', 'DPPIV', 'MRSA', 'Neuro', 'QS', 'TOX', 'TTCA'] + +AUTOPEPTIDEML_MODEL_NAMES = [ + "AB", + "ACE", + "ACP", + "AF", + "AMAP", + "AMP", + "AOX", + "APP", + "AV", + "BBP", + "DPPIV", + "MRSA", + "Neuro", + "QS", + "TOX", + "TTCA", +] + + rule run_autopeptideml: """ AutoPeptideML predicts the bioactivity of a peptide based on user-supplied models. @@ -446,17 +465,20 @@ rule run_autopeptideml: # They said they're working on uploading them. # Once they're available, I need to add a rule to download them and update the input here # to be the rules syntax - model=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json" + model=INPUT_DIR + / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json", output: tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv", params: - modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/" + modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/", conda: "envs/autopeptideml.yml" shell: """ python scripts/run_autopeptideml.py --input_fasta {input.peptide} --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble --model_name {wildcards.autopeptideml_model_name} --output_tsv {output.tsv} """ + + ################################################################################ ## Target rule all ################################################################################ @@ -491,7 +513,10 @@ rule predict_cleavage: rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv, rules.run_deepsig.output.tsv, rules.characterize_peptides.output.tsv, - expand(rules.run_autopeptideml.output.tsv, autopeptideml_model_name = AUTOPEPTIDEML_MODEL_NAMES) + expand( + rules.run_autopeptideml.output.tsv, autopeptideml_model_name=AUTOPEPTIDEML_MODEL_NAMES + ), + rule predict_nrps: """ diff --git a/scripts/run_autopeptideml.py b/scripts/run_autopeptideml.py index d9ac07d..5c228e7 100644 --- a/scripts/run_autopeptideml.py +++ b/scripts/run_autopeptideml.py @@ -1,10 +1,12 @@ +import argparse import os from pathlib import Path -import argparse -from Bio import SeqIO + import pandas as pd from autopeptideml.autopeptideml import AutoPeptideML from autopeptideml.utils.embeddings import RepresentationEngine +from Bio import SeqIO + def read_fasta(input_fasta): """ @@ -21,8 +23,10 @@ def read_fasta(input_fasta): sequences.append({"ID": seq_record.id, "sequence": str(seq_record.seq)}) return pd.DataFrame(sequences) -def predict_sequences(df, model_folder, model_name, threads = 6, seed = 42, batch_size = 64, - delete = True, tmp_dirname = "tmp"): + +def predict_sequences( + df, model_folder, model_name, threads=6, seed=42, batch_size=64, delete=True, tmp_dirname="tmp" +): """ Predicts peptide sequence bioactivity using AutoPeptideML and returns the predictions DataFrame. @@ -42,16 +46,19 @@ def predict_sequences(df, model_folder, model_name, threads = 6, seed = 42, batc autopeptideml = AutoPeptideML(verbose=True, threads=threads, seed=seed) representation_engine = RepresentationEngine(model="esm2-8m", batch_size=batch_size) - predictions = autopeptideml.predict(df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname) - predictions.rename(columns={'prediction': model_name}, inplace=True) + predictions = autopeptideml.predict( + df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname + ) + predictions.rename(columns={"prediction": model_name}, inplace=True) if delete: # autopeptideml writes prediction dataframe with uninformative column names to a # user-specified outputdir. This function specifies that folder as a temporary directory. # When delete == True, this function removes the output file written there. tmp_dirname = Path(tmp_dirname) - os.remove(tmp_dirname / "predictions.csv" ) + os.remove(tmp_dirname / "predictions.csv") return predictions + def save_predictions(predictions, output_path): """ Saves the predictions DataFrame to a TSV file. @@ -60,20 +67,22 @@ def save_predictions(predictions, output_path): predictions (pd.DataFrame): DataFrame with predictions produced by predicut_sequences(). output_path (str): Path to save the TSV file. """ - predictions.to_csv(output_path, sep='\t', index=False) + predictions.to_csv(output_path, sep="\t", index=False) + def main(): - parser = argparse.ArgumentParser(description='Predict sequences using AutoPeptideML.') - parser.add_argument('--input_fasta', required=True, help='Path to the FASTA file.') - parser.add_argument('--model_folder', required=True, help='Path to the model folder.') - parser.add_argument('--model_name', required=True, help='Name of the model.') - parser.add_argument('--output_tsv', required=True, help='Path to the output TSV file.') - + parser = argparse.ArgumentParser(description="Predict sequences using AutoPeptideML.") + parser.add_argument("--input_fasta", required=True, help="Path to the FASTA file.") + parser.add_argument("--model_folder", required=True, help="Path to the model folder.") + parser.add_argument("--model_name", required=True, help="Name of the model.") + parser.add_argument("--output_tsv", required=True, help="Path to the output TSV file.") + args = parser.parse_args() - + df = read_fasta(args.input_fasta) predictions = predict_sequences(df, args.model_folder, args.model_name) save_predictions(predictions, args.output_tsv) -if __name__ == '__main__': + +if __name__ == "__main__": main() From e818fe5f92e1119a3f925d30ef60deeb7b35b631 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Thu, 22 Feb 2024 20:32:32 +0000 Subject: [PATCH 4/6] update line spacing --- Snakefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 4571aec..a842b5b 100644 --- a/Snakefile +++ b/Snakefile @@ -475,7 +475,10 @@ rule run_autopeptideml: "envs/autopeptideml.yml" shell: """ - python scripts/run_autopeptideml.py --input_fasta {input.peptide} --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble --model_name {wildcards.autopeptideml_model_name} --output_tsv {output.tsv} + python scripts/run_autopeptideml.py --input_fasta {input.peptide} \ + --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble \ + --model_name {wildcards.autopeptideml_model_name} \ + --output_tsv {output.tsv} """ From 457f1ad538b594dc8c29f56722909da6466eecd9 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Fri, 23 Feb 2024 13:16:25 -0500 Subject: [PATCH 5/6] update to pathlib Co-authored-by: Keith Cheveralls Signed-off-by: Taylor Reiter --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index a842b5b..524367e 100644 --- a/Snakefile +++ b/Snakefile @@ -466,7 +466,7 @@ rule run_autopeptideml: # Once they're available, I need to add a rule to download them and update the input here # to be the rules syntax model=INPUT_DIR - / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json", + / "models" / "autopeptideml" / "HPO_NegSearch_HP" / f"{autopeptideml_model_name}_1" / "apml_config.json", output: tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv", params: From 74fc9e2089cf7f5ed0d1c8d7682cfe14098d2e5b Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Fri, 23 Feb 2024 18:57:39 +0000 Subject: [PATCH 6/6] revert to tmp model path --- Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Snakefile b/Snakefile index 524367e..a842b5b 100644 --- a/Snakefile +++ b/Snakefile @@ -466,7 +466,7 @@ rule run_autopeptideml: # Once they're available, I need to add a rule to download them and update the input here # to be the rules syntax model=INPUT_DIR - / "models" / "autopeptideml" / "HPO_NegSearch_HP" / f"{autopeptideml_model_name}_1" / "apml_config.json", + / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json", output: tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv", params: