From 0fe633e64f800ee7c24673fef2237eda67679e0e Mon Sep 17 00:00:00 2001
From: Taylor Reiter <taylorreiter@gmail.com>
Date: Wed, 21 Feb 2024 13:16:22 -0500
Subject: [PATCH 1/6] add env file

---
 envs/autopeptideml.yml | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 envs/autopeptideml.yml

diff --git a/envs/autopeptideml.yml b/envs/autopeptideml.yml
new file mode 100644
index 0000000..53703cc
--- /dev/null
+++ b/envs/autopeptideml.yml
@@ -0,0 +1,9 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - python=3.12.2
+   - pip=24.0
+   - pip:
+      - autopeptideml==0.2.9

From 72747aa35b45f98444b805ef40772a413dae4a0e Mon Sep 17 00:00:00 2001
From: Taylor Reiter <taylorreiter@gmail.com>
Date: Wed, 21 Feb 2024 21:15:29 +0000
Subject: [PATCH 2/6] update deps, add snakefile rule, and run script for
 autopeptideml

---
 Snakefile                    | 37 ++++++++++++++++-
 envs/autopeptideml.yml       |  4 +-
 scripts/run_autopeptideml.py | 79 ++++++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 3 deletions(-)
 create mode 100644 scripts/run_autopeptideml.py

diff --git a/Snakefile b/Snakefile
index d20807c..0a1fc71 100644
--- a/Snakefile
+++ b/Snakefile
@@ -423,7 +423,40 @@ rule characterize_peptides:
         python scripts/characterize_peptides.py {input.peptide} {output.tsv}
         """
 
-
+AUTOPEPTIDEML_MODEL_NAMES = ['AB', 'ACE', 'ACP', 'AF',  'AMAP',  'AMP',  'AOX', 'APP',  'AV',  
+                             'BBP',  'DPPIV',  'MRSA',  'Neuro',  'QS', 'TOX',  'TTCA']
+rule run_autopeptideml:
+    """
+    AutoPeptideML predicts the bioactivity of a peptide based on user-supplied models.
+    The tool is a binary classifier, so each bioactivty has it's own model.
+    As defined by AUTOPEPTIDEML_MODEL_NAMES, we use models trained in the autopeptideml preprint.
+    The abbreviations are AB: Antibacterial; ACE: ACE inhibitor; ACP: Anticancer; AF: Antifungal;
+    AMAP: Antimalarial; AMP: Antimicrobial; AOX: Antioxidant; APP: Antiparasitic; AV: Antiviral; 
+    BBB: Brain-blood barrier crossing; DPPIV: DPPIV inhibitor; MRSA: Anti-MRSA; NP: Neuropeptide; 
+    QS: Quorum sensing; TOX: Toxic; TTCA: Tumor T-cell antigens.
+    
+    The script below only predicts the bioactive classification against these models.
+    However, autopeptideml was built to train new binary classifiers and peptipedia contains a lot
+    of labelled peptides, so one could develop new models if the ones included above are 
+    insufficient.
+    """
+    input:
+        peptide=rules.combine_peptide_predictions.output.peptide,
+        # TER TODO: the authors of autopeptideml sent me these models.
+        # They said they're working on uploading them.
+        # Once they're available, I need to add a rule to download them and update the input here
+        # to be the rules syntax
+        model=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json"
+    output:
+        tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv",
+    params:
+        modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/" 
+    conda:
+        "envs/autopeptideml.yml"
+    shell:
+        """
+        python scripts/run_autopeptideml.py --input_fasta {input.peptide} --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble --model_name {wildcards.autopeptideml_model_name} --output_tsv {output.tsv}
+        """
 ################################################################################
 ## Target rule all
 ################################################################################
@@ -458,7 +491,7 @@ rule predict_cleavage:
         rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv,
         rules.run_deepsig.output.tsv,
         rules.characterize_peptides.output.tsv,
-
+        expand(rules.run_autopeptideml.output.tsv, autopeptideml_model_name = AUTOPEPTIDEML_MODEL_NAMES)
 
 rule predict_nrps:
     """
diff --git a/envs/autopeptideml.yml b/envs/autopeptideml.yml
index 53703cc..e5711cb 100644
--- a/envs/autopeptideml.yml
+++ b/envs/autopeptideml.yml
@@ -3,7 +3,9 @@ channels:
    - bioconda
    - defaults
 dependencies:
-   - python=3.12.2
+   - scikit-learn=1.3.0
+   - biopython=1.83
+   - python=3.11.8
    - pip=24.0
    - pip:
       - autopeptideml==0.2.9
diff --git a/scripts/run_autopeptideml.py b/scripts/run_autopeptideml.py
new file mode 100644
index 0000000..d9ac07d
--- /dev/null
+++ b/scripts/run_autopeptideml.py
@@ -0,0 +1,79 @@
+import os
+from pathlib import Path
+import argparse
+from Bio import SeqIO
+import pandas as pd
+from autopeptideml.autopeptideml import AutoPeptideML
+from autopeptideml.utils.embeddings import RepresentationEngine
+
+def read_fasta(input_fasta):
+    """
+    Reads a FASTA file and returns a pandas DataFrame with IDs and sequences.
+
+    Args:
+    input_fasta (str): Path to the FASTA file.
+
+    Returns:
+    pd.DataFrame: DataFrame with columns 'ID' and 'sequence'.
+    """
+    sequences = []
+    for seq_record in SeqIO.parse(input_fasta, "fasta"):
+        sequences.append({"ID": seq_record.id, "sequence": str(seq_record.seq)})
+    return pd.DataFrame(sequences)
+
+def predict_sequences(df, model_folder, model_name, threads = 6, seed = 42, batch_size = 64, 
+                      delete = True, tmp_dirname = "tmp"):
+    """
+    Predicts peptide sequence bioactivity using AutoPeptideML and returns the predictions DataFrame.
+
+    Args:
+    df (pd.DataFrame): DataFrame with sequences to predict.
+    model_folder (str): Path to the model folder.
+    model_name (str): Name of the model. Used to rename "prediction" column to output name.
+    threads (int): Number of threads used to run the prediction.
+    seed (int): Random seed.
+    batch_size (int): Number of peptide sequences to compute in each batch.
+    delete (log): Whether to delete the unmodified CSV file output by autopeptideml.
+    tmp_dirname (str): Directory name supplied to AutoPeptideML's outputdir argument.
+
+    Returns:
+    pd.DataFrame: DataFrame with predictions.
+    """
+    autopeptideml = AutoPeptideML(verbose=True, threads=threads, seed=seed)
+    representation_engine = RepresentationEngine(model="esm2-8m", batch_size=batch_size)
+
+    predictions = autopeptideml.predict(df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname)
+    predictions.rename(columns={'prediction': model_name}, inplace=True)
+    if delete:
+        # autopeptideml writes prediction dataframe with uninformative column names to a
+        # user-specified outputdir. This function specifies that folder as a temporary directory.
+        # When delete == True, this function removes the output file written there.
+        tmp_dirname = Path(tmp_dirname)
+        os.remove(tmp_dirname / "predictions.csv" )
+    return predictions
+
+def save_predictions(predictions, output_path):
+    """
+    Saves the predictions DataFrame to a TSV file.
+
+    Args:
+    predictions (pd.DataFrame): DataFrame with predictions produced by predicut_sequences().
+    output_path (str): Path to save the TSV file.
+    """
+    predictions.to_csv(output_path, sep='\t', index=False)
+
+def main():
+    parser = argparse.ArgumentParser(description='Predict sequences using AutoPeptideML.')
+    parser.add_argument('--input_fasta', required=True, help='Path to the FASTA file.')
+    parser.add_argument('--model_folder', required=True, help='Path to the model folder.')
+    parser.add_argument('--model_name', required=True, help='Name of the model.')
+    parser.add_argument('--output_tsv', required=True, help='Path to the output TSV file.')
+    
+    args = parser.parse_args()
+    
+    df = read_fasta(args.input_fasta)
+    predictions = predict_sequences(df, args.model_folder, args.model_name)
+    save_predictions(predictions, args.output_tsv)
+
+if __name__ == '__main__':
+    main()

From c90ca0acc5154057dc4804d92608250c561a91b0 Mon Sep 17 00:00:00 2001
From: Taylor Reiter <taylorreiter@gmail.com>
Date: Wed, 21 Feb 2024 21:18:56 +0000
Subject: [PATCH 3/6] linting

---
 Snakefile                    | 35 +++++++++++++++++++++++++-----
 scripts/run_autopeptideml.py | 41 ++++++++++++++++++++++--------------
 2 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/Snakefile b/Snakefile
index 0a1fc71..4571aec 100644
--- a/Snakefile
+++ b/Snakefile
@@ -423,8 +423,27 @@ rule characterize_peptides:
         python scripts/characterize_peptides.py {input.peptide} {output.tsv}
         """
 
-AUTOPEPTIDEML_MODEL_NAMES = ['AB', 'ACE', 'ACP', 'AF',  'AMAP',  'AMP',  'AOX', 'APP',  'AV',  
-                             'BBP',  'DPPIV',  'MRSA',  'Neuro',  'QS', 'TOX',  'TTCA']
+
+AUTOPEPTIDEML_MODEL_NAMES = [
+    "AB",
+    "ACE",
+    "ACP",
+    "AF",
+    "AMAP",
+    "AMP",
+    "AOX",
+    "APP",
+    "AV",
+    "BBP",
+    "DPPIV",
+    "MRSA",
+    "Neuro",
+    "QS",
+    "TOX",
+    "TTCA",
+]
+
+
 rule run_autopeptideml:
     """
     AutoPeptideML predicts the bioactivity of a peptide based on user-supplied models.
@@ -446,17 +465,20 @@ rule run_autopeptideml:
         # They said they're working on uploading them.
         # Once they're available, I need to add a rule to download them and update the input here
         # to be the rules syntax
-        model=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json"
+        model=INPUT_DIR
+        / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json",
     output:
         tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv",
     params:
-        modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/" 
+        modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/",
     conda:
         "envs/autopeptideml.yml"
     shell:
         """
         python scripts/run_autopeptideml.py --input_fasta {input.peptide} --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble --model_name {wildcards.autopeptideml_model_name} --output_tsv {output.tsv}
         """
+
+
 ################################################################################
 ## Target rule all
 ################################################################################
@@ -491,7 +513,10 @@ rule predict_cleavage:
         rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv,
         rules.run_deepsig.output.tsv,
         rules.characterize_peptides.output.tsv,
-        expand(rules.run_autopeptideml.output.tsv, autopeptideml_model_name = AUTOPEPTIDEML_MODEL_NAMES)
+        expand(
+            rules.run_autopeptideml.output.tsv, autopeptideml_model_name=AUTOPEPTIDEML_MODEL_NAMES
+        ),
+
 
 rule predict_nrps:
     """
diff --git a/scripts/run_autopeptideml.py b/scripts/run_autopeptideml.py
index d9ac07d..5c228e7 100644
--- a/scripts/run_autopeptideml.py
+++ b/scripts/run_autopeptideml.py
@@ -1,10 +1,12 @@
+import argparse
 import os
 from pathlib import Path
-import argparse
-from Bio import SeqIO
+
 import pandas as pd
 from autopeptideml.autopeptideml import AutoPeptideML
 from autopeptideml.utils.embeddings import RepresentationEngine
+from Bio import SeqIO
+
 
 def read_fasta(input_fasta):
     """
@@ -21,8 +23,10 @@ def read_fasta(input_fasta):
         sequences.append({"ID": seq_record.id, "sequence": str(seq_record.seq)})
     return pd.DataFrame(sequences)
 
-def predict_sequences(df, model_folder, model_name, threads = 6, seed = 42, batch_size = 64, 
-                      delete = True, tmp_dirname = "tmp"):
+
+def predict_sequences(
+    df, model_folder, model_name, threads=6, seed=42, batch_size=64, delete=True, tmp_dirname="tmp"
+):
     """
     Predicts peptide sequence bioactivity using AutoPeptideML and returns the predictions DataFrame.
 
@@ -42,16 +46,19 @@ def predict_sequences(df, model_folder, model_name, threads = 6, seed = 42, batc
     autopeptideml = AutoPeptideML(verbose=True, threads=threads, seed=seed)
     representation_engine = RepresentationEngine(model="esm2-8m", batch_size=batch_size)
 
-    predictions = autopeptideml.predict(df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname)
-    predictions.rename(columns={'prediction': model_name}, inplace=True)
+    predictions = autopeptideml.predict(
+        df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname
+    )
+    predictions.rename(columns={"prediction": model_name}, inplace=True)
     if delete:
         # autopeptideml writes prediction dataframe with uninformative column names to a
         # user-specified outputdir. This function specifies that folder as a temporary directory.
         # When delete == True, this function removes the output file written there.
         tmp_dirname = Path(tmp_dirname)
-        os.remove(tmp_dirname / "predictions.csv" )
+        os.remove(tmp_dirname / "predictions.csv")
     return predictions
 
+
 def save_predictions(predictions, output_path):
     """
     Saves the predictions DataFrame to a TSV file.
@@ -60,20 +67,22 @@ def save_predictions(predictions, output_path):
     predictions (pd.DataFrame): DataFrame with predictions produced by predicut_sequences().
     output_path (str): Path to save the TSV file.
     """
-    predictions.to_csv(output_path, sep='\t', index=False)
+    predictions.to_csv(output_path, sep="\t", index=False)
+
 
 def main():
-    parser = argparse.ArgumentParser(description='Predict sequences using AutoPeptideML.')
-    parser.add_argument('--input_fasta', required=True, help='Path to the FASTA file.')
-    parser.add_argument('--model_folder', required=True, help='Path to the model folder.')
-    parser.add_argument('--model_name', required=True, help='Name of the model.')
-    parser.add_argument('--output_tsv', required=True, help='Path to the output TSV file.')
-    
+    parser = argparse.ArgumentParser(description="Predict sequences using AutoPeptideML.")
+    parser.add_argument("--input_fasta", required=True, help="Path to the FASTA file.")
+    parser.add_argument("--model_folder", required=True, help="Path to the model folder.")
+    parser.add_argument("--model_name", required=True, help="Name of the model.")
+    parser.add_argument("--output_tsv", required=True, help="Path to the output TSV file.")
+
     args = parser.parse_args()
-    
+
     df = read_fasta(args.input_fasta)
     predictions = predict_sequences(df, args.model_folder, args.model_name)
     save_predictions(predictions, args.output_tsv)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     main()

From e818fe5f92e1119a3f925d30ef60deeb7b35b631 Mon Sep 17 00:00:00 2001
From: Taylor Reiter <taylorreiter@gmail.com>
Date: Thu, 22 Feb 2024 20:32:32 +0000
Subject: [PATCH 4/6] update line spacing

---
 Snakefile | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 4571aec..a842b5b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -475,7 +475,10 @@ rule run_autopeptideml:
         "envs/autopeptideml.yml"
     shell:
         """
-        python scripts/run_autopeptideml.py --input_fasta {input.peptide} --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble --model_name {wildcards.autopeptideml_model_name} --output_tsv {output.tsv}
+        python scripts/run_autopeptideml.py --input_fasta {input.peptide} \
+            --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble \
+            --model_name {wildcards.autopeptideml_model_name} \
+            --output_tsv {output.tsv}
         """
 
 

From 457f1ad538b594dc8c29f56722909da6466eecd9 Mon Sep 17 00:00:00 2001
From: Taylor Reiter <taylorreiter@gmail.com>
Date: Fri, 23 Feb 2024 13:16:25 -0500
Subject: [PATCH 5/6] update to pathlib

Co-authored-by: Keith Cheveralls <keith.chev@gmail.com>
Signed-off-by: Taylor Reiter <taylorreiter@gmail.com>
---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index a842b5b..524367e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -466,7 +466,7 @@ rule run_autopeptideml:
         # Once they're available, I need to add a rule to download them and update the input here
         # to be the rules syntax
         model=INPUT_DIR
-        / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json",
+        / "models" / "autopeptideml" / "HPO_NegSearch_HP" / f"{autopeptideml_model_name}_1" / "apml_config.json",
     output:
         tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv",
     params:

From 74fc9e2089cf7f5ed0d1c8d7682cfe14098d2e5b Mon Sep 17 00:00:00 2001
From: Taylor Reiter <taylorreiter@gmail.com>
Date: Fri, 23 Feb 2024 18:57:39 +0000
Subject: [PATCH 6/6] revert to tmp model path

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 524367e..a842b5b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -466,7 +466,7 @@ rule run_autopeptideml:
         # Once they're available, I need to add a rule to download them and update the input here
         # to be the rules syntax
         model=INPUT_DIR
-        / "models" / "autopeptideml" / "HPO_NegSearch_HP" / f"{autopeptideml_model_name}_1" / "apml_config.json",
+        / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json",
     output:
         tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv",
     params: