Arcadia-Science · taylorreiter · Feb 23, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 21, 2024
diff --git a/Snakefile b/Snakefile
@@ -424,6 +424,64 @@ rule characterize_peptides:
         """
 
 
+AUTOPEPTIDEML_MODEL_NAMES = [
+    "AB",
+    "ACE",
+    "ACP",
+    "AF",
+    "AMAP",
+    "AMP",
+    "AOX",
+    "APP",
+    "AV",
+    "BBP",
+    "DPPIV",
+    "MRSA",
+    "Neuro",
+    "QS",
+    "TOX",
+    "TTCA",
+]
+
+
+rule run_autopeptideml:
+    """
+    AutoPeptideML predicts the bioactivity of a peptide based on user-supplied models.
+    The tool is a binary classifier, so each bioactivty has it's own model.
+    As defined by AUTOPEPTIDEML_MODEL_NAMES, we use models trained in the autopeptideml preprint.
+    The abbreviations are AB: Antibacterial; ACE: ACE inhibitor; ACP: Anticancer; AF: Antifungal;
+    AMAP: Antimalarial; AMP: Antimicrobial; AOX: Antioxidant; APP: Antiparasitic; AV: Antiviral; 
+    BBB: Brain-blood barrier crossing; DPPIV: DPPIV inhibitor; MRSA: Anti-MRSA; NP: Neuropeptide; 
+    QS: Quorum sensing; TOX: Toxic; TTCA: Tumor T-cell antigens.
+
+    The script below only predicts the bioactive classification against these models.
+    However, autopeptideml was built to train new binary classifiers and peptipedia contains a lot
+    of labelled peptides, so one could develop new models if the ones included above are 
+    insufficient.
+    """
+    input:
+        peptide=rules.combine_peptide_predictions.output.peptide,
+        # TER TODO: the authors of autopeptideml sent me these models.
+        # They said they're working on uploading them.
+        # Once they're available, I need to add a rule to download them and update the input here
+        # to be the rules syntax
+        model=INPUT_DIR
+        / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json",
+    output:
+        tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv",
+    params:
+        modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/",
+    conda:
+        "envs/autopeptideml.yml"
+    shell:
+        """
+        python scripts/run_autopeptideml.py --input_fasta {input.peptide} \
+            --model_folder {params.modelsdir}/{wildcards.autopeptideml_model_name}_1/ensemble \
+            --model_name {wildcards.autopeptideml_model_name} \
+            --output_tsv {output.tsv}
+        """
+
+
 ################################################################################
 ## Target rule all
 ################################################################################
@@ -458,6 +516,9 @@ rule predict_cleavage:
         rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv,
         rules.run_deepsig.output.tsv,
         rules.characterize_peptides.output.tsv,
+        expand(
+            rules.run_autopeptideml.output.tsv, autopeptideml_model_name=AUTOPEPTIDEML_MODEL_NAMES
+        ),
 
 
 rule predict_nrps:

diff --git a/envs/autopeptideml.yml b/envs/autopeptideml.yml
@@ -0,0 +1,11 @@
+channels:
+   - conda-forge
+   - bioconda
+   - defaults
+dependencies:
+   - scikit-learn=1.3.0
+   - biopython=1.83
+   - python=3.11.8
+   - pip=24.0
+   - pip:
+      - autopeptideml==0.2.9
diff --git a/scripts/run_autopeptideml.py b/scripts/run_autopeptideml.py
@@ -0,0 +1,88 @@
+import argparse
+import os
+from pathlib import Path
+
+import pandas as pd
+from autopeptideml.autopeptideml import AutoPeptideML
+from autopeptideml.utils.embeddings import RepresentationEngine
+from Bio import SeqIO
+
+
+def read_fasta(input_fasta):
+    """
+    Reads a FASTA file and returns a pandas DataFrame with IDs and sequences.
+
+    Args:
+    input_fasta (str): Path to the FASTA file.
+
+    Returns:
+    pd.DataFrame: DataFrame with columns 'ID' and 'sequence'.
+    """
+    sequences = []
+    for seq_record in SeqIO.parse(input_fasta, "fasta"):
+        sequences.append({"ID": seq_record.id, "sequence": str(seq_record.seq)})
+    return pd.DataFrame(sequences)
+
+
+def predict_sequences(
+    df, model_folder, model_name, threads=6, seed=42, batch_size=64, delete=True, tmp_dirname="tmp"
+):
+    """
+    Predicts peptide sequence bioactivity using AutoPeptideML and returns the predictions DataFrame.
+
+    Args:
+    df (pd.DataFrame): DataFrame with sequences to predict.
+    model_folder (str): Path to the model folder.
+    model_name (str): Name of the model. Used to rename "prediction" column to output name.
+    threads (int): Number of threads used to run the prediction.
+    seed (int): Random seed.
+    batch_size (int): Number of peptide sequences to compute in each batch.
+    delete (log): Whether to delete the unmodified CSV file output by autopeptideml.
+    tmp_dirname (str): Directory name supplied to AutoPeptideML's outputdir argument.
+
+    Returns:
+    pd.DataFrame: DataFrame with predictions.
+    """
+    autopeptideml = AutoPeptideML(verbose=True, threads=threads, seed=seed)
+    representation_engine = RepresentationEngine(model="esm2-8m", batch_size=batch_size)
+
+    predictions = autopeptideml.predict(
+        df=df, re=representation_engine, ensemble_path=model_folder, outputdir=tmp_dirname
+    )
+    predictions.rename(columns={"prediction": model_name}, inplace=True)
+    if delete:
+        # autopeptideml writes prediction dataframe with uninformative column names to a
+        # user-specified outputdir. This function specifies that folder as a temporary directory.
+        # When delete == True, this function removes the output file written there.
+        tmp_dirname = Path(tmp_dirname)
+        os.remove(tmp_dirname / "predictions.csv")
+    return predictions
+
+
+def save_predictions(predictions, output_path):
+    """
+    Saves the predictions DataFrame to a TSV file.
+
+    Args:
+    predictions (pd.DataFrame): DataFrame with predictions produced by predicut_sequences().
+    output_path (str): Path to save the TSV file.
+    """
+    predictions.to_csv(output_path, sep="\t", index=False)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Predict sequences using AutoPeptideML.")
+    parser.add_argument("--input_fasta", required=True, help="Path to the FASTA file.")
+    parser.add_argument("--model_folder", required=True, help="Path to the model folder.")
+    parser.add_argument("--model_name", required=True, help="Name of the model.")
+    parser.add_argument("--output_tsv", required=True, help="Path to the output TSV file.")
+
+    args = parser.parse_args()
+
+    df = read_fasta(args.input_fasta)
+    predictions = predict_sequences(df, args.model_folder, args.model_name)
+    save_predictions(predictions, args.output_tsv)
+
+
+if __name__ == "__main__":
+    main()