diff --git a/Snakefile b/Snakefile index 3baa5f2..0dabd94 100644 --- a/Snakefile +++ b/Snakefile @@ -19,48 +19,40 @@ configfile: "./config.yml" INPUT_DIR = Path(config["input_dir"]) OUTPUT_DIR = Path(config["output_dir"]) -SHORT_CONTIGS = Path(config["short_contigs"]) ORFS_AMINO_ACIDS = Path(config["orfs_amino_acids"]) ORFS_NUCLEOTIDES = Path(config["orfs_nucleotides"]) -ALL_CONTIGS = Path(config["all_contigs"]) - +CONTIGS_SHORTER = Path(config["contigs_shorter_than_r2t_minimum_length"]) +CONTIGS_LONGER = Path(config["contigs_longer_than_r2t_minimum_length"]) +PLMUTILS_MODEL_DIR = Path(config["plmutils_model_dir"]) ################################################################################ ## sORF prediction ################################################################################ -rule filter_nt_contigs_to_short: - input: - all_contigs=ALL_CONTIGS, - short_contigs=SHORT_CONTIGS, - output: - contigs300=temp(OUTPUT_DIR / "outputs/sORF/short_contigs/contigs300.fa"), - all_short_contigs=OUTPUT_DIR / "sORF/short_contigs/short_contigs.fa", - conda: - "envs/seqkit.yml" - shell: - """ - seqkit seq --max-len 300 -o {output.contigs300} {input.all_contigs} - cat {input.short_contigs} {output.contigs300} > {output.all_short_contigs} - """ - - -# TER TODO: Add a rule for sORF prediction, either once smallesm is developed, -# when there is an accurate sORF rnasamba model, -# or using another tool from Singh & Roy. - - -rule filter_nt_contigs_to_long: +rule combine_contigs: + """ + By default we assume that files provided to this pipeline are reads2transcriptome outputs. + Reads2transcriptome outputs two files that contain contigs. + The first we refer to as contigs_shorter_than_r2t_minimum (or CONTIGS_SHORTER), + which are transcripts output by assemblers that did not meet the r2t runs minimum contig length. + The second we refer to as contigs_longer_than_r2t_minimum (or CONTIGS_LONGER) and are assembled + transcripts that passed the isoform clustering and decontamination steps of r2t. + We no longer need these pools of transcripts differentiated, so we combine them in this rule. + If your input transcriptome only has one file of assembled contigs, sequences only need to be + supplied in contigs_longer_than_r2t_minimum_length. + contigs_shorter_than_r2t_minimum_length can be an empty file. + """ input: - all_contigs=ALL_CONTIGS, + contigs_shorter=CONTIGS_SHORTER, + contigs_longer=CONTIGS_LONGER, output: - long_contigs=temp(OUTPUT_DIR / "sORF/long_contigs/contigs300.fa"), + all_contigs=OUTPUT_DIR / "sORF" / "contigs" / "all_input_contigs.fa", conda: "envs/seqkit.yml" shell: """ - seqkit seq --min-len 301 -o {output.long_contigs} {input.all_contigs} + cat {input.contigs_shorter} {input.contigs_longer} > {output.all_contigs} """ @@ -68,13 +60,12 @@ rule get_coding_contig_names: """ Extract amino acid contig names and remove everything after the first period, which are isoform labels. - This file will be used to select all contigs that DO NOT encode ORFs, - according to transdecoder. + This file will be used to select all contigs that DID NOT have a transdecoder-detected ORFs. """ input: ORFS_AMINO_ACIDS, output: - names=OUTPUT_DIR / "sORF/long_contigs/orfs_amino_acid_names.txt", + names=OUTPUT_DIR / "sORF" / "contigs" / "orfs_amino_acid_names.txt", conda: "envs/seqkit.yml" shell: @@ -83,19 +74,20 @@ rule get_coding_contig_names: """ -rule filter_long_contigs_to_no_predicted_ORF: +rule filter_contigs_to_no_predicted_ORF: """ - Many of the contigs in the full transcriptome have predicted ORFs. - The names of these contigs are recorded in the transdecoder input files (*pep & *cds, orfs_*). - By definition, these contigs are not noncoding RNAs, - so they don't need to be considered for classification as long noncoding RNAs (lncRNA). - This step removes the contigs that contain ORFs. + The r2t pipeline runs transdecoder to predict open reading frames (ORFs) from transcripts. + By default, only ORFs that are longer than 100 amino acids are kept by transdecoder. + The peptigate pipeline predicts peptides that are 100 amino acids or shorter. + This rule eliminates transcripts that contained a transdecoder-predicted ORF. + It keeps all other transcripts, regardless of length, to investigate the presence of an sORF + later in the pipeline. """ input: - fa=rules.filter_nt_contigs_to_long.output.long_contigs, + fa=rules.combine_contigs.output.all_contigs, names=rules.get_coding_contig_names.output.names, output: - fa=OUTPUT_DIR / "sORF/long_contigs/long_contigs_no_predicted_orf.fa", + fa=OUTPUT_DIR / "sORF" / "contigs" / "contigs_with_no_transdecoder_predicted_orf.fa", conda: "envs/seqkit.yml" shell: @@ -104,70 +96,92 @@ rule filter_long_contigs_to_no_predicted_ORF: """ -rule download_rnasamba_model: +rule plmutils_translate: """ - Place holder rule. - For now, the workflow uses the model output by build_rnasamba_euk_model.snakefile, - which is available locally from running it. + This rule takes input nucleotide transcripts, detects the longest open reading frame, and + translates it into amino acid sequences. """ + input: + rules.filter_contigs_to_no_predicted_ORF.output.fa, output: - model=OUTPUT_DIR / "models/rnasamba/build/3_model/eu_rnasamba.hdf5", + faa=OUTPUT_DIR / "sORF" / "plmutils" / "translated_contigs.faa", + conda: + "envs/plmutils.yml" shell: """ - curl -JLo {output.model} # TODO add URL for download + plmutils translate --longest-only --output-filepath {output} {input} """ -rule pip_install_rnasamba_no_deps: - """ - To take advantage of nvidia GPU on AWS instance - ("Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 20.04) 20240122" ami-07eb000b3340966b0), - we need to install specific versions of tensorflow and other dependencies. - This is accomplished in part in the envs/rnasamba.yml file, - however rnasamba itself is not installed there because we need to use the command: - pip install --no-deps rnasamba - and there is no way to specify the "--no-deps" flag in a yaml file. - This rule installs rnasamba into the conda-generated environment. - Note the output path is the same as that used by curate_datasets_and_build_models.snakefile, - as this conda env will already be present and configured if that snakefile has been executed. - """ +rule length_filter_plmutils_translate_output: + input: + rules.plmutils_translate.output.faa, output: - pip="outputs/models/build/rnasamba/rnasamba_installed.txt", + faa=OUTPUT_DIR / "sORF" / "plmutils" / "translated_contigs_filtered.faa", conda: - "envs/rnasamba.yml" + "envs/seqkit.yml" shell: """ - pip install 'nvidia-tensorflow~=1.15' - pip install --no-deps rnasamba # used version 0.2.5 - touch {output} + seqkit seq --max-len 100 -o {output} {input} """ -rule rnasamba: +rule plmutils_embed: """ - The eukaryote_rnasamba.hdf5 model is only accurate on longer contigs. - It assesses whether they are long noncoding RNAs. - However, lncRNAs often have sORFs that encode peptides. - This rule runs RNAsamba on longer contigs (>300nt) that were not predicted by transdecoder to - contain ORFs. + This rule embeds amino acid sequences produced by plmutils translated into the embedding space + of a protein large language model. + For now, plmutils only supports ESM. + We use the 8M model because it is fast and there is some evidence that it works well for other + peptide prediction tasks (https://www.biorxiv.org/content/10.1101/2023.11.13.566825v2). + The parameter --layer-ind -1 means to extract the embedding from the last layer of the model. """ input: - # TER TODO: update path when model is downloaded - pip=rules.pip_install_rnasamba_no_deps.output.pip, - model=rules.download_rnasamba_model.output.model, - contigs=rules.filter_long_contigs_to_no_predicted_ORF.output.fa, + rules.length_filter_plmutils_translate_output.output.faa, output: - tsv=OUTPUT_DIR / "sORF/long_contigs/rnasamba/classification.tsv", - fa=OUTPUT_DIR / "sORF/long_contigs/rnasamba/predicted_proteins.fa", + npy=OUTPUT_DIR / "sORF" / "plmutils" / "embedded_contigs_filtered.npy", conda: - "envs/rnasamba.yml" + "envs/plmutils.yml" shell: """ - rnasamba classify -p {output.fa} {output.tsv} {input.contigs} {input.model} + plmutils embed --model-name esm2_t6_8M_UR50D \ + --layer-ind -1 \ + --output-filepath {output.npy} \ + {input} """ -## TER TODO: predict sORFs from lncRNAs +rule plmutils_predict: + input: + embeddings=rules.plmutils_embed.output.npy, + faa=rules.length_filter_plmutils_translate_output.output.faa, + model=PLMUTILS_MODEL_DIR, + output: + csv=OUTPUT_DIR / "sORF" / "plmutils" / "predictions.csv", + conda: + "envs/plmutils.yml" + shell: + """ + plmutils predict --model-dirpath {input.model} \ + --embeddings-filepath {input.embeddings} \ + --fasta-filepath {input.faa} \ + --output-filepath {output.csv} + """ + + +rule extract_plmutils_predicted_peptides: + input: + csv=rules.plmutils_predict.output.csv, + faa=rules.length_filter_plmutils_translate_output.output.faa, + output: + names=OUTPUT_DIR / "sORF" / "plmutils" / "peptide_names.faa", + faa=OUTPUT_DIR / "sORF" / "plmutils" / "peptides.faa", + conda: + "envs/seqkit.yml" + shell: + """ + grep "positive" {input.csv} | cut -d, -f1 > {output.names} + seqkit grep -f {output.names} {input.faa} -o {output.faa} + """ ################################################################################ @@ -179,7 +193,7 @@ rule remove_stop_codon_asterisk_from_transdecoder_ORFs: input: ORFS_AMINO_ACIDS, output: - faa=OUTPUT_DIR / "cleavage/preprocessing/noasterisk.faa", + faa=OUTPUT_DIR / "cleavage" / "preprocessing" / "noasterisk.faa", shell: """ sed '/^[^>]/s/\*//g' {input} > {output} @@ -191,10 +205,10 @@ rule remove_stop_codon_asterisk_from_transdecoder_ORFs: rule download_nlpprecursor_models: output: - tar=INPUT_DIR / "models/nlpprecursor/nlpprecursor_models.tar.gz", - model=INPUT_DIR / "models/nlpprecursor/models/annotation/model.p", + tar=INPUT_DIR / "models" / "nlpprecursor" / "nlpprecursor_models.tar.gz", + model=INPUT_DIR / "models" / "nlpprecursor" / "models" / "annotation" / "model.p", params: - outdir=INPUT_DIR / "models/nlpprecursor", + outdir=INPUT_DIR / "models" / "nlpprecursor", shell: """ curl -JLo {output.tar} https://github.com/magarveylab/NLPPrecursor/releases/download/1.0/nlpprecursor_models.tar.gz @@ -222,10 +236,10 @@ rule nlpprecursor: faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa, model=rules.download_nlpprecursor_models.output.model, output: - tsv=OUTPUT_DIR / "cleavage/nlpprecursor/nlpprecursor_predictions.tsv", - peptide=OUTPUT_DIR / "cleavage/nlpprecursor/nlpprecursor_peptides.fasta", + tsv=OUTPUT_DIR / "cleavage" / "nlpprecursor" / "nlpprecursor_predictions.tsv", + peptide=OUTPUT_DIR / "cleavage" / "nlpprecursor" / "nlpprecursor_peptides.fasta", params: - modelsdir=INPUT_DIR / "models/nlpprecursor/models/", + modelsdir=INPUT_DIR / "models" / "nlpprecursor" / "models/", conda: "envs/nlpprecursor.yml" shell: @@ -256,11 +270,11 @@ rule deeppeptide: src=rules.clone_deeppeptide.output.src, faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa, output: - json=OUTPUT_DIR / "cleavage/deeppeptide/peptide_predictions.json", + json=OUTPUT_DIR / "cleavage" / "deeppeptide" / "peptide_predictions.json", conda: "envs/deeppeptide.yml" params: - outdir=OUTPUT_DIR / "cleavage/deeppeptide", + outdir=OUTPUT_DIR / "cleavage" / "deeppeptide", shell: """ cd cloned_repositories/DeepPeptide/predictor && python3 predict.py --fastafile {WORKING_DIRPATH}/{input.faa} --output_dir {params.outdir} --output_fmt json @@ -280,9 +294,9 @@ rule extract_deeppeptide_sequences: faa=rules.remove_stop_codon_asterisk_from_transdecoder_ORFs.output.faa, json=rules.deeppeptide.output.json, output: - propeptide=OUTPUT_DIR / "cleavage/deeppeptide/propeptides.faa", - peptide=OUTPUT_DIR / "cleavage/deeppeptide/peptides.faa", - tsv=OUTPUT_DIR / "cleavage/deeppeptide/predictions.tsv", + propeptide=OUTPUT_DIR / "cleavage" / "deeppeptide" / "propeptides.faa", + peptide=OUTPUT_DIR / "cleavage" / "deeppeptide" / "peptides.faa", + tsv=OUTPUT_DIR / "cleavage" / "deeppeptide" / "predictions.tsv", conda: "envs/biopython.yml" shell: @@ -296,15 +310,13 @@ rule extract_deeppeptide_sequences: ################################################################################ -# TER TODO: add sORF predictions - - rule combine_peptide_predictions: input: + sorf=rules.extract_plmutils_predicted_peptides.output.faa, nlpprecursor=rules.nlpprecursor.output.peptide, deeppeptide=rules.extract_deeppeptide_sequences.output.peptide, output: - peptide=OUTPUT_DIR / "annotation/combined_peptide_predictions/peptides.faa", + peptide=OUTPUT_DIR / "annotation" / "combined_peptide_predictions" / "peptides.faa", shell: """ cat {input} > {output.peptide} @@ -324,7 +336,7 @@ rule download_peptipedia_database: done some quality control on those sequences. """ output: - db=INPUT_DIR / "databases/peptipedia.fasta.gz", + db=INPUT_DIR / "databases" / "peptipedia.fasta.gz", shell: """ curl -JLo {output} https://osf.io/dzycu/download @@ -335,9 +347,9 @@ rule make_diamond_db_from_peptipedia_database: input: db=rules.download_peptipedia_database.output.db, output: - db=OUTPUT_DIR / "annotation/peptipedia/0_diamond_db/peptipedia.dmnd", + db=OUTPUT_DIR / "annotation" / "peptipedia" / "0_diamond_db" / "peptipedia.dmnd", params: - dbprefix=OUTPUT_DIR / "annotation/peptipedia/0_diamond_db/peptipedia", + dbprefix=OUTPUT_DIR / "annotation" / "peptipedia" / "0_diamond_db" / "peptipedia", conda: "envs/diamond.yml" shell: @@ -351,9 +363,9 @@ rule diamond_blastp_peptide_predictions_against_peptipedia_database: db=rules.make_diamond_db_from_peptipedia_database.output.db, peptide=rules.combine_peptide_predictions.output.peptide, output: - tsv=OUTPUT_DIR / "annotation/peptipedia/1_blastp/matches.tsv", + tsv=OUTPUT_DIR / "annotation" / "peptipedia" / "1_blastp" / "matches.tsv", params: - dbprefix=OUTPUT_DIR / "annotation/peptipedia/0_diamond_db/peptipedia", + dbprefix=OUTPUT_DIR / "annotation" / "peptipedia" / "0_diamond_db" / "peptipedia", conda: "envs/diamond.yml" shell: @@ -375,7 +387,7 @@ rule run_deepsig: input: peptide=rules.combine_peptide_predictions.output.peptide, output: - tsv=OUTPUT_DIR / "annotation/deepsig/deepsig.tsv", + tsv=OUTPUT_DIR / "annotation" / "deepsig" / "deepsig.tsv", conda: "envs/deepsig.yml" shell: @@ -389,7 +401,7 @@ rule characterize_peptides: input: peptide=rules.combine_peptide_predictions.output.peptide, output: - tsv=OUTPUT_DIR / "annotation/characteristics/peptide_characteristics.tsv", + tsv=OUTPUT_DIR / "annotation" / "characteristics" / "peptide_characteristics.tsv", conda: "envs/peptides.yml" shell: @@ -418,6 +430,25 @@ AUTOPEPTIDEML_MODEL_NAMES = [ ] +rule download_autopeptideml_models: + output: + # TER TODO: the authors of autopeptideml sent me these models. + # They said they're working on uploading them. + # Once they're available, I need to add a rule to download them and update the input here + # to be the rules syntax + # https://github.com/IBM/AutoPeptideML/issues/6#issuecomment-1989494559 + model=INPUT_DIR + / "models" + / "autopeptideml" + / "HPO_NegSearch_HP" + / "{autopeptideml_model_name}_1" + / "apml_config.json", + shell: + """ + touch {output} # will become a curl command or something, depending on how models are packaged + """ + + rule run_autopeptideml: """ AutoPeptideML predicts the bioactivity of a peptide based on user-supplied models. @@ -435,16 +466,14 @@ rule run_autopeptideml: """ input: peptide=rules.combine_peptide_predictions.output.peptide, - # TER TODO: the authors of autopeptideml sent me these models. - # They said they're working on uploading them. - # Once they're available, I need to add a rule to download them and update the input here - # to be the rules syntax - model=INPUT_DIR - / "models/autopeptideml/HPO_NegSearch_HP/{autopeptideml_model_name}_1/apml_config.json", + model=rules.download_autopeptideml_models.output.model, output: - tsv=OUTPUT_DIR / "annotation/autopeptideml/autopeptideml_{autopeptideml_model_name}.tsv", + tsv=OUTPUT_DIR + / "annotation" + / "autopeptideml" + / "autopeptideml_{autopeptideml_model_name}.tsv", params: - modelsdir=INPUT_DIR / "models/autopeptideml/HPO_NegSearch_HP/", + modelsdir=INPUT_DIR / "models" / "autopeptideml" / "HPO_NegSearch_HP/", conda: "envs/autopeptideml.yml" shell: @@ -461,6 +490,7 @@ rule combine_peptide_annotations: input: nlpprecursor=rules.nlpprecursor.output.tsv, deeppeptide=rules.extract_deeppeptide_sequences.output.tsv, + plmutils=rules.plmutils_predict.output.csv, autopeptideml=expand( rules.run_autopeptideml.output.tsv, autopeptideml_model_name=AUTOPEPTIDEML_MODEL_NAMES ), @@ -468,9 +498,9 @@ rule combine_peptide_annotations: peptipedia=rules.diamond_blastp_peptide_predictions_against_peptipedia_database.output.tsv, characteristics=rules.characterize_peptides.output.tsv, output: - tsv=OUTPUT_DIR / "annotation/peptide_annotations.tsv", + tsv=OUTPUT_DIR / "annotation" / "peptide_annotations.tsv", params: - autopeptidemldir=OUTPUT_DIR / "annotation/autopeptideml/", + autopeptidemldir=OUTPUT_DIR / "annotation" / "autopeptideml/", conda: "envs/tidyverse.yml" shell: @@ -478,6 +508,7 @@ rule combine_peptide_annotations: Rscript scripts/combine_peptide_annotations.R \ --nlpprecursor_path {input.nlpprecursor} \ --deeppeptide_path {input.deeppeptide} \ + --plmutils_path {input.plmutils} \ --autopeptideml_dir {params.autopeptidemldir} \ --deepsig_path {input.deepsig} \ --peptipedia_path {input.peptipedia} \ @@ -494,9 +525,7 @@ rule combine_peptide_annotations: rule all: default_target: True input: - rules.rnasamba.output.tsv, - rules.nlpprecursor.output.peptide, - rules.extract_deeppeptide_sequences.output.peptide, + rules.combine_peptide_annotations.output.tsv, rule predict_sORF: @@ -505,7 +534,7 @@ rule predict_sORF: snakemake predict_sORF --software-deployment-method conda -j 8 """ input: - rules.rnasamba.output.tsv, + sorf=rules.extract_plmutils_predicted_peptides.output.faa, rule predict_cleavage: @@ -514,4 +543,5 @@ rule predict_cleavage: snakemake predict_cleavage --software-deployment-method conda -j 8 """ input: - rules.combine_peptide_annotations.output.tsv, + rules.nlpprecursor.output.peptide, + rules.extract_deeppeptide_sequences.output.peptide, diff --git a/config.yml b/config.yml index 1a2bbd8..572dc03 100644 --- a/config.yml +++ b/config.yml @@ -17,12 +17,23 @@ output_dir: "outputs/" # All input files are produced by reads2transcriptome. # While this is not a strict requirement, using these output files gives us the ability to look at very short contiguous sequences. # TER TODO: update names of output files based on Nextflow of reads2transcriptome. -# - short_contigs: contigs that are shorter than X nucleotides, which do not progress through the assembly pipeline. These may contain sORFs and so are included. -# - orfs_amino_acids: predicted ORFs translated into amino acids. Output by transdecoder. Used for cleavage peptide prediction and annotation of nonribosomal peptide synthetases. -# - orfs_nucleotides: predicted ORFs as nucleotide sequences. Output by transdecoder. Used to compare peptide nucleotide sequences (clustering, dn/ds estimation, etc.). -# - all_contigs: all contigs as nucleotide sequences. Used to identify contigs shorter than X nucleotides (300) to scan for sORFs and to predict lncRNAs, which may have sORFs embedded in them. +# - orfs_amino_acids: predicted ORFs translated into amino acids. +# Output by transdecoder. +# Used for cleavage peptide prediction and annotation of nonribosomal peptide synthetases. +# - orfs_nucleotides: predicted ORFs as nucleotide sequences. Output by transdecoder. +# Used to compare peptide nucleotide sequences (clustering, dn/ds estimation, etc.). +# - contigs_shorter_than_r2t_minimum_length: contigs that are shorter than X nucleotides (by default 75bp). +# The reads2transcriptome pipeline assembles RNA-seq reads into contigs (transcripts) using multiple assemblers and then merges those assemblies together. +# Before merging, very short contigs are removed (<75bp). +# However, reads2transcriptome outputs a FASTA file containing these transcripts, which is used as input to peptigate here. +# These contigs may contain sORFs and so are included as an input to the peptigate pipeline. +# - contigs_longer_than_r2t_minimum_length: contigs longer than X nucleotides (default is 75bp). +# If the user did not use the reads2transcriptome file to generate their input files, this would be a transcriptome assembly FASTA file in nucleotide format containing transcripts. +# Note that the first step of sORF prediction combines the contigs_shorter_than_r2t_minimum_length and contigs_longer_than_r2t_minimum_length files, so there is no need to perform any pre-processing by length. +# - plmutils_model_dir: path to the directory for the plmutils model that will predict whether sORFs are coding or non-coding. -short_contigs: "demo/short_contigs.fa" orfs_amino_acids: "demo/orfs_amino_acids.faa" orfs_nucleotides: "demo/orfs_nucleotides.fa" -all_contigs: "demo/all_contigs.fa" +contigs_shorter_than_r2t_minimum_length: "demo/contigs_shorter_than_r2t_minimum_length.fa" +contigs_longer_than_r2t_minimum_length: "demo/contigs_longer_than_r2t_minimum_length.fa" +plmutils_model_dir: "inputs/models/plmutils/" diff --git a/demo/all_contigs.fa b/demo/contigs_longer_than_r2t_minimum_length.fa similarity index 100% rename from demo/all_contigs.fa rename to demo/contigs_longer_than_r2t_minimum_length.fa diff --git a/demo/short_contigs.fa b/demo/contigs_shorter_than_r2t_minimum_length.fa similarity index 100% rename from demo/short_contigs.fa rename to demo/contigs_shorter_than_r2t_minimum_length.fa diff --git a/inputs/models/plmutils/classifier.joblib b/inputs/models/plmutils/classifier.joblib new file mode 100644 index 0000000..9ec044b Binary files /dev/null and b/inputs/models/plmutils/classifier.joblib differ diff --git a/inputs/models/plmutils/pca.joblib b/inputs/models/plmutils/pca.joblib new file mode 100644 index 0000000..af00de4 Binary files /dev/null and b/inputs/models/plmutils/pca.joblib differ diff --git a/scripts/combine_peptide_annotations.R b/scripts/combine_peptide_annotations.R index 081d39a..881c07c 100644 --- a/scripts/combine_peptide_annotations.R +++ b/scripts/combine_peptide_annotations.R @@ -9,6 +9,9 @@ option_list <- list( make_option(c("--deeppeptide_path"), type="character", default="outputs/cleavage/deeppeptide/predictions.tsv", help="Path to DeepPeptide predictions TSV file."), + make_option(c("--plmutils_path"), type="character", + default="outputs/sORF/plmutils/predictions.csv", + help="Path to plmutils sORF predictions CSV file."), make_option(c("--autopeptideml_dir"), type="character", default="outputs/annotation/autopeptideml/", help="Path to directory containing AutoPeptideML TSV files."), @@ -37,13 +40,14 @@ args <- parse_args(OptionParser(option_list=option_list)) #' #' @param nlpprecursor_path Path to the NLPprecursor predictions TSV file. #' @param deeppeptide_path Path to the DeepPeptide predictions TSV file. +#' @param plmutils_path Path to the plmutils predictions CSV file. #' @param autopeptideml_dir Path to the directory containing AutoPeptideML TSV files. #' @param deepsig_path Path to the DeepSig annotations TSV file. #' @param peptipedia_path Path to the Peptipedia BLAST matches TSV file. #' @param characteristics_path Path to the peptide characteristics TSV file. #' #' @return A data frame with peptide predictions merged with various annotations. -combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, +combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, plmutils_path, autopeptideml_dir, deepsig_path, peptipedia_path, characteristics_path) { @@ -51,8 +55,15 @@ combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, select(-nlpprecursor_cleavage_sequence) deeppeptide <- read_tsv(deeppeptide_path) + + plmutils <- read_csv(plmutils_path) %>% + filter(predicted_label == "positive") %>% + select(peptide_id = sequence_id) %>% + mutate(peptide_type = "sORF", + prediction_tool = "plmutils") - peptide_predictions <- bind_rows(nlpprecursor, deeppeptide) + peptide_predictions <- bind_rows(nlpprecursor, deeppeptide) %>% + bind_rows(plmutils) autopeptideml_files <- Sys.glob(paste0(autopeptideml_dir, "/*tsv")) autopeptideml <- map_dfr(autopeptideml_files, read_tsv) %>% @@ -85,6 +96,7 @@ combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, annotations_df<- combine_peptide_annotations(nlpprecursor_path = args$nlpprecursor_path, deeppeptide_path = args$deeppeptide_path, + plmutils_path = args$plmutils_path, autopeptideml_dir = args$autopeptideml_dir, deepsig_path = args$deepsig_path, peptipedia_path = args$peptipedia_path,