Merge pull request #22 from Arcadia-Science/ter/plmutils-in-peptigate

Add in sORF prediction to peptigate using plmutils
Arcadia-Science · Mar 13, 2024 · b85f930 · b85f930
2 parents 1f1c091 + e81c63d
commit b85f930
Show file tree

Hide file tree

Showing 7 changed files with 177 additions and 124 deletions.
diff --git a/Snakefile b/Snakefile
diff --git a/config.yml b/config.yml
@@ -17,12 +17,23 @@ output_dir: "outputs/"
 # All input files are produced by reads2transcriptome.
 # While this is not a strict requirement, using these output files gives us the ability to look at very short contiguous sequences.
 # TER TODO: update names of output files based on Nextflow of reads2transcriptome.
-# - short_contigs: contigs that are shorter than X nucleotides, which do not progress through the assembly pipeline. These may contain sORFs and so are included.
-# - orfs_amino_acids: predicted ORFs translated into amino acids. Output by transdecoder. Used for cleavage peptide prediction and annotation of nonribosomal peptide synthetases.
-# - orfs_nucleotides: predicted ORFs as nucleotide sequences. Output by transdecoder. Used to compare peptide nucleotide sequences (clustering, dn/ds estimation, etc.).
-# - all_contigs: all contigs as nucleotide sequences. Used to identify contigs shorter than X nucleotides (300) to scan for sORFs and to predict lncRNAs, which may have sORFs embedded in them.
+# - orfs_amino_acids: predicted ORFs translated into amino acids.
+#   Output by transdecoder.
+#   Used for cleavage peptide prediction and annotation of nonribosomal peptide synthetases.
+# - orfs_nucleotides: predicted ORFs as nucleotide sequences. Output by transdecoder.
+#   Used to compare peptide nucleotide sequences (clustering, dn/ds estimation, etc.).
+# - contigs_shorter_than_r2t_minimum_length: contigs that are shorter than X nucleotides (by default 75bp).
+#   The reads2transcriptome pipeline assembles RNA-seq reads into contigs (transcripts) using multiple assemblers and then merges those assemblies together.
+#   Before merging, very short contigs are removed (<75bp).
+#   However, reads2transcriptome outputs a FASTA file containing these transcripts, which is used as input to peptigate here.
+#   These contigs may contain sORFs and so are included as an input to the peptigate pipeline.
+# - contigs_longer_than_r2t_minimum_length: contigs longer than X nucleotides (default is 75bp).
+#   If the user did not use the reads2transcriptome file to generate their input files, this would be a transcriptome assembly FASTA file in nucleotide format containing transcripts.
+#   Note that the first step of sORF prediction combines the contigs_shorter_than_r2t_minimum_length and contigs_longer_than_r2t_minimum_length files, so there is no need to perform any pre-processing by length.
+# - plmutils_model_dir: path to the directory for the plmutils model that will predict whether sORFs are coding or non-coding.
 
-short_contigs: "demo/short_contigs.fa"
 orfs_amino_acids: "demo/orfs_amino_acids.faa"
 orfs_nucleotides: "demo/orfs_nucleotides.fa"
-all_contigs: "demo/all_contigs.fa"
+contigs_shorter_than_r2t_minimum_length: "demo/contigs_shorter_than_r2t_minimum_length.fa"
+contigs_longer_than_r2t_minimum_length: "demo/contigs_longer_than_r2t_minimum_length.fa"
+plmutils_model_dir: "inputs/models/plmutils/"
diff --git a/demo/all_contigs.fa → ...contigs_longer_than_r2t_minimum_length.fa b/demo/all_contigs.fa → ...contigs_longer_than_r2t_minimum_length.fa
diff --git a/demo/short_contigs.fa → ...ontigs_shorter_than_r2t_minimum_length.fa b/demo/short_contigs.fa → ...ontigs_shorter_than_r2t_minimum_length.fa
diff --git a/inputs/models/plmutils/classifier.joblib b/inputs/models/plmutils/classifier.joblib
diff --git a/inputs/models/plmutils/pca.joblib b/inputs/models/plmutils/pca.joblib
diff --git a/scripts/combine_peptide_annotations.R b/scripts/combine_peptide_annotations.R
@@ -9,6 +9,9 @@ option_list <- list(
   make_option(c("--deeppeptide_path"), type="character",
               default="outputs/cleavage/deeppeptide/predictions.tsv", 
               help="Path to DeepPeptide predictions TSV file."),
+  make_option(c("--plmutils_path"), type="character",
+              default="outputs/sORF/plmutils/predictions.csv", 
+              help="Path to plmutils sORF predictions CSV file."),
   make_option(c("--autopeptideml_dir"), type="character",
               default="outputs/annotation/autopeptideml/", 
               help="Path to directory containing AutoPeptideML TSV files."),
@@ -37,22 +40,30 @@ args <- parse_args(OptionParser(option_list=option_list))
 #'
 #' @param nlpprecursor_path Path to the NLPprecursor predictions TSV file.
 #' @param deeppeptide_path Path to the DeepPeptide predictions TSV file.
+#' @param plmutils_path Path to the plmutils predictions CSV file.
 #' @param autopeptideml_dir Path to the directory containing AutoPeptideML TSV files.
 #' @param deepsig_path Path to the DeepSig annotations TSV file.
 #' @param peptipedia_path Path to the Peptipedia BLAST matches TSV file.
 #' @param characteristics_path Path to the peptide characteristics TSV file.
 #'
 #' @return A data frame with peptide predictions merged with various annotations.
-combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, 
+combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, plmutils_path, 
                                         autopeptideml_dir, deepsig_path,
                                         peptipedia_path, characteristics_path) {
 
   nlpprecursor <- read_tsv(nlpprecursor_path) %>%
     select(-nlpprecursor_cleavage_sequence)
 
   deeppeptide <- read_tsv(deeppeptide_path)
+
+  plmutils <- read_csv(plmutils_path) %>%
+    filter(predicted_label == "positive") %>%
+    select(peptide_id = sequence_id) %>%
+    mutate(peptide_type = "sORF", 
+           prediction_tool = "plmutils")
 
-  peptide_predictions <- bind_rows(nlpprecursor, deeppeptide)
+  peptide_predictions <- bind_rows(nlpprecursor, deeppeptide) %>%
+    bind_rows(plmutils)
 
   autopeptideml_files <- Sys.glob(paste0(autopeptideml_dir, "/*tsv"))
   autopeptideml <- map_dfr(autopeptideml_files, read_tsv) %>%
@@ -85,6 +96,7 @@ combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path,
 
 annotations_df<- combine_peptide_annotations(nlpprecursor_path = args$nlpprecursor_path,
                                              deeppeptide_path = args$deeppeptide_path,
+                                             plmutils_path = args$plmutils_path,
                                              autopeptideml_dir = args$autopeptideml_dir,
                                              deepsig_path = args$deepsig_path,
                                              peptipedia_path = args$peptipedia_path,