Skip to content

Commit

Permalink
Merge pull request #22 from Arcadia-Science/ter/plmutils-in-peptigate
Browse files Browse the repository at this point in the history
Add in sORF prediction to peptigate using plmutils
  • Loading branch information
taylorreiter authored Mar 13, 2024
2 parents 1f1c091 + e81c63d commit b85f930
Show file tree
Hide file tree
Showing 7 changed files with 177 additions and 124 deletions.
262 changes: 146 additions & 116 deletions Snakefile

Large diffs are not rendered by default.

23 changes: 17 additions & 6 deletions config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,23 @@ output_dir: "outputs/"
# All input files are produced by reads2transcriptome.
# While this is not a strict requirement, using these output files gives us the ability to look at very short contiguous sequences.
# TER TODO: update names of output files based on Nextflow of reads2transcriptome.
# - short_contigs: contigs that are shorter than X nucleotides, which do not progress through the assembly pipeline. These may contain sORFs and so are included.
# - orfs_amino_acids: predicted ORFs translated into amino acids. Output by transdecoder. Used for cleavage peptide prediction and annotation of nonribosomal peptide synthetases.
# - orfs_nucleotides: predicted ORFs as nucleotide sequences. Output by transdecoder. Used to compare peptide nucleotide sequences (clustering, dn/ds estimation, etc.).
# - all_contigs: all contigs as nucleotide sequences. Used to identify contigs shorter than X nucleotides (300) to scan for sORFs and to predict lncRNAs, which may have sORFs embedded in them.
# - orfs_amino_acids: predicted ORFs translated into amino acids.
# Output by transdecoder.
# Used for cleavage peptide prediction and annotation of nonribosomal peptide synthetases.
# - orfs_nucleotides: predicted ORFs as nucleotide sequences. Output by transdecoder.
# Used to compare peptide nucleotide sequences (clustering, dn/ds estimation, etc.).
# - contigs_shorter_than_r2t_minimum_length: contigs that are shorter than X nucleotides (by default 75bp).
# The reads2transcriptome pipeline assembles RNA-seq reads into contigs (transcripts) using multiple assemblers and then merges those assemblies together.
# Before merging, very short contigs are removed (<75bp).
# However, reads2transcriptome outputs a FASTA file containing these transcripts, which is used as input to peptigate here.
# These contigs may contain sORFs and so are included as an input to the peptigate pipeline.
# - contigs_longer_than_r2t_minimum_length: contigs longer than X nucleotides (default is 75bp).
# If the user did not use the reads2transcriptome file to generate their input files, this would be a transcriptome assembly FASTA file in nucleotide format containing transcripts.
# Note that the first step of sORF prediction combines the contigs_shorter_than_r2t_minimum_length and contigs_longer_than_r2t_minimum_length files, so there is no need to perform any pre-processing by length.
# - plmutils_model_dir: path to the directory for the plmutils model that will predict whether sORFs are coding or non-coding.

short_contigs: "demo/short_contigs.fa"
orfs_amino_acids: "demo/orfs_amino_acids.faa"
orfs_nucleotides: "demo/orfs_nucleotides.fa"
all_contigs: "demo/all_contigs.fa"
contigs_shorter_than_r2t_minimum_length: "demo/contigs_shorter_than_r2t_minimum_length.fa"
contigs_longer_than_r2t_minimum_length: "demo/contigs_longer_than_r2t_minimum_length.fa"
plmutils_model_dir: "inputs/models/plmutils/"
File renamed without changes.
File renamed without changes.
Binary file added inputs/models/plmutils/classifier.joblib
Binary file not shown.
Binary file added inputs/models/plmutils/pca.joblib
Binary file not shown.
16 changes: 14 additions & 2 deletions scripts/combine_peptide_annotations.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ option_list <- list(
make_option(c("--deeppeptide_path"), type="character",
default="outputs/cleavage/deeppeptide/predictions.tsv",
help="Path to DeepPeptide predictions TSV file."),
make_option(c("--plmutils_path"), type="character",
default="outputs/sORF/plmutils/predictions.csv",
help="Path to plmutils sORF predictions CSV file."),
make_option(c("--autopeptideml_dir"), type="character",
default="outputs/annotation/autopeptideml/",
help="Path to directory containing AutoPeptideML TSV files."),
Expand Down Expand Up @@ -37,22 +40,30 @@ args <- parse_args(OptionParser(option_list=option_list))
#'
#' @param nlpprecursor_path Path to the NLPprecursor predictions TSV file.
#' @param deeppeptide_path Path to the DeepPeptide predictions TSV file.
#' @param plmutils_path Path to the plmutils predictions CSV file.
#' @param autopeptideml_dir Path to the directory containing AutoPeptideML TSV files.
#' @param deepsig_path Path to the DeepSig annotations TSV file.
#' @param peptipedia_path Path to the Peptipedia BLAST matches TSV file.
#' @param characteristics_path Path to the peptide characteristics TSV file.
#'
#' @return A data frame with peptide predictions merged with various annotations.
combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path,
combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path, plmutils_path,
autopeptideml_dir, deepsig_path,
peptipedia_path, characteristics_path) {

nlpprecursor <- read_tsv(nlpprecursor_path) %>%
select(-nlpprecursor_cleavage_sequence)

deeppeptide <- read_tsv(deeppeptide_path)

plmutils <- read_csv(plmutils_path) %>%
filter(predicted_label == "positive") %>%
select(peptide_id = sequence_id) %>%
mutate(peptide_type = "sORF",
prediction_tool = "plmutils")

peptide_predictions <- bind_rows(nlpprecursor, deeppeptide)
peptide_predictions <- bind_rows(nlpprecursor, deeppeptide) %>%
bind_rows(plmutils)

autopeptideml_files <- Sys.glob(paste0(autopeptideml_dir, "/*tsv"))
autopeptideml <- map_dfr(autopeptideml_files, read_tsv) %>%
Expand Down Expand Up @@ -85,6 +96,7 @@ combine_peptide_annotations <- function(nlpprecursor_path, deeppeptide_path,

annotations_df<- combine_peptide_annotations(nlpprecursor_path = args$nlpprecursor_path,
deeppeptide_path = args$deeppeptide_path,
plmutils_path = args$plmutils_path,
autopeptideml_dir = args$autopeptideml_dir,
deepsig_path = args$deepsig_path,
peptipedia_path = args$peptipedia_path,
Expand Down

0 comments on commit b85f930

Please sign in to comment.