fix(grohmm): Support gxf

Scrap all of the CHM13 attempts I think it's the gtf conversion stuff that's causing the memory overload. But I can get them all to load locally. So I'm going to try to use gff3 files, if they're supplied, otherwise fall back to gtf.
nf-core · Oct 20, 2024 · b6a93ca · b6a93ca
1 parent c5d7b3b
commit b6a93ca
Show file tree

Hide file tree

Showing 9 changed files with 280 additions and 74 deletions.
diff --git a/bin/custom_makeConsensusAnnotations → bin/custom_makeConsensusAnnotations.R b/bin/custom_makeConsensusAnnotations → bin/custom_makeConsensusAnnotations.R
@@ -165,6 +165,7 @@ suppressPackageStartupMessages(library(groHMM))
 suppressPackageStartupMessages(library(testthat))
 suppressPackageStartupMessages(library(GenomicRanges))
 suppressPackageStartupMessages(library(rtracklayer))
+library(GenomicFeatures)
 
 # test_that("custom_makeConsensusAnnotations maintains accuracy", {
 #     # Load the GTF file
@@ -216,6 +217,17 @@ suppressPackageStartupMessages(library(rtracklayer))
 #     print(head(refactored_result, 5))
 # })
 
+gff <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz"
+kg_db <- makeTxDbFromGFF(gff)
+kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
+print("Collapse annotations in preparation for overlap")
+kg_consensus <- makeConsensusAnnotations(
+    kg_tx,
+    mc.cores = 2
+)
+print("Kg consensus:")
+print(kg_consensus)
+
 test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", {
     # Load the GTF file
     gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz"
@@ -239,6 +251,8 @@ test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", {
     original_result <- groHMM::makeConsensusAnnotations(sample_ranges)
     refactored_result <- custom_makeConsensusAnnotations(sample_ranges)
 
+    print("Original result:")
+    print(original_result)
     print("Refactored result:")
     print(refactored_result)
 

diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R
@@ -56,11 +56,11 @@ parser$add_argument(
 )
 parser$add_argument(
   "-g",
-  "--gtf",
+  "--gxf",
   type = "character",
   default = NULL,
   metavar = "string",
-  help = "GTF File to create TxDb",
+  help = "GFF/GTF File to create TxDb",
   required = TRUE
 )
 parser$add_argument(
@@ -75,7 +75,6 @@ parser$add_argument(
   "-m",
   "--memory",
   type = "integer",
-  default = 56000,
   metavar = "integer",
   help = "Amount of memory in MB"
 )
@@ -91,7 +90,6 @@ if (is.null(args$bam_files)) {
   stop("Please provide a bam file", call. = FALSE)
 }
 
-
 # Load alignment files
 # TODO? CHANGE BASED ON PAIRED OR SINGLE END
 alignments <- c()
@@ -104,65 +102,12 @@ for (bam in args$bam_files) {
 }
 
 print("Input transcript annotations")
-# Import the GTF file using rtracklayer
-gtf <- import(args$gtf)
-
-# Exclude any transcripts located on chromosomes labeled with "random"
-gtf <- gtf[!grepl("random", seqnames(gtf)), ]
-
-# Extract transcript-level features
-transcripts_gtf <- gtf[gtf$type == "transcript", ]
-# Extract exon features
-exons_gtf <- gtf[gtf$type == "exon", ]
-
-# Ensure that the 'transcript_id' and 'gene_id' columns are present
-if (!all(c("transcript_id", "gene_id") %in% names(mcols(exons_gtf)))) {
-  stop("The GTF file lacks 'transcript_id' or 'gene_id' in its attributes.")
-}
-
-# Group exons by transcript_id
-exons_by_transcript <- split(exons_gtf, exons_gtf$transcript_id)
-
-# Diagnostic prints
-print(paste("Number of transcripts:", length(exons_by_transcript)))
-
-# Reduce exons to create transcript ranges
-transcripts_ranges <- GenomicRanges::reduce(exons_by_transcript)
-transcripts_ranges <- unlist(transcripts_ranges, use.names = TRUE)
-
-# Diagnostic prints after reduction
-print(paste("Number of transcripts_ranges after reduction:", length(transcripts_ranges)))
-
-# Create mapping dataframe
-mapping_df <- data.frame(
-  transcript_id = names(transcripts_ranges),
-  gene_id = vapply(exons_by_transcript[names(transcripts_ranges)], function(x) unique(x$gene_id)[1], character(1)),
-  stringsAsFactors = FALSE
-)
-
-# Check for length mismatch
-if (nrow(mapping_df) != length(transcripts_ranges)) {
-  stop(paste("Length mismatch between mapping_df and transcripts_ranges:", nrow(mapping_df), length(transcripts_ranges)))
-}
-
-# Assign metadata
-mcols(transcripts_ranges)$transcript_id <- mapping_df$transcript_id
-mcols(transcripts_ranges)$gene_id <- mapping_df$gene_id
-
-# Assign seqnames and strand from the exons
-seqnames(transcripts_ranges) <- seqnames(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)])
-strand(transcripts_ranges) <- strand(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)])
-
-# Ensure that seqlevels are set correctly
-seqlevels(transcripts_ranges) <- seqlevels(gtf)
-
-# Remove any transcripts with NA values
-transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is.na(end(transcripts_ranges))]
-
+kg_db <- makeTxDbFromGFF(args$gxf)
+kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
 print("Collapse annotations in preparation for overlap")
-kg_consensus <- custom_makeConsensusAnnotations(
-  transcripts_ranges,
-  mc.cores = min(args$cores, 10) # 10 the number they had hardcoded in the grohmm package for some reason
+kg_consensus <- makeConsensusAnnotations(
+  kg_tx,
+  mc.cores = args$cores
 )
 print("Finished consensus annotations")
 
@@ -223,5 +168,3 @@ if (file.exists(r_log_file) == FALSE) {
   print(a)
   sink()
 }
-
-
diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R
@@ -64,11 +64,11 @@ parser$add_argument(
 )
 parser$add_argument(
   "-g",
-  "--gtf",
+  "--gxf",
   type = "character",
   default = NULL,
   metavar = "string",
-  help = "GTF File to create TxDb",
+  help = "GFF/GTF File to create TxDb",
   required = TRUE
 )
 parser$add_argument(
@@ -143,7 +143,7 @@ write.table(
 )
 
 print("Input transcript annotations")
-kg_db <- makeTxDbFromGFF(args$gtf)
+kg_db <- makeTxDbFromGFF(args$gxf)
 kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
 # TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level
 # https://github.com/google/deepvariant/issues/744

diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf
@@ -10,7 +10,7 @@ process GROHMM_PARAMETERTUNING {
 
     input:
     tuple val(meta), path(bams), path(bais)
-    path gtf
+    path gxf
     each UTS
     each LtProbB
 
@@ -29,7 +29,7 @@ process GROHMM_PARAMETERTUNING {
     parameter_tuning.R \\
         --bam_file ${bams} \\
         --outprefix ${prefix} \\
-        --gtf $gtf \\
+        --gxf $gxf \\
         --uts $UTS \\
         --ltprobb $LtProbB \\
         --outdir ./ \\

diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf
@@ -10,7 +10,7 @@ process GROHMM_TRANSCRIPTCALLING {
 
     input:
     tuple val(meta), path(bams), path(bais), path(tuning_file)
-    path gtf
+    path gxf
 
     output:
     tuple val(meta), path("*.transcripts.txt"), emit: transcripts
@@ -32,7 +32,7 @@ process GROHMM_TRANSCRIPTCALLING {
         --bam_file ${bams} \\
         --tuning_file ${tuning_file} \\
         --outprefix ${prefix} \\
-        --gtf $gtf \\
+        --gxf $gxf \\
         --outdir ./ \\
         --cores $task.cpus \\
         --memory ${task.memory.toMega()} \\

diff --git a/subworkflows/local/transcript_identification.nf b/subworkflows/local/transcript_identification.nf
@@ -14,7 +14,7 @@ include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect/ma
 workflow TRANSCRIPT_INDENTIFICATION {
     take:
     group_bam_bai
-    gtf
+    gxf
     fasta
     chrom_sizes
 
@@ -25,7 +25,7 @@ workflow TRANSCRIPT_INDENTIFICATION {
 
     grohmm_td_plot = Channel.empty()
     if(!params.skip_grohmm && params.assay_type == "GROseq") {
-        GROHMM ( group_bam_bai, gtf )
+        GROHMM ( group_bam_bai, gxf )
         ch_identification_bed = ch_identification_bed.mix(GROHMM.out.bed)
         grohmm_td_plot = GROHMM.out.td_plot
         ch_versions = ch_versions.mix(GROHMM.out.versions.first())

diff --git a/workflows/nascent.nf b/workflows/nascent.nf
@@ -286,9 +286,11 @@ workflow NASCENT {
 
     ch_group_bam_bai = ch_group_bam.join(ch_group_bai, by: [0])
 
+    ch_gxf = ch_gff ? ch_gff : PREPARE_GENOME.out.gtf
+
     TRANSCRIPT_INDENTIFICATION (
         ch_group_bam_bai,
-        PREPARE_GENOME.out.gtf,
+        ch_gxf,
         PREPARE_GENOME.out.fasta,
         PREPARE_GENOME.out.chrom_sizes,
     )

diff --git a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test
@@ -0,0 +1,57 @@
+nextflow_pipeline {
+
+    name "GROHMM"
+    script "../../../../../main.nf"
+    tag "input"
+    tag "grohmm"
+    tag "gff"
+    tag "chm13"
+    // triggers 'bin/parameter_tuning.R', 'bin/transcriptcalling_grohmm.R'
+
+    test("Should run groHMM with only a GFF file") {
+        when {
+            params {
+                outdir = "$outputDir"
+                skip_grohmm = false
+                gff = 'https://huggingface.co/datasets/edmundmiller/nascent-test-data/resolve/main/chm13v2.0_RefSeq_Liftoff_v5.1.chr21.gff3.gz'
+                // TODO Use CHM13 Fasta
+                gtf = null
+                bed = null
+            }
+        }
+
+        then {
+            assertAll(
+                { assert workflow.success },
+                { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") },
+                { assert snapshot(
+                    workflow.trace.tasks().size(),
+                    path("$outputDir/transcript_identification/homer/cd4.bed"),
+                    path("$outputDir/transcript_identification/homer/jurkat.bed"),
+                    // FIXME Not determinstic because of the order of files
+                    // Add to the other tests when fixed
+                    // UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"),
+                    path("$outputDir/transcript_identification/intersect/").list(),
+                    path("$outputDir/transcript_identification/filtered/").list(),
+                    path("$outputDir/transcript_identification/intersect/").list(),
+                    path("$outputDir/transcript_identification/filtered/").list(),
+                    path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"),
+                    path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"),
+                    path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"),
+                    path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.png").exists(),
+                    path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"),
+                    path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"),
+                    path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"),
+                    path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"),
+                    path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.png").exists(),
+                    path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"),
+                    // FIXME Not determinstic because of the order of files
+                    // Add to the other tests when fixed
+                    // path("$outputDir/quantification/").list(),
+                    path("$outputDir/multiqc/multiqc_report.html").exists(),
+                ).match("output_files")
+                }
+            )
+        }
+    }
+}