From b6a93caae8c2748b9d4d53a9b00b3db3c9bf83aa Mon Sep 17 00:00:00 2001 From: Edmund Miller Date: Sat, 19 Oct 2024 17:47:18 -0500 Subject: [PATCH] fix(grohmm): Support gxf Scrap all of the CHM13 attempts I think it's the gtf conversion stuff that's causing the memory overload. But I can get them all to load locally. So I'm going to try to use gff3 files, if they're supplied, otherwise fall back to gtf. --- ...ions => custom_makeConsensusAnnotations.R} | 14 ++ bin/parameter_tuning.R | 71 +------ bin/transcriptcalling_grohmm.R | 6 +- modules/local/grohmm/parametertuning/main.nf | 4 +- .../local/grohmm/transcriptcalling/main.nf | 4 +- .../local/transcript_identification.nf | 4 +- workflows/nascent.nf | 4 +- .../grohmm/only_gff/main.nf.test | 57 ++++++ .../grohmm/only_gff/main.nf.test.snap | 190 ++++++++++++++++++ 9 files changed, 280 insertions(+), 74 deletions(-) rename bin/{custom_makeConsensusAnnotations => custom_makeConsensusAnnotations.R} (96%) create mode 100644 workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test create mode 100644 workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap diff --git a/bin/custom_makeConsensusAnnotations b/bin/custom_makeConsensusAnnotations.R similarity index 96% rename from bin/custom_makeConsensusAnnotations rename to bin/custom_makeConsensusAnnotations.R index 74c5a43e..f51eda7d 100644 --- a/bin/custom_makeConsensusAnnotations +++ b/bin/custom_makeConsensusAnnotations.R @@ -165,6 +165,7 @@ suppressPackageStartupMessages(library(groHMM)) suppressPackageStartupMessages(library(testthat)) suppressPackageStartupMessages(library(GenomicRanges)) suppressPackageStartupMessages(library(rtracklayer)) +library(GenomicFeatures) # test_that("custom_makeConsensusAnnotations maintains accuracy", { # # Load the GTF file @@ -216,6 +217,17 @@ suppressPackageStartupMessages(library(rtracklayer)) # print(head(refactored_result, 5)) # }) +gff <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" +kg_db <- makeTxDbFromGFF(gff) +kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) +print("Collapse annotations in preparation for overlap") +kg_consensus <- makeConsensusAnnotations( + kg_tx, + mc.cores = 2 +) +print("Kg consensus:") +print(kg_consensus) + test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { # Load the GTF file gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" @@ -239,6 +251,8 @@ test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { original_result <- groHMM::makeConsensusAnnotations(sample_ranges) refactored_result <- custom_makeConsensusAnnotations(sample_ranges) + print("Original result:") + print(original_result) print("Refactored result:") print(refactored_result) diff --git a/bin/parameter_tuning.R b/bin/parameter_tuning.R index b630f003..de323bf5 100755 --- a/bin/parameter_tuning.R +++ b/bin/parameter_tuning.R @@ -56,11 +56,11 @@ parser$add_argument( ) parser$add_argument( "-g", - "--gtf", + "--gxf", type = "character", default = NULL, metavar = "string", - help = "GTF File to create TxDb", + help = "GFF/GTF File to create TxDb", required = TRUE ) parser$add_argument( @@ -75,7 +75,6 @@ parser$add_argument( "-m", "--memory", type = "integer", - default = 56000, metavar = "integer", help = "Amount of memory in MB" ) @@ -91,7 +90,6 @@ if (is.null(args$bam_files)) { stop("Please provide a bam file", call. = FALSE) } - # Load alignment files # TODO? CHANGE BASED ON PAIRED OR SINGLE END alignments <- c() @@ -104,65 +102,12 @@ for (bam in args$bam_files) { } print("Input transcript annotations") -# Import the GTF file using rtracklayer -gtf <- import(args$gtf) - -# Exclude any transcripts located on chromosomes labeled with "random" -gtf <- gtf[!grepl("random", seqnames(gtf)), ] - -# Extract transcript-level features -transcripts_gtf <- gtf[gtf$type == "transcript", ] -# Extract exon features -exons_gtf <- gtf[gtf$type == "exon", ] - -# Ensure that the 'transcript_id' and 'gene_id' columns are present -if (!all(c("transcript_id", "gene_id") %in% names(mcols(exons_gtf)))) { - stop("The GTF file lacks 'transcript_id' or 'gene_id' in its attributes.") -} - -# Group exons by transcript_id -exons_by_transcript <- split(exons_gtf, exons_gtf$transcript_id) - -# Diagnostic prints -print(paste("Number of transcripts:", length(exons_by_transcript))) - -# Reduce exons to create transcript ranges -transcripts_ranges <- GenomicRanges::reduce(exons_by_transcript) -transcripts_ranges <- unlist(transcripts_ranges, use.names = TRUE) - -# Diagnostic prints after reduction -print(paste("Number of transcripts_ranges after reduction:", length(transcripts_ranges))) - -# Create mapping dataframe -mapping_df <- data.frame( - transcript_id = names(transcripts_ranges), - gene_id = vapply(exons_by_transcript[names(transcripts_ranges)], function(x) unique(x$gene_id)[1], character(1)), - stringsAsFactors = FALSE -) - -# Check for length mismatch -if (nrow(mapping_df) != length(transcripts_ranges)) { - stop(paste("Length mismatch between mapping_df and transcripts_ranges:", nrow(mapping_df), length(transcripts_ranges))) -} - -# Assign metadata -mcols(transcripts_ranges)$transcript_id <- mapping_df$transcript_id -mcols(transcripts_ranges)$gene_id <- mapping_df$gene_id - -# Assign seqnames and strand from the exons -seqnames(transcripts_ranges) <- seqnames(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)]) -strand(transcripts_ranges) <- strand(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)]) - -# Ensure that seqlevels are set correctly -seqlevels(transcripts_ranges) <- seqlevels(gtf) - -# Remove any transcripts with NA values -transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is.na(end(transcripts_ranges))] - +kg_db <- makeTxDbFromGFF(args$gxf) +kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) print("Collapse annotations in preparation for overlap") -kg_consensus <- custom_makeConsensusAnnotations( - transcripts_ranges, - mc.cores = min(args$cores, 10) # 10 the number they had hardcoded in the grohmm package for some reason +kg_consensus <- makeConsensusAnnotations( + kg_tx, + mc.cores = args$cores ) print("Finished consensus annotations") @@ -223,5 +168,3 @@ if (file.exists(r_log_file) == FALSE) { print(a) sink() } - - diff --git a/bin/transcriptcalling_grohmm.R b/bin/transcriptcalling_grohmm.R index 883daa18..5c48c64e 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/transcriptcalling_grohmm.R @@ -64,11 +64,11 @@ parser$add_argument( ) parser$add_argument( "-g", - "--gtf", + "--gxf", type = "character", default = NULL, metavar = "string", - help = "GTF File to create TxDb", + help = "GFF/GTF File to create TxDb", required = TRUE ) parser$add_argument( @@ -143,7 +143,7 @@ write.table( ) print("Input transcript annotations") -kg_db <- makeTxDbFromGFF(args$gtf) +kg_db <- makeTxDbFromGFF(args$gxf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) # TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level # https://github.com/google/deepvariant/issues/744 diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index fef7cab3..ad5ac5c7 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -10,7 +10,7 @@ process GROHMM_PARAMETERTUNING { input: tuple val(meta), path(bams), path(bais) - path gtf + path gxf each UTS each LtProbB @@ -29,7 +29,7 @@ process GROHMM_PARAMETERTUNING { parameter_tuning.R \\ --bam_file ${bams} \\ --outprefix ${prefix} \\ - --gtf $gtf \\ + --gxf $gxf \\ --uts $UTS \\ --ltprobb $LtProbB \\ --outdir ./ \\ diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 7935daae..07cba0ca 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -10,7 +10,7 @@ process GROHMM_TRANSCRIPTCALLING { input: tuple val(meta), path(bams), path(bais), path(tuning_file) - path gtf + path gxf output: tuple val(meta), path("*.transcripts.txt"), emit: transcripts @@ -32,7 +32,7 @@ process GROHMM_TRANSCRIPTCALLING { --bam_file ${bams} \\ --tuning_file ${tuning_file} \\ --outprefix ${prefix} \\ - --gtf $gtf \\ + --gxf $gxf \\ --outdir ./ \\ --cores $task.cpus \\ --memory ${task.memory.toMega()} \\ diff --git a/subworkflows/local/transcript_identification.nf b/subworkflows/local/transcript_identification.nf index b1217a90..b2f4883d 100644 --- a/subworkflows/local/transcript_identification.nf +++ b/subworkflows/local/transcript_identification.nf @@ -14,7 +14,7 @@ include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect/ma workflow TRANSCRIPT_INDENTIFICATION { take: group_bam_bai - gtf + gxf fasta chrom_sizes @@ -25,7 +25,7 @@ workflow TRANSCRIPT_INDENTIFICATION { grohmm_td_plot = Channel.empty() if(!params.skip_grohmm && params.assay_type == "GROseq") { - GROHMM ( group_bam_bai, gtf ) + GROHMM ( group_bam_bai, gxf ) ch_identification_bed = ch_identification_bed.mix(GROHMM.out.bed) grohmm_td_plot = GROHMM.out.td_plot ch_versions = ch_versions.mix(GROHMM.out.versions.first()) diff --git a/workflows/nascent.nf b/workflows/nascent.nf index d4675c1d..06e7f16d 100644 --- a/workflows/nascent.nf +++ b/workflows/nascent.nf @@ -286,9 +286,11 @@ workflow NASCENT { ch_group_bam_bai = ch_group_bam.join(ch_group_bai, by: [0]) + ch_gxf = ch_gff ? ch_gff : PREPARE_GENOME.out.gtf + TRANSCRIPT_INDENTIFICATION ( ch_group_bam_bai, - PREPARE_GENOME.out.gtf, + ch_gxf, PREPARE_GENOME.out.fasta, PREPARE_GENOME.out.chrom_sizes, ) diff --git a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test new file mode 100644 index 00000000..4e8ab43d --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test @@ -0,0 +1,57 @@ +nextflow_pipeline { + + name "GROHMM" + script "../../../../../main.nf" + tag "input" + tag "grohmm" + tag "gff" + tag "chm13" + // triggers 'bin/parameter_tuning.R', 'bin/transcriptcalling_grohmm.R' + + test("Should run groHMM with only a GFF file") { + when { + params { + outdir = "$outputDir" + skip_grohmm = false + gff = 'https://huggingface.co/datasets/edmundmiller/nascent-test-data/resolve/main/chm13v2.0_RefSeq_Liftoff_v5.1.chr21.gff3.gz' + // TODO Use CHM13 Fasta + gtf = null + bed = null + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") }, + { assert snapshot( + workflow.trace.tasks().size(), + path("$outputDir/transcript_identification/homer/cd4.bed"), + path("$outputDir/transcript_identification/homer/jurkat.bed"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/intersect/").list(), + path("$outputDir/transcript_identification/filtered/").list(), + path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.png").exists(), + path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"), + path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.png").exists(), + path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"), + // FIXME Not determinstic because of the order of files + // Add to the other tests when fixed + // path("$outputDir/quantification/").list(), + path("$outputDir/multiqc/multiqc_report.html").exists(), + ).match("output_files") + } + ) + } + } +} diff --git a/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap new file mode 100644 index 00000000..95ce3a3c --- /dev/null +++ b/workflows/tests/transcript_indentification/grohmm/only_gff/main.nf.test.snap @@ -0,0 +1,190 @@ +{ + "output_files": { + "content": [ + 206, + [ + "cd4_REP1.trimmed.fastp.json:md5,bd9a3344c1591d6be4d524451f9dca53", + "cd4_REP2.trimmed.fastp.json:md5,bc6e3f9ff7835f220535cc393b8eb25f", + "jurkat.trimmed.fastp.json:md5,65f7247a87479c7157c7357d575336e2" + ], + "c529a16c839e85e119b98354f109352d", + "9e63b682af88fa902cf92b5c485845b1", + "c2adf5327ff6d4edda2fdad00c7cb9bf", + [ + "cd4_REP1.sorted.bam.flagstat:md5,863e2d506d5cc4239af98a5f31bbc906", + "cd4_REP1.sorted.bam.idxstats:md5,b1dd8bcbd23c53c21f0e11082d9315f2", + "cd4_REP1.sorted.bam.stats:md5,1536c80bae78b2062508e1de210f6387", + "cd4_REP2.sorted.bam.flagstat:md5,0fd86dbf8f799fad49ba471702979bdc", + "cd4_REP2.sorted.bam.idxstats:md5,53204e4c6a9f68664087e4a8123be46a", + "cd4_REP2.sorted.bam.stats:md5,561610e53fb676ac83252712dcac30d4", + "jurkat.sorted.bam.flagstat:md5,fd5f02b0f02a407447b804b1d80f5421", + "jurkat.sorted.bam.idxstats:md5,c61af0847c1ad76c06a8de2815975b32", + "jurkat.sorted.bam.stats:md5,898c6b259ef686bf797ef15646faf29d" + ], + [ + "cd4_REP1.coverage.hist.txt:md5,7cbb473be8d3b32ff2e52fdf4e5d10d2", + "cd4_REP1.coverage.stats.txt:md5,f1471b61ac17dba283d80e08450c7e55", + "cd4_REP2.coverage.hist.txt:md5,8d5258a0882494bc4e3f1aa6aa5ed685", + "cd4_REP2.coverage.stats.txt:md5,3cdb4473211f9da44166ffa6aaa5b602", + "jurkat.coverage.hist.txt:md5,71a893d9d1d55fd47399f6d47b628d6e", + "jurkat.coverage.stats.txt:md5,381c69a30099d82066a959deab1a2569" + ], + [ + "cd4_REP1.c_curve.txt:md5,cf4743abdd355595d6ec1fb3f38e66e5", + "cd4_REP1.lc_extrap.txt:md5,8633f84ccd5cc725db9af4b33edd63b0", + "cd4_REP2.c_curve.txt:md5,cf4743abdd355595d6ec1fb3f38e66e5", + "cd4_REP2.lc_extrap.txt:md5,3ad9e4028c3711e6d46831c10ed04200", + "jurkat.c_curve.txt:md5,cf4743abdd355595d6ec1fb3f38e66e5", + "jurkat.lc_extrap.txt:md5,84faa937faa88476c942f330c5762cb5" + ], + [ + "cd4_REP1.pos.DupRate.xls:md5,a80db2d20096ca839a7847ec5b11bf75", + "cd4_REP1.seq.DupRate.xls:md5,c34531fd7578c6f62cbad53b96a7feb9", + "cd4_REP2.pos.DupRate.xls:md5,06200ab67a60bee71fd168de88c15369", + "cd4_REP2.seq.DupRate.xls:md5,c82f6d687eacabbab045db34647c3254", + "jurkat.pos.DupRate.xls:md5,0721c91ab7c640b046689095047657f8", + "jurkat.seq.DupRate.xls:md5,aba941b1bf0e93f99e39bd507d1c02de" + ], + [ + "cd4_REP1.DupRate_plot.r:md5,a6f96b5b87a142dca2e09868deb8222b", + "cd4_REP2.DupRate_plot.r:md5,a0686d22ba07f33a627c1a106d442a03", + "jurkat.DupRate_plot.r:md5,acb2fdffc578643503503bf0081eb7ae" + ], + [ + "cd4_REP1.infer_experiment.txt:md5,7e73a21e1f8658241cd10aeab556c14d", + "cd4_REP2.infer_experiment.txt:md5,34089eb17ea1b32442718b484d19ae8c", + "jurkat.infer_experiment.txt:md5,c15341e640bfb39e4d588121db196989" + ], + [ + "cd4_REP1.read_distribution.txt:md5,34f891f50b686b0857ae8107786a197a", + "cd4_REP2.read_distribution.txt:md5,ded803207b4a9ebf4ba57ecda9f9227f", + "jurkat.read_distribution.txt:md5,6bdb2f60fcadc143bace6a8276145d25" + ], + [ + "cd4_REP1.dreg.bedGraph:md5,8948a8fa86d8f6d413b77983189ff56e", + "cd4_REP1.minus.bedGraph:md5,2a1c34f9d9ef9ff1b9da7874b9e3aaad", + "cd4_REP1.minus.bigWig:md5,5280319275c98dcce023779fa389884d", + "cd4_REP1.plus.bedGraph:md5,1509ec3a921e3109c5914e1bcef8cf33", + "cd4_REP1.plus.bigWig:md5,72ccab3173f2018a22a4b36841247ba2", + "cd4_REP2.dreg.bedGraph:md5,29f865a5668fae4a52a41589ac2b3179", + "cd4_REP2.minus.bedGraph:md5,c530bc34fa3ec7ac49e88ff65f9c2f92", + "cd4_REP2.minus.bigWig:md5,5e748c794e037f441741f7f409c8c5ad", + "cd4_REP2.plus.bedGraph:md5,a675141da2874ec08d91591e5ea8242b", + "cd4_REP2.plus.bigWig:md5,08674d52e9eeb08807c33ed3e4b3d504", + "jurkat.dreg.bedGraph:md5,c25a4fb095e9f7d6766a3ce33e08f7d8", + "jurkat.minus.bedGraph:md5,8d5d9a41df6eb6c56b1bfd3f39dc1fc6", + "jurkat.minus.bigWig:md5,d06c8015c996bf520fff17266fd01f84", + "jurkat.plus.bedGraph:md5,6ed63e5983edaa74fb3965676efdb674", + "jurkat.plus.bigWig:md5,7a02334f2c7300ffdb5a2253c0937390" + ], + "cd4.bed:md5,b55e5290d78941f36c3d1ecfef8e0062", + "jurkat.bed:md5,383cfaf10535dbe5d7f47607e345f4cb", + [ + "cd4_intersect.bed:md5,d41d8cd98f00b204e9800998ecf8427e", + "jurkat_intersect.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + [ + "cd4_filtered.bed:md5,7596b9c555cf05614153ee4b539f6b2e", + "jurkat_filtered.bed:md5,5cc005df15b1f03d21191f0f5d37f0e1" + ], + false + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T19:16:53.18655" + }, + "software_versions": { + "content": [ + { + "BBMAP_PILEUP": { + "bbmap": 39.01, + "samtools": "1.16.1", + "pigz": 2.6 + }, + "BEDTOOLS_GENOMECOV_MINUS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_GENOMECOV_PLUS": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT": { + "bedtools": "2.31.1" + }, + "BEDTOOLS_INTERSECT_FILTER": { + "bedtools": "2.31.1" + }, + "BWA_INDEX": { + "bwa": "0.7.18-r1243-dirty" + }, + "BWA_MEM": { + "bwa": "0.7.18-r1243-dirty", + "samtools": 1.2 + }, + "CUSTOM_GETCHROMSIZES": { + "getchromsizes": 1.2 + }, + "DEEPTOOLS_BAMCOVERAGE_MINUS": { + "deeptools": "3.5.1" + }, + "DEEPTOOLS_BAMCOVERAGE_PLUS": { + "deeptools": "3.5.1" + }, + "FASTP": { + "fastp": "0.23.4" + }, + "FASTQC": { + "fastqc": "0.12.1" + }, + "GFFREAD": { + "gffread": "0.12.7" + }, + "GROHMM_PARAMETERTUNING": { + "r-base": "4.3.3", + "bioconductor-grohmm": "1.39.0" + }, + "GTF2BED": { + "perl": "5.26.2" + }, + "GUNZIP_GFF": { + "gunzip": 1.1 + }, + "HOMER_MAKETAGDIRECTORY": { + "homer": 4.11, + "samtools": 1.11 + }, + "PINTS_CALLER": { + "python": "3.10.6", + "pints": "1.1.8" + }, + "PRESEQ_CCURVE": { + "preseq": "3.1.1" + }, + "PRESEQ_LCEXTRAP": { + "preseq": "3.1.1" + }, + "RSEQC_INFEREXPERIMENT": { + "rseqc": "5.0.2" + }, + "RSEQC_READDISTRIBUTION": { + "rseqc": "5.0.2" + }, + "RSEQC_READDUPLICATION": { + "rseqc": "5.0.2" + }, + "SUBREAD_FEATURECOUNTS_GENE": { + "subread": "2.0.1" + }, + "Workflow": { + "nf-core/nascent": "v2.3.0dev" + } + } + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-19T19:16:52.664403" + } +} \ No newline at end of file