Skip to content

Commit

Permalink
fix(grohmm): Support gxf
Browse files Browse the repository at this point in the history
Scrap all of the CHM13 attempts

I think it's the gtf conversion stuff that's causing the memory
overload. But I can get them all to load locally. So I'm going to try to
use gff3 files, if they're supplied, otherwise fall back to gtf.
  • Loading branch information
edmundmiller committed Oct 20, 2024
1 parent c5d7b3b commit b6a93ca
Show file tree
Hide file tree
Showing 9 changed files with 280 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,7 @@ suppressPackageStartupMessages(library(groHMM))
suppressPackageStartupMessages(library(testthat))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(rtracklayer))
library(GenomicFeatures)

# test_that("custom_makeConsensusAnnotations maintains accuracy", {
# # Load the GTF file
Expand Down Expand Up @@ -216,6 +217,17 @@ suppressPackageStartupMessages(library(rtracklayer))
# print(head(refactored_result, 5))
# })

gff <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz"
kg_db <- makeTxDbFromGFF(gff)
kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
print("Collapse annotations in preparation for overlap")
kg_consensus <- makeConsensusAnnotations(
kg_tx,
mc.cores = 2
)
print("Kg consensus:")
print(kg_consensus)

test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", {
# Load the GTF file
gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz"
Expand All @@ -239,6 +251,8 @@ test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", {
original_result <- groHMM::makeConsensusAnnotations(sample_ranges)
refactored_result <- custom_makeConsensusAnnotations(sample_ranges)

print("Original result:")
print(original_result)
print("Refactored result:")
print(refactored_result)

Expand Down
71 changes: 7 additions & 64 deletions bin/parameter_tuning.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,11 @@ parser$add_argument(
)
parser$add_argument(
"-g",
"--gtf",
"--gxf",
type = "character",
default = NULL,
metavar = "string",
help = "GTF File to create TxDb",
help = "GFF/GTF File to create TxDb",
required = TRUE
)
parser$add_argument(
Expand All @@ -75,7 +75,6 @@ parser$add_argument(
"-m",
"--memory",
type = "integer",
default = 56000,
metavar = "integer",
help = "Amount of memory in MB"
)
Expand All @@ -91,7 +90,6 @@ if (is.null(args$bam_files)) {
stop("Please provide a bam file", call. = FALSE)
}


# Load alignment files
# TODO? CHANGE BASED ON PAIRED OR SINGLE END
alignments <- c()
Expand All @@ -104,65 +102,12 @@ for (bam in args$bam_files) {
}

print("Input transcript annotations")
# Import the GTF file using rtracklayer
gtf <- import(args$gtf)

# Exclude any transcripts located on chromosomes labeled with "random"
gtf <- gtf[!grepl("random", seqnames(gtf)), ]

# Extract transcript-level features
transcripts_gtf <- gtf[gtf$type == "transcript", ]
# Extract exon features
exons_gtf <- gtf[gtf$type == "exon", ]

# Ensure that the 'transcript_id' and 'gene_id' columns are present
if (!all(c("transcript_id", "gene_id") %in% names(mcols(exons_gtf)))) {
stop("The GTF file lacks 'transcript_id' or 'gene_id' in its attributes.")
}

# Group exons by transcript_id
exons_by_transcript <- split(exons_gtf, exons_gtf$transcript_id)

# Diagnostic prints
print(paste("Number of transcripts:", length(exons_by_transcript)))

# Reduce exons to create transcript ranges
transcripts_ranges <- GenomicRanges::reduce(exons_by_transcript)
transcripts_ranges <- unlist(transcripts_ranges, use.names = TRUE)

# Diagnostic prints after reduction
print(paste("Number of transcripts_ranges after reduction:", length(transcripts_ranges)))

# Create mapping dataframe
mapping_df <- data.frame(
transcript_id = names(transcripts_ranges),
gene_id = vapply(exons_by_transcript[names(transcripts_ranges)], function(x) unique(x$gene_id)[1], character(1)),
stringsAsFactors = FALSE
)

# Check for length mismatch
if (nrow(mapping_df) != length(transcripts_ranges)) {
stop(paste("Length mismatch between mapping_df and transcripts_ranges:", nrow(mapping_df), length(transcripts_ranges)))
}

# Assign metadata
mcols(transcripts_ranges)$transcript_id <- mapping_df$transcript_id
mcols(transcripts_ranges)$gene_id <- mapping_df$gene_id

# Assign seqnames and strand from the exons
seqnames(transcripts_ranges) <- seqnames(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)])
strand(transcripts_ranges) <- strand(exons_gtf[match(names(transcripts_ranges), exons_gtf$transcript_id)])

# Ensure that seqlevels are set correctly
seqlevels(transcripts_ranges) <- seqlevels(gtf)

# Remove any transcripts with NA values
transcripts_ranges <- transcripts_ranges[!is.na(start(transcripts_ranges)) & !is.na(end(transcripts_ranges))]

kg_db <- makeTxDbFromGFF(args$gxf)
kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
print("Collapse annotations in preparation for overlap")
kg_consensus <- custom_makeConsensusAnnotations(
transcripts_ranges,
mc.cores = min(args$cores, 10) # 10 the number they had hardcoded in the grohmm package for some reason
kg_consensus <- makeConsensusAnnotations(
kg_tx,
mc.cores = args$cores
)
print("Finished consensus annotations")

Expand Down Expand Up @@ -223,5 +168,3 @@ if (file.exists(r_log_file) == FALSE) {
print(a)
sink()
}


6 changes: 3 additions & 3 deletions bin/transcriptcalling_grohmm.R
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ parser$add_argument(
)
parser$add_argument(
"-g",
"--gtf",
"--gxf",
type = "character",
default = NULL,
metavar = "string",
help = "GTF File to create TxDb",
help = "GFF/GTF File to create TxDb",
required = TRUE
)
parser$add_argument(
Expand Down Expand Up @@ -143,7 +143,7 @@ write.table(
)

print("Input transcript annotations")
kg_db <- makeTxDbFromGFF(args$gtf)
kg_db <- makeTxDbFromGFF(args$gxf)
kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name"))
# TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level
# https://github.com/google/deepvariant/issues/744
Expand Down
4 changes: 2 additions & 2 deletions modules/local/grohmm/parametertuning/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ process GROHMM_PARAMETERTUNING {

input:
tuple val(meta), path(bams), path(bais)
path gtf
path gxf
each UTS
each LtProbB

Expand All @@ -29,7 +29,7 @@ process GROHMM_PARAMETERTUNING {
parameter_tuning.R \\
--bam_file ${bams} \\
--outprefix ${prefix} \\
--gtf $gtf \\
--gxf $gxf \\
--uts $UTS \\
--ltprobb $LtProbB \\
--outdir ./ \\
Expand Down
4 changes: 2 additions & 2 deletions modules/local/grohmm/transcriptcalling/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ process GROHMM_TRANSCRIPTCALLING {

input:
tuple val(meta), path(bams), path(bais), path(tuning_file)
path gtf
path gxf

output:
tuple val(meta), path("*.transcripts.txt"), emit: transcripts
Expand All @@ -32,7 +32,7 @@ process GROHMM_TRANSCRIPTCALLING {
--bam_file ${bams} \\
--tuning_file ${tuning_file} \\
--outprefix ${prefix} \\
--gtf $gtf \\
--gxf $gxf \\
--outdir ./ \\
--cores $task.cpus \\
--memory ${task.memory.toMega()} \\
Expand Down
4 changes: 2 additions & 2 deletions subworkflows/local/transcript_identification.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ include { BEDTOOLS_INTERSECT } from '../../modules/nf-core/bedtools/intersect/ma
workflow TRANSCRIPT_INDENTIFICATION {
take:
group_bam_bai
gtf
gxf
fasta
chrom_sizes

Expand All @@ -25,7 +25,7 @@ workflow TRANSCRIPT_INDENTIFICATION {

grohmm_td_plot = Channel.empty()
if(!params.skip_grohmm && params.assay_type == "GROseq") {
GROHMM ( group_bam_bai, gtf )
GROHMM ( group_bam_bai, gxf )
ch_identification_bed = ch_identification_bed.mix(GROHMM.out.bed)
grohmm_td_plot = GROHMM.out.td_plot
ch_versions = ch_versions.mix(GROHMM.out.versions.first())
Expand Down
4 changes: 3 additions & 1 deletion workflows/nascent.nf
Original file line number Diff line number Diff line change
Expand Up @@ -286,9 +286,11 @@ workflow NASCENT {

ch_group_bam_bai = ch_group_bam.join(ch_group_bai, by: [0])

ch_gxf = ch_gff ? ch_gff : PREPARE_GENOME.out.gtf

TRANSCRIPT_INDENTIFICATION (
ch_group_bam_bai,
PREPARE_GENOME.out.gtf,
ch_gxf,
PREPARE_GENOME.out.fasta,
PREPARE_GENOME.out.chrom_sizes,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
nextflow_pipeline {

name "GROHMM"
script "../../../../../main.nf"
tag "input"
tag "grohmm"
tag "gff"
tag "chm13"
// triggers 'bin/parameter_tuning.R', 'bin/transcriptcalling_grohmm.R'

test("Should run groHMM with only a GFF file") {
when {
params {
outdir = "$outputDir"
skip_grohmm = false
gff = 'https://huggingface.co/datasets/edmundmiller/nascent-test-data/resolve/main/chm13v2.0_RefSeq_Liftoff_v5.1.chr21.gff3.gz'
// TODO Use CHM13 Fasta
gtf = null
bed = null
}
}

then {
assertAll(
{ assert workflow.success },
{ assert snapshot(UTILS.removeNextflowVersion("$outputDir/pipeline_info/nf_core_pipeline_software_mqc_versions.yml")).match("software_versions") },
{ assert snapshot(
workflow.trace.tasks().size(),
path("$outputDir/transcript_identification/homer/cd4.bed"),
path("$outputDir/transcript_identification/homer/jurkat.bed"),
// FIXME Not determinstic because of the order of files
// Add to the other tests when fixed
// UTILS.getAllFilesFromDir("$outputDir/transcript_identification/pints/", ".bed"),
path("$outputDir/transcript_identification/intersect/").list(),
path("$outputDir/transcript_identification/filtered/").list(),
path("$outputDir/transcript_identification/intersect/").list(),
path("$outputDir/transcript_identification/filtered/").list(),
path("$outputDir/transcript_identification/grohmm/cd4.eval.txt"),
path("$outputDir/transcript_identification/grohmm/cd4.final.transcripts.bed"),
path("$outputDir/transcript_identification/grohmm/cd4.tdFinal.txt"),
path("$outputDir/transcript_identification/grohmm/cd4.tdplot_mqc.png").exists(),
path("$outputDir/transcript_identification/grohmm/cd4.transcripts.txt"),
path("$outputDir/transcript_identification/grohmm/jurkat.eval.txt"),
path("$outputDir/transcript_identification/grohmm/jurkat.final.transcripts.bed"),
path("$outputDir/transcript_identification/grohmm/jurkat.tdFinal.txt"),
path("$outputDir/transcript_identification/grohmm/jurkat.tdplot_mqc.png").exists(),
path("$outputDir/transcript_identification/grohmm/jurkat.transcripts.txt"),
// FIXME Not determinstic because of the order of files
// Add to the other tests when fixed
// path("$outputDir/quantification/").list(),
path("$outputDir/multiqc/multiqc_report.html").exists(),
).match("output_files")
}
)
}
}
}
Loading

0 comments on commit b6a93ca

Please sign in to comment.