diff --git a/.editorconfig b/.editorconfig index 72dda289..e957201e 100644 --- a/.editorconfig +++ b/.editorconfig @@ -8,7 +8,7 @@ trim_trailing_whitespace = true indent_size = 4 indent_style = space -[*.{md,yml,yaml,html,css,scss,js}] +[*.{md,yml,yaml,html,css,scss,js,R,Rmd}] indent_size = 2 # These files are edited and tested upstream in nf-core/modules @@ -31,3 +31,7 @@ indent_size = unset # ignore python and markdown [*.{py,md}] indent_style = unset + +# Follow tidyverse style for R +[*.{R,Rmd}] +indent_size = 2 diff --git a/bin/custom_makeConsensusAnnotations.R b/bin/custom_makeConsensusAnnotations.R deleted file mode 100644 index f51eda7d..00000000 --- a/bin/custom_makeConsensusAnnotations.R +++ /dev/null @@ -1,285 +0,0 @@ -#' custom_makeConsensusAnnotations Makes a consensus annotation #' -#' Makes a non-overlapping consensus annotation. Gene annotations are often -#' overalpping due to #' multiple isoforms for a gene. -#' In consensus annotation, isoforms are first reduced so that only -#' redundant intervals are used to represent a genomic interval for a gene, -#' i.e., a gene id. -#' Remaining unresolved annotations are further reduced by truncating 3' -#' end of annotations. -#' -#' Supports parallel processing using mclapply in the 'parallel' package. -#' To change the number of processors, use the argument 'mc.cores'. -#' -#' @param ar GRanges of annotations to be collapsed. -#' @param minGap Minimun gap between overlapped annotations after truncated. -#' Default: 1L -#' @param minWidth Minimun width of consensus annotations. Default: 1000L -#' @param ... Extra argument passed to mclapply. -#' @return Returns GRanges object of annotations. -#' @author Minho Chae -#' @examples -#' ## Not run: -#' # library(TxDb.Hsapiens.UCSC.hg19.knownGene) -#' # txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene -#' # tx <- transcripts(txdb, columns=c("gene_id", "tx_id", "tx_name"), -#' filter=list(tx_chrom="chr7")) -#' # tx <- tx[grep("random", as.character(seqnames(tx)), invert=TRUE),] -#' # ca <- makeConsensusAnnotations(tx) -custom_makeConsensusAnnotations <- function(ar, minGap=1L, minWidth=1000L, ...) { - # Check and remove ranges with missing or multiple gene_ids - valid_ranges <- vapply(mcols(ar)$gene_id, function(x) length(x) == 1, logical(1)) - if (any(!valid_ranges)) { - warning(sum(!valid_ranges), " ranges have missing or multiple gene_id and they are dropped") - ar <- ar[valid_ranges] - } - - # Split into single-isoform and multi-isoform genes - gene_counts <- table(mcols(ar)$gene_id) - singles <- ar[mcols(ar)$gene_id %in% names(gene_counts[gene_counts == 1])] - multi_isoform_genes <- names(gene_counts[gene_counts > 1]) - - # Process multi-isoform genes in chunks - chunk_size <- 1000 # Adjust based on available memory - num_chunks <- ceiling(length(multi_isoform_genes) / chunk_size) - - isoforms <- GRangesList() - for (i in 1:num_chunks) { - chunk_start <- (i - 1) * chunk_size + 1 - chunk_end <- min(i * chunk_size, length(multi_isoform_genes)) - chunk_genes <- multi_isoform_genes[chunk_start:chunk_end] - - chunk_isoforms <- ar[mcols(ar)$gene_id %in% chunk_genes] - chunk_list <- split(chunk_isoforms, mcols(chunk_isoforms)$gene_id) - - message("Reduce isoforms (chunk ", i, "/", num_chunks, ") ... ", appendLF=FALSE) - chunk_result <- GRangesList(mclapply(chunk_list, function(x) { - # For mixed strands or chrom, choose the longest - if ((length(seqlevelsInUse(x)) > 1) || - (length(unique(strand(x))) > 1)) { - result <- x[which.max(width(x)), "gene_id"] - } else { - dx <- disjoin(x) - mcols(dx)$gene_id <- mcols(x)$gene_id[1] - olcnt <- countOverlaps(dx, x) - - multi <- dx[olcnt > 1] # Use the disjoint ranges - # covered more than once - if (length(multi) == 0) { # For non-overlapping isoforms, - # choose the longest - result <- x[which.max(width(x)), "gene_id"] - } else if (length(multi) == 1) { - result <- multi - } else { - reduced <- reduce(multi) - if (length(reduced) == 1) - result <- reduced - else (length(reduced) > 1) - result <- reduced[which.max(width(reduced)),] - - } - mcols(result)$gene_id <- mcols(x)$gene_id[1] - } - return(result) - }, ...)) - isoforms <- c(isoforms, chunk_result) - message("OK") - } - isoforms <- unlist(isoforms) - - # Check redundancy - isoforms <- removeRedundant(isoforms) - singles <- removeRedundant(singles) - - o <- findOverlaps(singles, isoforms, type = "equal") - if (length(o) != 0) { - singles <- singles[-queryHits(o), ] - } - - o <- findOverlaps(singles, isoforms, type = "within") - if (length(o) != 0) { - singles <- singles[-queryHits(o), ] - } - - o <- findOverlaps(isoforms, singles, type = "within") - if (length(o) != 0) { - isoforms <- isoforms[-queryHits(o), ] - } - - noiso <- sort(c(isoforms, singles[, "gene_id"])) - message("Truncate overlapped ranges ... ", appendLF=FALSE) - # with different gene_ids - while(!isDisjoint(noiso)) { - ol <- findOverlaps(noiso, drop.self=TRUE, drop.redundant=TRUE) - ol_gr <- GRangesList(lapply(1:length(ol), function(x) { - sort(c( - noiso[queryHits(ol)[x]], - noiso[subjectHits(ol)[x]] - )) - })) - - # Truncate 3' end - ol_gr <- unlist(endoapply(ol_gr, function(x) { - if (as.character(strand(x[1, ])) == "+") { - end(x[1, ]) <- start(x[2, ]) - minGap - # first range's end is truncated - } else { - start(x[2, ]) <- end(x[1, ]) + minGap - # sencond range's end is truncated - } - x - })) - - # Remove any ranges with duplicated names since they already adujsted - # in the previous call - ol_gr <- ol_gr[!duplicated(names(ol_gr)), ] - - noiso <- noiso[-unique(c(queryHits(ol), subjectHits(ol))), ] - # update noiso - noiso <- c(noiso, ol_gr) - } - message("OK") - - noiso <- noiso[width(noiso) >= minWidth, ] - return(sort(noiso)) -} - -removeRedundant <- function(annox) { - o <- findOverlaps(annox, drop.self=TRUE, type="equal", - drop.redundant=TRUE) - if(length(o) != 0) - annox <- annox[-subjectHits(o),] - - o <- findOverlaps(annox, drop.self=TRUE, type="within", - drop.redundant=TRUE) - if(length(o) != 0) - annox <- annox[-queryHits(o),] - - return(annox) -} - - - - -## TESTING -suppressPackageStartupMessages(library(groHMM)) -suppressPackageStartupMessages(library(testthat)) -suppressPackageStartupMessages(library(GenomicRanges)) -suppressPackageStartupMessages(library(rtracklayer)) -library(GenomicFeatures) - -# test_that("custom_makeConsensusAnnotations maintains accuracy", { -# # Load the GTF file -# gtf_url <- "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf" -# gtf <- import(gtf_url) - -# # Extract transcripts from the GTF -# transcripts <- gtf[gtf$type == "transcript"] - -# # Create a sample GRanges object -# set.seed(42) -# n <- 100 -# sample_ranges <- transcripts -# print("Sample ranges:") -# print(sample_ranges) - -# # Run both original and refactored functions -# original_result <- groHMM::makeConsensusAnnotations(sample_ranges) -# refactored_result <- custom_makeConsensusAnnotations(sample_ranges) - -# print("Refactored result:") -# print(refactored_result) - -# # Compare results -# expect_equal(length(original_result), length(refactored_result)) -# expect_equal(sum(width(original_result)), sum(width(refactored_result))) - -# if (length(original_result) > 0 && length(refactored_result) > 0) { -# expect_equal( -# sort(unique(mcols(original_result)$gene_id)), -# sort(unique(mcols(refactored_result)$gene_id)) -# ) -# } else { -# print("Warning: One or both results are empty.") -# } - -# # Additional detailed comparisons -# print(paste("Original result length:", length(original_result))) -# print(paste("Refactored result length:", length(refactored_result))) -# print(paste("Original result total width:", sum(width(original_result)))) -# print(paste("Refactored result total width:", sum(width(refactored_result)))) -# print(paste("Original result unique gene_ids:", length(unique(mcols(original_result)$gene_id)))) -# print(paste("Refactored result unique gene_ids:", length(unique(mcols(refactored_result)$gene_id)))) - -# # Compare the first few entries -# print("First 5 entries of original result:") -# print(head(original_result, 5)) -# print("First 5 entries of refactored result:") -# print(head(refactored_result, 5)) -# }) - -gff <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" -kg_db <- makeTxDbFromGFF(gff) -kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) -print("Collapse annotations in preparation for overlap") -kg_consensus <- makeConsensusAnnotations( - kg_tx, - mc.cores = 2 -) -print("Kg consensus:") -print(kg_consensus) - -test_that("chm13 custom_makeConsensusAnnotations maintains accuracy", { - # Load the GTF file - gff_url <- "./chm13v2.0_RefSeq_Liftoff_v5.1.gff3.gz" - gff <- import(gff_url) - - print("GFF:") - print(gff) - # Extract transcripts from the GTF - transcripts <- gff[gff$type == "transcript"] - print("Transcripts:") - print(transcripts) - - # Create a sample GRanges object - set.seed(42) - n <- 100 - sample_ranges <- transcripts - print("Sample ranges:") - print(sample_ranges) - - # Run both original and refactored functions - original_result <- groHMM::makeConsensusAnnotations(sample_ranges) - refactored_result <- custom_makeConsensusAnnotations(sample_ranges) - - print("Original result:") - print(original_result) - print("Refactored result:") - print(refactored_result) - - # Compare results - expect_equal(length(original_result), length(refactored_result)) - expect_equal(sum(width(original_result)), sum(width(refactored_result))) - - if (length(original_result) > 0 && length(refactored_result) > 0) { - expect_equal( - sort(unique(mcols(original_result)$gene_id)), - sort(unique(mcols(refactored_result)$gene_id)) - ) - } else { - print("Warning: One or both results are empty.") - } - - # Additional detailed comparisons - print(paste("Original result length:", length(original_result))) - print(paste("Refactored result length:", length(refactored_result))) - print(paste("Original result total width:", sum(width(original_result)))) - print(paste("Refactored result total width:", sum(width(refactored_result)))) - print(paste("Original result unique gene_ids:", length(unique(mcols(original_result)$gene_id)))) - print(paste("Refactored result unique gene_ids:", length(unique(mcols(refactored_result)$gene_id)))) - - # Compare the first few entries - print("First 5 entries of original result:") - print(head(original_result, 5)) - print("First 5 entries of refactored result:") - print(head(refactored_result, 5)) -}) diff --git a/bin/parameter_tuning.R b/bin/grohmm_parametertuning.R similarity index 96% rename from bin/parameter_tuning.R rename to bin/grohmm_parametertuning.R index de323bf5..482b8741 100755 --- a/bin/parameter_tuning.R +++ b/bin/grohmm_parametertuning.R @@ -119,8 +119,8 @@ tune <- data.frame( LtProbB = args$ltprobb, UTS = args$uts ) -Fp <- windowAnalysis(alignments, strand = "+", windowSize = 50) -Fm <- windowAnalysis(alignments, strand = "-", windowSize = 50) +fp <- windowAnalysis(alignments, strand = "+", windowSize = 50) +fm <- windowAnalysis(alignments, strand = "-", windowSize = 50) hmm <- detectTranscripts( Fp = Fp, Fm = Fm, diff --git a/bin/transcriptcalling_grohmm.R b/bin/grohmm_transcriptcalling.R similarity index 88% rename from bin/transcriptcalling_grohmm.R rename to bin/grohmm_transcriptcalling.R index 3f362275..cdc4ca6b 100755 --- a/bin/transcriptcalling_grohmm.R +++ b/bin/grohmm_transcriptcalling.R @@ -145,7 +145,8 @@ write.table( print("Input transcript annotations") kg_db <- makeTxDbFromGFF(args$gxf) kg_tx <- transcripts(kg_db, columns = c("gene_id", "tx_id", "tx_name")) -# TODO I wonder if I could speed things up by filtering by chromosome at the Nextflow level +# TODO I wonder if I could speed things up by filtering +# by chromosome at the Nextflow level... # https://github.com/google/deepvariant/issues/744 # filter=list(tx_chrom="chr7")) # exclude any transcripts that are located on chromosomes labeled with "random". @@ -170,8 +171,10 @@ get_expressed_annotations <- function(features, reads) { f_limit <- limitToXkb(features) count <- countOverlaps(f_limit, reads) features <- features[count != 0, ] - return(features[(quantile(width(features), .05) < width(features)) & - (width(features) < quantile(width(features), .95)), ]) + return(features[ + (quantile(width(features), .05) < width(features)) & + (width(features) < quantile(width(features), .95)), + ]) } con_expressed <- get_expressed_annotations( features = kg_consensus, @@ -181,15 +184,20 @@ b_plus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "+") b_minus <- breakTranscriptsOnGenes(tx_hmm, kg_consensus, strand = "-") tx_broken <- c(b_plus, b_minus) # Assign unique IDs if they're missing -if (is.null(mcols(tx_broken)$transcript_id) || any(is.na(mcols(tx_broken)$transcript_id))) { - mcols(tx_broken)$transcript_id <- paste0("TX", seq_along(tx_broken)) +if ( + is.null(mcols(tx_broken)$transcript_id) || + any(is.na(mcols(tx_broken)$transcript_id)) +) { + mcols(tx_broken)$transcript_id <- paste0("TX", seq_along(tx_broken)) } # Filter out any transcripts with NA values in start or end positions -tx_broken_filtered <- tx_broken[!is.na(start(tx_broken)) & !is.na(end(tx_broken))] +tx_broken_filtered <- + tx_broken[!is.na(start(tx_broken)) & !is.na(end(tx_broken))] # Ensure that kg_consensus also doesn't contain NA values -kg_consensus_filtered <- kg_consensus[!is.na(start(kg_consensus)) & !is.na(end(kg_consensus))] +kg_consensus_filtered <- + kg_consensus[!is.na(start(kg_consensus)) & !is.na(end(kg_consensus))] # Now call combineTranscripts with the filtered data tx_final <- combineTranscripts(tx_broken_filtered, kg_consensus_filtered) @@ -212,7 +220,10 @@ capture.output(td_final, file = paste0(args$outprefix, ".tdFinal.txt")) # Write the data used in the plot to a CSV file data_to_write <- data.frame(x = td_final$x, profile = td_final$profile) -write.csv(data_to_write, file = paste0(args$outprefix, ".tdFinal_mqc.csv"), row.names = FALSE) +write.csv(data_to_write, + file = paste0(args$outprefix, ".tdFinal_mqc.csv"), + row.names = FALSE +) ######################## ## CITE PACKAGES USED ## diff --git a/modules/local/grohmm/parametertuning/main.nf b/modules/local/grohmm/parametertuning/main.nf index ad5ac5c7..0d2a5069 100644 --- a/modules/local/grohmm/parametertuning/main.nf +++ b/modules/local/grohmm/parametertuning/main.nf @@ -26,7 +26,7 @@ process GROHMM_PARAMETERTUNING { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}_${UTS}_${LtProbB}" """ - parameter_tuning.R \\ + grohmm_parametertuning.R \\ --bam_file ${bams} \\ --outprefix ${prefix} \\ --gxf $gxf \\ diff --git a/modules/local/grohmm/transcriptcalling/main.nf b/modules/local/grohmm/transcriptcalling/main.nf index 07cba0ca..984d1800 100644 --- a/modules/local/grohmm/transcriptcalling/main.nf +++ b/modules/local/grohmm/transcriptcalling/main.nf @@ -28,7 +28,7 @@ process GROHMM_TRANSCRIPTCALLING { def args = task.ext.args ?: '' def prefix = task.ext.prefix ?: "${meta.id}" """ - transcriptcalling_grohmm.R \\ + grohmm_transcriptcalling.R \\ --bam_file ${bams} \\ --tuning_file ${tuning_file} \\ --outprefix ${prefix} \\ diff --git a/subworkflows/local/grohmm/tests/main.nf.test b/subworkflows/local/grohmm/tests/main.nf.test index 29382e3f..d62fc0cd 100644 --- a/subworkflows/local/grohmm/tests/main.nf.test +++ b/subworkflows/local/grohmm/tests/main.nf.test @@ -17,15 +17,15 @@ nextflow_workflow { workflow { """ input[0] = Channel.of([ - [ id: 'Sall' ], + [ id: 'Sall' ], [ file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S0mR1.bam", checkIfExists: true), file("https://raw.githubusercontent.com/Kraus-Lab/groHMM/master/inst/extdata/S40mR1.bam", checkIfExists: true) ], [], - ]) + ]) input[1] = Channel.of([file( "https://gist.github.com/edmundmiller/c142801995689ed8d15ebcf40b2fb042/raw/eca3b955312209b5845cca084bb506d5250b3d33/hg19.chr7.refGene.gtf", checkIfExists: true - )]).first() + )]).first() """ } }