diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml index 1f8f6db..3fffa1f 100644 --- a/.github/workflows/R-CMD-check.yaml +++ b/.github/workflows/R-CMD-check.yaml @@ -4,19 +4,12 @@ name: R-CMD-check jobs: R-CMD-check: - runs-on: macOS-latest + runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: r-lib/actions/setup-r@master - - uses: r-lib/actions/setup-pandoc@master - - name: Install dependencies - run: | - install.packages(c("remotes", "rcmdcheck", "knitr")) - deps <- remotes::dev_package_deps(dependencies = TRUE) - install.packages(deps$package[!is.na(deps$available)]) - if (!requireNamespace("BiocManager", quietly = TRUE)) {install.packages("BiocManager")} - BiocManager::install(deps$package[is.na(deps$available)]) - shell: Rscript {0} - - name: Check - run: rcmdcheck::rcmdcheck(args = "--no-manual", error_on = "error") - shell: Rscript {0} + - uses: actions/checkout@v3 + - uses: r-lib/actions/setup-r@v2 + - uses: r-lib/actions/setup-r-dependencies@v2 + with: + extra-packages: any::rcmdcheck + needs: check + - uses: r-lib/actions/check-r-package@v2 diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..3e6c483 --- /dev/null +++ b/.lintr @@ -0,0 +1,2 @@ +linters: linters_with_defaults( + indentation_linter(hanging_indent_style="tidy")) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..0c8bc3c --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,52 @@ +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.5.0 + hooks: + - id: check-merge-conflict + - id: debug-statements + - id: mixed-line-ending + - id: detect-private-key + - id: check-case-conflict + - id: check-yaml + - id: trailing-whitespace + - repo: https://github.com/DavidAnson/markdownlint-cli2 + rev: v0.11.0 + hooks: + - id: markdownlint-cli2 + files: \.(md|qmd)$ + types: [file] + exclude: LICENSE.md + - id: markdownlint-cli2-fix + files: \.(md|qmd)$ + types: [file] + exclude: LICENSE.md + - repo: https://github.com/lorenzwalthert/precommit + rev: v0.3.2.9027 + hooks: + - id: style-files + name: style-files + description: style files with {styler} + entry: Rscript inst/hooks/exported/style-files.R + language: r + files: '(\.[rR]profile|\.[rR]|\.[rR]md|\.[rR]nw|\.[qQ]md)$' + exclude: | + (?x)^( + renv/activate\.R| + )$ + minimum_pre_commit_version: "2.13.0" + - id: parsable-R + name: parsable-R + description: check if a .R file is parsable + entry: Rscript inst/hooks/exported/parsable-R.R + language: r + files: '\.[rR](md)?$' + minimum_pre_commit_version: "2.13.0" + - id: lintr + args: [--warn_only] + name: lintr + description: check if a `.R` file is lint free (using {lintr}) + entry: Rscript inst/hooks/exported/lintr.R + language: r + files: '(\.[rR]profile|\.R|\.Rmd|\.Rnw|\.r|\.rmd|\.rnw)$' + exclude: 'renv/activate\.R' + minimum_pre_commit_version: "2.13.0" diff --git a/DESCRIPTION b/DESCRIPTION index 4450307..fb778ef 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: genecovr Title: Gene body coverage analysis to evaluate genome assemblies -Version: 0.0.0.9013 -Authors@R: +Version: 0.1.0 +Authors@R: person(given = "Per", family = "Unneberg", role = c("aut", "cre"), diff --git a/README.Rmd b/README.Rmd index d6dd96d..bed3633 100644 --- a/README.Rmd +++ b/README.Rmd @@ -19,8 +19,15 @@ knitr::opts_chunk$set( [![R build status](https://github.com/NBISweden/genecovr/workflows/R-CMD-check/badge.svg)](https://github.com/NBISweden/genecovr/actions) -Perform gene body coverage analyses in R to evaluate genome assembly -quality. +`genecovr` is an `R` package that provides plotting functions that +summarize gene transcript to genome alignments. The main purpose is to +assess the effect of polishing and scaffolding operations has on the +quality of a genome assembly. The gene transcript set is a large +sequence set consisting of assembled transcripts from RNA-seq data +generated in relation to a genome assembly project. Therefore, +`genecovr` serves as a complement to software such as +[BUSCO](https://busco.ezlab.org/), which evaluates genome assembly +quality using a smaller set of well-defined single-copy orthologs. ## Installation @@ -28,12 +35,14 @@ You can install the released version of genecovr from [NBIS GitHub](https://github.com/nbis) with: ``` r +# If necessary, uncomment to install devtools # install.packages("devtools") devtools::install_github("NBISweden/genecovr") ``` +## Usage -## Quick usage +### genecovr script quick start There is a helper script for generating basic plots located in PACKAGE_DIR/bin/genecovr. Create a data input csv-delimited file with @@ -52,8 +61,66 @@ script to generate plots: PACKAGE_DIR/bin/genecovr indata.csv ``` -## Vignette +#### Example -Alternatively, import the library as usual in an R script and use the -package functions. See the vignette for a minimum working example. +There are example files located in PACKAGE_DIR/inst/extdata consisting +of two psl alignment files containing gmap alignments and fasta +indices for the transcript sequences and two for different assembly +versions: +- nonpolished.fai - fasta index for raw assembly +- polished.fai - fasta index for polished assembly +- transcripts.fai - fasta index for transcript sequences +- transcripts2nonpolished.psl - gmap alignments, transcripts to raw assembly +- transcripts2polished.psl - gmap alignments, transcripts to polished + assembly + +Using these files and the labels `non` and `pol` for the different +assemblies, a `genecovr` input file (called e.g., `assemblies.csv`) +would look as follows: + +``` +nonpol,transcripts2nonpolished.psl,nonpolished.fai,transcripts.fai +pol,transcripts2polished.psl,polished.fai,transcripts.fai +``` + +and the command to run would be: + +``` +genecovr assemblies.csv +``` + +#### genecovr options + +To list genecovr script options, type 'genecovr -h`: + +``` +usage: genecovr [-h] [-v] [-p number] + [-d OUTPUT_DIRECTORY] [--height HEIGHT] + [--width WIDTH] + csvfile + +positional arguments: + csvfile csv-delimited file with columns + 1. data label + 2. mapping file (supported formats: psl) + 3. assembly file (fasta or fasta index) + 4. transcript file (fasta or fasta index) + +optional arguments: + -h, --help show this help message and exit + -v, --verbose print extra output + -p number, --cpus number + number of cpus [default 1] + -d OUTPUT_DIRECTORY, --output-directory OUTPUT_DIRECTORY + output directory + --height HEIGHT figure height in inches [default 6.0] + --width WIDTH figure width in inches [default 6.0] +``` + + + +### R package vignette + +Alternatively, import the library in an R script and use the package +functions. See the vignette for a minimum working example. diff --git a/inst/bin/genecovr b/inst/bin/genecovr index ecb20f5..8abe49e 100755 --- a/inst/bin/genecovr +++ b/inst/bin/genecovr @@ -131,6 +131,7 @@ apl <- AlignmentPairsList( seqinfo.query=transcripts.sinfo[[x]]) }, BPPARAM=bpparam) ) + names(apl) <- names(psl.fn) ##------------------------------ @@ -183,8 +184,9 @@ save_plot(p, outfile) ## FIXME: number of levels should be parametrized via option +suppressPackageStartupMessages(library(dplyr)) outfile <- file.path(outdir, "qnuminsert") -x <- insertionSummary(apl, reduce=FALSE) +x <- dplyr::tibble(insertionSummary(apl, reduce=FALSE)) p <- ggplot(x, aes(id)) + geom_bar(aes(fill=cuts)) + scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) @@ -199,9 +201,8 @@ message("saving ", outfile) write.csv(x, file=gzfile(outfile), row.names=FALSE) ## Also make plot based on gbc -suppressPackageStartupMessages(library(dplyr)) outfile <- file.path(outdir, "qnuminsert.gbc") -x <- insertionSummary(apl) +x <- dplyr::tibble(insertionSummary(apl)) p <- ggplot(x, aes(id)) + geom_bar(aes(fill=cuts)) + scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) @@ -211,6 +212,10 @@ save_plot(p, outfile) ##-------------------- ## Save gbc data ##-------------------- +x$revmap <- as.character(x$revmap) +x$hitCoverage <- as.character(x$hitCoverage) +x$hitStart <- as.character(x$hitStart) +x$hitEnd <- as.character(x$hitEnd) outfile <- file.path(outdir, "gbcdata.tsv.gz") message("saving ", outfile) write.table(x, file=gzfile(outfile), row.names=FALSE, sep="\t") @@ -313,11 +318,13 @@ p <- ggplot(data=subset(data, n.subjects>1), outfile <- file.path(outdir, paste0("depth_breadth_seqlengths.mm", mm)) save_plot(p, outfile) - +data$revmap <- as.character(data$revmap) +data$hitCoverage <- as.character(data$hitCoverage) +data$hitStart <- as.character(data$hitStart) +data$hitEnd <- as.character(data$hitEnd) outfile <- file.path(outdir, "gene_body_coverage.csv.gz") message("saving ", outfile) -write.csv(data, gzfile(outfile), row.names=FALSE) - +write.csv(dplyr::tibble(data), gzfile(outfile), row.names=FALSE) ############################## ## Save Rdata of analysis diff --git a/tests/testthat/test-genecovr.R b/tests/testthat/test-genecovr.R index 1114979..dfac9be 100644 --- a/tests/testthat/test-genecovr.R +++ b/tests/testthat/test-genecovr.R @@ -1,29 +1,36 @@ -skip("Skip until resolve issue of testing package binary") pkg_path <- getwd() tmp <- tempdir(check = TRUE) withr::local_dir(tmp) withr::local_temp_libpaths() withr::local_path(file.path(.libPaths()[1], "genecovr", "bin")) -devtools::install(pkg=pkg_path, quick = TRUE, upgrade = "never") +devtools::install(pkg = pkg_path, quick = TRUE, upgrade = "never") create_file <- function(fn) system.file("extdata", fn, package = "genecovr") data <- as.data.frame(rbind( - c("nonpol", create_file("transcripts2nonpolished.psl"), - create_file("nonpolished.fai"), create_file("transcripts.fai")), - c("pol", create_file("transcripts2polished.psl"), - create_file("polished.fai"), create_file("transcripts.fai")) + c( + "nonpol", create_file("transcripts2nonpolished.psl"), + create_file("nonpolished.fai"), create_file("transcripts.fai") + ), + c( + "pol", create_file("transcripts2polished.psl"), + create_file("polished.fai"), create_file("transcripts.fai") + ) )) colnames(data) <- NULL withr::local_file(write.csv(data, file = "assemblies.csv", row.names = FALSE)) test_that("genecovr runs as expected", { - system(paste(paste0("R_LIBS_USER=", .libPaths()[1]), - "genecovr", "assemblies.csv")) - expect_equal(length(list.files(pattern = "*.png")), 13) - expect_equal(length(grep("Rplots", list.files(pattern = "*.pdf"), - invert = TRUE)), 13) - expect_equal(length(list.files(pattern = "*.csv.gz")), 4) + system(paste( + paste0("R_LIBS_USER=", .libPaths()[1]), + "genecovr", "assemblies.csv" + )) + expect_equal(length(list.files(pattern = "*.png")), 15) + expect_equal(length(grep("Rplots", list.files(pattern = "*.pdf"), + invert = TRUE + )), 15) + expect_equal(length(list.files(pattern = "*.csv.gz")), 4) + expect_equal(length(list.files(pattern = "*.tsv.gz")), 1) }) -withr::defer(unlink(dir, recursive = TRUE)) +withr::defer(unlink(tmp, recursive = TRUE)) diff --git a/vignettes/genecovr.Rmd b/vignettes/genecovr.Rmd index 19d4404..dcc71a7 100644 --- a/vignettes/genecovr.Rmd +++ b/vignettes/genecovr.Rmd @@ -15,21 +15,24 @@ bibliography: bibliography.bib This vignette describes analyses of gene body coverage and other genome assembly evaluation metrics with in R using the `genecovr` -package. `ripr` contains functionality for parsing alignment files, -calculating gene body coverages, and generating simple QC metrics to -assess assembly quality output. Before we start with the example -analysis, we describe how `genecovr` represents pairwise alignments. +package. `genecovr` contains functionality for parsing alignment +files, calculating gene body coverages, and generating simple QC +metrics to assess assembly quality output. Before we start with the +example analysis, we describe how `genecovr` represents pairwise +alignments. ## R setup ```{r knitr-setup, echo=FALSE, cache=FALSE} library(knitr) -knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE, - fig.width=8, fig.height=6, autodep=TRUE, - cache=FALSE, include=TRUE, eval=TRUE, tidy=FALSE, - dev=c('png')) +knitr::opts_chunk$set( + echo = TRUE, warning = FALSE, message = FALSE, + fig.width = 8, fig.height = 6, autodep = TRUE, + cache = FALSE, include = TRUE, eval = TRUE, tidy = FALSE, + dev = c("png") +) knitr::knit_hooks$set(inline = function(x) { - prettyNum(x, big.mark=" ") + prettyNum(x, big.mark = " ") }) ``` @@ -45,9 +48,10 @@ library(rlang) ```{r ggplot-theme, cache=FALSE} library(viridis) library(RColorBrewer) -bw <- theme_bw(base_size=18) %+replace% theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +bw <- theme_bw(base_size = 18) %+replace% + theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) theme_set(bw) -color.pal.4 <- brewer.pal(name = "Paired", n = 4) +color_pal_4 <- brewer.pal(name = "Paired", n = 4) psize <- 3 ``` @@ -77,29 +81,44 @@ the psl files with `readPsl`, storing the results in an `AlignmentPairsList` for convenience. ``` {r gbc-load-data} -assembly.fai.fn <- list( - nonpol = system.file("extdata", "nonpolished.fai", package="genecovr"), - pol = system.file("extdata", "polished.fai", package="genecovr") +assembly_fai_fn <- list( + nonpol = system.file("extdata", "nonpolished.fai", + package = "genecovr" + ), + pol = system.file("extdata", "polished.fai", + package = "genecovr" + ) ) -transcripts.fai.fn <- list( - nonpol = system.file("extdata", "transcripts.fai", package="genecovr"), - pol = system.file("extdata", "transcripts.fai", package="genecovr") +transcripts_fai_fn <- list( + nonpol = system.file("extdata", "transcripts.fai", + package = "genecovr" + ), + pol = system.file("extdata", "transcripts.fai", + package = "genecovr" + ) ) -assembly.sinfo <- endoapply(assembly.fai.fn, readFastaIndex) -transcripts.sinfo <- endoapply(transcripts.fai.fn, readFastaIndex) +assembly_sinfo <- endoapply(assembly_fai_fn, readFastaIndex) +transcripts_sinfo <- endoapply(transcripts_fai_fn, readFastaIndex) -psl.fn <- list( - nonpol = system.file("extdata", "transcripts2nonpolished.psl", package="genecovr"), - pol = system.file("extdata", "transcripts2polished.psl", package="genecovr") +psl_fn <- list( + nonpol = system.file("extdata", "transcripts2nonpolished.psl", + package = "genecovr" + ), + pol = system.file("extdata", "transcripts2polished.psl", + package = "genecovr" + ) ) apl <- AlignmentPairsList( - lapply(names(psl.fn), function(x) { - readPsl(psl.fn[[x]], seqinfo.sbjct=assembly.sinfo[[x]], seqinfo.query=transcripts.sinfo[[x]]) - }) + lapply(names(psl_fn), function(x) { + readPsl(psl_fn[[x]], + seqinfo.sbjct = assembly_sinfo[[x]], + seqinfo.query = transcripts_sinfo[[x]] + ) + }) ) -names(apl) <- names(psl.fn) +names(apl) <- names(psl_fn) ``` ## Plot ratio matches to width of alignment regions @@ -108,7 +127,8 @@ We first plot the ratio of matches to width of alignments with respect to the transcripts. ``` {r gbc-plot-alignment-width} -plot(apl, aes(x=id, y=matches/query.width, fill=id), which="violin") + ylim(0.8, 1) + scale_fill_viridis_d() +plot(apl, aes(x = id, y = matches / query.width, fill = id), which = "violin") + + ylim(0.8, 1) + scale_fill_viridis_d() ``` There is a clear shift to higher percentage matches in the polished @@ -121,15 +141,18 @@ We can also select multiple columns to plot in the ```{r gbc-plot-match-indel} cnames <- c("misMatches", "query.NumInsert", "query.BaseInsert") -plot(apl, aes(x=id, y=get_expr(enquo(cnames))), which="violin") + facet_wrap(. ~ name, scales="free") +plot(apl, aes(x = id, y = get_expr(enquo(cnames))), which = "violin") + + facet_wrap(. ~ name, scales = "free") ``` ```{r gbc-plot-match-indel-boxplot} -plot(apl, aes(x=id, y=get_expr(enquo(cnames))), which="boxplot") + facet_wrap(. ~ name, scales="free") +plot(apl, aes(x = id, y = get_expr(enquo(cnames))), which = "boxplot") + + facet_wrap(. ~ name, scales = "free") ``` ```{r gbc-plot-match-indel-log10-boxplot} -plot(apl, aes(x=id, y=get_expr(enquo(cnames))), which="boxplot") + facet_wrap(. ~ name, scales="free") + scale_y_continuous(trans="log10") +plot(apl, aes(x = id, y = get_expr(enquo(cnames))), which = "boxplot") + + facet_wrap(. ~ name, scales = "free") + scale_y_continuous(trans = "log10") ``` ## Plot number of indels @@ -144,10 +167,10 @@ consequence of this is that as a transcript may be split in multiple alignments, the bars are of unequal height. ```{r gbc-plot-qnuminsert} -x <- insertionSummary(apl, reduce=FALSE) +x <- insertionSummary(apl, reduce = FALSE) ggplot(x, aes(id)) + - geom_bar(aes(fill=cuts)) + - scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) + geom_bar(aes(fill = cuts)) + + scale_fill_viridis_d(name = "qNumInsert", begin = 1, end = 0) ``` An alternative is to summarize the number of insertions over a @@ -159,52 +182,55 @@ the fewest number of insertions. ```{r gbc-plot-qnuminsert-gbc} x <- insertionSummary(apl) ggplot(x, aes(id)) + - geom_bar(aes(fill=cuts)) + - scale_fill_viridis_d(name="qNumInsert", begin=1, end=0) + geom_bar(aes(fill = cuts)) + + scale_fill_viridis_d(name = "qNumInsert", begin = 1, end = 0) ``` - - ## Gene body coverage - The function `geneBodyCoverage` takes an `AlignmentPairs` object and summarizes breadth of coverage and number of subject hits per transcript. ``` {r gbc-make-gene-body-coverage} -gbc <- lapply(apl, geneBodyCoverage, min.match=0.1) +gbc <- lapply(apl, geneBodyCoverage, min.match = 0.1) ``` A summary can be obtained with the `summarizeGeneBodyCoverage` function. We define a range of minimum match hit cutoffs to filter out -hits with too few matches in the aligned region. +hits with too few matches in the aligned region. ``` {r gbc-summarize-gene-body-coverage} min.match <- c(0.25, 0.5, 0.75, 0.9) names(min.match) <- min.match -gbc.summary <- lapply(apl, function(x) { - y <- do.call("rbind", lapply(min.match, function(mm) { - summarizeGeneBodyCoverage(x, min.match=mm) - })) - }) +gbc_summary <- lapply(apl, function(x) { + y <- do.call("rbind", lapply(min.match, function(mm) { + summarizeGeneBodyCoverage(x, min.match = mm) + })) +}) ``` We combine the data ``` {r gbc-plot-gene-body-coverage-combine-data} library(dplyr) -data <- dplyr::bind_rows(lapply(gbc.summary, data.frame), .id="dataset") +data <- dplyr::bind_rows(lapply(gbc_summary, data.frame), .id = "dataset") ``` + and plot the resulting coverages ```{r gbc-plot-gene-body-coverage} h <- max(data$total) -hmax <- ceiling(h/100) * 100 -ggplot(subset(data, min.match==0.25), aes(x=min.coverage, y=count, group=dataset, color=dataset)) + - geom_abline(slope=0, intercept=h) + - geom_point(aes(shape=dataset, color=dataset), size=psize) + geom_line() + - scale_color_viridis_d() + scale_y_continuous(breaks=c(pretty(data$count)), limits=c(0, hmax)) +hmax <- ceiling(h / 100) * 100 +ggplot( + subset(data, min.match == 0.25), + aes(x = min.coverage, y = count, group = dataset, color = dataset) +) + + geom_abline(slope = 0, intercept = h) + + geom_point(aes(shape = dataset, color = dataset), size = psize) + + geom_line() + + scale_color_viridis_d() + + scale_y_continuous(breaks = c(pretty(data$count)), limits = c(0, hmax)) ``` ## Number of contigs per transcript @@ -214,17 +240,25 @@ contigs. We calculate the number of subjects by coverage cutoff with the function `countSubjectsByCoverage` ```{r gbc-ncontigs-per-transcript} -data <- dplyr::bind_rows(lapply( - lapply(apl, countSubjectsByCoverage), - data.frame), - .id="dataset") - +data <- dplyr::bind_rows( + lapply( + lapply(apl, countSubjectsByCoverage), + data.frame + ), + .id = "dataset" +) ``` and plot the results ```{r gbc-plot-ncontigs-per-transcript} -ggplot(data=data, aes(x=factor(min.coverage), y=Freq, fill=n.subjects)) + geom_bar(stat='identity', position=position_stack()) + scale_fill_viridis_d(begin=1, end=0) + facet_wrap(. ~ dataset, nrow=1, labeller=label_both) +ggplot(data = data, aes( + x = factor(min.coverage), + y = Freq, fill = n.subjects +)) + + geom_bar(stat = "identity", position = position_stack()) + + scale_fill_viridis_d(begin = 1, end = 0) + + facet_wrap(. ~ dataset, nrow = 1, labeller = label_both) ``` ## Duplicated versus split transcripts @@ -238,33 +272,61 @@ of coverage against length-normalized coverage. First combine the data: ```{r gbc-depth-by-breadth-data} -data <- dplyr::bind_rows(lapply( - lapply(apl, geneBodyCoverage), - data.frame), - .id="dataset") +data <- dplyr::bind_rows( + lapply( + lapply(apl, geneBodyCoverage), + data.frame + ), + .id = "dataset" +) ``` -and plot the ratio depthOfCoverage / breadthOfCoverage against length-normalized coverage +and plot the ratio depthOfCoverage / breadthOfCoverage against +length-normalized coverage + ```{r gbc-plot-depth-by-breadth-normalized} -ggplot(data=data, aes(x=coverage, y=depthOfCoverage / breadthOfCoverage, color=factor(n.subjects))) + geom_point(size=psize) + scale_color_viridis_d(alpha=.8) + xlim(0, 1) + ylim(0, 5) + facet_wrap(. ~ dataset) +ggplot(data = data, aes( + x = coverage, y = depthOfCoverage / breadthOfCoverage, + color = factor(n.subjects) +)) + + geom_point(size = psize) + + scale_color_viridis_d(alpha = .8) + + xlim(0, 1) + + ylim(0, 5) + + facet_wrap(. ~ dataset) ``` Alternatively we can make a jitter plot of the depthOfCoverage by breadthOfCoverage against number of subjects. ```{r gbc-plot-depth-by-breadth-by-contigs} -ggplot(data=subset(data, n.subjects>1), aes(x=factor(dataset), y=depthOfCoverage / breadthOfCoverage)) + geom_jitter(size=psize, alpha=.6) + facet_wrap(. ~ n.subjects) +ggplot( + data = subset(data, n.subjects > 1), + aes(x = factor(dataset), y = depthOfCoverage / breadthOfCoverage) +) + + geom_jitter(size = psize, alpha = .6) + + facet_wrap(. ~ n.subjects) ``` A similar picture is obtained via a histogram plot. ```{r gbc-plot-depth-by-breadth-by-contigs-hist} -ggplot(data=subset(data, n.subjects>1), aes(depthOfCoverage/breadthOfCoverage)) + geom_histogram() + facet_grid(vars(dataset)) +ggplot( + data = subset(data, n.subjects > 1), + aes(depthOfCoverage / breadthOfCoverage) +) + + geom_histogram() + + facet_grid(vars(dataset)) ``` Finally, we assess whether there is a length bias in the ratio condition on the number of subjects per transcript. ```{r gbc-plot-seqlengths-by-contigs-hist} -ggplot(data=subset(data, n.subjects>1), aes(x=factor(dataset), y=seqlengths)) + geom_boxplot() + facet_grid(. ~ n.subjects) +ggplot( + data = subset(data, n.subjects > 1), + aes(x = factor(dataset), y = seqlengths) +) + + geom_boxplot() + + facet_grid(. ~ n.subjects) ```