diff --git a/README.md b/README.md index 498a4f7..938d5e9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,20 @@ # gaga2 - automated 16S amplicon analysis with Figaro/DADA2 +![](docs/img/gaga2_flow.png) + ## Installation instructions -`TBD` +`gaga2` requires a working `nextflow` installation (v20.4+). + +Other dependencies: +* bbmap +* fastqc +* figaro +* R v4+ with dada2, devtools, tidyverse, and cowplot installed + +For convenience, `gaga2` comes with a Singularity container with all dependencies installed. + +```singularity pull oras://ghcr.io/zellerlab/gaga2:latest``` + ## Usage instructions @@ -9,8 +22,8 @@ `gaga2` takes as input Illumina paired-end 16S amplicon sequences (e.g. sequenced on a MiSeq). #### Prerequisites -* Read files need to be named according the typical pattern `_*[12].{fastq,fq,fastq.gz,fq.gz}` -and need to be arranged in a sample-based directory structure: +* Read files need to be named according the typical pattern `_R?[12].{fastq,fq,fastq.gz,fq.gz}`. +They should, but don't have to, be arranged in a sample-based directory structure: ``` (aka "input_dir") @@ -26,29 +39,42 @@ and need to be arranged in a sample-based directory structure: |____ ``` -* If input reads have been "invasively" preprocessed (as evidenced by length differences between reads), -`gaga2` will generate an empty sentinel file `/SKIP_FIGARO`. -This will prevent `Figaro` from executing and should automatically advance to `dada2` processing. +A flat directory structure (with all read files in the same directory) or a deeply-branched (with read files scattered over multiple levels) should also work. + +If `gaga2` preprocesses the reads, it will automatically use `_R1/2` endings internally. + +* If input reads have already been preprocessed, you can set the `--preprocessed` flag. In this case, `gaga2` will do no preprocessing at all and instruct `dada2` to perform no trimming. Otherwie, `gaga2` will assess the read lengths for uniformity. If read lengths differ within and between samples, preprocessing with `figaro` is not possible and `dada2` will be run without trimming. * Samples with less than `110` reads after `dada2` preprocessing, will be discarded. ### Running gaga2 -The typical command for running `gaga2` is -`gaga2.nf --input_dir --output_dir --amplicon_length --left_primer --right_primer ` +`gaga2` can be directly run from github. + +`nextflow run zellerlab/gaga2 ` + +To obtain a newer version, do a + +`nextflow pull zellerlab/gaga2` + +before. + +In addition, you should obtain a copy of the `run.config` from the `gaga2` github repo and modify it according to your environment. #### Mandatory arguments -* `input_dir` is the project directory mentioned above. **Recommended: absolute path** -* `output_dir` will be created automatically. **Recommended: absolute path** -* `amplicon_length`, `left_primer`, and `right_primer` are derived from the experiment parameters +* `--input_dir` is the project directory mentioned above. +* `--output_dir` will be created automatically. +* `--amplicon_length` this is derived from your experiment parameters (this is not read-length, but the length of the, well, amplicon!) +* `--single_end` this is only required for single-end libraries (auto-detection of library-type is in progress) #### Optional arguments * `--min_overlap` of read pairs is `20bp` by default -* `-w, -work-dir` should be set to `/work`, otherwise it will be automatically placed in the current directory. +* `--primers ` or `--left_primer`, and `--right_primer` If primer sequences are provided via `--primers`, `gaga2` will remove primers and upstream sequences (using `bbduk`), such as adapters based on the primer sequences. If non-zero primer lengths are provided instead (via `--left_primer` and `--right_primer`), `figaro` will take those into account when determining the best trim positions. +* `--preprocessed` will prevent any further preprocessing by `gaga2` - this flag should only be used if the read data is reliably clean. ### internal beta-testing instructions -* `source /g/scb2/zeller/schudoma/software/wrappers/gaga2_wrapper` **before** submitting job to cluster -* please report issues/requests/feedback in the github issue tracker -* If you want to run `gaga2` on the cluster, `nextflow` alone requires `>4GB memory` just for 'managing'. +* The old gaga2 version can be run with `source /g/scb2/zeller/schudoma/software/wrappers/gaga2_wrapper` **before** submitting job to cluster +* Please report issues/requests/feedback in the github issue tracker +* If you want to run `gaga2` on the cluster, `nextflow` alone requires `>=5GB` memory just for 'managing'. diff --git a/R_scripts/dada2.R b/R_scripts/dada2.R deleted file mode 100644 index ab15f74..0000000 --- a/R_scripts/dada2.R +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env Rscript -.libPaths(c('/g/scb2/zeller/SHARED/software/R/3.5', .libPaths())) -library("dada2");packageVersion("dada2") - -args = commandArgs(trailingOnly=TRUE) -input_dir = args[1] -output_dir = args[2] - -length_cut = as.numeric(c(args[3], args[4])) -print(length_cut) -if (length_cut[1] == -1 || length_cut[2] == -1) { - length_cut = 0 -} -exp_err = as.numeric(c(2,2)) # get from cargs? apparently not. - -list.files(input_dir) -sample_ids = basename(list.files(input_dir)) -print(sample_ids) - -r1_raw = sort(list.files(file.path(input_dir, sample_ids), pattern="_R?1.(fastq|fq)(.gz)?", full.names=TRUE)) -r2_raw = sort(list.files(file.path(input_dir, sample_ids), pattern="_R?2.(fastq|fq)(.gz)?", full.names=TRUE)) - -print(r1_raw) -print(r2_raw) - -#TODO: this only generates one plot -pdf("read_quality.pdf", useDingbats=FALSE, width=12, height=6) -for (i in 1:length(r1_raw)) { - print(plotQualityProfile(c(r1_raw[i], r2_raw[i]))) -} -dev.off() - -r1_filtered = file.path(output_dir, "filtered", basename(r1_raw)) -r2_filtered = file.path(output_dir, "filtered", basename(r2_raw)) - -print(r1_filtered) -print(r2_filtered) - - -out = filterAndTrim(r1_raw, r1_filtered, r2_raw, r2_filtered, truncLen=length_cut, - maxN=0, maxEE=exp_err, truncQ=2, rm.phix=TRUE, - compress=TRUE, multithread=TRUE) -write.table(out, file="filter_trim_table.tsv", sep="\t") - -# update read files -keep = file.exists(r1_filtered) & file.exists(r2_filtered) & out[,"reads.out"] > 100 -r1_filtered = r1_filtered[keep] -r2_filtered = r2_filtered[keep] -sample_ids = sample_ids[keep] - -r1_error = learnErrors(r1_filtered, multithread=TRUE) -r2_error = learnErrors(r2_filtered, multithread=TRUE) - -pdf("error_model.pdf", width=12, height=12, paper='special') -print(plotErrors(r1_error, nominalQ=TRUE)) -print(plotErrors(r2_error, nominalQ=TRUE)) -dev.off() - - -r1_uniq = derepFastq(r1_filtered, verbose=TRUE) -r2_uniq = derepFastq(r2_filtered, verbose=TRUE) -names(r1_uniq) = sample_ids -names(r2_uniq) = sample_ids - -r1_dada = dada(r1_uniq, err=r1_error, multithread=TRUE) -r2_dada = dada(r2_uniq, err=r2_error, multithread=TRUE) - -#Inspecting the returned dada-class object: -r1_dada[[1]] - - -merged = mergePairs(r1_dada, r1_uniq, r2_dada, r2_uniq, verbose=TRUE) - -# Inspect the merger data.frame from the first sample -head(merged[[1]]) - -seqtab = makeSequenceTable(merged) -n_OTU = dim(seqtab)[2] -# Inspect distribution of sequence lengths -table(nchar(getSequences(seqtab))) - -seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=TRUE) -n_OTU_after_removing_chimeras = dim(seqtab.nochim)[2] -# relative abundance of the chimeric sequences -rel_ab_chimeras = 1 - (sum(seqtab.nochim)/sum(seqtab)) -table(nchar(getSequences(seqtab.nochim))) - -getN = function(x) sum(getUniques(x)) -track = cbind(out[keep,], sapply(r1_dada, getN), sapply(r2_dada, getN), sapply(merged, getN), rowSums(seqtab.nochim)) -colnames(track) = c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") -rownames(track) = sample_ids -head(track) -write.table(track, file = "summary_table.tsv", sep="\t") - -save(seqtab, seqtab.nochim, r1_error, r2_error, track, file = "result.RData") diff --git a/R_scripts/dada2_analysis.R b/R_scripts/dada2_analysis.R deleted file mode 100644 index a351b7f..0000000 --- a/R_scripts/dada2_analysis.R +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env Rscript - -library("dada2");packageVersion("dada2") -library("tidyverse") -library("cowplot") - -print("YYY") - -# handle command line args -args = commandArgs(trailingOnly=TRUE) -input_dir = args[1] -output_dir = args[2] - -filter_table = read.table(args[3]) - -if (length(args) >= 4) { - nthreads = as.numeric(c(args[4])) -} else { - nthreads = TRUE -} - -# get the read files and sample names -list.files(input_dir) -sample_ids = basename(list.files(input_dir)) -print(sample_ids) - -r1_filtered = sort(list.files(file.path(input_dir, sample_ids), pattern = "_R?1.(fastq|fq)(.gz)?", full.names = TRUE)) -r2_filtered = sort(list.files(file.path(input_dir, sample_ids), pattern = "_R?2.(fastq|fq)(.gz)?", full.names = TRUE)) -r1_filtered = sort(list.files(file.path(input_dir), pattern = "_R?1.(fastq|fq)(.gz)?", full.names = TRUE)) -r2_filtered = sort(list.files(file.path(input_dir), pattern = "_R?2.(fastq|fq)(.gz)?", full.names = TRUE)) - -print(r1_filtered) -print(r2_filtered) - -# learn error rate -r1_error = learnErrors(r1_filtered, multithread=nthreads) -r2_error = learnErrors(r2_filtered, multithread=nthreads) - -pdf("error_model.pdf", width=12, height=12, paper='special') -print(plotErrors(r1_error, nominalQ=TRUE)) -print(plotErrors(r2_error, nominalQ=TRUE)) -dev.off() - -# remove replicates -r1_uniq = derepFastq(r1_filtered, verbose=TRUE) -r2_uniq = derepFastq(r2_filtered, verbose=TRUE) -print(r1_uniq[[1]]) -print(c("LENGTHCHECK:", length(r1_uniq), length(r2_uniq), length(sample_ids))) -#names(r1_uniq) = sample_ids -#names(r2_uniq) = sample_ids -r1_dada = dada(r1_uniq, err=r1_error, multithread=nthreads) -r2_dada = dada(r2_uniq, err=r2_error, multithread=nthreads) - -#Inspecting the returned dada-class object: -r1_dada[[1]] - -# merge read pairs -print("merging") -merged = mergePairs(r1_dada, r1_uniq, r2_dada, r2_uniq, verbose=TRUE) -# Inspect the merger data.frame from the first sample -head(merged[[1]]) - -# construct sequence table -print("making sequence table") -seqtab = makeSequenceTable(merged) -n_OTU = dim(seqtab)[2] -# Inspect distribution of sequence lengths -print(dim(seqtab)) -print("ASV length") -print(table(nchar(getSequences(seqtab)))) - -# remove chimeras -seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=nthreads, verbose=TRUE) -n_OTU_after_removing_chimeras = dim(seqtab.nochim)[2] -print(dim(seqtab.nochim)) -asv.table = t(seqtab.nochim) - -# export asv table -asvs = tibble(id=paste0('ASV_', seq_len(nrow(asv.table))), - ASV=rownames(asv.table)) -write_tsv(asvs, 'ASVs.tsv') - -saved_rownames = rownames(asv.table) -rownames(asv.table) = asvs$id -write.table(asv.table, file = 'asv_table.tsv', - sep='\t', quote = FALSE, row.names = TRUE, col.names = TRUE) -rownames(asv.table) = saved_rownames - -# relative abundance of the chimeric sequences -rel_ab_chimeras = 1 - (sum(seqtab.nochim)/sum(seqtab)) -table(nchar(getSequences(seqtab.nochim))) - -# track reads -getN = function(x) sum(getUniques(x)) -track = cbind(filter_table, sapply(r1_dada, getN), sapply(r2_dada, getN), sapply(merged, getN), rowSums(seqtab.nochim)) -print(c("TRACK", length(track), length(sample_ids))) -colnames(track) = c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim") -# rownames(track) = sample_ids -print(head(track)) -print("Filtering") -print(summary(track[,6] / track[,1])) -write.table(track, file = "summary_table.tsv", sep="\t") - -save(seqtab, seqtab.nochim, r1_error, r2_error, track, file = "result.RData") - -# prevalance sanity check -pdf('dada2_figures.pdf') -temp = prop.table(asv.table, 2) -print(temp) -hist(log10(temp), 100, main='abundance') -prev = rowMeans(asv.table!=0) -hist(prev, 50, main='prevalence') -mean.ab = rowMeans(log10(temp + 1e-05)) -hist(mean.ab, 50, main='log.abundance') -print("Prevalence") -print(summary(prev)) -print("Mean abundance") -print(summary(mean.ab)) -plot(prev, mean.ab, main='prevalence vs abundance') -plot(nchar(rownames(asv.table)), prev, main='ASV length vs prevalence') -dev.off() diff --git a/R_scripts/dada2_analysis_paired.R b/R_scripts/dada2_analysis_paired.R index a351b7f..f01ed3d 100644 --- a/R_scripts/dada2_analysis_paired.R +++ b/R_scripts/dada2_analysis_paired.R @@ -19,6 +19,9 @@ if (length(args) >= 4) { nthreads = TRUE } +dada2_chimera_method = args[5] # +dada2_chimera_min_fold_parent_over_abundance = as.numeric(c(args[6])) + # get the read files and sample names list.files(input_dir) sample_ids = basename(list.files(input_dir)) @@ -70,7 +73,8 @@ print("ASV length") print(table(nchar(getSequences(seqtab)))) # remove chimeras -seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=nthreads, verbose=TRUE) +#seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=nthreads, verbose=TRUE) +seqtab.nochim = removeBimeraDenovo(seqtab, method=dada2_chimera_method, minFoldParentOverAbundance=dada2_chimera_min_fold_parent_over_abundance, multithread=nthreads, verbose=TRUE) n_OTU_after_removing_chimeras = dim(seqtab.nochim)[2] print(dim(seqtab.nochim)) asv.table = t(seqtab.nochim) @@ -103,19 +107,19 @@ write.table(track, file = "summary_table.tsv", sep="\t") save(seqtab, seqtab.nochim, r1_error, r2_error, track, file = "result.RData") -# prevalance sanity check -pdf('dada2_figures.pdf') -temp = prop.table(asv.table, 2) -print(temp) -hist(log10(temp), 100, main='abundance') -prev = rowMeans(asv.table!=0) -hist(prev, 50, main='prevalence') -mean.ab = rowMeans(log10(temp + 1e-05)) -hist(mean.ab, 50, main='log.abundance') -print("Prevalence") -print(summary(prev)) -print("Mean abundance") -print(summary(mean.ab)) -plot(prev, mean.ab, main='prevalence vs abundance') -plot(nchar(rownames(asv.table)), prev, main='ASV length vs prevalence') -dev.off() +## prevalance sanity check +#pdf('dada2_figures.pdf') +#temp = prop.table(asv.table, 2) +#print(temp) +#hist(log10(temp), 100, main='abundance') +#prev = rowMeans(asv.table!=0) +#hist(prev, 50, main='prevalence') +#mean.ab = rowMeans(log10(temp + 1e-05)) +#hist(mean.ab, 50, main='log.abundance') +#print("Prevalence") +#print(summary(prev)) +#print("Mean abundance") +#print(summary(mean.ab)) +#plot(prev, mean.ab, main='prevalence vs abundance') +#plot(nchar(rownames(asv.table)), prev, main='ASV length vs prevalence') +#dev.off() diff --git a/R_scripts/dada2_analysis_single.R b/R_scripts/dada2_analysis_single.R index 6787896..c9845a7 100644 --- a/R_scripts/dada2_analysis_single.R +++ b/R_scripts/dada2_analysis_single.R @@ -19,6 +19,9 @@ if (length(args) >= 4) { nthreads = TRUE } +dada2_chimera_method = args[5] # +dada2_chimera_min_fold_parent_over_abundance = as.numeric(c(args[6])) + # get the read files and sample names list.files(input_dir) sample_ids = basename(list.files(input_dir)) @@ -56,7 +59,8 @@ print("ASV length") print(table(nchar(getSequences(seqtab)))) # remove chimeras -seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=nthreads, verbose=TRUE) +#seqtab.nochim = removeBimeraDenovo(seqtab, method="consensus", multithread=nthreads, verbose=TRUE) +seqtab.nochim = removeBimeraDenovo(seqtab, method=dada2_chimera_method, minFoldParentOverAbundance=dada2_chimera_min_fold_parent_over_abundance, multithread=nthreads, verbose=TRUE) n_OTU_after_removing_chimeras = dim(seqtab.nochim)[2] print(dim(seqtab.nochim)) asv.table = t(seqtab.nochim) @@ -78,6 +82,9 @@ table(nchar(getSequences(seqtab.nochim))) # track reads getN = function(x) sum(getUniques(x)) +print(r1_dada) +print(filter_table) +print(seqtab.nochim) track = cbind(filter_table, sapply(r1_dada, getN), rowSums(seqtab.nochim)) print(c("TRACK", length(track), length(sample_ids))) colnames(track) = c("input", "filtered", "denoisedF", "nonchim") @@ -89,19 +96,19 @@ write.table(track, file = "summary_table.tsv", sep="\t") save(seqtab, seqtab.nochim, r1_error, track, file = "result.RData") -# prevalance sanity check -pdf('dada2_figures.pdf') -temp = prop.table(asv.table, 2) -print(temp) -hist(log10(temp), 100, main='abundance') -prev = rowMeans(asv.table!=0) -hist(prev, 50, main='prevalence') -mean.ab = rowMeans(log10(temp + 1e-05)) -hist(mean.ab, 50, main='log.abundance') -print("Prevalence") -print(summary(prev)) -print("Mean abundance") -print(summary(mean.ab)) -plot(prev, mean.ab, main='prevalence vs abundance') -plot(nchar(rownames(asv.table)), prev, main='ASV length vs prevalence') -dev.off() +## prevalance sanity check +#pdf('dada2_figures.pdf') +#temp = prop.table(asv.table, 2) +#print(temp) +#hist(log10(temp), 100, main='abundance') +#prev = rowMeans(asv.table!=0) +#hist(prev, 50, main='prevalence') +#mean.ab = rowMeans(log10(temp + 1e-05)) +#hist(mean.ab, 50, main='log.abundance') +#print("Prevalence") +#print(summary(prev)) +#print("Mean abundance") +#print(summary(mean.ab)) +#plot(prev, mean.ab, main='prevalence vs abundance') +#plot(nchar(rownames(asv.table)), prev, main='ASV length vs prevalence') +#dev.off() diff --git a/R_scripts/dada2_preprocess.R b/R_scripts/dada2_preprocess.R deleted file mode 100644 index 5c9b975..0000000 --- a/R_scripts/dada2_preprocess.R +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env Rscript - -library("dada2");packageVersion("dada2") -library("tidyverse") -library("cowplot") - -MIN_READ_THRESHOLD = 110 - -# handle command line args -args = commandArgs(trailingOnly=TRUE) -input_dir = args[1] -output_dir = args[2] - -length_cut = as.numeric(c(args[3], args[4])) -print(length_cut) -if (length_cut[1] == -1 || length_cut[2] == -1) { - length_cut = 0 -} -exp_err = as.numeric(c(2,2)) # get from cargs? apparently not. - -if (length(args) >= 5) { - nthreads = as.numeric(c(args[5])) -} else { - nthreads = TRUE -} - -# get the read files and sample names -list.files(input_dir) -#sample_ids = basename(list.files(input_dir)) -#print(sample_ids) - -r1_raw = sort(list.files(input_dir, pattern = "_R?1.(fastq|fq)(.gz)?", full.names = TRUE)) -r2_raw = sort(list.files(input_dir, pattern = "_R?2.(fastq|fq)(.gz)?", full.names = TRUE)) -print(r1_raw) -print(r2_raw) - -# get the quality profiles -x.f = plotQualityProfile(r1_raw, aggregate = TRUE) -x.r = plotQualityProfile(r2_raw, aggregate = TRUE) -g = plot_grid(x.f, x.r, labels = c('forward', 'reverse')) -ggsave(g, filename = "read_quality.pdf", - width = 12, height = 7, useDingbats=FALSE) - -# perform filtering and trimming -r1_filtered = file.path(output_dir, basename(r1_raw)) -r2_filtered = file.path(output_dir, basename(r2_raw)) - -print(r1_filtered) -print(r2_filtered) - -out = filterAndTrim(r1_raw, r1_filtered, r2_raw, r2_filtered, truncLen=length_cut, - maxN=0, maxEE=exp_err, truncQ=2, rm.phix=TRUE, - compress=TRUE, multithread=nthreads) -# colnames(out) = c("sample", "reads.in", "reads.out") -rownames(out) = str_replace(rownames(out), "_R?[12].(fastq|fq)(.gz)?", "") -write.table(out, file="filter_trim_table.tsv", sep="\t") - -# remove -keep = file.exists(r1_filtered) & file.exists(r2_filtered) & out[,"reads.out"] >= MIN_READ_THRESHOLD -r1_remove = r1_filtered[!keep] #file.exists(r1_filtered) & out["reads.out"] < MIN_READ_THRESHOLD] -r2_remove = r2_filtered[!keep] #file.exists(r2_filtered) & out["reads.out"] < MIN_READ_THRESHOLD] -sapply(r1_remove, file.remove) -sapply(r2_remove, file.remove) - -out = out[keep,] -write.table(out, file="filter_trim_table.final.tsv", sep="\t") - -r1_filtered = r1_filtered[keep] -r2_filtered = r2_filtered[keep] - -# get the quality profiles post-qc -x.f = plotQualityProfile(r1_filtered, aggregate = TRUE) -x.r = plotQualityProfile(r2_filtered, aggregate = TRUE) -g = plot_grid(x.f, x.r, labels = c('forward', 'reverse')) -ggsave(g, filename = "read_quality_postqc.pdf", - width = 12, height = 7, useDingbats=FALSE) - -# update read files -#keep = file.exists(r1_filtered) & file.exists(r2_filtered) & out[,"reads.out"] > 100 -#r1_filtered = r1_filtered[keep] -#r2_filtered = r2_filtered[keep] -#sample_ids = sample_ids[keep] diff --git a/R_scripts/dada2_preprocess_single.R b/R_scripts/dada2_preprocess_single.R index 83e9945..8ecd75e 100644 --- a/R_scripts/dada2_preprocess_single.R +++ b/R_scripts/dada2_preprocess_single.R @@ -55,7 +55,7 @@ keep = file.exists(r1_filtered) & out[,"reads.out"] >= MIN_READ_THRESHOLD r1_remove = r1_filtered[!keep] #file.exists(r1_filtered) & out["reads.out"] < MIN_READ_THRESHOLD] sapply(r1_remove, file.remove) -out = out[keep,] +out = out[keep,,drop=FALSE] write.table(out, file="filter_trim_table.final.tsv", sep="\t") r1_filtered = r1_filtered[keep] diff --git a/assets/adapters.fa b/assets/adapters.fa new file mode 100755 index 0000000..a87d425 --- /dev/null +++ b/assets/adapters.fa @@ -0,0 +1,317 @@ +>Reverse_adapter +AGATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Universal_Adapter +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>pcr_dimer +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG +>PCR_Primers +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCTCAAGCAGAAGACGGCATACGAGCTCTTCCGATCT +>TruSeq_Adapter_Index_1_6 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_2 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_3 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_4 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_5 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_6 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_7 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_8 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_9 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_10 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_11 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_12 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_13 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_14 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_15 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_16 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_18_7 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_19 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_20 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_21 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_22 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_23 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_25 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG +>TruSeq_Adapter_Index_27 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG +>I5_Nextera_Transposase_1 +CTGTCTCTTATACACATCTGACGCTGCCGACGA +>I7_Nextera_Transposase_1 +CTGTCTCTTATACACATCTCCGAGCCCACGAGAC +>I5_Nextera_Transposase_2 +CTGTCTCTTATACACATCTCTGATGGCGCGAGGGAGGC +>I7_Nextera_Transposase_2 +CTGTCTCTTATACACATCTCTGAGCGGGCTGGCAAGGC +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]501 +GACGCTGCCGACGAGCGATCTAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]502 +GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]503 +GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]504 +GACGCTGCCGACGATCTACTCTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]505 +GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]506 +GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]507 +GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]508 +GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_and_Nextera_Enrichment_[N/S/E]517 +GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N701 +CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N702 +CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N703 +CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N704 +CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N705 +CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N706 +CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N707 +CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N708 +CCGAGCCCACGAGACCAGAGAGGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N709 +CCGAGCCCACGAGACGCTACGCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N710 +CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N711 +CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_and_Nextera_Enrichment_N712 +CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG +>I5_Primer_Nextera_XT_Index_Kit_v2_S502 +GACGCTGCCGACGAATAGAGAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S503 +GACGCTGCCGACGAAGAGGATAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S505 +GACGCTGCCGACGACTCCTTACGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S506 +GACGCTGCCGACGATATGCAGTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S507 +GACGCTGCCGACGATACTCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S508 +GACGCTGCCGACGAAGGCTTAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S510 +GACGCTGCCGACGAATTAGACGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S511 +GACGCTGCCGACGACGGAGAGAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S513 +GACGCTGCCGACGACTAGTCGAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S515 +GACGCTGCCGACGAAGCTAGAAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S516 +GACGCTGCCGACGAACTCTAGGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S517 +GACGCTGCCGACGATCTTACGCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S518 +GACGCTGCCGACGACTTAATAGGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S520 +GACGCTGCCGACGAATAGCCTTGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S521 +GACGCTGCCGACGATAAGGCTCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I5_Primer_Nextera_XT_Index_Kit_v2_S522 +GACGCTGCCGACGATCGCATAAGTGTAGATCTCGGTGGTCGCCGTATCATT +>I7_Primer_Nextera_XT_Index_Kit_v2_N701 +CCGAGCCCACGAGACTAAGGCGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N702 +CCGAGCCCACGAGACCGTACTAGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N703 +CCGAGCCCACGAGACAGGCAGAAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N704 +CCGAGCCCACGAGACTCCTGAGCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N705 +CCGAGCCCACGAGACGGACTCCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N706 +CCGAGCCCACGAGACTAGGCATGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N707 +CCGAGCCCACGAGACCTCTCTACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N710 +CCGAGCCCACGAGACCGAGGCTGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N711 +CCGAGCCCACGAGACAAGAGGCAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N712 +CCGAGCCCACGAGACGTAGAGGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N714 +CCGAGCCCACGAGACGCTCATGAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N715 +CCGAGCCCACGAGACATCTCAGGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N716 +CCGAGCCCACGAGACACTCGCTAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N718 +CCGAGCCCACGAGACGGAGCTACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N719 +CCGAGCCCACGAGACGCGTAGTAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N720 +CCGAGCCCACGAGACCGGAGCCTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N721 +CCGAGCCCACGAGACTACGCTGCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N722 +CCGAGCCCACGAGACATGCGCAGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N723 +CCGAGCCCACGAGACTAGCGCTCATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N724 +CCGAGCCCACGAGACACTGAGCGATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N726 +CCGAGCCCACGAGACCCTAAGACATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N727 +CCGAGCCCACGAGACCGATCAGTATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N728 +CCGAGCCCACGAGACTGCAGCTAATCTCGTATGCCGTCTTCTGCTTG +>I7_Primer_Nextera_XT_Index_Kit_v2_N729 +CCGAGCCCACGAGACTCGACGTCATCTCGTATGCCGTCTTCTGCTTG +>I5_Adapter_Nextera +CTGATGGCGCGAGGGAGGCGTGTAGATCTCGGTGGTCGCCGTATCATT +>I7_Adapter_Nextera_No_Barcode +CTGAGCGGGCTGGCAAGGCAGACCGATCTCGTATGCCGTCTTCTGCTTG +>Nextera_LMP_Read1_External_Adapter +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>Nextera_LMP_Read2_External_Adapter +GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>RNA_Adapter_(RA5)_part_#_15013205 +GATCGTCGGACTGTAGAACTCTGAAC +>RNA_Adapter_(RA3)_part_#_15013207 +CCTTGGCACCCGAGAATTCCA +>Stop_Oligo_(STP)_8 +CCACGGGAACGTGGTGGAATTC +>RNA_RT_Primer_(RTP)_part_#_15013981 +TGGAATTCTCGGGTGCCAAGGC +>RNA_PCR_Primer_(RP1)_part_#_15013198 +TCGGACTGTAGAACTCTGAACGTGTAGATCTCGGTGGTCGCCGTATCATT +>RNA_PCR_Primer_Index_1_(RPI1)_2,9 +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_2_(RPI2) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_3_(RPI3) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_4_(RPI4) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_5_(RPI5) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_6_(RPI6) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_7_(RPI7) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_8_(RPI8) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_9_(RPI9) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_10_(RPI10) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_11_(RPI11) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_12_(RPI12) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_13_(RPI13) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTCAAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_14_(RPI14) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACAGTTCCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_15_(RPI15) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGTCAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_16_(RPI16) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCGTCCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_17_(RPI17) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTAGAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_18_(RPI18) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTCCGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_19_(RPI19) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGAAAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_20_(RPI20) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTGGCCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_21_(RPI21) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGTTTCGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_22_(RPI22) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGTACGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_23_(RPI23) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGAGTGGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_24_(RPI24) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGGTAGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_25_(RPI25) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACACTGATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_26_(RPI26) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATGAGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_27_(RPI27) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACATTCCTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_28_(RPI28) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAAAAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_29_(RPI29) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAACTAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_30_(RPI30) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACCGGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_31_(RPI31) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACGATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_32_(RPI32) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCACTCAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_33_(RPI33) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCAGGCGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_34_(RPI34) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATGGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_35_(RPI35) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCATTTTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_36_(RPI36) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCCAACAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_37_(RPI37) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCGGAATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_38_(RPI38) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTAGCTATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_39_(RPI39) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTATACATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_40_(RPI40) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACCTCAGAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_41_(RPI41) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACGACGACATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_42_(RPI42) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTAATCGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_43_(RPI43) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTACAGCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_44_(RPI44) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTATAATATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_45_(RPI45) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCATTCATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_46_(RPI46) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCCCGAATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_47_(RPI47) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGAAGATCTCGTATGCCGTCTTCTGCTTG +>RNA_PCR_Primer_Index_48_(RPI48) +TGGAATTCTCGGGTGCCAAGGAACTCCAGTCACTCGGCAATCTCGTATGCCGTCTTCTGCTTG +>PhiX_read1_adapter +AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTGAAA +>PhiX_read2_adapter +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATTAAAAAA +>Bisulfite_R1 +AGATCGGAAGAGCACACGTCTGAAC +>Bisulfite_R2 +AGATCGGAAGAGCGTCGTGTAGGGA +>Illumina Small RNA v1.5 3p Adapter +ATCTCGTATGCCGTCTTCTGCTTG +>Illumina RNA 3p Adapter (RA3) +TGGAATTCTCGGGTGCCAAGG +>Illumina RNA 5p Adapter (RA5) +GTTCAGAGTTCTACAGTCCGACGATC +>Illumina 3p RNA Adapter +TCGTATGCCGTCTTCTGCTTGT + diff --git a/config/run.config b/config/run.config index f48623b..05baa3a 100644 --- a/config/run.config +++ b/config/run.config @@ -4,6 +4,28 @@ params { min_overlap = 20 left_primer = 0 right_primer = 0 + + /* + bbduk qc parameters + s. https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/bbduk-guide/ + qtrim=rl trimq=3 : gentle quality trimming (only discard bases < phred 3; phred 2 = junk marker) on either side (rl) of the read + maq=25 : discard reads below average quality of pred 25 + minlen=45 : discard reads < 45bp + ref=?? ktrim=r k=23 mink=11 hdist=1 tpe tbo : right-side k-mer based adapter clipping with 1 mismatch allowed, try overlap-detection (tbo), and trim pairs to same length (tpe) upon adapter detection + ftm=5 : get rid off (n*5)+1st base (last sequencing cycle illumina garbage) + entropy=0.5 entropywindow=50 entropyk=5 : discard low complexity sequences + */ + + qc_params_primers = "qtrim=rl trimq=3 ktrim=l k=14 mink=1 hdist=1 cu=t" + qc_params_adapters = "qtrim=rl trimq=3 ktrim=l k=23 mink=1 hdist=1 tpe tbo cu=t" + qc_minlen = 100 + + mapseq_bin = "mapseq" + mapseq_db_path = projectDir + mapseq_db_name = "" + + dada2_chimera_method = "consensus" // can be "consensus" (default dada2 since 1.4) or "pool" + dada2_chimera_min_fold_parent_over_abundance = 2 } /* section below needs to be adjusted to local cluster */ @@ -19,39 +41,69 @@ executor { process { cache = 'lenient' - withName: ltrim { + withName: figaro { + container = "oras://ghcr.io/zellerlab/gaga2:latest" + // conda = "/g/scb/zeller/schudoma/software/conda/figaro" executor = "slurm" errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} - memory = '8.GB' + memory = '16.GB' time = '4d' maxRetries = 3 + cpus = 8 } - withName: figaro { + withLabel: dada2 { container = "oras://ghcr.io/zellerlab/gaga2:latest" - // conda = "/g/scb/zeller/schudoma/software/conda/figaro" executor = "slurm" errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} - memory = '16.GB' + memory = '16.GB' time = '4d' maxRetries = 3 + cpus = 8 + } + withLabel: bbduk { + container = "oras://ghcr.io/zellerlab/gaga2:latest" + executor = "slurm" + errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} + cpus = 4 + memory = {8.GB * task.attempt} + time = '2h' + maxRetries = 3 + } + withName: fastqc { + container = "oras://ghcr.io/zellerlab/gaga2:latest" + executor = "slurm" + errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} + cpus = 2 + memory = {4.GB * task.attempt} + time = '7d' + maxRetries = 3 } - withName: dada2_preprocess { + withName: mapseq_otutable { container = "oras://ghcr.io/zellerlab/gaga2:latest" executor = "slurm" errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} - memory = '16.GB' - time = '4d' + cpus = 1 + memory = {2.GB * task.attempt} + time = '24h' maxRetries = 3 - cpus = 8 } - withName: dada2_analysis { + withName: collate_mapseq_tables { container = "oras://ghcr.io/zellerlab/gaga2:latest" executor = "slurm" errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} - memory = '16.GB' - time = '4d' + cpus = 1 + memory = {2.GB * task.attempt} + time = '24h' maxRetries = 3 + } + withName: mapseq { + container = "oras://ghcr.io/zellerlab/gaga2:latest" + executor = "slurm" + errorStrategy = {task.attempt <= 3 ? "retry" : "ignore"} cpus = 8 + memory = {8.GB * task.attempt} + time = '7d' + maxRetries = 3 } } diff --git a/docs/img/gaga2_flow.png b/docs/img/gaga2_flow.png new file mode 100644 index 0000000..51cc168 Binary files /dev/null and b/docs/img/gaga2_flow.png differ diff --git a/main.nf b/main.nf index c02871d..08024a4 100644 --- a/main.nf +++ b/main.nf @@ -2,6 +2,11 @@ nextflow.enable.dsl = 2 +include { qc_bbduk } from "./modules/nevermore/qc/bbduk" +include { fastqc } from "./modules/nevermore/qc/fastqc" +include { classify_sample } from "./modules/nevermore/functions" +include { mapseq; mapseq_otutable } from "./modules/profilers/mapseq" + //def helpMessage() { // log.info """ @@ -37,37 +42,8 @@ nextflow.enable.dsl = 2 // helpMessage() // exit 0 //} -// -if ( !params.min_overlap ) { - params.min_overlap = 20 -} - -if ( !params.left_primer ) { - params.left_primer = 0 -} - -if ( !params.right_primer ) { - params.right_primer = 0 -} -process ltrim { - publishDir "${params.output_dir}", mode: params.publish_mode - - input: - path input_reads - - output: - path("ltrim/*.{fastq,fq,fastq.gz,fq.gz}"), emit: ltrimmed_reads - - script: - """ - mkdir -p ltrim/ - python ${projectDir}/scripts/ltrim.py . ltrim/ - """ - -} - process figaro { publishDir "${params.output_dir}", mode: params.publish_mode @@ -81,28 +57,29 @@ process figaro { script: def paired_params = (is_paired_end == true) ? "-r ${params.right_primer} -m ${params.min_overlap}" : "" + """ figaro -i . -o figaro/ -a ${params.amplicon_length} -f ${params.left_primer} ${paired_params} """ - // -r ${params.right_primer} -m ${params.min_overlap} } + process extract_trim_parameters { input: path(trim_params) output: - //tuple val(left), val(right), emit: trim_params path("trim_params.txt"), emit: trim_params script: """ python $projectDir/scripts/trim_params.py $trim_params > trim_params.txt """ - } + process dada2_preprocess { + label 'dada2' publishDir "${params.output_dir}", mode: params.publish_mode input: @@ -130,6 +107,7 @@ process dada2_preprocess { process dada2_analysis { + label 'dada2' publishDir "${params.output_dir}", mode: params.publish_mode input: @@ -143,8 +121,8 @@ process dada2_analysis { path("error_model.pdf") path("summary_table.tsv") path("result.RData") - path("dada2_figures.pdf") - path("ASVs.tsv") + path("dada2_figures.pdf"), optional: true + path("ASVs.tsv"), emit: asv_sequences path("asv_table.tsv") script: @@ -152,8 +130,142 @@ process dada2_analysis { mkdir -p dada2_in/ for f in \$(find . -maxdepth 1 -type l); do ln -s ../\$f dada2_in/; done rm dada2_in/*.R dada2_in/filter_trim_table.final.tsv - Rscript --vanilla ${dada2_script} dada2_in/ dada2_analysis/ filter_trim_table.final.tsv $task.cpus > dada2_analysis.log + Rscript --vanilla ${dada2_script} dada2_in/ dada2_analysis/ filter_trim_table.final.tsv $task.cpus ${params.dada2_chimera_method} ${params.dada2_chimera_min_fold_parent_over_abundance} > dada2_analysis.log + """ +} + +process asv2fasta { + publishDir "${params.output_dir}", mode: params.publish_mode + + input: + path(asv_seqs) + + output: + tuple val(meta), path("ASVs.fasta"), emit: asv_fasta + + script: + meta = [:] + meta.id = "all" + meta.is_paired = false + """ + tail -n +2 ${asv_seqs} | sed 's/^/>/' | tr '\t' '\n' > ASVs.fasta """ + +} + +process assess_read_length_distribution { + input: + path(fastq_reports) + + output: + path("read_length_thresholds.txt"), emit: read_length + path("READSET_HOMOGENEOUS"), emit: hom_reads_marker, optional: true + + script: + """ + python ${projectDir}/scripts/assess_readlengths.py . > read_length_thresholds.txt + """ +} + + +process homogenise_readlengths { + label 'bbduk' + + input: + tuple val(sample), path(reads) + path(read_lengths) + + output: + path("${sample.id}/*.{fastq,fq,fastq.gz,fq.gz}"), emit: reads + + script: + def maxmem = task.memory.toString().replace(/ GB/, "g") + + if (sample.is_paired) { + """ + mkdir -p ${sample.id} + r1len=\$(head -n 1 ${read_lengths} | cut -f 1) + r2len=\$(head -n 1 ${read_lengths} | cut -f 4) + bbduk.sh -Xmx${maxmem} t=${task.cpus} ordered=t minlength=\$((r1len-1)) ftr=\$((r1len-1)) stats=${sample.id}/${sample.id}.homr_stats_1.txt in=${sample.id}_R1.fastq.gz out=${sample.id}/${sample.id}_R1.fastq.gz + bbduk.sh -Xmx${maxmem} t=${task.cpus} ordered=t minlength=\$((r2len-1)) ftr=\$((r2len-1)) stats=${sample.id}/${sample.id}.homr_stats_2.txt in=${sample.id}_R2.fastq.gz out=${sample.id}/${sample.id}_R2.fastq.gz + """ + } else { + """ + mkdir -p ${sample.id} + r1len=\$(head -n 1 ${read_lengths} | cut -f 1) + bbduk.sh -Xmx${maxmem} t=${task.cpus} ordered=t minlength=\$((r1len-1)) ftr=\$((r1len-1)) stats=${sample.id}/${sample.id}.homr_stats_1.txt in=${sample.id}_R1.fastq.gz out=${sample.id}/${sample.id}_R1.fastq.gz + """ + } +} + + +workflow raw_reads_figaro { + + take: + reads + run_figaro + + main: + qc_bbduk(reads, "${projectDir}/assets/adapters.fa", run_figaro) + + fastqc(qc_bbduk.out.reads) + fastqc_ch = fastqc.out.reports + .map { sample, report -> return report } + .collect() + + assess_read_length_distribution(fastqc_ch) + homogenise_readlengths(qc_bbduk.out.reads, assess_read_length_distribution.out.read_length) + + hom_reads = homogenise_readlengths.out.reads.collect() + figaro(hom_reads, !params.single_end) + extract_trim_parameters(figaro.out.trim_params) + + emit: + reads = hom_reads + trim_params = extract_trim_parameters.out.trim_params + +} + + +workflow check_for_preprocessing { + take: + reads + + main: + reads.view() + fastqc(reads) + assess_read_length_distribution( + fastqc.out.reports + .map { sample, report -> return report } + .collect() + ) + + emit: + readlen_dist = assess_read_length_distribution.out.read_length + hom_reads_marker = assess_read_length_distribution.out.hom_reads_marker +} + + +process prepare_fastqs { + input: + tuple val(sample), path(fq) + + output: + tuple val(sample), path("fastq/${sample.id}/${sample.id}_R*.fastq.gz"), emit: reads + + script: + if (sample.is_paired) { + """ + mkdir -p fastq/${sample.id} + ln -sf ../../${fq[0]} fastq/${sample.id}/${sample.id}_R1.fastq.gz + ln -sf ../../${fq[1]} fastq/${sample.id}/${sample.id}_R2.fastq.gz + """ + } else { + """ + mkdir -p fastq/${sample.id} + ln -sf ../../${fq[0]} fastq/${sample.id}/${sample.id}_R1.fastq.gz + """ + } } @@ -166,34 +278,57 @@ workflow { return tuple(sample, file) } .groupTuple(sort: true) + .map { classify_sample(it[0], it[1]) } - if (!params.single_end) { - library_layout = "PAIRED"; - dada2_preprocess_script = "$projectDir/R_scripts/dada2_preprocess_paired.R" - dada2_analysis_script = "$projectDir/R_scripts/dada2_analysis_paired.R" - is_paired_end = true - } else { + prepare_fastqs(fastq_ch) + + if (params.single_end) { library_layout = "SINGLE"; dada2_preprocess_script = "$projectDir/R_scripts/dada2_preprocess_single.R" dada2_analysis_script = "$projectDir/R_scripts/dada2_analysis_single.R" is_paired_end = false + } else { + library_layout = "PAIRED"; + dada2_preprocess_script = "$projectDir/R_scripts/dada2_preprocess_paired.R" + dada2_analysis_script = "$projectDir/R_scripts/dada2_analysis_paired.R" + is_paired_end = true } print library_layout - fastq_ch.collect().view() + trim_params_ch = Channel.empty() + dada_reads_ch = Channel.empty() + + if (!params.preprocessed) { - files_only_ch = fastq_ch - .map { sample, files -> return files } - .collect() + /* check if dataset was preprocessed */ - files_only_ch.view() + check_for_preprocessing(prepare_fastqs.out.reads) - ltrim(files_only_ch) + raw_reads_figaro(prepare_fastqs.out.reads, check_for_preprocessing.out.hom_reads_marker) + trim_params_ch = trim_params_ch + .concat(raw_reads_figaro.out.trim_params) - figaro(ltrim.out.ltrimmed_reads, is_paired_end) - extract_trim_parameters(figaro.out.trim_params) + dada_reads_ch = dada_reads_ch.concat(raw_reads_figaro.out.reads) - dada2_preprocess(ltrim.out.ltrimmed_reads, extract_trim_parameters.out.trim_params, dada2_preprocess_script, is_paired_end) + } + + trim_params = file("${workDir}/trim_params.txt") + trim_params.text = "-1 -1\n" + + trim_params_ch = trim_params_ch + .concat(Channel.fromPath("${workDir}/trim_params.txt")) + + dada_reads_ch = dada_reads_ch.concat( + prepare_fastqs.out.reads + .map { sample, reads -> reads } + .collect() + ) + + dada2_preprocess(dada_reads_ch.first(), trim_params_ch.first(), dada2_preprocess_script, is_paired_end) dada2_analysis(dada2_preprocess.out.filtered_reads, dada2_preprocess.out.trim_table, dada2_analysis_script, is_paired_end) + asv2fasta(dada2_analysis.out.asv_sequences) + + mapseq(asv2fasta.out.asv_fasta, params.mapseq_db_path, params.mapseq_db_name) + mapseq_otutable(mapseq.out.bac_ssu) } diff --git a/gaga2_conda.yml b/misc/gaga2_conda.yml similarity index 100% rename from gaga2_conda.yml rename to misc/gaga2_conda.yml diff --git a/gaga2_install.sh b/misc/gaga2_install.sh similarity index 100% rename from gaga2_install.sh rename to misc/gaga2_install.sh diff --git a/scripts/check_readlengths.py b/misc/legacy_scripts/check_readlengths.py similarity index 94% rename from scripts/check_readlengths.py rename to misc/legacy_scripts/check_readlengths.py index ad172cb..6a69878 100644 --- a/scripts/check_readlengths.py +++ b/misc/legacy_scripts/check_readlengths.py @@ -36,6 +36,7 @@ def main(): print("WARNING: read lengths are not homogenous in {fq}. Figaro will not be executed.".format(fq=fn)) open(os.path.join(args.output_dir, "SKIP_FIGARO"), "wt").close() return None + open(os.path.join(args.output_dir, "RUN_FIGARO"), "wt").close() diff --git a/scripts/gather_fastq_files.py b/misc/legacy_scripts/gather_fastq_files.py similarity index 100% rename from scripts/gather_fastq_files.py rename to misc/legacy_scripts/gather_fastq_files.py diff --git a/misc/legacy_scripts/hltrim.py b/misc/legacy_scripts/hltrim.py new file mode 100644 index 0000000..f9f572e --- /dev/null +++ b/misc/legacy_scripts/hltrim.py @@ -0,0 +1,88 @@ +import os +import sys +import gzip +import argparse +import pathlib +import contextlib + +from collections import Counter + + +def read_fastq_stream(fq_in): + for line in fq_in: + _id, seq, _, qual = line, next(fq_in), next(fq_in), next(fq_in) + yield _id.strip(), seq.strip(), qual.strip() + + +def process_fastq_record(fq_rec, cutlen): + if fq_rec is None: + return None + + _id, seq, qual = fq_rec + _len = len(seq) + + if _len < cutlen: + return None + + return _id, seq[:cutlen], qual[:cutlen] + + +def main(): + ap = argparse.ArgumentParser() + # ap.add_argument("--r1", type=str) + # ap.add_argument("--r2", type=str) + ap.add_argument("reads", nargs="*", type=str) + ap.add_argument("--outdir", "-o", type=str, default="output") + ap.add_argument("--cutlen", "-c", type=str) + args = ap.parse_args() + + pathlib.Path(args.outdir).mkdir(exist_ok=True, parents=True) + + cutlen = args.cutlen.split(",") + if len(cutlen) == 1: + r1cut, r2cut = int(cutlen[0]), None + else: + r1cut, r2cut, *_ = map(int, cutlen) + + reads = sorted(args.reads) + r1 = reads[0] + r2 = reads[1] if len(reads) > 1 else None + + _open = gzip.open if r1.endswith(".gz") else open + + + r1_in = _open(r1, "rt") + r2_in = _open(r2, "rt") if r2 else contextlib.nullcontext() + + r1_out = _open(os.path.join(args.outdir, r1), "wt") + r2_out = _open(os.path.join(args.outdir, r2), "wt") if r2 else contextlib.nullcontext() + + with r1_in, r2_in, r1_out, r2_out: + r1_stream = read_fastq_stream(r1_in) + r2_stream = read_fastq_stream(r2_in) if r2 else None + + while True: + try: + r1_rec = process_fastq_record(next(r1_stream), r1cut) + except StopIteration: + break + if r2_stream: + try: + r2_rec = process_fastq_record(next(r2_stream), r2cut) + except StopIteration: + raise ValueError("r2 cannot be exhausted before r1") + else: + r2_rec = None + + if r1_rec is not None and (r2_rec is not None or r2_stream is None): + print(*r1_rec[0:2], "+", r1_rec[-1], sep="\n", file=r1_out) + if r2_rec is not None: + print(*r2_rec[0:2], "+", r2_rec[-1], sep="\n", file=r2_out) + + + + + + +if __name__ == "__main__": + main() diff --git a/scripts/ltrim.py b/misc/legacy_scripts/ltrim.py similarity index 100% rename from scripts/ltrim.py rename to misc/legacy_scripts/ltrim.py diff --git a/modules/nevermore/functions.nf b/modules/nevermore/functions.nf new file mode 100644 index 0000000..799253e --- /dev/null +++ b/modules/nevermore/functions.nf @@ -0,0 +1,15 @@ +def classify_sample(sample, files) { + + def meta = [:] + meta.is_paired = (files instanceof Collection && files.size() == 2) + meta.id = sample + + return [meta, files] + + if (meta.is_paired) { + return [meta, files] + } + + return [meta, [files]] + +} diff --git a/modules/nevermore/qc/bbduk.nf b/modules/nevermore/qc/bbduk.nf new file mode 100644 index 0000000..22a91b6 --- /dev/null +++ b/modules/nevermore/qc/bbduk.nf @@ -0,0 +1,35 @@ + +process qc_bbduk { + label 'bbduk' + publishDir path: params.output_dir, mode: params.publish_mode, pattern: "${sample.id}/${sample.id}.bbduk_stats.txt" + + input: + tuple val(sample), path(reads) + path(adapters) + path(run_sentinel) + + output: + tuple val(sample), path("${sample.id}/${sample.id}_R*.fastq.gz"), emit: reads + tuple val(sample), path("${sample.id}/${sample.id}_O.fastq.gz"), emit: orphans, optional: true + path("${sample.id}/${sample.id}.bbduk_stats.txt") + + script: + def maxmem = task.memory.toString().replace(/ GB/, "g") + //def read1 = "in1=${sample.id}_R1.fastq.gz out1=${sample.id}/${sample.id}_R1.fastq.gz" + def read1 = "in1=${sample.id}_R1.fastq.gz out1=${sample.id}/${sample.id}_R1.fastq.gz" + //read2 = sample.is_paired ? "in2=${sample.id}_R2.fastq.gz out2=${sample.id}/${sample.id}_R2.fastq.gz outs=${sample.id}/${sample.id}_O.fastq.gz" : "" + read2 = sample.is_paired ? "in2=${sample.id}_R2.fastq.gz out2=${sample.id}/${sample.id}_R2.fastq.gz outs=${sample.id}/${sample.id}_O.fastq.gz" : "" + + if (params.primers) { + qc_params = params.qc_params_primers + trim_params = "literal=${params.primers} minlen=${params.qc_minlen}" + } else { + qc_params = params.qc_params_adapters + trim_params = "ref=${adapters} minlen=${params.qc_minlen}" + } + + """ + mkdir -p ${sample.id} + bbduk.sh -Xmx${maxmem} t=${task.cpus} ordered=t ${qc_params} ${trim_params} stats=${sample.id}/${sample.id}.bbduk_stats.txt ${read1} ${read2} + """ +} diff --git a/modules/nevermore/qc/fastqc.nf b/modules/nevermore/qc/fastqc.nf new file mode 100644 index 0000000..2afd3fa --- /dev/null +++ b/modules/nevermore/qc/fastqc.nf @@ -0,0 +1,22 @@ +process fastqc { + publishDir params.output_dir, mode: params.publish_mode, pattern: "raw_counts/*.txt" + + input: + tuple val(sample), path(reads) + + output: + tuple val(sample), path("fastqc/*/*fastqc_data.txt"), emit: reports + tuple val(sample), path("raw_counts/${sample.id}.txt"), emit: counts + + script: + def process_r2 = (sample.is_paired) ? "fastqc -t $task.cpus --extract --outdir=fastqc ${sample.id}_R2.fastq.gz && mv fastqc/${sample.id}_R2_fastqc/fastqc_data.txt fastqc/${sample.id}_R2_fastqc/${sample.id}_R2_fastqc_data.txt" : ""; + + """ + mkdir -p fastqc + mkdir -p raw_counts + fastqc -t $task.cpus --extract --outdir=fastqc ${sample.id}_R1.fastq.gz && mv fastqc/${sample.id}_R1_fastqc/fastqc_data.txt fastqc/${sample.id}_R1_fastqc/${sample.id}_R1_fastqc_data.txt + ${process_r2} + grep "Total Sequences" fastqc/*/*data.txt > seqcount.txt + echo \$(wc -l seqcount.txt)\$'\t'\$(head -n1 seqcount.txt | cut -f 2) > raw_counts/${sample.id}.txt + """ +} diff --git a/modules/profilers/mapseq.nf b/modules/profilers/mapseq.nf new file mode 100644 index 0000000..78338ca --- /dev/null +++ b/modules/profilers/mapseq.nf @@ -0,0 +1,58 @@ +process mapseq { + input: + tuple val(sample), path(seqs) + path(db_path) + val(db_name) + + output: + path("${sample.id}/${sample.id}_bac_ssu.mseq"), emit: bac_ssu + + script: + def db = (db_name == "default" || db_name == "") ? "" : "${db_path}/${db_name}.fasta ${db_path}/*.tax*" + + """ + mkdir -p ${sample.id} + ${params.mapseq_bin} ${seqs} > ${sample.id}/${sample.id}_bac_ssu.mseq + """ +} + + +process mapseq_otutable { + publishDir params.output_dir, mode: params.publish_mode + + input: + path(mapped_seqs) + + output: + path("mapseq.otucounts.txt") + + script: + """ + ${params.mapseq_bin} -otucounts ${mapped_seqs} > mapseq.otucounts.txt + """ +} + + +process collate_mapseq_tables { + publishDir params.output_dir, mode: params.publish_mode + + input: + path(mapped_seqs) + + output: + path("mapseq_tables/mapseq_counts_genus_*_bac_ssu.tsv"), emit: ssu_tables + + script: + if (mapped_seqs.size() == 2) { + """ + mkdir -p mapseq_tables + ${params.mapseq_bin} -otutable -tl 5 \$(ls *_R1_bac_ssu.mseq) | sed 's/_R1_bac_ssu.mseq//g' > mapseq_tables/mapseq_counts_genus_fwd_bac_ssu.tsv + ${params.mapseq_bin} -otutable -tl 5 \$(ls *_R2_bac_ssu.mseq) | sed 's/_R2_bac_ssu.mseq//g' > mapseq_tables/mapseq_counts_genus_rev_bac_ssu.tsv + """ + } else { + """ + mkdir -p mapseq_tables + ${params.mapseq_bin} -otutable -tl 5 \$(ls *_R1_bac_ssu.mseq) | sed 's/_R1_bac_ssu.mseq//g' > mapseq_tables/mapseq_counts_genus_fwd_bac_ssu.tsv + """ + } +} diff --git a/nextflow.config b/nextflow.config index 8d27654..904d854 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,5 +4,5 @@ manifest { description = "DADA2/figaro-based 16S amplicon analysis pipeline" name = "gaga2" nextflowVersion = ">=21.0" - version = "0.3" + version = "0.4" } diff --git a/scripts/assess_readlengths.py b/scripts/assess_readlengths.py new file mode 100644 index 0000000..3135796 --- /dev/null +++ b/scripts/assess_readlengths.py @@ -0,0 +1,64 @@ +import argparse +import glob +import os + +from collections import Counter + +def parse_fastqc_report(f): + readlengths = list() + active = False + for line in open(f): + active = active or (not active and line.startswith(">>Sequence Length Distribution")) + if active: + if line.startswith(">>END_MODULE"): + break + if line[0] not in (">", "#"): + length, count = line.split("\t") + readlengths.append((int(length.split("-")[0]), float(count))) + return dict(item for item in readlengths if item[1] > 0) + + +def main(): + + ap = argparse.ArgumentParser() + ap.add_argument("input_dir", type=str, default=".") + args = ap.parse_args() + + read_yields = {} + for r in (1, 2): + read_lengths = Counter() + for fastq_report in glob.glob(os.path.join(args.input_dir, f"*{r}_fastqc_data.txt")): + read_lengths.update(parse_fastqc_report(fastq_report)) + + yields = list() + for length, count in read_lengths.items(): + yields.append( + (length, sum(v for k, v in read_lengths.items() if k >= length), sum(v * length for k, v in read_lengths.items() if k >= length)) + ) + for length, reads, bases in sorted(yields, key=lambda x:(x[2], x[1]), reverse=True): + read_yields.setdefault(r, list()).append((length, reads, bases)) + + r1_lengths = read_yields.get(1, list()) + r2_lengths = read_yields.get(2, list()) + + if not r2_lengths: + all_lengths = r1_lengths + is_hom = len(r1_lengths) == 1 + else: + if len(r1_lengths) < len(r2_lengths): + r1_lengths.extend(("NA", "NA", "NA") for i in range(len(r2_lengths) - len(r1_lengths))) + elif len(r2_lengths) < len(r1_lengths): + r2_lengths.extend(("NA", "NA", "NA") for i in range(len(r1_lengths) - len(r2_lengths))) + all_lengths = [x + y for x, y in zip(r1_lengths, r2_lengths)] + is_hom = len(r1_lengths) == len(r2_lengths) == 1 + + for item in all_lengths: + print(*item, sep="\t") + + if is_hom: + open("READSET_HOMOGENEOUS", "wt").close() + + + +if __name__ == "__main__": + main()