diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index e7d98850..4dc32a60 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -14,10 +14,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Launch workflow via tower +<<<<<<< HEAD uses: seqeralabs/action-tower-launch@922e5c8d5ac4e918107ec311d2ebbd65e5982b3d # v2 # TODO nf-core: You can customise AWS full pipeline tests as required # Add full size test data (but still relatively small datasets for few samples) # on the `test_full.config` test runs with only one set of parameters +======= + uses: seqeralabs/action-tower-launch@v2 +>>>>>>> dev with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f9546d96..89155ce2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,10 +22,24 @@ jobs: if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/taxprofiler') }}" runs-on: ubuntu-latest strategy: + fail-fast: false matrix: NXF_VER: - "23.04.0" - "latest-everything" + tags: + - "test" + - "test_nopreprocessing" + - "test_noprofiling" + - "test_krakenuniq" + - "test_malt" + - "test_motus" + - "test_falco" + - "test_fastp" + - "test_adapterremoval" + - "test_bbduk" + - "test_prinseqplusplus" + steps: - name: Check out pipeline code uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 @@ -35,12 +49,27 @@ jobs: with: version: "${{ matrix.NXF_VER }}" +<<<<<<< HEAD - name: Disk space cleanup uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 +======= + - name: Show current locale + run: locale + + - name: Set UTF-8 enabled locale + run: | + sudo locale-gen en_US.UTF-8 + sudo update-locale LANG=en_US.UTF-8 +>>>>>>> dev - name: Run pipeline with test data - # TODO nf-core: You can customise CI pipeline run tests as required - # For example: adding multiple test runs with different parameters - # Remember that you can parallelise this by using strategy.matrix run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + if [[ "${{ matrix.tags }}" == "test_motus" ]]; then + wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py + python downloadDB.py > download_db_log.txt + echo 'tool,db_name,db_params,db_path' > 'database_motus.csv' + echo 'motus,db_mOTU,,db_mOTU' >> 'database_motus.csv' + nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }}; + else + nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; + fi diff --git a/.prettierignore b/.prettierignore index 437d763d..abb4b4d6 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,3 +10,4 @@ testing/ testing* *.pyc bin/ +tests/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 313368e1..99b7669b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,10 +3,195 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +<<<<<<< HEAD ## v1.1.6dev - [date] +======= +## v1.1.6dev - [unreleased] + +### `Added` + +### `Fixed` + +### `Dependencies` + +### `Deprecated` + +## v1.1.5 - Augmented Akita Patch [2024-02-08] + +### `Added` + +- [#439](https://github.com/nf-core/taxprofiler/pull/439) Read deduplication with fastp (added by @maxibor) +- [#440](https://github.com/nf-core/taxprofiler/pull/440) Include mention of pre-built kaiju databases in tutorial.md (added by @Joon-Klaps) +- [#442](https://github.com/nf-core/taxprofiler/pull/442) Updated to nf-core pipeline template v2.12 (added by @sofstam) + +### `Fixed` + +- [#444](https://github.com/nf-core/taxprofiler/pull/444) Centrifuge now uses dedicated tmp directory to hopefully prevent mkfifo clashes (❤️ to @erinyoung for reporting, fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| ---------- | ---------------- | ----------- | +| Centrifuge | 1.0.4_beta | 1.0.4.1 | + +### `Deprecated` + +## v1.1.4 - Augmented Akita Patch [2024-01-24] + +### `Added` + +### `Fixed` + +- [#431](https://github.com/nf-core/modules/pull/4781#event-11555493525) Updated kaiju2table module to report taxon names (fix by @Joon-Klaps) +- [#430](https://github.com/nf-core/taxprofiler/pull/430) Fix the fastq output in the module LONGREAD_HOSTREMOVAL. (fix by @LilyAnderssonLee) + +### `Dependencies` + +| Tool | Previous version | New version | +| ----- | ---------------- | ----------- | +| kaiju | 1.8.2 | 1.10.0 | + +### `Deprecated` + +## v1.1.3 - Augmented Akita Patch [2024-01-12] + +### `Added` + +- [#424](https://github.com/nf-core/taxprofiler/pull/424) Updated to nf-core pipeline template v2.11.1 (added by @LilyAnderssonLee & @sofstam) + +### `Fixed` + +- [#419](https://github.com/nf-core/taxprofiler/pull/419) Added improved syntax highlighting for tables in documentation (fix by @mashehu) +- [#421](https://github.com/nf-core/taxprofiler/pull/421) Updated the krakenuniq/preloadedkrakenuniq module that contained a fix for saving the output reads (❤️ to @SannaAb for reporting, fix by @Midnighter) +- [#427](https://github.com/nf-core/taxprofiler/pull/427) Fixed preprint information in the recommended methods text (fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| ------------- | ---------------- | ----------- | +| multiqc | 1.15 | 1.19 | +| fastqc | 11.9 | 12.1 | +| nf-validation | unpinned | 1.1.3 | + +## v1.1.2 - Augmented Akita Patch [2023-10-27] + +### `Added` + +- [#408](https://github.com/nf-core/taxprofiler/pull/408) Added preprint citation information to README and manifest (added by @jfy133) + +### `Fixed` + +- [#405](https://github.com/nf-core/taxprofiler/pull/405) Fix database to tool mismatching in KAIJU2KRONA input (❤️ to @MajoroMask for reporting, fix by @jfy133) +- [#406](https://github.com/nf-core/taxprofiler/pull/406) Fix overwriting of bracken-derived kraken2 outputs when the database name is shared between Bracken/Kraken2. (❤️ to @MajoroMask for reporting, fix by @jfy133) +- [#409](https://github.com/nf-core/taxprofiler/pull/409) Fix a NullPointerException error occurring occasionally in older version of MEGAN's rma2info (❤️ to @MajoroMask for reporting, fix by @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| -------------- | ---------------- | ----------- | +| megan/rma2info | 6.21.7 | 6.24.20 | + +### `Deprecated` + +## v1.1.1 - Augmented Akita Patch [2023-10-11] + +### `Added` + +- [#379](https://github.com/nf-core/taxprofiler/pull/379) Added support for previously missing Bracken-corrected Kraken2 report as output (added by @hkaspersen & @jfy133 ) +- [#380](https://github.com/nf-core/taxprofiler/pull/380) Updated to nf-core pipeline template v2.10 (added by @LilyAnderssonLee & @sofstam) +- [#393](https://github.com/nf-core/taxprofiler/pull/383) Add validation check for a taxpasta taxonomy directory if --taxpasta*add*\* parameters requested (♥️ to @alimalrashed for reporting, added by @jfy133) + +### `Fixed` + +- [#383](https://github.com/nf-core/taxprofiler/pull/383) Update the module of KrakenUniq to the latest to account for edge case bugs where FASTQ input was mis-detected as wrong format (❤️ to @asafpr for reporting and solution, fixed by @LilyAnderssonLee) +- [#392](https://github.com/nf-core/taxprofiler/pull/392) Update the module of Taxpasta to support adding taxa information to results (❤️ to @SannaAb for reporting, fixed by @Midnighter) + +### `Dependencies` + +| Tool | Previous version | New version | +| ---------- | ---------------- | ----------- | +| KrakenUniq | 1.0.2 | 1.0.4 | +| taxpasta | 0.6.0 | 0.6.1 | + +### `Deprecated` + +## v1.1.0 - Augmented Akita [2023-09-19] + +### `Added` + +- [#298](https://github.com/nf-core/taxprofiler/pull/298) **New classifier** [ganon](https://pirovc.github.io/ganon/) (added by @jfy133) +- [#312](https://github.com/nf-core/taxprofiler/pull/312) **New classifier** [KMCP](https://github.com/shenwei356/kmcp) (added by @sofstam) +- [#318](https://github.com/nf-core/taxprofiler/pull/318) **New classifier** [MetaPhlAn4](https://github.com/biobakery/MetaPhlAn) (MetaPhlAn3 support remains) (added by @LilyAnderssonLee) +- [#276](https://github.com/nf-core/taxprofiler/pull/276) Implemented batching in the KrakenUniq samples processing (added by @Midnighter) +- [#272](https://github.com/nf-core/taxprofiler/pull/272) Add saving of final 'analysis-ready-reads' to dedicated directory (❤️ to @alexhbnr for request, added by @jfy133) +- [#303](https://github.com/nf-core/taxprofiler/pull/303) Add support for taxpasta profile standardisation in single sample pipeline runs (❤️ to @artur-matysik for request, added by @jfy133) +- [#308](https://github.com/nf-core/taxprofiler/pull/308) Add citations and bibliographic information to the MultiQC methods text of tools used in a given pipeline run (added by @jfy133) +- [#315](https://github.com/nf-core/taxprofiler/pull/315) Updated to nf-core pipeline template v2.9 (added by @sofstam & @jfy133) +- [#321](https://github.com/nf-core/taxprofiler/pull/321) Added support for virus hit expansion in Kaiju (❤️ to @dnlrxn for requesting, added by @jfy133) +- [#325](https://github.com/nf-core/taxprofiler/pull/325) Add ability to skip sequencing quality control tools (❤️ to @vinisalazar for requesting, added by @jfy133) +- [#345](https://github.com/nf-core/taxprofiler/pull/345) Add simple tutorial to explain how to get up and running with an nf-core/taxprofiler run (added by @jfy133) +- [#355](https://github.com/nf-core/taxprofiler/pull/355) Add support for TAXPASTA's `--add-rank-lineage` to output (❤️ to @MajoroMask for request, added by @Midnighter, @sofstam, @jfy133) +- [#368](https://github.com/nf-core/taxprofiler/pull/368/) Add the ability to ignore profile errors caused by empty profiles and other validation errors when merging multiple profiles using TAXPASTA (added by @Midnighter and @LilyAnderssonLee) + +### `Fixed` + +- [#271](https://github.com/nf-core/taxprofiler/pull/271) Improved standardised table generation documentation for mOTUs manual database download tutorial (♥ to @prototaxites for reporting, fix by @jfy133) +- [#269](https://github.com/nf-core/taxprofiler/pull/269) Reduced output files in AWS full test output due to very large files (fix by @jfy133) +- [#270](https://github.com/nf-core/taxprofiler/pull/270) Fixed warning for host removal index parameter, and improved index checks (♥ to @prototaxites for reporting, fix by @jfy133) +- [#274](https://github.com/nf-core/taxprofiler/pull/274) Substituted the samtools/bam2fq module with samtools/fastq module (fix by @sofstam) +- [#275](https://github.com/nf-core/taxprofiler/pull/275) Replaced function used for error reporting to more Nextflow friendly method (fix by @jfy133) +- [#285](https://github.com/nf-core/taxprofiler/pull/285) Fixed overly large log files in Kraken2 output (♥ to @prototaxites for reporting, fix by @Midnighter & @jfy133) +- [#286](https://github.com/nf-core/taxprofiler/pull/286) Runtime optimisation of MultiQC step via improved log file processing (fix by @Midnighter & @jfy133) +- [#289](https://github.com/nf-core/taxprofiler/pull/289) Pipeline updated to nf-core template 2.8 (fix by @Midnighter & @jfy133) +- [#290](https://github.com/nf-core/taxprofiler/pull/290) Minor database input documentation improvements (♥ to @alneberg for reporting, fix by @jfy133) +- [#305](https://github.com/nf-core/taxprofiler/pull/305) Fix docker/podman registry definition for tower compatibility (fix by @adamrtalbot, @jfy133) +- [#304](https://github.com/nf-core/taxprofiler/pull/304) Correct mistake in kaiju2table documentation, only single rank can be supplied (♥ to @artur-matysik for reporting, fix by @jfy133) +- [#307](https://github.com/nf-core/taxprofiler/pull/307) Fix databases being sometimes associated with the wrong tool (e.g. Kaiju) (fix by @jfy133, @Midnighter and @LilyAnderssonLee) +- [#313](https://github.com/nf-core/taxprofiler/pull/313) Fix pipeline not providing error when database sheet does not have a header (♥ to @noah472 for reporting, fix by @jfy133) +- [#330](https://github.com/nf-core/taxprofiler/pull/330) Added better tagging to allow disambiguation of Kraken2 steps of Kraken2 vs Bracken (♥ to @MajoroMask for requesting, added by @jfy133) +- [#334](https://github.com/nf-core/taxprofiler/pull/334) Increase the memory of the FALCO process to 4GB (fix by @LilyAnderssonLee) +- [#332](https://github.com/nf-core/taxprofiler/pull/332) Improved meta map stability for more robust pipeline resuming (fix by @jfy133) +- [#338](https://github.com/nf-core/taxprofiler/pull/338) Fixed wrong file 'out' file going to `centrifuge kreport` module (♥ to @LilyAnderssonLee for reporting, fix by @jfy133) +- [#342](https://github.com/nf-core/taxprofiler/pull/342) Fixed docs/usage to correctly list the required database files for Bracken and tips to obtain Kraken2 databases (fix by @husensofteng) +- [#350](https://github.com/nf-core/taxprofiler/pull/350) Reorganize the CI tests into separate profiles in preparation for implementation of nf-test (fix by @LilyAnderssonLee) +- [#364](https://github.com/nf-core/taxprofiler/pull/364) Add autoMounts to apptainer profile in nextflow.config (♥ to @hkaspersen for reporting, fix by @LilyAnderssonLee) +- [#372](https://github.com/nf-core/taxprofiler/pull/372) Update modules to use quay.io nf-core mirrored containers (♥ to @maxulysse for pointing out, fix by @LilyAnderssonLee and @jfy133) + +### `Dependencies` + +| Tool | Previous version | New version | +| --------- | ---------------- | ----------- | +| MultiQC | 1.13 | 1.15 | +| TAXPASTA | 0.2.3 | 0.6.0 | +| MetaPhlAn | 3.0.12 | 4.0.6 | +| fastp | 0.23.2 | 0.23.4 | +| samtools | 1.16.1 | 1.17 | + +### `Deprecated` + +- [#338](https://github.com/nf-core/taxprofiler/pull/338) Updated Centrifuge module to not generate (undocumented) SAM alignments by default if --save_centrifuge_reads supplied, due to a Centrifuge bug modifying profile header. SAM alignments can still be generated if `--out-fmt` supplied in `database.csv` (♥ to @LilyAnderssonLee for reporting, fix by @jfy133) + +## v1.0.1 - Dodgy Dachshund Patch [2023-05-15] + +### `Added` + +### `Fixed` + +- [#291](https://github.com/nf-core/taxprofiler/pull/291) - Fix Taxpasta not receiving taxonomy directory (❤️ to @SannaAb for reporting, fix by @jfy133) + +## v1.0.0 - Dodgy Dachshund [2023-03-13] +>>>>>>> dev Initial release of nf-core/taxprofiler, created with the [nf-core](https://nf-co.re/) template. +- Add read quality control (sequencing QC, adapter removal and merging) +- Add read complexity filtering +- Add host-reads removal step +- Add run merging +- Add taxonomic classification +- Add taxon table standardisation +- Add post-classification visualisation + ### `Added` ### `Fixed` diff --git a/CITATIONS.md b/CITATIONS.md index e10d62ae..b82fe7d7 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -2,11 +2,11 @@ ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/) -> Ewels PA, Peltzer A, Fillinger S, Patel H, Alneberg J, Wilm A, Garcia MU, Di Tommaso P, Nahnsen S. The nf-core framework for community-curated bioinformatics pipelines. Nat Biotechnol. 2020 Mar;38(3):276-278. doi: 10.1038/s41587-020-0439-x. PubMed PMID: 32055031. +> Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. In Nature Biotechnology (Vol. 38, Issue 3). https://doi.org/10.1038/s41587-020-0439-x ## [Nextflow](https://pubmed.ncbi.nlm.nih.gov/28398311/) -> Di Tommaso P, Chatzou M, Floden EW, Barja PP, Palumbo E, Notredame C. Nextflow enables reproducible computational workflows. Nat Biotechnol. 2017 Apr 11;35(4):316-319. doi: 10.1038/nbt.3820. PubMed PMID: 28398311. +> Di Tommaso, P., Chatzou, M., Floden, E. W., Barja, P. P., Palumbo, E., & Notredame, C. (2017). Nextflow enables reproducible computational workflows. In Nature Biotechnology (Vol. 35, Issue 4). https://doi.org/10.1038/nbt.3820 ## Pipeline tools @@ -16,7 +16,103 @@ - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: Summarize analysis results for multiple tools and samples in a single report. Bioinformatics, 32(19). https://doi.org/10.1093/bioinformatics/btw354 + +- [falco](https://doi.org/10.12688/f1000research.21142.2) + + > de Sena Brandine, G., & Smith, A. D. (2021). Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research, 8(1874), 1874. https://doi.org/10.12688/f1000research.21142.2 + +- [fastp](https://doi.org/10.1093/bioinformatics/bty560) + + > Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. https://doi.org/10.1093/bioinformatics/bty560 + +- [AdapterRemoval2](https://doi.org/10.1186/s13104-016-1900-2) + + > Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. https://doi.org/10.1186/s13104-016-1900-2 + +- [Porechop](https://github.com/rrwick/Porechop) + + > Wick, R. R., Judd, L. M., Gorrie, C. L., & Holt, K. E. (2017). Completing bacterial genome assemblies with multiplex MinION sequencing. Microbial Genomics, 3(10), e000132. https://doi.org/10.1099/mgen.0.000132 + +- [Filtlong](https://github.com/rrwick/Filtlong) + + > Wick R (2021) Filtlong, URL: https://github.com/rrwick/Filtlong + +- [BBTools](http://sourceforge.net/projects/bbmap/) + + > Bushnell B. (2022) BBMap, URL: http://sourceforge.net/projects/bbmap/ + +- [PRINSEQ++](https://doi.org/10.7287/peerj.preprints.27553v1) + + > Cantu, V. A., Sadural, J., & Edwards, R. (2019). PRINSEQ++, a multi-threaded tool for fast and efficient quality control and preprocessing of sequencing datasets (e27553v1). PeerJ Preprints. https://doi.org/10.7287/peerj.preprints.27553v1 + +- [Bowtie2](https://doi.org/10.1038/nmeth.1923) + + > Langmead, B., & Salzberg, S. L. (2012). Fast gapped-read alignment with Bowtie 2. Nature Methods, 9(4), 357–359. https://doi.org/10.1038/nmeth.1923 + +- [minimap2](https://doi.org/10.1093/bioinformatics/bty191) + + > Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics , 34(18), 3094–3100. https://doi.org/10.1093/bioinformatics/bty191 + +- [SAMTools](https://doi.org/10.1093/gigascience/giab008) + + > Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., Whitwham, A., Keane, T., McCarthy, S. A., Davies, R. M., & Li, H. (2021). Twelve years of SAMtools and BCFtools. GigaScience, 10(2). https://doi.org/10.1093/gigascience/giab008 + +- [Bracken](https://doi.org/10.7717/peerj-cs.104) + + > Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. https://doi.org/10.7717/peerj-cs.104 + +- [Kraken2](https://doi.org/10.1186/s13059-019-1891-0) + + > Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome Biology, 20(1), 257. https://doi.org/10.1186/s13059-019-1891-0 + +- [KrakenUniq](https://doi.org/10.1186/s13059-018-1568-0) + + > Breitwieser, F. P., Baker, D. N., & Salzberg, S. L. (2018). KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology, 19(1), 198. https://doi.org/10.1186/s13059-018-1568-0 + +- [MetaPhlAn](https://doi.org/10.1038/s41587-023-01688-w) + + > Blanco-Míguez, A., Beghini, F., Cumbo, F., McIver, L. J., Thompson, K. N., Zolfo, M., Manghi, P., Dubois, L., Huang, K. D., Thomas, A. M., Nickols, W. A., Piccinno, G., Piperni, E., Punčochář, M., Valles-Colomer, M., Tett, A., Giordano, F., Davies, R., Wolf, J., … Segata, N. (2023). Extending and improving metagenomic taxonomic profiling with uncharacterized species using MetaPhlAn 4. Nature Biotechnology, 1–12. https://doi.org/10.1038/s41587-023-01688-w + +- [MALT](https://doi.org/10.1038/s41559-017-0446-6) + + > Vågene, Å. J., Herbig, A., Campana, M. G., Robles García, N. M., Warinner, C., Sabin, S., Spyrou, M. A., Andrades Valtueña, A., Huson, D., Tuross, N., Bos, K. I., & Krause, J. (2018). Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature Ecology & Evolution, 2(3), 520–528. https://doi.org/10.1038/s41559-017-0446-6 + +- [MEGAN](https://doi.org/10.1371/journal.pcbi.1004957) + + > Huson, D. H., Beier, S., Flade, I., Górska, A., El-Hadidi, M., Mitra, S., Ruscheweyh, H.-J., & Tappu, R. (2016). MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data. PLoS Computational Biology, 12(6), e1004957. https://doi.org/10.1371/journal.pcbi.1004957 + +- [DIAMOND](https://doi.org/10.1038/nmeth.3176) + + > Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. https://doi.org/10.1038/nmeth.3176 + +- [Centrifuge](https://doi.org/10.1101/gr.210641.116) + + > Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. https://doi.org/10.1101/gr.210641.116 + +- [Kaiju](https://doi.org/10.1038/ncomms11257) + + > Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. https://doi.org/10.1038/ncomms11257 + +- [mOTUs](https://doi.org/10.1186/s40168-022-01410-z) + + > Ruscheweyh, H.-J., Milanese, A., Paoli, L., Karcher, N., Clayssen, Q., Keller, M. I., Wirbel, J., Bork, P., Mende, D. R., Zeller, G., & Sunagawa, S. (2022). Cultivation-independent genomes greatly expand taxonomic-profiling capabilities of mOTUs across various environments. Microbiome, 10(1), 212. https://doi.org/10.1186/s40168-022-01410-z + +- [KMCP](https://doi.org/10.1093/bioinformatics/btac845) + + > Shen, W., Xiang, H., Huang, T., Tang, H., Peng, M., Cai, D., Hu, P., & Ren, H. (2023). KMCP: accurate metagenomic profiling of both prokaryotic and viral populations by pseudo-mapping. Bioinformatics (Oxford, England), 39(1). https://doi.org/10.1093/bioinformatics/btac845 + +- [ganon](https://doi.org/10.1093/bioinformatics/btaa458) + + > Piro, V. C., Dadi, T. H., Seiler, E., Reinert, K., & Renard, B. Y. (2020). Ganon: Precise metagenomics classification against large and up-to-date sets of reference sequences. Bioinformatics (Oxford, England), 36(Suppl_1), i12–i20. https://doi.org/10.1093/bioinformatics/btaa458 + +- [Krona](https://doi.org/10.1186/1471-2105-12-385) + + > Ondov, B. D., Bergman, N. H., & Phillippy, A. M. (2011). Interactive metagenomic visualization in a Web browser. BMC Bioinformatics, 12. https://doi.org/10.1186/1471-2105-12-385 + +- [TAXPASTA](https://doi.org/10.21105/joss.05627) + + > Beber, M. E., Borry, M., Stamouli, S., & Fellows Yates, J. A. (2023). TAXPASTA: TAXonomic Profile Aggregation and STAndardisation. Journal of Open Source Software, 8(87), 5627. https://doi.org/10.21105/joss.05627 ## Software packaging/containerisation tools @@ -26,11 +122,11 @@ - [Bioconda](https://pubmed.ncbi.nlm.nih.gov/29967506/) - > Grüning B, Dale R, Sjödin A, Chapman BA, Rowe J, Tomkins-Tinch CH, Valieris R, Köster J; Bioconda Team. Bioconda: sustainable and comprehensive software distribution for the life sciences. Nat Methods. 2018 Jul;15(7):475-476. doi: 10.1038/s41592-018-0046-7. PubMed PMID: 29967506. + > Dale, R., Grüning, B., Sjödin, A., Rowe, J., Chapman, B. A., Tomkins-Tinch, C. H., Valieris, R., Batut, B., Caprez, A., Cokelaer, T., Yusuf, D., Beauchamp, K. A., Brinda, K., Wollmann, T., Corguillé, G. Le, Ryan, D., Bretaudeau, A., Hoogstrate, Y., Pedersen, B. S., … Köster, J. (2018). Bioconda: Sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7). https://doi.org/10.1038/s41592-018-0046-7 - [BioContainers](https://pubmed.ncbi.nlm.nih.gov/28379341/) - > da Veiga Leprevost F, Grüning B, Aflitos SA, Röst HL, Uszkoreit J, Barsnes H, Vaudel M, Moreno P, Gatto L, Weber J, Bai M, Jimenez RC, Sachsenberg T, Pfeuffer J, Alvarez RV, Griss J, Nesvizhskii AI, Perez-Riverol Y. BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics. 2017 Aug 15;33(16):2580-2582. doi: 10.1093/bioinformatics/btx192. PubMed PMID: 28379341; PubMed Central PMCID: PMC5870671. + > Da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: An open-source and community-driven framework for software standardization. Bioinformatics, 33(16). https://doi.org/10.1093/bioinformatics/btx192 - [Docker](https://dl.acm.org/doi/10.5555/2600239.2600241) @@ -38,4 +134,14 @@ - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) - > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. + > Kurtzer, G. M., Sochat, V., & Bauer, M. W. (2017). Singularity: Scientific containers for mobility of compute. PLoS ONE, 12(5). https://doi.org/10.1371/journal.pone.0177459 + +## Data + +- [Maixner (2021)](https://doi.org/10.1016/j.cub.2021.09.031) (CI Test Data) + + > Maixner, F., Sarhan, M. S., Huang, K. D., Tett, A., Schoenafinger, A., Zingale, S., Blanco-Míguez, A., Manghi, P., Cemper-Kiesslich, J., Rosendahl, W., Kusebauch, U., Morrone, S. R., Hoopmann, M. R., Rota-Stabelli, O., Rattei, T., Moritz, R. L., Oeggl, K., Segata, N., Zink, A., … Kowarik, K. (2021). Hallstatt miners consumed blue cheese and beer during the Iron Age and retained a non-Westernized gut microbiome until the Baroque period. Current Biology, 31(23). https://doi.org/10.1016/j.cub.2021.09.031 + +- [Meslier (2022)](https://doi.org/10.1038/s41597-022-01762-z) (AWS Full Test data) + + > Meslier, V., Quinquis, B., Da Silva, K., Plaza Oñate, F., Pons, N., Roume, H., Podar, M., & Almeida, M. (2022). Benchmarking second and third-generation sequencing platforms for microbial metagenomics. Scientific Data, 9(1). https://doi.org/10.1038/s41597-022-01762-z diff --git a/README.md b/README.md index fe76e3c0..a991a99d 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,17 @@

- - nf-core/taxprofiler + + nf-core/taxprofiler

+<<<<<<< HEAD [![GitHub Actions CI Status](https://github.com/nf-core/taxprofiler/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/taxprofiler/actions/workflows/ci.yml) [![GitHub Actions Linting Status](https://github.com/nf-core/taxprofiler/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/taxprofiler/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/taxprofiler/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +======= +[![GitHub Actions CI Status](https://github.com/nf-core/taxprofiler/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/taxprofiler/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/taxprofiler/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/taxprofiler/actions?query=workflow%3A%22nf-core+linting%22)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/taxprofiler/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.7728364-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.7728364) +>>>>>>> dev [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -16,53 +21,79 @@ [![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23taxprofiler-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/taxprofiler)[![Follow on Twitter](http://img.shields.io/badge/twitter-%40nf__core-1DA1F2?labelColor=000000&logo=twitter)](https://twitter.com/nf_core)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core) -## Introduction - -**nf-core/taxprofiler** is a bioinformatics pipeline that ... - - +[![Cite Preprint](https://img.shields.io/badge/Cite%20Us!-Cite%20Preprint-orange)](https://doi.org/10.1101/2023.10.20.563221) - - +## Introduction -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +**nf-core/taxprofiler** is a bioinformatics best-practice analysis pipeline for taxonomic classification and profiling of shotgun short- and long-read metagenomic data. It allows for in-parallel taxonomic identification of reads or taxonomic abundance estimation with multiple classification and profiling tools against multiple databases, and produces standardised output tables for facilitating results comparison between different tools and databases. + +## Pipeline summary + +![](docs/images/taxprofiler_tube.png) + +1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) or [`falco`](https://github.com/smithlabcode/falco) as an alternative option) +2. Performs optional read pre-processing + - Adapter clipping and merging (short-read: [fastp](https://github.com/OpenGene/fastp), [AdapterRemoval2](https://github.com/MikkelSchubert/adapterremoval); long-read: [porechop](https://github.com/rrwick/Porechop)) + - Low complexity and quality filtering (short-read: [bbduk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/), [PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus); long-read: [Filtlong](https://github.com/rrwick/Filtlong)) + - Host-read removal (short-read: [BowTie2](http://bowtie-bio.sourceforge.net/bowtie2/); long-read: [Minimap2](https://github.com/lh3/minimap2)) + - Run merging +3. Supports statistics for host-read removal ([Samtools](http://www.htslib.org/)) +4. Performs taxonomic classification and/or profiling using one or more of: + - [Kraken2](https://ccb.jhu.edu/software/kraken2/) + - [MetaPhlAn](https://huttenhower.sph.harvard.edu/metaphlan/) + - [MALT](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/malt/) + - [DIAMOND](https://github.com/bbuchfink/diamond) + - [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) + - [Kaiju](https://kaiju.binf.ku.dk/) + - [mOTUs](https://motu-tool.org/) + - [KrakenUniq](https://github.com/fbreitwieser/krakenuniq) + - [KMCP](https://github.com/shenwei356/kmcp) + - [ganon](https://pirovc.github.io/ganon/) +5. Perform optional post-processing with: + - [bracken](https://ccb.jhu.edu/software/bracken/) +6. Standardises output tables ([`Taxpasta`](https://taxpasta.readthedocs.io)) +7. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +8. Plotting Kraken2, Centrifuge, Kaiju and MALT results ([`Krona`](https://hpc.nih.gov/apps/kronatools.html)) ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +Additionally, you will need a database sheet that looks as follows: -Now, you can run the pipeline using: +`databases.csv`: + +``` +tool,db_name,db_params,db_path +kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +metaphlan,db1,,///metaphlan/metaphlan_database/ +``` + +That includes directories or `.tar.gz` archives containing databases for the tools you wish to run the pipeline against. - +Now, you can run the pipeline using: ```bash nextflow run nf-core/taxprofiler \ -profile \ --input samplesheet.csv \ - --outdir + --databases databases.csv \ + --outdir \ + --run_kraken2 --run_metaphlan ``` > [!WARNING] @@ -81,9 +112,34 @@ For more details about the output files and reports, please refer to the nf-core/taxprofiler was originally written by James A. Fellows Yates, Sofia Stamouli, Moritz E. Beber, and the nf-core/taxprofiler team. -We thank the following people for their extensive assistance in the development of this pipeline: +### Team + +- [James A. Fellows Yates](https://github.com/jfy133) +- [Sofia Stamouli](https://github.com/sofstam) +- [Moritz E. Beber](https://github.com/Midnighter) + +We thank the following people for their contributions to the development of this pipeline: + +- [Lauri Mesilaakso](https://github.com/ljmesi) +- [Tanja Normark](https://github.com/talnor) +- [Maxime Borry](https://github.com/maxibor) +- [Thomas A. Christensen II](https://github.com/MillironX) +- [Jianhong Ou](https://github.com/jianhong) +- [Rafal Stepien](https://github.com/rafalstepien) +- [Mahwash Jamy](https://github.com/mjamy) +- [Lily Andersson Lee](https://github.com/LilyAnderssonLee) + +### Acknowledgments - +We also are grateful for the feedback and comments from: + +- The general [nf-core/community](https://nf-co.re/community) + +And specifically to + +- [Alex Hübner](https://github.com/alexhbnr) + +❤️ also goes to [Zandra Fagernäs](https://github.com/ZandraFagernas) for the logo. ## Contributions and Support @@ -93,10 +149,11 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use nf-core/taxprofiler for your analysis, please cite it using the following doi: [10.1101/2023.10.20.563221](https://doi.org/10.1101/2023.10.20.563221). + +> Stamouli, S., Beber, M. E., Normark, T., Christensen II, T. A., Andersson-Li, L., Borry, M., Jamy, M., nf-core community, & Fellows Yates, J. A. (2023). nf-core/taxprofiler: Highly parallelised and flexible pipeline for metagenomic taxonomic classification and profiling. In bioRxiv (p. 2023.10.20.563221). https://doi.org/10.1101/2023.10.20.563221 - +For the latest version of the code, cite the Zenodo doi: [10.5281/zenodo.7728364](https://doi.org/10.5281/zenodo.7728364) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index d5119e4a..8b9a651b 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,8 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/taxprofiler Methods Description" section_href: "https://github.com/nf-core/taxprofiler" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

Data was processed using nf-core/taxprofiler v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

@@ -17,12 +15,13 @@ data: |
  • Ewels, P. A., Peltzer, A., Fillinger, S., Patel, H., Alneberg, J., Wilm, A., Garcia, M. U., Di Tommaso, P., & Nahnsen, S. (2020). The nf-core framework for community-curated bioinformatics pipelines. Nature Biotechnology, 38(3), 276-278. doi: 10.1038/s41587-020-0439-x
  • Grüning, B., Dale, R., Sjödin, A., Chapman, B. A., Rowe, J., Tomkins-Tinch, C. H., Valieris, R., Köster, J., & Bioconda Team. (2018). Bioconda: sustainable and comprehensive software distribution for the life sciences. Nature Methods, 15(7), 475–476. doi: 10.1038/s41592-018-0046-7
  • da Veiga Leprevost, F., Grüning, B. A., Alves Aflitos, S., Röst, H. L., Uszkoreit, J., Barsnes, H., Vaudel, M., Moreno, P., Gatto, L., Weber, J., Bai, M., Jimenez, R. C., Sachsenberg, T., Pfeuffer, J., Vera Alvarez, R., Griss, J., Nesvizhskii, A. I., & Perez-Riverol, Y. (2017). BioContainers: an open-source and community-driven framework for software standardization. Bioinformatics (Oxford, England), 33(16), 2580–2582. doi: 10.1093/bioinformatics/btx192
  • +
  • Stamouli, S., Beber, M. E., Normark, T., Christensen, T. A., Andersson-Li, L., Borry, M., Jamy, M., nf-core community, & Fellows Yates, J. A. (2023). nf-core/taxprofiler: Highly parallelised and flexible pipeline for metagenomic taxonomic classification and profiling. (Preprint). bioRxiv 2023.10.20.563221. doi: 10.1101/2023.10.20.563221
  • ${tool_bibliography}
    Notes:
      - ${nodoi_text} + ${doi_text}
    • The command above does not include parameters contained in any configs or profiles that may have been used. Ensure the config file is also uploaded with your publication!
    • You should also cite all software used within this run. Check the "Software Versions" of this report to get version information.
    diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 8e75884e..58909c5e 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -2,6 +2,7 @@ report_comment: > This report has been generated by the nf-core/taxprofiler analysis pipeline. For information about how to interpret these results, please see the documentation. + report_section_order: "nf-core-taxprofiler-methods-description": order: -1000 @@ -12,4 +13,273 @@ report_section_order: export_plots: true +<<<<<<< HEAD disable_version_detection: true +======= +custom_logo: "nf-core-taxprofiler_logo_custom_light.png" +custom_logo_url: https://nf-co.re/taxprofiler +custom_logo_title: "nf-core/taxprofiler" + +run_modules: + - fastqc + - adapterRemoval + - fastp + - bbduk + - prinseqplusplus + - porechop + - filtlong + - bowtie2 + - minimap2 + - samtools + - kraken + - kaiju + - diamond + - malt + - motus + - custom_content + +sp: + diamond: + fn_re: ".*.diamond.log$" + fastqc/data: + fn_re: ".*(fastqc|falco)_data.txt$" + fastqc/zip: + fn: "*_fastqc.zip" + +top_modules: + - "fastqc": + name: "FastQC / Falco (pre-Trimming)" + path_filters: + - "*raw*" + path_filters_exclude: + - "*processed*" + extra: "If used in this run, Falco is a drop-in replacement for FastQC producing the same output, written by Guilherme de Sena Brandine and Andrew D. Smith." + - "fastqc": + name: "FastQC / Falco (post-Trimming)" + path_filters: + - "*processed*" + path_filters_exclude: + - "*raw*" + extra: "If used in this run, Falco is a drop-in replacement for FastQC producing the same output, written by Guilherme de Sena Brandine and Andrew D. Smith." + - "fastp" + - "adapterRemoval" + - "porechop": + extra: "ℹ️: if you get the error message 'Error - was not able to plot data.' this means that porechop did not detect any adapters and therefore no statistics generated." + - "bbduk" + - "prinseqplusplus" + - "filtlong" + - "bowtie2": + name: "bowtie2" + - "samtools": + name: "Samtools Stats" + - "kraken": + name: "Kraken" + path_filters: + - "*.kraken2.kraken2.report.txt" + - "kraken": + name: "Bracken" + anchor: "bracken" + target: "Bracken" + doi: "10.7717/peerj-cs.104" + info: "Estimates species abundances in metagenomics samples by probabilistically re-distributing reads in the taxonomic tree." + extra: "ℹ️: plot title will say Kraken2 due to the first step of bracken producing the same output format as Kraken. Abundance information is currently not supported in MultiQC." + path_filters: + - "*.bracken.kraken2.report.txt" + - "kraken": + name: "Centrifuge" + anchor: "centrifuge" + target: "Centrifuge" + doi: "10.1101/gr.210641.116" + info: "is a very rapid and memory-efficient system for the classification of DNA sequences from microbial samples. The system uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. Note: Figure title" + extra: "ℹ️: plot title will say Kraken2 due to Centrifuge producing the same output format as Kraken. If activated, see the actual Kraken2 results in the section above." + path_filters: + - "*.centrifuge.txt" + - "malt": + name: "MALT" + - "diamond" + - "kaiju": + name: "Kaiju" + - "motus" + +#It is not possible to set placement for custom kraken and centrifuge columns. + +table_columns_placement: + FastQC / Falco (pre-Trimming): + total_sequences: 100 + avg_sequence_length: 110 + median_sequence_length: 120 + percent_duplicates: 130 + percent_gc: 140 + percent_fails: 150 + FastQC / Falco (post-Trimming): + total_sequences: 200 + avg_sequence_length: 210 + median_sequence_length: 220 + percent_duplicates: 230 + percent_gc: 240 + percent_fails: 250 + fastp: + pct_adapter: 300 + pct_surviving: 310 + pct_duplication: 320 + after_filtering_gc_content: 330 + after_filtering_q30_rate: 340 + after_filtering_q30_bases: 350 + filtering_result_passed_filter_reads: 360 + Adapter Removal: + aligned_total: 360 + percent_aligned: 370 + percent_collapsed: 380 + percent_discarded: 390 + Porechop: + Input Reads: 400 + Start Trimmed: 410 + Start Trimmed Percent: 420 + End Trimmed: 430 + End Trimmed Percent: 440 + Middle Split: 450 + Middle Split Percent: 460 + Filtlong: + Target bases: 500 + BBDuk: + Input reads: 800 + Total Removed bases percent: 810 + Total Removed bases: 820 + Total Removed reads percent: 830 + Total Removed reads: 840 + PRINSEQ++: + prinseqplusplus_total: 900 + bowtie2: + overall_alignment_rate: 1000 + Samtools Stats: + raw_total_sequences: 1100 + reads_mapped: 1110 + reads_mapped_percent: 1120 + reads_properly_paired_percent: 1130 + non-primary_alignments: 1140 + reads_MQ0_percent: 1150 + error_rate: 1160 + Bracken: + "% Unclassified": 1200 + "% Top 5": 1210 + Centrifuge: + "% Unclassified": 1300 + "% Top 5": 1310 + DIAMOND: + queries_aligned: 1400 + Kaiju: + assigned: 1500 + "% Assigned": 1510 + "% Unclassified": 1520 + Kraken: + "% Unclassified": 1600 + "% Top 5": 1610 + MALT: + "Num. of queries": 1700 + Total reads: 1710 + Mappability: 1720 + Assig. Taxonomy: 1730 + Taxonomic assignment success: 1740 + motus: + Total number of reads: 1800 + Number of reads after filtering: 1810 + Total number of inserts: 1820 + Unique mappers: 1830 + Multiple mappers: 1840 + Ignored multiple mapper without unique hit: 1850 + "Number of ref-mOTUs": 1860 + "Number of meta-mOTUs": 1870 + "Number of ext-mOTUs": 1880 + +table_columns_visible: + FastQC / Falco (pre-Trimming): + total_sequences: True + avg_sequence_length: True + percent_duplicates: True + percent_gc: True + percent_fails: False + FastQC / Falco (post-Trimming): + total_sequences: True + avg_sequence_length: True + percent_duplicates: False + percent_gc: False + percent_fails: False + porechop: + Input reads: False + Start Trimmed: + Start Trimmed Percent: True + End Trimmed: False + End Trimmed Percent: True + Middle Split: False + Middle Split Percent: True + fastp: + pct_adapter: True + pct_surviving: True + pct_duplication: False + after_filtering_gc_content: False + after_filtering_q30_rate: False + after_filtering_q30_bases: False + Filtlong: + Target bases: True + Adapter Removal: + aligned_total: True + percent_aligned: True + percent_collapsed: True + percent_discarded: False + BBDuk: + Input reads: False + Total Removed bases Percent: False + Total Removed bases: False + Total Removed reads percent: True + Total Removed reads: False + "PRINSEQ++": + prinseqplusplus_total: True + bowtie2: + overall_alignment_rate: True + Samtools Stats: + raw_total_sequences: True + reads_mapped: True + reads_mapped_percent: True + reads_properly_paired_percent: False + non-primary_alignments: False + reads_MQ0_percent: False + error_rate: False + Kraken: False + Bracken: False + Centrifuge: False + DIAMOND: False + Kaiju: False + MALT: False + motus: False + +table_columns_name: + FastQC / Falco (pre-Trimming): + total_sequences: "Nr. Input Reads" + avg_sequence_length: "Length Input Reads" + percent_gc: "% GC Input Reads" + percent_duplicates: "% Dups Input Reads" + percent_fails: "% Failed Input Reads" + FastQC / Falco (post-Trimming): + total_sequences: "Nr. Processed Reads" + avg_sequence_length: "Length Processed Reads" + percent_gc: "% GC Processed Reads" + percent_duplicates: "% Dups Processed Reads" + percent_fails: "% Failed Processed Reads" + Samtools Stats: + raw_total_sequences: "Nr. Reads Into Mapping" + reads_mapped: "Nr. Mapped Reads" + reads_mapped_percent: "% Mapped Reads" + +extra_fn_clean_exts: + - "kraken2.report.txt" + - ".txt" + - ".settings" + - ".bbduk" + - ".unmapped" + - "_filtered" + - type: remove + pattern: "_falco" + +section_comments: + general_stats: "By default, all read count columns are displayed as millions (M) of reads." +>>>>>>> dev diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7..82565b15 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,6 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, diff --git a/assets/schema_database.json b/assets/schema_database.json new file mode 100644 index 00000000..a9a8f13a --- /dev/null +++ b/assets/schema_database.json @@ -0,0 +1,79 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/nf-core/taxprofiler/master/assets/schema_database.json", + "title": "nf-core/taxprofiler pipeline - params.database schema", + "description": "Schema for the file provided with params.database", + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string", + "exists": true, + "pattern": "^\\S+$", + "enum": [ + "bracken", + "centrifuge", + "diamond", + "ganon", + "kaiju", + "kmcp", + "kraken2", + "krakenuniq", + "malt", + "metaphlan", + "motus" + ], + "errorMessage": "Invalid tool name. Please see documentation for all supported profilers. Currently these classifers are included: bracken, centrifuge, diamond, ganon, kaiju, kmcp, kraken2, krakenuniq, malt, metaphlan, motus.", + "meta": ["tool"] + }, + "db_name": { + "type": "string", + "exists": true, + "pattern": "^\\S+$", + "errorMessage": "The unique name of the database should be provided.", + "meta": ["db_name"] + }, + "db_params": { + "type": "string", + "pattern": "^[^\"']*$", + "anyOf": [ + { + "properties": { + "tool": { "const": "bracken" } + }, + "not": { + "pattern": ".*;" + }, + "errorMessage": "Invalid database db_params entry. Bracken requires a semi-colon if passing parameter." + }, + { + "properties": { + "tool": { "const": "kmcp" } + }, + "pattern": ".*;$", + "errorMessage": "Invalid database db_params entry. KMCP only requires a semi-colon if passing arguments to KMCP profile, in cases of which the arguments should go after the semi-colon." + }, + { + "not": { + "properties": { + "tool": { "enum": ["bracken", "kmcp"] } + } + }, + "errorMessage": "Invalid database db_params entry." + } + ], + "errorMessage": "Invalid database db_params entry. No quotes allowed.", + "meta": ["db_params"] + }, + "db_path": { + "type": "string", + "exists": true, + "format": "file-path", + "errorMessage": "The database path could not be found." + } + }, + "required": ["tool", "db_name", "db_path"], + "uniqueEntries": ["tool", "db_name"] + } +} diff --git a/assets/schema_input.json b/assets/schema_input.json index fe4d4a00..6acc00f7 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -9,25 +9,49 @@ "properties": { "sample": { "type": "string", - "pattern": "^\\S+$", "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, + "run_accession": { + "type": "string", + "errorMessage": "Run accession must be provided and cannot contain spaces." + }, + "instrument_platform": { + "type": "string", + "enum": [ + "ABI_SOLID", + "BGISEQ", + "CAPILLARY", + "COMPLETE_GENOMICS", + "DNBSEQ", + "HELICOS", + "ILLUMINA", + "ION_TORRENT", + "LS454", + "OXFORD_NANOPORE", + "PACBIO_SMRT" + ], + "errorMessage": "Sequencing platform must be provided." + }, "fastq_1": { "type": "string", "format": "file-path", - "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" }, "fastq_2": { "type": "string", "format": "file-path", - "exists": true, "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'. If not applicable, leave it empty." + }, + "fasta": { + "type": "string", + "format": "file-path", + "pattern": "^\\S+\\.(f(ast)?q|fa(sta)?)\\.gz$", + "errorMessage": "FastA file must be provided, cannot contain spaces and must have extension '.fa.gz' or '.fasta.gz'. If not applicable, leave it empty." } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "run_accession", "instrument_platform"] } } diff --git a/conf/base.config b/conf/base.config index 372f0798..12ca2afa 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -24,11 +23,10 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } - memory = { check_max( 6.GB * task.attempt, 'memory' ) } + memory = { check_max( 1.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_low { @@ -62,4 +60,32 @@ process { withName:CUSTOM_DUMPSOFTWAREVERSIONS { cache = false } + + withName: BRACKEN_BRACKEN { + errorStrategy = 'ignore' + } + + withName: CENTRIFUGE_KREPORT { + errorStrategy = {task.exitStatus == 255 ? 'ignore' : 'retry'} + } + + withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE { + errorStrategy = { task.exitStatus in [255,1] ? 'ignore' : 'retry' } + } + + withName: MEGAN_RMA2INFO_TSV { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withName: MEGAN_RMA2INFO_KRONA { + cpus = { check_max( 1 , 'cpus' ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } + withName: FALCO { + cpus = { check_max( 6 , 'cpus' ) } + memory = { check_max( 4.GB * task.attempt, 'memory' ) } + time = { check_max( 4.h * task.attempt, 'time' ) } + } } diff --git a/conf/modules.config b/conf/modules.config index e3ea8fa6..d7488250 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -12,14 +12,759 @@ process { - publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + withName: FASTQC { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_raw" } + publishDir = [ + path: { "${params.outdir}/fastqc/raw" }, + mode: params.publish_dir_mode, + pattern: '*.{html,zip}' + ] + } + + withName: FASTQC_PROCESSED { + ext.args = '--quiet' + ext.prefix = { "${meta.id}_${meta.run_accession}_processed" } + publishDir = [ + path: { "${params.outdir}/fastqc/processed" }, + mode: params.publish_dir_mode, + pattern: '*.{html,zip}' + ] + } + + withName: FALCO { + ext.prefix = { "${meta.id}_${meta.run_accession}_raw_falco" } + publishDir = [ + path: { "${params.outdir}/falco/raw" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt,zip}' + ] + } + + withName: FALCO_PROCESSED { + ext.prefix = { "${meta.id}_${meta.run_accession}_processed_falco" } + publishDir = [ + path: { "${params.outdir}/falco/processed" }, + mode: params.publish_dir_mode, + pattern: '*.{html,txt,zip}' + ] + } + + withName: FASTP_SINGLE { + ext.args = [ + // trimming options + params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "", + params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", + // filtering options + "--length_required ${params.shortread_qc_minlength}", + (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp') ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '', + params.shortread_qc_dedup ? "--dedup" : "" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/fastp" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ], + [ + path: { "${params.outdir}/fastp" }, + mode: params.publish_dir_mode, + pattern: '*.{log,html,json}' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + // Don't know why `!` doesn't work here, but `== false` makes it work... + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: FASTP_PAIRED { + ext.args = [ + // collapsing options - option to retain singletons + params.shortread_qc_includeunmerged ? '--include_unmerged' : '', + // trimming options + params.shortread_qc_skipadaptertrim ? "--disable_adapter_trimming" : "", + params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter_sequence ${params.shortread_qc_adapter1}" : "", + params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter2 ? "--adapter_sequence_r2 ${params.shortread_qc_adapter2}" : "--detect_adapter_for_pe", + // filtering options + "--length_required ${params.shortread_qc_minlength}", + params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == 'fastp' ? "--low_complexity_filter --complexity_threshold ${params.shortread_complexityfilter_fastp_threshold}" : '', + params.shortread_qc_dedup ? "--dedup" : "" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/fastp" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ], + [ + path: { "${params.outdir}/fastp" }, + mode: params.publish_dir_mode, + pattern: '*.{log,html,json}' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: params.shortread_qc_mergepairs ? '*merged.fastq.gz' : '*.fastp.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: ADAPTERREMOVAL_SINGLE { + ext.args = [ + // trimming options + params.shortread_qc_skipadaptertrim ? "--adapter1 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", + // filtering options + "--minlength ${params.shortread_qc_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/adapterremoval" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ], + [ + path: { "${params.outdir}/adapterremoval" }, + mode: params.publish_dir_mode, + pattern: '*.settings' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*truncated.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: ADAPTERREMOVAL_PAIRED { + ext.args = [ + // collapsing options + params.shortread_qc_mergepairs ? "--collapse" : "", + // trimming options + params.shortread_qc_skipadaptertrim ? "--adapter1 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter1 ? "--adapter1 ${params.shortread_qc_adapter1}" : "", // adding adapter list happens at module input channel level + params.shortread_qc_skipadaptertrim ? "--adapter2 ''" : params.shortread_qc_adapterlist ? "" : params.shortread_qc_adapter2 ? "--adapter2 ${params.shortread_qc_adapter2}" : "", + // filtering options + "--minlength ${params.shortread_qc_minlength}" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/adapterremoval" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ], + [ + path: { "${params.outdir}/adapterremoval" }, + mode: params.publish_dir_mode, + pattern: '*.settings' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*{truncated.fastq,singleton.truncated}.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && !params.shortread_qc_mergepairs && params.save_analysis_ready_fastqs ? it : null} + ] + ] + } + + // AdapterRemoval separate output merging + withName: CAT_FASTQ { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && !params.perform_shortread_complexityfilter && params.perform_shortread_qc && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: PORECHOP_PORECHOP { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/porechop" }, + mode: params.publish_dir_mode, + pattern: '*_porechopped.fastq.gz', + enabled: params.save_preprocessed_reads + ], + [ + path: { "${params.outdir}/porechop" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*_porechopped.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && params.longread_qc_skipqualityfilter && !params.longread_qc_skipadaptertrim && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: FILTLONG { + ext.args = [ + "--min_length ${params.longread_qc_qualityfilter_minlength}", + "--keep_percent ${params.longread_qc_qualityfilter_keeppercent}", + "--target_bases ${params.longread_qc_qualityfilter_targetbases}" + ] + .join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}_filtered" } + publishDir = [ + [ + path: { "${params.outdir}/filtlong" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_preprocessed_reads + ], + [ + path: { "${params.outdir}/filtlong" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_longread_hostremoval && !params.longread_qc_skipqualityfilter && params.perform_longread_qc && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: BBMAP_BBDUK { + ext.args = [ + "entropy=${params.shortread_complexityfilter_entropy}", + "entropywindow=${params.shortread_complexityfilter_bbduk_windowsize}", + params.shortread_complexityfilter_bbduk_mask ? "entropymask=t" : "entropymask=f" + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/bbduk/" }, + mode: params.publish_dir_mode, + pattern: '*.{fastq.gz}', + enabled: params.save_complexityfiltered_reads + ], + [ + path: { "${params.outdir}/bbduk/" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: PRINSEQPLUSPLUS { + ext.args = [ + params.shortread_complexityfilter_prinseqplusplus_mode == 'dust' ? "-lc_dust=${params.shortread_complexityfilter_prinseqplusplus_dustscore}" : "-lc_entropy=${params.shortread_complexityfilter_entropy}", + "-trim_qual_left=0 -trim_qual_left=0 -trim_qual_window=0 -trim_qual_step=0", + ].join(' ').trim() + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/prinseqplusplus/" }, + mode: params.publish_dir_mode, + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}', + enabled: params.save_complexityfiltered_reads + ], + [ + path: { "${params.outdir}/prinseqplusplus/" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*{_good_out.fastq.gz,_good_out_R1.fastq.gz,_good_out_R2.fastq.gz}', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && !params.perform_shortread_hostremoval && params.shortread_complexityfilter_tool && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: BOWTIE2_BUILD { + publishDir = [ + [ + path: { "${params.outdir}/bowtie2/build" }, + mode: params.publish_dir_mode, + pattern: 'bowtie2', + enabled: params.save_hostremoval_index + ] + ] + } + + // Saving unmapped reads as FQ comes via input channel! + withName: BOWTIE2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + pattern: '*.log' + ], + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: params.save_hostremoval_bam + ], + [ + path: { "${params.outdir}/bowtie2/align" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_hostremoval_unmapped + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + enabled: params.perform_shortread_hostremoval, + pattern: '*.fastq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun ) ) && params.perform_shortread_hostremoval && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: MINIMAP2_INDEX { + ext.args = '-x map-ont' + publishDir = [ + path: { "${params.outdir}/minimap2/index" }, + mode: params.publish_dir_mode, + pattern: '*.mmi', + enabled: params.save_hostremoval_index + ] + } + + withName: MINIMAP2_ALIGN { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/minimap2/align" }, + mode: params.publish_dir_mode, + pattern: '*.bam', + enabled: params.save_hostremoval_bam + ] + } + + withName: SAMTOOLS_VIEW { + ext.args = '-f 4' + ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" } + } + + withName: SAMTOOLS_FASTQ { + ext.prefix = { "${meta.id}_${meta.run_accession}.unmapped" } + publishDir = [ + [ + path: { "${params.outdir}/samtools/fastq" }, + mode: params.publish_dir_mode, + pattern: '*_other.fastq.gz', + enabled: params.save_hostremoval_unmapped + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*.fq.gz', + enabled: params.save_analysis_ready_fastqs, + saveAs: { ( params.perform_runmerging == false || ( params.perform_runmerging && !meta.is_multirun) ) && params.perform_longread_hostremoval && params.save_analysis_ready_fastqs ? it : null } + ] + ] + } + + withName: SAMTOOLS_STATS { + ext.prefix = { "${meta.id}_${meta.run_accession}" } + publishDir = [ + path: { "${params.outdir}/samtools/stats" }, + mode: params.publish_dir_mode, + pattern: '*stats' + ] + } + + withName: MERGE_RUNS { + ext.prefix = { "${meta.id}" } + publishDir = [ + [ + path: { "${params.outdir}/run_merging/" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.save_runmerged_reads + ], + [ + path: { "${params.outdir}/analysis_ready_fastqs" }, + mode: params.publish_dir_mode, + pattern: '*.fastq.gz', + enabled: params.perform_runmerging && params.save_analysis_ready_fastqs + ] + ] + } + + withName: MALT_RUN { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params} -m ${params.malt_mode}" } + // one run with multiple samples, so fix ID to just db name to ensure clean log name + ext.prefix = { "${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{rma6,log,sam}' + ] + } + + withName: 'MEGAN_RMA2INFO_TSV' { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = "-c2c Taxonomy" + ext.prefix = { "${meta.id}" } + publishDir = [ + path: { "${params.outdir}/malt/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt.gz,megan}' + ] + } + + withName: KRAKEN2_KRAKEN2 { + tag = { "${meta.db_name}|${meta.tool}|${meta.id}" } + ext.args = params.kraken2_save_minimizers ? { "${meta.db_params} --report-minimizer-data" } : { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { meta.tool == "bracken" ? "${meta.id}_${meta.db_name}.bracken" : "${meta.id}_${meta.db_name}.kraken2" } : { meta.tool == "bracken" ? "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" : "${meta.id}_${meta.run_accession}_${meta.db_name}.kraken2" } + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,fastq.gz}' + ] + } + + withName: KRAKEN2_STANDARD_REPORT { + tag = { "${meta.db_name}|${meta.tool}|${meta.id}" } + ext.prefix = params.perform_runmerging ? { meta.tool == "bracken" ? "${meta.id}_${meta.db_name}.bracken" : "${meta.id}_${meta.db_name}.kraken2" } : { meta.tool == "bracken" ? "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" : "${meta.id}_${meta.run_accession}_${meta.db_name}.kraken2" } + publishDir = [ + path: { "${params.outdir}/kraken2/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.report.txt' + ] + } + + withName: BRACKEN_BRACKEN { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.bracken" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.bracken" } + publishDir = [ + path: { "${params.outdir}/bracken/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*{.tsv,.txt}' ] + } +<<<<<<< HEAD withName: FASTQC { ext.args = '--quiet' +======= + withName: BRACKEN_COMBINEBRACKENOUTPUTS { + ext.prefix = { "bracken_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/bracken/" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + } + + withName: KRAKENTOOLS_COMBINEKREPORTS_KRAKEN { + ext.prefix = { "kraken2_${meta.db_name}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/kraken2/" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + } + + withName: KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + ext.args = { "${meta.db_params}" } + // one run with multiple samples, so fix ID to just db name to ensure clean log name + ext.prefix = { "${meta.db_name}.krakenuniq" } + publishDir = [ + path: { "${params.outdir}/krakenuniq/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,fasta.gz}' + ] + } + + withName: KRONA_CLEANUP { + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/krona/" }, + mode: params.publish_dir_mode, + pattern: '*.{html}' + ] + } + + withName: KRONA_KTIMPORTTEXT { + ext.prefix = { "${meta.tool}_${meta.id}" } + publishDir = [ + path: { "${params.outdir}/krona/" }, + mode: params.publish_dir_mode, + pattern: '*.{html}' + ] + } + + withName: 'MEGAN_RMA2INFO_KRONA' { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "--read2class Taxonomy" } + ext.prefix = { "${meta.id}_${meta.db_name}" } + } + + withName: KRONA_KTIMPORTTAXONOMY { + ext.args = "-i" + ext.prefix = { "${meta.tool}_${meta.id}" } + publishDir = [ + path: { "${params.outdir}/krona/" }, + mode: params.publish_dir_mode, + pattern: '*.{html}' + ] + } + + withName: METAPHLAN_METAPHLAN { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.metaphlan" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.metaphlan" } + publishDir = [ + path: { "${params.outdir}/metaphlan/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{biom,txt}' + ] + } + + withName: METAPHLAN_MERGEMETAPHLANTABLES { + ext.prefix = { "metaphlan_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/metaphlan/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: CENTRIFUGE_CENTRIFUGE { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" } + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt,sam,tab,gz}' + ] + } + + withName: CENTRIFUGE_KREPORT { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.centrifuge" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.centrifuge" } + publishDir = [ + path: { "${params.outdir}/centrifuge/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE { + ext.prefix = { "centrifuge_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/centrifuge/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KAIJU_KAIJU { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kaiju" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kaiju" } + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' + ] + } + + withName: 'KAIJU_KAIJU2TABLE_SINGLE' { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = {[ + params.kaiju_expand_viruses ? "-e" : "" + ].join(' ').trim() } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kaijutable" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kaijutable" } + publishDir = [ + path: { "${params.outdir}/kaiju/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: 'KAIJU_KAIJU2TABLE_COMBINED' { + ext.prefix = { "kaiju_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/kaiju/" }, + mode: params.publish_dir_mode, + pattern: '*.{txt}' + ] + } + + withName: KAIJU_KAIJU2KRONA { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = '-v -u' + } + + withName: DIAMOND_BLASTX { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.diamond" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.diamond" } + publishDir = [ + path: { "${params.outdir}/diamond/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{blast,xml,txt,daa,sam,tsv,paf,log}' + ] + } + + withName: MOTUS_PROFILE { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { + [ + params.motus_remove_ncbi_ids ? "" : "-p", + params.motus_use_relative_abundance ? "" : "-c", + params.motus_save_mgc_read_counts ? "-M ${task.ext.prefix}.mgc" : "" + ].join(',').replaceAll(','," ") + } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}" } + publishDir = [ + path: { "${params.outdir}/motus/${meta.db_name}/" }, + mode: params.publish_dir_mode + ] + } + + withName: MOTUS_MERGE { + ext.args = { params.standardisation_motus_generatebiom ? "-B" : "" } + ext.prefix = { "motus_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/motus/" }, + mode: params.publish_dir_mode + ] + } + + withName: KMCP_SEARCH { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kmcp_search" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kmcp_search" } + publishDir = [ + path: { "${params.outdir}/kmcp/${meta.db_name}/" }, + mode: params.publish_dir_mode, + enabled: params.kmcp_save_search + ] + } + + withName: KMCP_PROFILE { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.kmcp" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.kmcp" } + publishDir = [ + path: { "${params.outdir}/kmcp/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{profile}' + ] + } + + withName: GANON_CLASSIFY { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = params.ganon_save_readclassifications ? { "${meta.db_params} --output-all --output-lca --output-unclassified" } : { "${meta.db_params}" } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.ganon" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.ganon" } + publishDir = [ + path: { "${params.outdir}/ganon/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{tre,rep,lca,all,unc}' + ] + } + + withName: GANON_REPORT { + tag = {"${meta.db_name}|${meta.id}"} + ext.args = {[ + "--report-type ${params.ganon_report_type}", + ganon_report_rank != 'default' ? "--ranks ${params.ganon_report_rank}" : "", + "--top-percentile ${params.ganon_report_toppercentile}", + "--min-count ${params.ganon_report_mincount}", + "--max-count ${params.ganon_report_maxcount}" + ].join(' ').trim() } + ext.prefix = params.perform_runmerging ? { "${meta.id}_${meta.db_name}.ganon_report" } : { "${meta.id}_${meta.run_accession}_${meta.db_name}.ganon_report" } + publishDir = [ + path: { "${params.outdir}/ganon/${meta.db_name}/" }, + mode: params.publish_dir_mode, + pattern: '*.{tre}' + ] + } + + withName: GANON_TABLE { + ext.prefix = { "ganon_${meta.id}_combined_reports" } + publishDir = [ + path: { "${params.outdir}/ganon/" }, + mode: params.publish_dir_mode, + pattern: '*.txt' + ] + } + + withName: TAXPASTA_MERGE { + tag = { "${meta.tool}|${meta.id}" } + ext.args = { + [ + "-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}", + params.taxpasta_add_name ? "--add-name" : "", + params.taxpasta_add_rank ? "--add-rank" : "", + params.taxpasta_add_lineage ? "--add-lineage" : "", + params.taxpasta_add_idlineage ? "--add-id-lineage" : "", + params.taxpasta_add_ranklineage ? "--add-rank-lineage" : "", + params.taxpasta_ignore_errors ? "--ignore-errors" : "" + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/taxpasta/" }, + mode: params.publish_dir_mode, + pattern: '*.{tsv,csv,arrow,parquet,biom}' + ] + } + + withName: TAXPASTA_STANDARDISE { + tag = { "${meta.tool}|${meta.id}" } + ext.args = { + [ + "-p ${meta.tool} -o ${meta.tool}_${meta.id}.${params.standardisation_taxpasta_format}", + params.taxpasta_add_name ? "--add-name" : "", + params.taxpasta_add_rank ? "--add-rank" : "", + params.taxpasta_add_lineage ? "--add-lineage" : "", + params.taxpasta_add_idlineage ? "--add-id-lineage" : "" + ].join(' ').trim() + } + publishDir = [ + path: { "${params.outdir}/taxpasta/" }, + mode: params.publish_dir_mode, + pattern: '*.{tsv,csv,arrow,parquet,biom}' + ] +>>>>>>> dev } withName: CUSTOM_DUMPSOFTWAREVERSIONS { diff --git a/conf/test.config b/conf/test.config index 42772cfe..c11f27b6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,45 @@ params { max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = true + run_malt = false + run_metaphlan = true + run_centrifuge = true + run_diamond = true + run_krakenuniq = true + run_motus = false + run_ganon = true + run_krona = true + run_kmcp = true + kmcp_mode = 0 + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + run_profile_standardisation = true +} - // Genome references - genome = 'R64-1-1' +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } + withName: MEGAN_RMA2INFO_TSV { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_KRONA { + maxForks = 1 + } } diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config new file mode 100644 index 00000000..c3422d02 --- /dev/null +++ b/conf/test_adapterremoval.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for adapterremoval' + config_profile_description = "Minimal test to check the alternative short-read QC function, adapterremoval" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_tool = 'adapterremoval' + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/conf/test_bbduk.config b/conf/test_bbduk.config new file mode 100644 index 00000000..623fe191 --- /dev/null +++ b/conf/test_bbduk.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for bbduk' + config_profile_description = "Minimal test to check the default tool of short-read complexity filtering, bbduk" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = true + perform_longread_qc = true + perform_shortread_complexityfilter = true + shortread_complexityfilter_tool = 'bbduk' + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/conf/test_falco.config b/conf/test_falco.config new file mode 100644 index 00000000..3fb77c03 --- /dev/null +++ b/conf/test_falco.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for Falco' + config_profile_description = "Minimal test dataset without performing any preprocessing nor profiling to check pipeline function but running falco instead of fastqc. Useful when you only wish to test a single profiler without having to 'opt-out' of all the others" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + preprocessing_qc_tool = 'falco' + perform_shortread_qc = true + perform_longread_qc = true + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/conf/test_fastp.config b/conf/test_fastp.config new file mode 100644 index 00000000..3feeae7a --- /dev/null +++ b/conf/test_fastp.config @@ -0,0 +1,53 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for fastp' + config_profile_description = "Minimal test to check the default short-read QC function, fastp" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_tool = 'fastp' + perform_shortread_complexityfilter = true + shortread_complexityfilter_tool = 'fastp' + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/conf/test_full.config b/conf/test_full.config index 49a10a0f..2a74a80b 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -1,12 +1,10 @@ /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running full-size tests -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Defines input files and everything required to run a full size pipeline test. - Use as follows: nextflow run nf-core/taxprofiler -profile test_full, --outdir - ---------------------------------------------------------------------------------------- */ @@ -15,10 +13,66 @@ params { config_profile_description = 'Full test dataset to check pipeline function' // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + input = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_full.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_full_v1.1.csv' // Genome references - genome = 'R64-1-1' + hostremoval_reference = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/819/615/GCA_000819615.1_ViralProj14015/GCA_000819615.1_ViralProj14015_genomic.fna.gz' + + save_preprocessed_reads = false + + perform_shortread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = false + save_complexityfiltered_reads = false + + perform_longread_qc = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + save_hostremoval_index = false + save_hostremoval_bam = false + save_hostremoval_unmapped = false + + perform_runmerging = true + save_runmerged_reads = false + + save_analysis_ready_fastqs = true + + run_centrifuge = true + centrifuge_save_reads = false + + run_diamond = true + + run_kaiju = true + + run_kraken2 = true + kraken2_save_reads = false + kraken2_save_readclassifications = false + kraken2_save_minimizers = false + + run_krakenuniq = true + krakenuniq_save_reads = false + krakenuniq_save_readclassifications = false + + run_bracken = true + + run_malt = true + malt_save_reads = false + malt_generate_megansummary = true + + run_metaphlan = true + + run_motus = true + motus_save_mgc_read_counts = true + + run_ganon = true + ganon_save_readclassifications = true + + run_kmcp = true + kmcp_save_search = true + + run_profile_standardisation = true + run_krona = true } + +cleanup = true diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config new file mode 100644 index 00000000..e93de158 --- /dev/null +++ b/conf/test_krakenuniq.config @@ -0,0 +1,68 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// +// Separate test as KrakenUniq database can sometimes be too big for GHA +// + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test to check KrakenUniq function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_krakenuniq.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = true + run_motus = false + run_kmcp = false + kmcp_mode = 0 + run_ganon = false + run_krona = true + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = false + kraken2_save_reads = false + centrifuge_save_reads = false + diamond_save_reads = false + run_profile_standardisation = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_TSV { + maxForks = 1 + } + withName: MEGAN_RMA2INFO_KRONA { + maxForks = 1 + } +} diff --git a/conf/test_malt.config b/conf/test_malt.config new file mode 100644 index 00000000..7e5f2df3 --- /dev/null +++ b/conf/test_malt.config @@ -0,0 +1,54 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// +// Separate test for malt +// + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test to check malt function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_malt.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = true + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_motus.config b/conf/test_motus.config new file mode 100644 index 00000000..ef1a2276 --- /dev/null +++ b/conf/test_motus.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +// +// Separate test as mOTUs database download can be flaky +// + +params { + config_profile_name = 'mOTUs Test profile' + config_profile_description = 'Minimal test to check mOTUs function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'database_motus.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = true + run_kmcp = false + kmcp_mode = 0 + run_ganon = false + motus_save_mgc_read_counts = false + motus_remove_ncbi_ids = false + motus_use_relative_abundance = false + run_profile_standardisation = true +} diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config new file mode 100644 index 00000000..004a49e8 --- /dev/null +++ b/conf/test_nopreprocessing.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset skipping all preprocessing to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = true + run_malt = false // too big with other profiles on GHA + run_metaphlan = true + run_centrifuge = true + run_diamond = true + run_krakenuniq = true + run_motus = false + run_kmcp = true + kmcp_mode = 0 + run_ganon = true + run_krona = true +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config new file mode 100644 index 00000000..7cf2317d --- /dev/null +++ b/conf/test_noprofiling.config @@ -0,0 +1,51 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset without performing any profiling to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_kmcp = false + kmcp_mode = 0 + run_ganon = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + } +} diff --git a/conf/test_nothing.config b/conf/test_nothing.config new file mode 100644 index 00000000..ed247ef4 --- /dev/null +++ b/conf/test_nothing.config @@ -0,0 +1,51 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile' + config_profile_description = "Minimal test dataset without performing any preprocessing nor profiling to check pipeline function. Useful when you only wish to test a single profiler without having to 'opt-out' of all the others" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_kmcp = false + kmcp_mode = 0 + run_ganon = false +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/conf/test_prinseqplusplus.config b/conf/test_prinseqplusplus.config new file mode 100644 index 00000000..acc23aa8 --- /dev/null +++ b/conf/test_prinseqplusplus.config @@ -0,0 +1,52 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/taxprofiler -profile test, --outdir + +---------------------------------------------------------------------------------------- +*/ + +params { + config_profile_name = 'Test profile for prinseqplusplus' + config_profile_description = "Minimal test to check the alternative tool of short-read complexity filtering, prinseqplusplus" + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + perform_shortread_qc = true + perform_longread_qc = true + perform_shortread_complexityfilter = true + shortread_complexityfilter_tool = 'prinseqplusplus' + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 +} + +process { + withName: MALT_RUN { + maxForks = 1 + ext.args = { "-m ${params.malt_mode} -J-Xmx12G" } + } +} diff --git a/docs/images/nf-core-taxprofiler_icon.png b/docs/images/nf-core-taxprofiler_icon.png new file mode 100644 index 00000000..c639fb67 Binary files /dev/null and b/docs/images/nf-core-taxprofiler_icon.png differ diff --git a/docs/images/nf-core-taxprofiler_icon.svg b/docs/images/nf-core-taxprofiler_icon.svg new file mode 100644 index 00000000..24e615ff --- /dev/null +++ b/docs/images/nf-core-taxprofiler_icon.svg @@ -0,0 +1,444 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-taxprofiler_icon_border.svg b/docs/images/nf-core-taxprofiler_icon_border.svg new file mode 100644 index 00000000..887e8e82 --- /dev/null +++ b/docs/images/nf-core-taxprofiler_icon_border.svg @@ -0,0 +1,445 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/nf-core-taxprofiler_logo_custom_dark.png b/docs/images/nf-core-taxprofiler_logo_custom_dark.png new file mode 100644 index 00000000..6b089fc1 Binary files /dev/null and b/docs/images/nf-core-taxprofiler_logo_custom_dark.png differ diff --git a/docs/images/nf-core-taxprofiler_logo_custom_dark.svg b/docs/images/nf-core-taxprofiler_logo_custom_dark.svg new file mode 100644 index 00000000..3d47b4c6 --- /dev/null +++ b/docs/images/nf-core-taxprofiler_logo_custom_dark.svg @@ -0,0 +1,2302 @@ + + + +taxfindertaxprofiler/ diff --git a/docs/images/nf-core-taxprofiler_logo_custom_light.png b/docs/images/nf-core-taxprofiler_logo_custom_light.png new file mode 100644 index 00000000..2dc85b81 Binary files /dev/null and b/docs/images/nf-core-taxprofiler_logo_custom_light.png differ diff --git a/docs/images/nf-core-taxprofiler_logo_custom_light.svg b/docs/images/nf-core-taxprofiler_logo_custom_light.svg new file mode 100644 index 00000000..dae1fbe0 --- /dev/null +++ b/docs/images/nf-core-taxprofiler_logo_custom_light.svg @@ -0,0 +1,2305 @@ + + + +taxfindertaxprofiler/ diff --git a/docs/images/nf_core_taxprofiler_icon_border.png b/docs/images/nf_core_taxprofiler_icon_border.png new file mode 100644 index 00000000..c513de0c Binary files /dev/null and b/docs/images/nf_core_taxprofiler_icon_border.png differ diff --git a/docs/images/taxprofiler_logo.svg b/docs/images/taxprofiler_logo.svg new file mode 100644 index 00000000..c9aefbd2 --- /dev/null +++ b/docs/images/taxprofiler_logo.svg @@ -0,0 +1,3223 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + taxfinder + + + + + + + + + + + + taxprofiler + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + taxfinder + + + + + + + + + + + + taxprofiler + / + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/images/taxprofiler_tube.pdf b/docs/images/taxprofiler_tube.pdf new file mode 100644 index 00000000..024d4aca Binary files /dev/null and b/docs/images/taxprofiler_tube.pdf differ diff --git a/docs/images/taxprofiler_tube.png b/docs/images/taxprofiler_tube.png new file mode 100644 index 00000000..b9119af4 Binary files /dev/null and b/docs/images/taxprofiler_tube.png differ diff --git a/docs/images/taxprofiler_tube.svg b/docs/images/taxprofiler_tube.svg new file mode 100644 index 00000000..b7d52428 --- /dev/null +++ b/docs/images/taxprofiler_tube.svg @@ -0,0 +1,5266 @@ + + + +SEQUENCINGQUALITY CONTROLfastqfastqfastqfasta(ADAPTER TRIMMING & MERGING)(COMPLEXITY FILTERING)(HOST REMOVAL)(RUN MERGING)TAXONOMIC CLASSIFICATIONSUMMARY STATISTICSfastpAdapterRemovalBBDukPRINSEQ++Short ReadsAll ReadsProfileMultiple databasesLog FilesLong ReadsBowtie2samtoolsstatssamtoolsstatscatminimap2FiltlongMultiQCKronaPorechopBrackenTaxpastaKraken2KMCPKrakenUniqCentrifugeKaijumOTUsganonMetaPhlAnMALTDIAMONDFastQCfalcoFastQCfalcoFastQCfalcoFastQCfalco(LENGTH FILTERING)taxfindertaxprofiler/tsvtsvtsvtsvtsvtsvtsvtsvtsvtsvhtmlMANDATORY STEP(OPTIONAL STEP)htmltsvdbv1.1 diff --git a/docs/output.md b/docs/output.md index 0287a907..cf4678c3 100644 --- a/docs/output.md +++ b/docs/output.md @@ -6,29 +6,61 @@ This document describes the output produced by the pipeline. Most of the plots a The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - [FastQC](#fastqc) - Raw read QC +- [falco](#fastqc) - Alternative to FastQC for raw read QC +- [fastp](#fastp) - Adapter trimming for Illumina data +- [AdapterRemoval](#adapterremoval) - Adapter trimming for Illumina data +- [Porechop](#porechop) - Adapter removal for Oxford Nanopore data +- [BBDuk](#bbduk) - Quality trimming and filtering for Illumina data +- [PRINSEQ++](#prinseq) - Quality trimming and filtering for Illunina data +- [Filtlong](#filtlong) - Quality trimming and filtering for Nanopore data +- [Bowtie2](#bowtie2) - Host removal for Illumina reads +- [minimap2](#minimap2) - Host removal for Nanopore reads +- [SAMtools stats](#samtools-stats) - Statistics from host removal +- [SAMtools fastq](#samtools-fastq) - Converts unmapped BAM file to fastq format (minimap2 only) +- [Analysis Ready Reads](#analysis-read-reads) - Optional results directory containing the final processed reads used as input for classification/profiling. +- [Bracken](#bracken) - Taxonomic classifier using k-mers and abundance estimations +- [Kraken2](#kraken2) - Taxonomic classifier using exact k-mer matches +- [KrakenUniq](#krakenuniq) - Taxonomic classifier that combines the k-mer-based classification and the number of unique k-mers found in each species +- [Centrifuge](#centrifuge) - Taxonomic classifier that uses a novel indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. +- [Kaiju](#kaiju) - Taxonomic classifier that finds maximum (in-)exact matches on the protein-level. +- [Diamond](#diamond) - Sequence aligner for protein and translated DNA searches. +- [MALT](#malt) - Sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics +- [MetaPhlAn](#metaphlan) - Genome-level marker gene based taxonomic classifier +- [mOTUs](#motus) - Tool for marker gene-based OTU (mOTU) profiling. +- [KMCP](#kmcp) - Taxonomic classifier that utilizes genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimized COBS index for fast alignment-free sequence searching. +- [ganon](#ganon) - Taxonomic classifier and profile that uses Interleaved Bloom Filters as indices based on k-mers/minimizers. +- [TAXPASTA](#taxpasta) - Tool to standardise taxonomic profiles as well as merge profiles across samples from the same database and classifier/profiler. - [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC +![](images/taxprofiler_tube.png) + +### FastQC or Falco
    Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `{fastqc,falco}/` + - {raw,preprocessed} + - `*html`: FastQC or Falco report containing quality metrics in HTML format. + - `*.txt`: FastQC or Falco report containing quality metrics in TXT format. + - `*.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images (FastQC only).
    [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +If preprocessing is turned on, nf-core/taxprofiler runs FastQC/Falco twice -once before and once after adapter removal/read merging, to allow evaluation of the performance of these preprocessing steps. Note in the General Stats table, the columns of these two instances of FastQC/Falco are placed next to each other to make it easier to evaluate. However, the columns of the actual preprocessing steps (i.e, fastp, AdapterRemoval, and Porechop) will be displayed _after_ the two FastQC/Falco columns, even if they were run 'between' the two FastQC/Falco jobs in the pipeline itself. + +:::info +Falco produces identical output to FastQC but in the `falco/` directory. +::: + ![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) ![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) @@ -39,6 +71,542 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. ::: +### fastp + +[fastp](https://github.com/OpenGene/fastp) is a FASTQ pre-processing tool for quality control, trimmming of adapters, quality filtering and other features. + +It is used in nf-core/taxprofiler for adapter trimming of short-reads. + +
    +Output files + +- `fastp/` + - `.fastp.fastq.gz`: File with the trimmed unmerged fastq reads. + - `.merged.fastq.gz`: File with the reads that were successfully merged. + - `.*{log,html,json}`: Log files in different formats. + +
    + +By default nf-core/taxprofiler will only provide the `.fastp.fastq.gz` file if fastp is selected. The file `.merged.fastq.gz` will be available in the output folder if you provide the argument ` --shortread_qc_mergepairs` (optionally retaining un-merged pairs when in combination with `--shortread_qc_includeunmerged`). + +You can change the default value for low complexity filtering by using the argument `--shortread_complexityfilter_fastp_threshold`. + +### AdapterRemoval + +[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. + +
    +Output files + +- `adapterremoval/` + - `.settings`: AdapterRemoval log file containing general adapter removal, read trimming and merging statistics + - `.collapsed.fastq.gz` - read-pairs that merged and did not undergo trimming (only when `--shortread_qc_mergepairs` supplied) + - `.collapsed.truncated.fastq.gz` - read-pairs that merged underwent quality trimming (only when `--shortread_qc_mergepairs` supplied) + - `.pair1.truncated.fastq.gz` - read 1 of pairs that underwent quality trimming + - `.pair2.truncated.fastq.gz` - read 2 of pairs that underwent quality trimming (and could not merge if `--shortread_qc_mergepairs` supplied) + - `.singleton.truncated.fastq.gz` - orphaned read pairs where one of the pair was discarded + - `.discard.fastq.gz` - reads that were discarded due to length or quality filtering + +
    + +By default nf-core/taxprofiler will only provide the `.settings` file if AdapterRemoval is selected. + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. If this is selected, you may receive different combinations of `.fastq` files for each sample depending on the input types - e.g. whether you have merged or not, or if you're supplying both single- and paired-end reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as complexity filtering, host removal, run merging etc.. +::: + +### Porechop + +[Porechop](https://github.com/rrwick/Porechop) is a tool for finding and removing adapters from Oxford Nanopore reads. Adapters on the ends of reads are trimmed and if a read has an adapter in its middle, it is considered a chimeric and it chopped into separate reads. + +
    +Output files + +- `porechop/` + - `.log`: Log file containing trimming statistics + - `.fastq.gz`: Adapter-trimmed file + +
    + +The output logs are saved in the output folder and are part of MultiQC report.You do not normally need to check these manually. + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +We do **not** recommend using Porechop if you are already trimming the adapters with ONT's basecaller Guppy. +::: + +### BBDuk + +[BBDuk](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/) stands for Decontamination Using Kmers. BBDuk was developed to combine most common data-quality-related trimming, filtering, and masking operations into a single high-performance tool. + +It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats). + +
    +Output files + +- `bbduk/` + - `.bbduk.log`: log file containing filtering statistics + - `.fastq.gz`: resulting FASTQ file without low-complexity reads + +
    + +By default nf-core/taxprofiler will only provide the `.log` file if BBDuk is selected as the complexity filtering tool. You will only find the complexity filtered reads in your results directory if you provide ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. +::: + +### PRINSEQ++ + +[PRINSEQ++](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus) is a C++ implementation of the [prinseq-lite.pl](https://prinseq.sourceforge.net/) program. It can be used to filter, reformat or trim genomic and metagenomic sequence data. + +It is used in nf-core/taxprofiler for complexity filtering using different algorithms. This means that it will remove reads with low sequence diversity (e.g. mono- or dinucleotide repeats). + +
    +Output files + +- `prinseqplusplus/` + - `.log`: log file containing number of reads. Row IDs correspond to: `min_len, max_len, min_gc, max_gc, min_qual_score, min_qual_mean, ns_max_n, noiupac, derep, lc_entropy, lc_dust, trim_tail_left, trim_tail_right, trim_qual_left, trim_qual_right, trim_left, trim_right` + - `_good_out.fastq.gz`: resulting FASTQ file without low-complexity reads + +
    + +By default nf-core/taxprofiler will only provide the `.log` file if PRINSEQ++ is selected as the complexity filtering tool. You will only find the complexity filtered `.fastq` files in your results directory if you supply ` --save_complexityfiltered_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as host removal, run merging etc.. +::: + +### Filtlong + +[Filtlong](https://github.com/rrwick/Filtlong) is a quality filtering tool for long reads. It can take a set of small reads and produce a smaller, better subset. + +
    +Output files + +- `filtlong/` + - `_filtered.fastq.gz`: Quality or short read data filtered file + - `_filtered.log`: log file containing summary statistics + +
    + +You will only find the `.fastq` files in the results directory if you provide ` --save_preprocessed_reads`. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::warning +We do _not_ recommend using Filtlong if you are performing filtering of low quality reads with ONT's basecaller Guppy. +::: + +### Bowtie2 + +[Bowtie 2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) is an ultrafast and memory-efficient tool for aligning sequencing reads to long reference sequences. It is particularly good at aligning reads of about 50 up to 100s or 1,000s of characters, and particularly good at aligning to relatively long (e.g. mammalian) genomes. + +It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) and/or other possible contaminant reads (e.g. Phi X) from short-read `.fastq` files prior to profiling. + +
    +Output files + +- `bowtie2/` + - `build/` + - `*.bt2`: Bowtie2 indicies of reference genome, only if `--save_hostremoval_index` supplied. + - `align/` + - `.bam`: BAM file containing reads that aligned against the user-supplied reference genome as well as unmapped reads + - `.bowtie2.log`: log file about the mapped reads + - `.unmapped.fastq.gz`: the off-target reads from the mapping that is used in downstream steps. + +
    + +By default nf-core/taxprofiler will only provide the `.log` file if host removal is turned on. You will only have a `.bam` file if you specify `--save_hostremoval_bam`. This will contain _both_ mapped and unmapped reads. You will only get FASTQ files if you specify to save `--save_hostremoval_unmapped` - these contain only unmapped reads. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::info +Unmapped reads in FASTQ are only found in this directory for short-reads, for long-reads see [`samtools/fastq/`](#samtools-fastq). +::: + +:::info +The resulting `.fastq` files may _not_ always be the 'final' reads that go into taxprofiling, if you also run other steps such as run merging etc.. +::: + +:::info +While there is a dedicated section in the MultiQC HTML for Bowtie2, these values are not displayed by default in the General Stats table. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report for direct comparison with minimap2 (see below). +::: + +### minimap2 + +[minimap2](https://github.com/lh3/minimap2) is an alignment tool suited to mapping long reads to reference sequences. + +It is used with nf-core/taxprofiler to allow removal of 'host' (e.g. human) or other possible contaminant reads from long-read `.fastq` files prior to taxonomic classification/profiling. + +
    +Output files + +- `minimap2/` + - `build/` + - `*.mmi2`: minimap2 indices of reference genome, only if `--save_hostremoval_index` supplied. + - `align/` + - `.bam`: Alignment file in BAM format containing both mapped and unmapped reads. + +
    + +By default, nf-core/taxprofiler will only provide the `.bam` file containing mapped and unmapped reads if saving of host removal for long reads is turned on via `--save_hostremoval_bam`. + +:::info +minimap2 is not yet supported as a module in MultiQC and therefore there is no dedicated section in the MultiQC HTML. Rather, alignment statistics to host genome is reported via samtools stats module in MultiQC report. +::: + +:::info +Unlike Bowtie2, minimap2 does not produce an unmapped FASTQ file by itself. See [`samtools/fastq`](#samtools-fastq). +::: + +### SAMtools fastq + +[SAMtools fastq](http://www.htslib.org/doc/1.1/samtools.html) converts a `.sam`, `.bam`, or `.cram` alignment file to FASTQ format + +
    +Output files + +- `samtools/stats/` + - `_interleaved.fq.gz`: Unmapped reads only in FASTQ gzip format + +
    + +This directory will be present and contain the unmapped reads from the `.fastq` format from long-read minimap2 host removal, if `--save_hostremoval_unmapped` is supplied. Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +:::info +For short-read unmapped reads, see [bowtie2](#bowtie2). +::: + +### Analysis Ready Reads + +:::info +This optional results directory will only be present in the pipeline results when supplying `--save_analysis_ready_reads`. +::: + +
    +Output files + +- `samtools/stats/` + - `_{fq,fastq}.gz`: Final reads that underwent preprocessing and were sent for classification/profiling. + +
    + +The results directory will contain the 'final' processed reads used as input for classification/profiling. It will _only_ include the output of the _last_ step of any combinations of preprocessing steps that may have been specified in the run configuration. For example, if you perform the read QC and host-removal preprocessing steps, the final reads that are sent to classification/profiling are the host-removed FASTQ files - those will be the ones present in this directory. + +:::warning +If you turn off all preprocessing steps, then no results will be present in this directory. This happens independently for short- and long-reads. I.e. you will only have FASTQ files for short reads in this directory if you skip all long-read preprocessing. +::: + +### SAMtools stats + +[SAMtools stats](http://www.htslib.org/doc/samtools-stats.html) collects statistics from a `.sam`, `.bam`, or `.cram` alignment file and outputs in a text format. + +
    +Output files + +- `samtools/stats/` + - `.stats`: File containing samtools stats output. + +
    + +In most cases you do not need to check this file, as it is rendered in the MultiQC run report. + +### Run Merging + +nf-core/taxprofiler offers the option to merge FASTQ files of multiple sequencing runs or libraries that derive from the same sample, as specified in the input samplesheet. + +This is the last possible preprocessing step, so if you have multiple runs or libraries (and run merging turned on), this will represent the final reads that will go into classification/profiling steps. + +
    +Output files + +- `run_merging/` + - `*.fastq.gz`: Concatenated FASTQ files on a per-sample basis + +
    + +Note that you will only find samples that went through the run merging step in this directory. For samples that had a single run or library will not go through this step of the pipeline and thus will not be present in this directory. + +This directory and its FASTQ files will only be present if you supply `--save_runmerged_reads`.Alternatively, if you wish only to have the 'final' reads that go into classification/profiling (i.e., that may have additional processing), do not specify this flag but rather specify `--save_analysis_ready_reads`, in which case the reads will be in the folder `analysis_ready_reads`. + +### Bracken + +[Bracken](https://ccb.jhu.edu/software/bracken/) (Bayesian Reestimation of Abundance with Kraken) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. Braken uses the taxonomy labels assigned by Kraken, a highly accurate metagenomics classification algorithm, to estimate the number of reads originating from each species present in a sample. + +:::info +The first step of using Bracken requires running Kraken2, therefore the initial results before abundance estimation will be found in `/kraken2/`. +::: + +
    +Output files + +- `bracken/` + - `/` + - `bracken__combined_reports.txt`: combined bracken results as output from Bracken's `combine_bracken_outputs.py` script + - `/` + - `_.tsv`: TSV file containing per-sample summary of Bracken results with abundance information + - `_.report_bracken_species.txt`: Kraken2 style report with Bracken abundance information + +
    + +The main taxonomic profiling file from Bracken is the `*.tsv` file. This provides the basic results from Kraken2 but with the corrected abundance information. Note that the raw Kraken2 version of the upstream step of Bracken can be found in the `kraken2/` directory with the suffix of `_.bracken.report.txt` (with a 6 column variant when `--save_minimizers` specified). + +### Kraken2 + +[Kraken](https://ccb.jhu.edu/software/kraken2/) is a taxonomic sequence classifier that assigns taxonomic labels to DNA sequences. Kraken examines the k-mers within a query sequence and uses the information within those k-mers to query a database. That database maps -mers to the lowest common ancestor (LCA) of all genomes known to contain a given k-mer. + +
    +Output files + +- `kraken2/` + - `_combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `krakentools`) + - If you have also run Bracken, the original Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). + - `/` + - `_.classified.fastq.gz`: FASTQ file containing all reads that had a hit against a reference in the database for a given sample + - `_.unclassified.fastq.gz`: FASTQ file containing all reads that did not have a hit in the database for a given sample + - `_.report.txt`: A Kraken2 report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits in the Kraken2 run for a given sample. Will be 6 column rather than 8 if `--save_minimizers` specified. + - `_.classifiedreads.txt`: A list of read IDs and the hits each read had against each database for a given sample + +
    + +The main taxonomic classification file from Kraken2 is the `_combined_reports.txt` or `*report.txt` file. The former provides you the broadest over view of the taxonomic classification results across all samples against a single database, where you get two columns for each sample e.g. `2_all` and `2_lvl`, as well as a summarised column summing up across all samples `tot_all` and `tot_lvl`. The latter gives you the most information for a single sample. The report file is also used for the taxpasta step. + +You will only receive the `.fastq` and `*classifiedreads.txt` file if you supply `--kraken2_save_reads` and/or `--kraken2_save_readclassifications` parameters to the pipeline. + +### KrakenUniq + +[KrakenUniq](https://github.com/fbreitwieser/krakenuniq) (formerly KrakenHLL) is an extension to the fast k-mer-based classification performed by [Kraken](https://github.com/DerrickWood/kraken) with an efficient algorithm for additionally assessing the coverage of unique k-mers found in each species in a dataset. + +
    +Output files + +- `krakenuniq/` + - `/` + - `_[.merged].classified.fasta.gz`: Optional FASTA file containing all reads that had a hit against a reference in the database for a given sample. Paired-end input reads are merged in this output. + - `_[.merged].unclassified.fasta.gz`: Optional FASTA file containing all reads that did not have a hit in the database for a given sample. Paired-end input reads are merged in this output. + - `_.krakenuniq.report.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of Kmers, taxonomic path of all the hits, with an additional column for k-mer coverage, that allows for more accurate distinguishing between false-positive/true-postitive hits. + - `_.krakenuniq.classified.txt`: An optional list of read IDs and the hits each read had against each database for a given sample. + +
    + +The main taxonomic classification file from KrakenUniq is the `*.krakenuniq.report.txt` file. This is an extension of the Kraken2 report with the additional k-mer coverage information that provides more information about the accuracy of hits. + +You will only receive the `.fasta.gz` and `*.krakenuniq.classified.txt` file if you supply `--krakenuniq_save_reads` and/or `--krakenuniq_save_readclassification` parameters to the pipeline. + +:::info +The output system of KrakenUniq can result in other `stdout` or `stderr` logging information being saved in the report file, therefore you must check your report files before downstream use! +::: + +### Centrifuge + +[Centrifuge](https://github.com/DaehwanKimLab/centrifuge) is a taxonomic sequence classifier that uses a Burrows-Wheeler transform and Ferragina-Manzina index for storing and mapping sequences. + +
    +Output files + +- `centrifuge/` + - `/` + - `.centrifuge.mapped.fastq.gz`: `FASTQ` files containing all mapped reads + - `.centrifuge.report.txt`: A classification report that summarises the taxonomic ID, the taxonomic rank, length of genome sequence, number of classified and uniquely classified reads + - `.centrifuge.results.txt`: A file that summarises the classification assignment for a read, i.e read ID, sequence ID, score for the classification, score for the next best classification, number of classifications for this read + - `.centrifuge.txt`: A Kraken2-style report that summarises the fraction abundance, taxonomic ID, number of k-mers, taxonomic path of all the hits in the centrifuge run for a given sample + - `.centrifuge.unmapped.fastq.gz`: FASTQ file containing all unmapped reads + +
    + +The main taxonomic classification files from Centrifuge are the `_combined_reports.txt`, `*report.txt`, `*results.txt` and the `*centrifuge.txt`. The latter is used by the taxpasta step. You will receive the `.fastq` files if you supply `--centrifuge_save_reads`. + +### Kaiju + +[Kaiju](https://github.com/bioinformatics-centre/kaiju) is a taxonomic classifier that finds maximum exact matches on the protein-level using the Burrows-Wheeler transform. + +
    +Output files + +- `kaiju/` + - `kaiju__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by kaiju2table) + - `/` + - `_.kaiju.tsv`: Raw output from Kaiju with taxonomic rank, read ID and taxonic ID + - `_.kaijutable.txt`: Summarised Kaiju output with fraction abundance, taxonomic ID, number of reads, and taxonomic names (as generated by `kaiju2table`) + +
    + +The most useful summary file is the `_combined_reports.txt` file which summarises hits across all reads and samples. Separate per-sample versions summaries can be seen in `/*.txt`. However if you wish to look at more precise information on a per-read basis, see the `*tsv` file. The default taxonomic rank is `species`. You can provide a different one by updating the argument `--kaiju_taxon_rank`. + +### DIAMOND + +[DIAMOND](https://github.com/bbuchfink/diamond) is a sequence aligner for translated DNA searches or protein sequences against a protein reference database such as NR. It is a replacement for the NCBI BLAST software tools.It has many key features and it is used as taxonomic classifier in nf-core/taxprofiler. + +
    +Output files + +- `diamond/` + - `/` + - `.log`: A log file containing stdout information + - `*.{blast,xml,txt,daa,sam,tsv,paf}`: A file containing alignment information in various formats, or taxonomic information in a text-based format. Exact output depends on user choice. + +
    + +By default you will receive a TSV output. Alternatively, you will receive a `*.sam` file if you provide the parameter `--diamond_save_reads` but in this case no taxonomic classification will be available(!), only the aligned reads in sam format. + +:::info +DIAMOND has many output formats, so depending on your [choice](https://github.com/bbuchfink/diamond/wiki/3.-Command-line-options) with ` --diamond_output_format` you will receive the taxonomic information in a different format. +::: + +### MALT + +[MALT](https://software-ab.cs.uni-tuebingen.de/download/malt) is a fast replacement for BLASTX, BLASTP and BLASTN, and provides both local and semi-global alignment capabilities. + +
    +Output files + +- `malt/` + - `/` + - `.blastn.sam`: sparse SAM file containing alignments of each hit + - `.megan`: summary file that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer. Generated by MEGAN6 companion tool `rma2info` + - `.rma6`: binary file containing all alignments and taxonomic information of hits that can be loaded into the [MEGAN6](https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/) interactive viewer + - `.txt.gz`: text file containing taxonomic IDs and read counts against each taxon. Generated by MEGAN6 companion tool `rma2info` + +
    + +The main output of MALT is the `.rma6` file format, which can be only loaded into MEGAN and it's related tools. We provide the `rma2info` text files for improved compatibility with spreadsheet programs and other programmtic data manipulation tools, however this has only limited information compared to the 'binary' RMA6 file format (the `.txt` file only contains taxonomic ID and count, whereas RMA6 has taxonomic lineage information). + +You will only receive the `.sam` and `.megan` files if you supply `--malt_save_reads` and/or `--malt_generate_megansummary` parameters to the pipeline. + +### MetaPhlAn + +[MetaPhlAn](https://github.com/biobakery/metaphlan) is a computational tool for profiling the composition of microbial communities (Bacteria, Archaea and Eukaryotes) from metagenomic shotgun sequencing data (i.e. not 16S) with species-level resolution via marker genes. + +
    +Output files + +- `metaphlan/` + - `metaphlan__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `metaphlan_merge_tables`) + - `/` + - `.biom`: taxonomic profile in BIOM format + - `.bowtie2out.txt`: BowTie2 alignment information (can be re-used for skipping alignment when re-running MetaPhlAn with different parameters) + - `_profile.txt`: MetaPhlAn taxonomic profile including abundance estimates + +
    + +The output contains a file named `*_combined_reports.txt`, which provides an overview of the classification results for all samples. The main taxonomic profiling file from MetaPhlAn is the `*_profile.txt` file. This provides the abundance estimates from MetaPhlAn however does not include raw counts by default. Additionally, it contains intermediate Bowtie2 output `.bowtie2out.txt`, which presents a condensed representation of the mapping results of your sequencing reads to MetaPhlAn's marker gene sequences. The alignments are listed in tab-separated columns, including Read ID and Marker Gene ID, with each alignment represented on a separate line. + +### mOTUs + +[mOTUS](https://github.com/motu-tool/mOTUs) is a taxonomic profiler that maps reads to a unique marker specific database and estimates the relative abundance of known and unknown species. + +
    +Output files + +- `motus/` + - `/` + - `.log`: A log file that contains summary statistics + - `.out`: A classification file that summarises taxonomic identifiers, by default at the rank of mOTUs (i.e., species level), and their relative abundances in the profiled sample. + - `motus__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `motus_merge`) + +
    + +Normally `*_combined_reports.txt` is the most useful file for downstream analyses, but the per sample `.out` file can provide additional more specific information. By default, nf-core/taxprofiler is providing a column describing NCBI taxonomic ID as this is used in the taxpasta step. You can disable this column by activating the argument `--motus_remove_ncbi_ids`. +You will receive the relative abundance instead of read counts if you provide the argument `--motus_use_relative_abundance`. + +### KMCP + +[KMCP](https://github.com/shenwei356/kmcp) utilises genome coverage information by splitting the reference genomes into chunks and stores k-mers in a modified and optimised COBS index for fast alignment-free sequence searching. KMCP combines k-mer similarity and genome coverage information to reduce the false positive rate of k-mer-based taxonomic classification and profiling methods. + +
    +Output files + +- `kmcp/` + + - `/` + - `.gz`: output of `kmcp_search` containing search sequences against a database in tab-delimited format with 15 columns. + - `_kmcp.profile`: output of `kmcp_profile` containing the taxonomic profile from search results. + +
    + +You will receive the `.gz` file if you supply `--kmcp_save_search`. Please note that there is no taxonomic label assignment in this output file. + +The main taxonomic classification file from KMCP is the `*kmcp.profile` which is also used by the taxpasta step. + +### ganon + +[ganon](https://pirovc.github.io/ganon/) is designed to index large sets of genomic reference sequences and to classify reads against them efficiently. The tool uses Interleaved Bloom Filters as indices based on k-mers/minimizers. It was mainly developed, but not limited, to the metagenomics classification problem: quickly assign sequence fragments to their closest reference among thousands of references. After classification, taxonomic abundance is estimated and reported. + +
    +Output files + +- `ganon/` + + - `/` + + - `_report.tre`: output of `ganon report` containing taxonomic classifications with possible formatting and/or filtering depending on options specified. + - ``.tre: output of `ganon classify` containing raw taxonomic classifications and abundance estimations with no additional formatting or filtering. + - ``.rep: 'raw' report of counts against each taxon. + - ``.all: per-read summary of all hits of each reads. + - ``.lca: per-read summary of the best single hit after LCA for each read. + - ``.unc: list of read IDs with no hits. + - ``.log: the stdout console messages printed by `ganon classify`, containing some classification summary information + + - `ganon__combined_reports.txt`: A combined profile of all samples aligned to a given database (as generated by `ganon table`) + +
    + +Generally you will want to refer to the `combined_reports.txt` or `_report.tre` file. For further descriptions of the contents of each file, see the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/). + +You will only receive the `.all`, `.lca`, and `.unc` files if you supply the `--ganon_save_readclassifications` parameter to the pipeline. + +### Krona + +[Krona](https://github.com/marbl/Krona) allows the exploration of (metagenomic) hierarchical data with interactive zooming, multi-layered pie charts. + +Krona charts will be generated by the pipeline for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) + +
    +Output files + +- `krona/` + - `_.html`: per-tool/per-database interactive HTML file containing hierarchical piecharts + +
    + +The resulting HTML files can be loaded into your web browser for exploration. Each file will have a dropdown to allow you to switch between each sample aligned against the given database of the tool. + +### TAXPASTA + +[TAXPASTA](https://github.com/taxprofiler/taxpasta) standardises and optionally merges two or more taxonomic profiles across samples into one single table. It supports multiple different classifiers simplifying comparison of taxonomic classification results between tools and databases. + +
    +Output files + +- `taxpasta/` + + - `_*.{tsv,csv,arrow,parquet,biom}`: Standardised taxon table containing multiple samples. The standard format is the `tsv`. + - The first column describes the taxonomy ID and the rest of the columns describe the read counts for each sample. + - Note that the file naming scheme will apply regardless of whether `TAXPASTA_MERGE` (multiple sample run) or `TAXPASTA_STANDARDISE` (single sample run) are executed. + - If you have also run Bracken, the initial Kraken report (i.e., _before_ read re-assignment) will also be included in this directory with `-bracken` suffixed to your Bracken database name. For example: `kraken2--bracken.tsv`. However in most cases you want to use the actual Bracken file (i.e., `bracken_.tsv`). + +
    + +By providing the path to a directory containing taxdump files to `--taxpasta_taxonomy_dir`, the taxon name, the taxon rank, the taxon's entire lineage including taxon names and/or the taxon's entire lineage including taxon identifiers can also be added in the output in addition to just the taxon ID. Addition of this extra information can be turned by using the parameters `--taxpasta_add_name`, `--taxpasta_add_rank`, `--taxpasta_add_lineage` and `--taxpasta_add_idlineage` respectively. + +These files will likely be the most useful files for the comparison of differences in classification between different tools or building consensuses, with the caveat they have slightly less information than the actual output from each tool (which may have non-standard information e.g. taxonomic rank, percentage of hits, abundance estimations). + +The following report files are used for the taxpasta step: + +- Bracken: `_.tsv` Taxpasta used the `new_est_reads` column for the standardised profile. +- Centrifuge: `.centrifuge.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile. +- Diamond: `` Taxpasta summarises number of reads per NCBI taxonomy ID standardised profile. +- Kaiju: `_.kaijutable.txt` Taxpasta uses the `reads` column from kaiju2table standardised profile. +- KrakenUniq: `_.report.txt` Taxpasta uses the `reads` column for the standardised profile. +- Kraken2: `_.report.txt` Taxpasta uses the `direct_assigned_reads` column for the standardised profile. +- MALT: `.txt.gz` Taxpasta uses the `count` (second) column from the output of MEGAN6's rma2info for the standardised profile. +- MetaPhlAn: `_profile.txt` Taxpasta uses the `relative_abundance` column multiplied with a fixed number to yield an integer for the standardised profile. +- mOTUs: `.out` Taxpasta uses the `read_count` column for the standardised profile. + +:::warning +Please aware the outputs of each tool's standardised profile _may not_ be directly comparable between each tool. Some may report raw read counts, whereas others may report abundance information. Please always refer to the list above, for which information is used for each tool. +::: + ### MultiQC
    @@ -55,6 +623,32 @@ The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They m Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +All tools in taxprofiler supported by MultiQC will have a dedicated section showing summary statistics of each tool based on information stored in log files. + +You can expect in the MultiQC reports either sections and/or general stats columns for the following tools: + +- fastqc +- adapterRemoval +- fastp +- bbduk +- prinseqplusplus +- porechop +- filtlong +- bowtie2 +- minimap2 +- samtools (stats) +- kraken +- bracken +- centrifuge +- kaiju +- diamond +- malt +- motus + +:::info +The 'General Stats' table by default will only show statistics referring to pre-processing steps, and will not display possible values from each classifier/profiler, unless turned on by the user within the 'Configure Columns' menu or via a custom MultiQC config file (`--multiqc_config`) +::: + ### Pipeline information
    diff --git a/docs/usage.md b/docs/usage.md index 286da89c..dc398350 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,62 +6,178 @@ ## Introduction - +nf-core/taxprofiler is a pipeline for highly-parallelised taxonomic classification and profiling of shotgun metagenomic data across multiple tools simultaneously. In addition to multiple classification and profiling tools, at the same time it allows you to performing taxonomic classification and profiling across multiple databases and settings per tool, as well as produces standardised output tables to allow immediate cross comparison of results between tools. -## Samplesheet input +In addition to this page, you can find additional usage information on the following pages: -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +- [Tutorials](usage/tutorials.md) +- [FAQ and Troubleshooting](usage/faq-troubleshooting.md) -```bash ---input '[path to samplesheet file]' +## General Usage + +To run nf-core/taxprofiler, at a minimum two you require two inputs: + +- a sequencing read samplesheet +- a database samplesheet + +Both contain metadata and paths to the data of your input samples and databases. + +When running nf-core/taxprofiler, every step and tool is 'opt in'. To run a given classifier or profiler you must make sure to supply both a database in your `.csv` and supply `--run_` flag to your command. Omitting either will result in the profiling tool not executing. + +nf-core/taxprofiler also includes optional pre-processing (adapter clipping, merge running etc.) or post-processing (visualisation) steps. These are also opt in with a `--perform_` flag. In some cases, the pre- and post-processing steps may also require additional files. Please check the parameters tab of this documentation for more information. + +Please see the rest of this page for information about how to prepare input samplesheets and databases and how to run Nextflow pipelines. See the [parameters](https://nf-co.re/taxprofiler/parameters) documentation for more information about specific options the pipeline also offers. + +## Samplesheet inputs + +nf-core/taxprofiler can accept as input raw or preprocessed single- or paired-end short-read (e.g. Illumina) FASTQ files, long-read FASTQ files (e.g. Oxford Nanopore), or FASTA sequences (available for a subset of profilers). + +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row as shown in the examples below. Furthermore, nf-core/taxprofiler also requires a second comma-separated file of 3 columns with a header row as in the examples below. + +This samplesheet is then specified on the command line as follows: + +```console +--input '[path to samplesheet file]' --databases '[path to database sheet file]' ``` ### Multiple runs of the same sample -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate different runs FASTQ files of the same sample before performing profiling, when `--perform_runmerging` is supplied. Below is an example for the same sample sequenced across 3 lanes: ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2612,run1,ILLUMINA,2612_run1_R1.fq.gz,, +2612,run2,ILLUMINA,2612_run2_R1.fq.gz,, +2612,run3,ILLUMINA,2612_run3_R1.fq.gz,2612_run3_R2.fq.gz, ``` +:::warning +Runs of the same sample sequenced on Illumina platforms with a combination of single and paired-end data will **not** be run-wise concatenated, unless pair-merging is specified. In the example above, `run3` will be profiled independently of `run1` and `run2` if pairs are not merged. +::: + ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 6 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +A final samplesheet file consisting of both single- and paired-end data, as well as long-read FASTA files may look something like the one below. This is for 6 samples, where `2612` has been sequenced twice. ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +2611,ERR5766174,ILLUMINA,,,///fasta/ERX5474930_ERR5766174_1.fa.gz +2612,ERR5766176,ILLUMINA,///fastq/ERX5474932_ERR5766176_1.fastq.gz,///fastq/ERX5474932_ERR5766176_2.fastq.gz, +2612,ERR5766180,ILLUMINA,///fastq/ERX5474936_ERR5766180_1.fastq.gz,, +2613,ERR5766181,ILLUMINA,///fastq/ERX5474937_ERR5766181_1.fastq.gz,///fastq/ERX5474937_ERR5766181_2.fastq.gz, +ERR3201952,ERR3201952,OXFORD_NANOPORE,///fastq/ERR3201952.fastq.gz,, ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +:::warning +Input FASTQ and FASTA files _must_ be gzipped. +::: + +:::warning +While one can include both short-read and long-read data in one run, we recommend that you split these across _two_ pipeline runs and database sheets (see below). This will allow classification optimisation for each data type, and make MultiQC run-reports more readable (due to run statistics having vary large number differences). +::: + +| Column | Description | +| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Unique sample name [required]. | +| `run_accession` | Run ID or name unique for each (pairs of) file(s) .Can also supply sample name again here, if only a single run was generated [required]. | +| `instrument_platform` | Sequencing platform reads generated on, selected from the EBI ENA [controlled vocabulary](https://www.ebi.ac.uk/ena/portal/api/controlledVocab?field=instrument_platform) [required]. | +| `fastq_1` | Path or URL to sequencing reads or for Illumina R1 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if data in FASTA is specified. Cannot be combined with `fasta`. | +| `fastq_2` | Path or URL to Illumina R2 sequencing reads in FASTQ format. GZipped compressed files accepted. Can be left empty if single end data. Cannot be combined with `fasta`. | +| `fasta` | Path or URL to long-reads or contigs in FASTA format. GZipped compressed files accepted. Can be left empty if data in FASTA is specified. Cannot be combined with `fastq_1` or `fastq_2`. | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +### Full database sheet + +nf-core/taxprofiler supports multiple databases being classified/profiled against in parallel for each tool. + +Databases can be supplied either in the form of a compressed `.tar.gz` archive of a directory containing all relevant database files or the path to a directory on the filesystem. + +:::warning +nf-core/taxprofiler does not provide any databases by default, nor does it currently generate them for you. This must be performed manually by the user. See bottom of this section for more information of the expected database files, or the [building custom database](usage/tutorials#retrieving-databases-or-building-custom-databases) tutorials. +::: + +The pipeline takes the paths and specific classification/profiling parameters of the tool of these databases as input via a four column comma-separated sheet. + +:::warning +To allow user freedom, nf-core/taxprofiler does not check for mandatory or the validity of non-file database parameters for correct execution of the tool - excluding options offered via pipeline level parameters! Please validate your database parameters (cross-referencing [parameters](https://nf-co.re/taxprofiler/parameters), and the given tool documentation) before submitting the database sheet! For example, if you don't use the default read length - Bracken will require `-r ` in the `db_params` column. +::: + +An example database sheet can look as follows, where 7 tools are being used, and `malt` and `kraken2` will be used against two databases each. + +`kraken2` will be run twice even though only having a single 'dedicated' database because specifying `bracken` implies first running `kraken2` on the `bracken` database, as required by `bracken`. + +```csv +tool,db_name,db_params,db_path +malt,malt85,-id 85,///malt/testdb-malt/ +malt,malt95,-id 90,///malt/testdb-malt.tar.gz +bracken,db1,;-r 150,///bracken/testdb-bracken.tar.gz +kraken2,db2,--quick,///kraken2/testdb-kraken2.tar.gz +krakenuniq,db3,,///krakenuniq/testdb-krakenuniq.tar.gz +centrifuge,db1,,///centrifuge/minigut_cf.tar.gz +metaphlan,db1,,///metaphlan/metaphlan_database/ +motus,db_mOTU,,///motus/motus_database/ +ganon,db1,,///ganon/test-db-ganon.tar.gz +kmcp,db1,;-I 20,///kmcp/test-db-kmcp.tar.gz +``` + +:::warning +For Bracken and KMCP, which are two step profilers, nf-core/taxprofiler has a special way of passing parameters to each steps! + +For Bracken, if you wish to supply any parameters to both the Kraken or Bracken steps or just the Bracken step, you **must** have a _semi-colon_ `;` list in the `db_params` column. This allows you to specify the Kraken2 parameters before and Bracken parameters after the `;`. This is particularly important if you supply a Bracken database with a non-default read length parameter. If you do not have any parameters to specify, you can leave this column empty. If you wish to provide settings to _just_ the Kraken2 step of the Bracken profiling, you can supply a normal string to the column without a semi-colon. If you wish to supply parameters to only Bracken (and keep default Kraken2 parameters), then you supply a string to the column starting with `;` and the Bracken parameters _after_. + +Similiarly, for KMCP, if you want to supply parameters for both the first (KMCP search) and the _second step_ (KMCP profile) steps, you **must** have a _semi-colon_ separated`;` list in `db_params`. If you wish to provide parameters to just KMCP search, you do not need the `;`. If you want to supply parameters to just KMCP profile (and keep search parameters at default), then you must start the string with `;` and the KMCP profile parameters come _after_ the semi colon. If you do not wish to modify any parameters, you can leave the column empty (i.e. the `;` is not necessary). + +This allows you to specify the KMCP search and the KMCP profile parameters, separated by `;`. If you do not have any parameters to specify, you can leave this as empty. +::: + +Column specifications are as follows: + +| Column | Description | +| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `tool` | Taxonomic profiling tool (supported by nf-core/taxprofiler) that the database has been indexed for [required]. Please note that `bracken` also implies running `kraken2` on the same database. | +| `db_name` | A unique name per tool for the particular database [required]. Please note that names need to be unique across both `kraken2` and `bracken` as well, even if re-using the same database. | +| `db_params` | Any parameters of the given taxonomic classifier/profiler that you wish to specify that the taxonomic classifier/profiling tool should use when profiling against this specific database. Can be empty to use taxonomic classifier/profiler defaults. Must not be surrounded by quotes [required]. We generally do not recommend specifying parameters here that turn on/off saving of output files or specifying particular file extensions - this should be already addressed via pipeline parameters. For Bracken databases, must at a minimum contain a `;` separating Kraken2 from Bracken parameters. | +| `db_path` | Path to the database. Can either be a path to a directory containing the database index files or a `.tar.gz` file which contains the compressed database directory with the same name as the tar archive, minus `.tar.gz` [required]. | + +:::tip +You can also specify the same database directory/file twice (ensuring unique `db_name`s) and specify different parameters for each database to compare the effect of different parameters during classification/profiling. +::: + +nf-core/taxprofiler will automatically decompress and extract any compressed archives for you. + +The (uncompressed) database paths (`db_path`) for each tool are expected to contain: + +- [**Bracken**:](usage/tutorials.md#bracken-custom-database) output of the combined `kraken2-build` and `bracken-build` process. +- [**Centrifuge**:](usage/tutorials.md#centrifuge-custom-database) output of `centrifuge-build`. +- [**DIAMOND**:](usage/tutorials.md#diamond-custom-database) output of `diamond makedb`. +- [**Kaiju**:](usage/tutorials.md#kaiju-custom-database) output of `kaiju-makedb`. +- [**Kraken2**:](usage/tutorials.md#kraken2-custom-database) output of `kraken2-build` command(s). +- [**KrakenUniq**:](usage/tutorials.md#krakenuniq-custom-database) output of `krakenuniq-build` command(s). +- [**MALT**](usage/tutorials.md#malt-custom-database) output of `malt-build`. +- [**MetaPhlAn**:](usage/tutorials.md#metaphlan-custom-database) output of with `metaphlan --install` or downloaded from links on the [MetaPhlAn wiki](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-4#customizing-the-database). +- [**mOTUs**:](usage/tutorials.md#motus-custom-database) the directory `db_mOTU/` that is downloaded via `motus downloadDB`. +- [**ganon**:](usage/tutorials.md#ganon-custom-database) output of `ganon build` or `ganon build-custom`. +- [**KMCP**:](usage/tutorials.md#kmcp-custom-database) output of `kmcp index`. Note: `kmcp index` uses the output of an upstream `kmcp compute` step. + +:::info +Click the links in the list above for short quick-reference tutorials how to generate custom databases for each tool. +::: + ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/taxprofiler --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/taxprofiler --input samplesheet.csv --databases databases.csv --outdir -profile docker --run_ --run_ ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +When running nf-core/taxprofiler, every step and tool is 'opt in'. To run a given classifier/profiler you must make sure to supply both a database in your `.csv` and supply `--run_` flag to your command. Omitting either will result in the classification/profiling tool not executing. If you wish to perform pre-processing (adapter clipping, merge running etc.) or post-processing (visualisation) steps, these are also opt in with a `--perform_` flag. In some cases, the pre- and post-processing steps may also require additional files. Please check the parameters tab of this documentation for more information. + Note that the pipeline will create the following files in your working directory: ```bash @@ -96,6 +212,185 @@ genome: 'GRCh37' You can also generate such `YAML`/`JSON` files via [nf-core/launch](https://nf-co.re/launch). +### Sequencing quality control + +[`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. nf-core taxprofiler offers [`falco`](https://github.com/smithlabcode/falco) as an drop-in replacement, with supposedly better improvement particularly for long reads. + +### Preprocessing Steps + +nf-core/taxprofiler offers four main preprocessing steps for preprocessing raw sequencing reads: + +- [**Read processing**](#read-processing): adapter clipping and pair-merging. +- [**Complexity filtering**](#complexity-filtering): removal of low-sequence complexity reads. +- [**Host read-removal**](#host-read-removal): removal of reads aligning to reference genome(s) of a host. +- [**Run merging**](#run-merging): concatenation of multiple FASTQ chunks/sequencing runs/libraries of a sample. + +:::info +You can save the 'final' reads used for classification/profiling from any combination of these steps with `--save_analysis_ready_reads`. +::: + +#### Read Processing + +Raw sequencing read processing in the form of adapter clipping and paired-end read merging can be activated via the `--perform_shortread_qc` or `--perform_longread_qc` flags. + +It is highly recommended to run this on raw reads to remove artifacts from sequencing that can cause false positive identification of taxa (e.g. contaminated reference genomes) and/or skews in taxonomic abundance profiles. If you have public data, normally these should have been corrected for, however you should still check that these steps have indeed been already performed. + +There are currently two options for short-read preprocessing: [`fastp`](https://github.com/OpenGene/fastp) or [`adapterremoval`](https://github.com/MikkelSchubert/adapterremoval). + +For adapter clipping, you can either rely on the tool's default adapter sequences, or supply your own adapters (`--shortread_qc_adapter1` and `--shortread_qc_adapter2`) +By default, paired-end merging is not activated. In this case paired-end 'alignment' against the reference databases is performed where supported, and if not, supported pairs will be independently classified/profiled. If paired-end merging is activated you can also specify whether to include unmerged reads in the reads sent for classification/profiling (`--shortread_qc_mergepairs` and `--shortread_qc_includeunmerged`). +You can also turn off clipping and only perform paired-end merging, if requested. This can be useful when processing data downloaded from the ENA, SRA, or DDBJ (`--shortread_qc_skipadaptertrim`). +Both tools support length filtering of reads and can be tuned with `--shortread_qc_minlength`. Performing length filtering can be useful to remove short (often low sequencing complexity) sequences that result in unspecific classification and therefore slow down runtime during classification/profiling, with minimal gain. + +There is currently one option for long-read Oxford Nanopore processing: [`porechop`](https://github.com/rrwick/Porechop). + +For both short-read and long-read preprocessing, you can optionally save the resulting processed reads with `--save_preprocessed_reads`. + +#### Complexity Filtering + +Complexity filtering can be activated via the `--perform_shortread_complexityfilter` flag. + +Complexity filtering is primarily a run-time optimisation step. It is not necessary for accurate taxonomic classification/profiling, however it can speed up run-time of each tool by removing reads with low-diversity of nucleotides (e.g. with mono-nucleotide - `AAAAAAAA`, or di-nucleotide repeats `GAGAGAGAGAGAGAG`) that have a low-chance of giving an informative taxonomic ID as they can be associated with many different taxa. Removing these reads therefore saves computational time and resources. + +There are currently three options for short-read complexity filtering: [`bbduk`](https://jgi.doe.gov/data-and-tools/software-tools/bbtools/bb-tools-user-guide/bbduk-guide/), [`prinseq++`](https://github.com/Adrian-Cantu/PRINSEQ-plus-plus), and [`fastp`](https://github.com/OpenGene/fastp#low-complexity-filter). + +There is one option for long-read quality filtering: [`Filtlong`](https://github.com/rrwick/Filtlong) + +The tools offer different algorithms and parameters for removing low complexity reads and quality filtering. We therefore recommend reviewing the pipeline's [parameter documentation](https://nf-co.re/taxprofiler/parameters) and the documentation of the tools (see links above) to decide on optimal methods and parameters for your dataset. + +You can optionally save the FASTQ output of the run merging with the `--save_complexityfiltered_reads`. If running with `fastp`, complexity filtering happens inclusively within the earlier shortread preprocessing step. Therefore there will not be an independent pipeline step for complexity filtering, and no independent FASTQ file (i.e. `--save_complexityfiltered_reads` will be ignored) - your complexity filtered reads will also be in the `fastp/` folder in the same file(s) as the preprocessed read. + +:::warning +For nanopore data: we do not recommend performing any read preprocessing or complexity filtering if you are using ONTs Guppy toolkit for basecalling and post-processing. +::: + +#### Host-Read Removal + +Removal of possible-host reads from FASTQ files prior classification/profiling can be activated with `--perform_shortread_hostremoval` or `--perform_longread_hostremoval`. + +Similarly to complexity filtering, host-removal can be useful for runtime optimisation and reduction in misclassified reads. It is not always necessary to report classification of reads from a host when you already know the host of the sample, therefore you can gain a run-time and computational advantage by removing these prior typically resource-heavy classification/profiling with more efficient methods. Furthermore, particularly with human samples, you can reduce the number of false positives during classification/profiling that occur due to host-sequence contamination in reference genomes on public databases. + +nf-core/taxprofiler currently offers host-removal via alignment against a reference genome with Bowtie2 for short reads and minimap2 for long reads, and the use of the unaligned reads for downstream classification/profiling. + +You can supply your reference genome in FASTA format with `--hostremoval_reference`. You can also optionally supply a directory containing pre-indexed Bowtie2 index files with `--shortread_hostremoval_index` or a minimap2 `.mmi` file for `--longread_hostremoval_index`, however nf-core/taxprofiler will generate these for you if necessary. Pre-supplying the index directory or files can greatly speed up the process, and these can be re-used. + +:::tip +If you have multiple taxa or sequences you wish to remove (e.g., the host genome and then also PhiX - common quality-control reagent during sequencing) you can simply concatenate the FASTAs of each taxa or sequences into a single reference file. +::: + +#### Run Merging + +For samples that may have been sequenced over multiple runs, or for FASTQ files split into multiple chunks, you can activate the ability to merge across all runs or chunks with `--perform_runmerging`. + +For more information how to set up your input samplesheet, see [Multiple runs of the same sample](#multiple-runs-of-the-same-sample). + +Activating this functionality will concatenate the FASTQ files with the same sample name _after_ the optional preprocessing steps and _before_ classification/profiling. Note that libraries with runs of different pairing types will **not** be merged and this will be indicated on output files with a `_se` or `_pe` suffix to the sample name accordingly. + +You can optionally save the FASTQ output of the run merging with the `--save_runmerged_reads`. + +#### Classification and Profiling + +The following sections provide tips and suggestions for running the different taxonomic classification and profiling tools _within the pipeline_. For advice and/or guidance whether you should run a particular tool on your specific data, please see the documentation of each tool! + +An important distinction between the different tools in included in the pipeline is classification versus profiling. Taxonomic _classification_ is concerned with simply detecting the presence of species in a given sample. Taxonomic _profiling_ involves additionally estimating the _abundance_ of each species. + +Note that not all taxonomic classification tools (e.g. Kraken, MALT, Kaiju) performs _profiling_, but all taxonomic profilers (e.g. MetaPhlAn, mOTUs, Bracken) must perform some form of _classification_ prior to profiling. + +For advice as to which tool to run in your context, please see the documentation of each tool. + +:::note +If you would like to change this behaviour, please contact us on the [nf-core slack](https://nf-co.re/join) and we can discuss this. +::: + +Not all tools currently have dedicated tips, suggestions and/or recommendations, however we welcome further contributions for existing and additional tools via pull requests to the [nf-core/taxprofiler repository](https://github.com/nf-core/taxprofiler)! + +##### Bracken + +You must make sure to also activate Kraken2 to run Bracken in the pipeline. + +It is unclear whether Bracken is suitable for running long reads, as it makes certain assumptions about read lengths. Furthermore, during testing we found issues where Bracken would fail on the long-read test data. + +Therefore currently nf-core/taxprofiler does not run Bracken on data specified as being sequenced with `OXFORD_NANOPORE` in the input samplesheet. + +##### Centrifuge + +Centrifuge currently does not accept FASTA files as input, therefore no output will be produced for these input files. + +##### DIAMOND + +DIAMOND only allows output of a single file format at a time, therefore parameters such `--diamond_save_reads` supplied will result in only aligned reads in SAM format will be produced, no taxonomic profiles will be available. Be aware of this when setting up your pipeline runs, depending on your particular use case. + +##### Kaiju + +Currently, no specific tips or suggestions. + +##### Kraken2 + +Currently, no specific tips or suggestions. + +##### KrakenUniq + +Currently, no specific tips or suggestions. + +##### MALT + +MALT does not support paired-end reads alignment (unlike other tools), therefore nf-core/taxprofiler aligns these as independent files if read-merging is skipped. If you skip merging, you can sum or average the results of the counts of the pairs. + +Krona can only be run on MALT output if path to Krona taxonomy database supplied to `--krona_taxonomy_directory`. Therefore if you do not supply the a Krona directory, Krona plots will not be produced for MALT. + +##### MetaPhlAn + +MetaPhlAn4 is compatible with the MetaPhlAn3 database by adding the `--mpa3` into `db_params` of the `database.csv`. + +##### mOTUs + +mOTUs currently does not accept FASTA files as input, therefore no output will be produced for these input files. + +##### ganon + +It is unclear whether ganon is suitable for running long reads - during testing we found issues where ganon would fail on the long-read test data. + +Therefore currently nf-core/taxprofiler does not run ganon on data specified as being sequenced with `OXFORD_NANOPORE` in the input samplesheet. + +##### KMCP + +KMCP is only suitable for short-read metagenomic profiling, with much lower sensitivity on long-read datasets. Therefore, nf-core/taxprofiler does not currently run KMCP on data specified as being sequenced with `OXFORD_NANOPORE` in the input samplesheet. + +#### Post Processing + +##### Visualisation + +nf-core/taxprofiler supports generation of Krona interactive pie chart plots for the following compatible tools. + +- Kraken2 +- Centrifuge +- Kaiju +- MALT + +:::warning +MALT KRONA plots cannot be generated automatically, you must also specify a Krona taxonomy directory with `--krona_taxonomy_directory` if you wish to generate these. +::: + +##### Multi-Table Generation + +The main multiple-sample table from nf-core/taxprofiler is from a dedicated standalone tool originally developed for the pipeline - [Taxpasta](https://taxpasta.readthedocs.io/en/latest/). When providing `--run_profile_standardisation`, every classifier/profiler and database combination will get a standardised and (if present) multi-sample taxon table in the [`taxpasta/`](https://nf-co.re/taxprofiler/output) directory. These tables are structured in the same way, to facilitate comparison between the results of the classifier/profiler. If multiple samples are provided, `taxpasta merge` will be executed, whereas if only a single sample is provided, `taxpasta standardise` will be executed - the file naming scheme will be the same for both. + +In addition to per-sample profiles and standardised Taxpasta output, the pipeline also supports generation of 'native' multi-sample taxonomic profiles (i.e., those generated by the taxonomic profiling tools themselves or additional utility scripts provided by the tool authors), when providing `--run_profile_standardisation` to your pipeline. + +These are executed on a per-database level. I.e., you will get a multi-sample taxon table for each database you provide for each tool and will be placed in the same directory as the directories containing the per-sample profiles. + +The following tools will produce multi-sample taxon tables: + +- **Bracken** (via bracken's `combine_bracken_outputs.py` script) +- **Centrifuge** (via KrakenTools' `combine_kreports.py` script) +- **Kaiju** (via Kaiju's `kaiju2table` tool) +- **Kraken2** (via KrakenTools' `combine_kreports.py` script) +- **MetaPhlAn** (via MetaPhlAn's `merge_metaphlan_tables.py` script) +- **mOTUs** (via the `motus merge` command) +- **ganon** (via the `ganon table` command) + +Note that the multi-sample tables from the 'native' tools in each folders are [not inter-operable](https://taxpasta.readthedocs.io/en/latest/tutorials/getting-started/) with each other as they can have different formats and can contain additional and different data. In this case we refer you to use the standardised and merged output from Taxpasta, as described above. + ### Updating the pipeline When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: @@ -112,7 +407,7 @@ First, go to the [nf-core/taxprofiler releases page](https://github.com/nf-core/ This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. -To further assist in reproducbility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. +To further assist in reproducibility, you can use share and re-use [parameter files](#running-the-pipeline) to repeat pipeline runs with the same settings without having to write out a command with every single parameter. :::tip If you wish to share such profile (such as upload as supplementary material for academic publications), make sure to NOT include cluster specific paths to files, nor institutional specific profiles. @@ -139,7 +434,7 @@ The pipeline also dynamically loads configurations from [https://github.com/nf-c Note that multiple profiles can be loaded, for example: `-profile test,docker` - the order of arguments is important! They are loaded in sequence, so later profiles can overwrite earlier profiles. -If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer enviroment. +If `-profile` is not specified, the pipeline will run locally and expect all software to be installed and available on the `PATH`. This is _not_ recommended, since it can lead to different results on different machines dependent on the computer environment. - `test` - A profile with a complete configuration for automated testing diff --git a/docs/usage/faq-troubleshooting.md b/docs/usage/faq-troubleshooting.md new file mode 100644 index 00000000..3c80725d --- /dev/null +++ b/docs/usage/faq-troubleshooting.md @@ -0,0 +1,9 @@ +# Troubleshooting and FAQs + +## I get a warning during centrifuge_kreport process with exit status 255 + +When a sample has insufficient hits for abundance estimation, the resulting `report.txt` file will be empty. + +When trying to convert this to a kraken-style report, the conversion tool will exit with a status code `255`, and provide a `WARN`. + +This is _not_ an error nor a failure of the pipeline, just your sample has no hits to the provided database when using centrifuge. diff --git a/docs/usage/tutorials.md b/docs/usage/tutorials.md new file mode 100644 index 00000000..000736b5 --- /dev/null +++ b/docs/usage/tutorials.md @@ -0,0 +1,620 @@ +# nf-core/taxprofiler: Tutorials + +This page provides a range of tutorials to help give you a bit more guidance on how to set up nf-core/taxprofiler runs in the wild. + +## Simple Tutorial + +In this tutorial we will run you through a simple set up of a small nf-core/taxprofiler run. +It assumes that you have basic knowledge of metagenomic classification input and output files. + +### Preparation + +#### Hardware + +The datasets used should be small enough to run on your own laptop or a single server node. + +If you wish to use a HPC cluster or cloud, and don't wish to use an 'interactive' session submitted to your scheduler, please see the [nf-core documentation](https://nf-co.re/docs/usage/configuration#introduction) on how to make a relevant config file. + +You will need internet access and at least 1.5 GB of hardrive space. + +#### Software + +The tutorial assumes you are on a Unix based operating system, and have already installed Nextflow as well a software environment system such as [Conda](https://docs.conda.io/en/latest/miniconda.html), [Docker](https://www.docker.com/), or [Singularity/Apptainer](https://apptainer.org/). +The tutorial will use Docker, however you can simply replace references to `docker` with `conda`, `singularity`, or `apptainer` accordingly. + +#### Data + +First we will make a directory to run the whole tutorial in. + +```bash +mkdir taxprofiler-tutorial +cd taxprofiler-tutorial/ +``` + +We will use very small short-read (pre-subset) metagenomes used for testing. +nf-core/taxprofiler accepts FASTQ or FASTA files as input formats, however we will use FASTQ here as the more common format in taxonomic classification. +You can download these metagenomes with the following command. + +```bash +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_1.fastq.gz +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_2.fastq.gz +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_B_1.fastq.gz +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_B_2.fastq.gz +``` + +In this tutorial we will demonstrate running with three different profilers, and in one of those cases, running the same database twice but with different parameters. +The database consists of two genomes of species known to be present in the metagenomes. +You can download the databases for Kraken2, Centrifuge, and Kaiju with the following commands. + +```bash +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/database/kraken2/testdb-kraken2.tar.gz +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/database/centrifuge/test-db-centrifuge.tar.gz +curl -O https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/database/kaiju/kaiju.tar.gz +``` + +To demonstrate that nf-core/taxprofiler can also accept databases as uncompressed folders, we can extract one of them. + +```bash +tar -xzf kaiju.tar.gz +``` + +:::note +You have provide these databases pre-built to the pipeline, nf-core/taxprofiler neither comes with default databases not can generate databases for you. +For guidance on how to build databases, see the [Retrieving databases or building custom databases](#retrieving-databases-or-building-custom-databases) tutorial. +::: + +Finally, an important step of any metagenomic classification is to remove contamination. +Contamination can come from many places, typically from the host of a host-associated sample, however this can also come from laboratory processing samples. +A common contaminant in Illumina sequencing is a spike-in control of the genome of PhiX virus, which we can download with the following command. + +```bash +curl -O https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/819/615/GCF_000819615.1_ViralProj14015/GCF_000819615.1_ViralProj14015_genomic.fna.gz +``` + +### Preparing Input + +#### Sample sheet + +You provide the sequencing data FASTQ files to nf-core/taxprofiler via a input 'sample sheet' `.csv` file. +This is a 6 column table, that includes sample and library names, instrument platform, and paths to the sequencing data. + +Open a text editor, and create a file called `samplesheet.csv`. +Copy and paste the following lines into the file and save it. + +```csv title="samplesheet.csv" +sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta +ERX5474932,ERR5766176,ILLUMINA,ERX5474932_ERR5766176_1.fastq.gz,ERX5474932_ERR5766176_2.fastq.gz, +ERX5474932,ERR5766176_B,ILLUMINA,ERX5474932_ERR5766176_B_1.fastq.gz,ERX5474932_ERR5766176_B_2.fastq.gz, +``` + +Here we have specified two libraries of the same sample, that they were sequencing on Illumina platforms, and the paths to the FASTQ files. +If you had placed your FASTQ files elsewhere, you would give the full path (i.e., with relevant directories) to the `fastq_1`, `fastq_2` and `fasta` columns. + +#### Database sheet + +For the database(s), you also supply these via a `.csv` file. +This 4 column table contains the tool the database has been built for, a database name, the parameters you wish reads to be queried against the given database with, and a path to a `.tar.gz` archive file or a directory containing the database files. + +Open a text editor, and create a file called `database.csv`. +Copy and paste the following csv file into the file and save it. + +```csv title="database.csv" +tool,db_name,db_params,db_path +kraken2,db1,--quick,testdb-kraken2.tar.gz +centrifuge,db2,,test-db-centrifuge.tar.gz +centrifuge,db2_trimmed,--trim5 2 --trim3 2,test-db-centrifuge.tar.gz +kaiju,db3,,kaiju/ +``` + +You can see here we have specified the Centrifuge database twice, to allow comparison of different settings. +Note that the each database of the same tool has a unique name. +Furthermore, while the Kraken2 and Centrifuge databases have been supplied as `.tar.gz` archives, the Kaiju database has been supplied as a directory. + +### Running the pipeline + +Now that we have the sequencing reads (in FASTQ format), the databases (directory or `.tar.gz`), and a reference genome (FASTA, optionally gzipped), we can now run them with the pipeline. The following command will perform short read quality control, remove contaminant reads, merge multiple libraries for each sample, run the three profilers, and finally generate standardised profiles. + +```bash +nextflow run nf-core/taxprofiler -r 1.1.0 -profile docker \ +--input samplesheet.csv --databases database.csv --outdir ./results \ +--perform_shortread_qc \ +--perform_shortread_hostremoval --hostremoval_reference GCF_000819615.1_ViralProj14015_genomic.fna.gz \ +--perform_runmerging --save_runmerged_reads \ +--run_centrifuge --run_kaiju --run_kraken2 \ +--run_profile_standardisation \ +--max_cpus 2 --max_memory '6.GB' +``` + +:::info +With all Docker containers pre-downloaded, this run took 2 minutes and 31 seconds on a laptop running Ubuntu 22.04.2 with 32 GB RAM and 16 CPUs. +If you are running nf-core/taxprofiler for the first time, expect this command to take longer as Nextflow will have to download each software container for each step of the pipeline. +::: + +To break down each line of the command: + +- Tell Nextflow to run nf-core/taxprofiler with the particular version and using the Docker container system +- Specify the input and outputs, i.e., paths to the `samplesheet.csv`, `database.csv`, and directory where to save the results +- Turn on basic quality control of input reads: adapter clipping, length filtering, etc +- Turn on the removal of host or contaminant reads, and specify the path to reference genome of this +- Turn on run merging, i.e., combine the processed input reads of the multiple libraries into each sample, and save these reads (e.g. for downstream use) +- Turn on the different taxonomic profiling tools you wish to use +- Turn on profile standardisation and multi-sample taxon tables +- (Optional) provide a _cap_ to the maximum amount of resources each step/job of the pipeline can use + +:::warning +The `--max_cpu`, `--max_memory`, `--max_time` parameters _do not_ increase the amount of memory a step of the pipeline uses! +They simply prevent Nextflow requesting more than this threshold, e.g. more than available on your machine. +To learn how to increase computational resource to the pipeline, see the central [nf-core documentation](https://nf-co.re/docs/usage/configuration). +::: + +The pipeline run can be represented (in a simplified format!) as follows + +```mermaid +graph LR +0([FASTQs]) --> X[FastQC] --> A[FastP] --> Y[FastQC] --> B[BowTie2] +0([FASTQs]) --> X[FastQC] --> A[FastP] --> Y[FastQC] --> B[BowTie2] + +3([Reference FASTA]) -----> B + +2([Databases]) -------> D[Kraken2] +2([Databases]) -------> E[Centrifuge] +2([Databases]) -------> E[Centrifuge] +2([Databases]) -------> F[Kaiju] + +B--> C[Run Merging] +B--> C[Run Merging] +C --> D[Kraken2] +C --> E[Centrifuge] +C --> F[Kaiju] + +D --> G[combinekreports] +E --> H[combinekreports] +F --> I[Kaiju2Table] + +D ---> J[TAXPASTA] +E ---> J[TAXPASTA] +F ---> J[TAXPASTA] + +X ---> K[MultiQC] +A ---> K[MultiQC] +B ---> K[MultiQC] +D ---> K[MultiQC] +E ---> K[MultiQC] +F ---> K[MultiQC] +``` + +:::tip{title=""} +We hope you see the benefit of using pipelines for such a task! +::: + +### Output + +In the resulting directory `results/` you will find a range of directories. + +```tree +results/ +├── bowtie2 +├── centrifuge +├── fastp +├── fastqc +├── kaiju +├── kraken2 +├── multiqc +├── pipeline_info +├── run_merging +├── samtools +└── taxpasta +``` + +To follow the same order as the command construction above + +- Pipeline run report is found in `multiqc/` and resource statistics in `pipeline_info` +- Short-read QC results are found in `fastqc/` and `fastp/` +- Host/contaminant removal results are found in `bowtie2/` and `samtools/` +- Lane merged preprocessed reads are found in `run_merging/` +- Raw profiling results are found in `kraken2/`, `centrifuge/`, and `kaiju/` +- Standardised profiles for all profiling tools and databases are found in `taxpasta` + +:::info +Within each classifier results directory, there will be one directory and 'combined samples table' per database. +::: + +:::info +For read-preprocessing steps, only log files are stored in the `results/` directories by default. Refer to the parameters tab of the [nf-core/taxprofiler documentation](https://nf-co.re/taxprofiler/) for more options. +::: + +The general 'workflow' of going through the results will typically be reviewing the `multiqc/multiqc_report.html` file to get general statistics of the entire run, particularly of the preprocessing. +You would then use the taxon tables in the `taxpasta/` directory for downstream analysis, but referring to the classifier specific results directories when you require more detailed information on each classification. + +Detailed descriptions of all results files can be found in the output tab of the [nf-core/taxprofiler documentation](https://nf-co.re/taxprofiler/). + +### Clean up + +Once you have completed the tutorial, you can run the following command to delete all downloaded and output files. + +```bash +rm -r taxprofiler-tutorial/ +``` + +:::warning +Don't forget to change out of the directory above before trying to delete it! +::: + +## Retrieving databases or building custom databases + +Not all taxonomic profilers provide ready-made or default databases. Here we will give brief guidance on how to build custom databases for each supported taxonomic profiler. + +You should always consult the documentation of each tool for more information, as here we only provide short minimal-tutorials as quick reference guides (with no guarantee they are up to date). + +The following tutorials assumes you already have the tool available (e.g. installed locally, or via conda, docker etc.), and you have already downloaded the FASTA files you wish to build into a database. + +### Bracken custom database + +Bracken does not require an independent database but rather builds upon Kraken2 databases. [The pre-built Kraken2 databases hosted by Ben Langmead](https://benlangmead.github.io/aws-indexes/k2) already contain the required files to run Bracken. + +However, to build custom databases, you will need a Kraken2 database, the (average) read lengths (in bp) of your sequencing experiment, the K-mer size used to build the Kraken2 database, and Kraken2 available on your machine. + +```bash +bracken-build -d -k -l +``` + +:::tip +You can speed up database construction by supplying the threads parameter (`-t`). +::: + +:::tip +If you do not have Kraken2 in your `$PATH` you can point to the binary with `-x ///kraken2`. +::: + +
    +Expected files in database directory + +- `bracken` + - `hash.k2d` + - `opts.k2d` + - `taxo.k2d` + - `database100mers.kmer_distrib` + - `database150mers.kmer_distrib` + +
    + +You can follow Bracken [tutorial](https://ccb.jhu.edu/software/bracken/index.shtml?t=manual) for more information. + +### Centrifuge custom database + +To build a custom Centrifuge database, a user needs to download taxonomy files, make a custom `seqid2taxid.map` and combine the fasta files together. + +In total, you need four components: a tab-separated file mapping sequence IDs to taxonomy IDs (`--conversion-table`), a tab-separated file mapping taxonomy IDs to their parents and rank, up to the root of the tree (`--taxonomy-tree`), a pipe-separated file mapping taxonomy IDs to a name (`--name-table`), and the reference sequences. + +An example of custom `seqid2taxid.map`: + +```csv title="seqid2taxid.map" + NC_001133.9 4392 + NC_012920.1 9606 + NC_001134.8 4392 + NC_001135.5 4392 +``` + +```bash +centrifuge-download -o taxonomy taxonomy +cat *.{fa,fna} > input-sequences.fna +centrifuge-build -p 4 --conversion-table seqid2taxid.map --taxonomy-tree taxonomy/nodes.dmp --name-table taxonomy/names.dmp input-sequences.fna taxprofiler_cf +``` + +
    +Expected files in database directory + +- `centrifuge` + - `..cf` + - `..cf` + - `..cf` + - `..cf` + +
    + +For the Centrifuge custom database documentation, see [here](https://ccb.jhu.edu/software/centrifuge/manual.shtml#custom-database). + +### DIAMOND custom database + +To create a custom database for DIAMOND, the user should download and unzip the NCBI's taxonomy files and the input FASTA files. + +The download and build steps are as follows: + +```bash +wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip +unzip taxdmp.zip + +## warning: large file! +wget ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.gz + +## warning: takes a long time! +cat ../raw/*.faa | diamond makedb -d testdb-diamond --taxonmap prot.accession2taxid.FULL.gz --taxonnodes nodes.dmp --taxonnames names.dmp + +## clean up +rm *dmp *txt *gz *prt *zip +``` + +
    +Expected files in database directory + +- `diamond` + - `.dmnd` + +
    + +A detailed description can be found [here](https://github.com/bbuchfink/diamond/wiki/1.-Tutorial). + +### Kaiju custom database + +A number of kaiju pre-built indexes for reference datasets are maintained by the developers of kaiju and made available on the [kaiju website](https://bioinformatics-centre.github.io/kaiju/downloads.html). These databases can directly be used to run the workflow with Kaiju. + +In case the databases above do not contain your desired libraries, you can build a custom kaiju database. To build a kaiju database, you need three components: a FASTA file with the protein sequences, the NCBI taxonomy dump files, and you need to define the uppercase characters of the standard 20 amino acids you wish to include. + +:::warning +The headers of the protein fasta file must be numeric NCBI taxon identifiers of the protein sequences. +::: + +To download the NCBI taxonomy files, please run the following commands: + +```bash +wget https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip +unzip new_taxdump.zip +``` + +To build the database, run the following command (the contents of taxdump must be in the same location where you run the command): + +```bash +kaiju-mkbwt -a ACDEFGHIKLMNPQRSTVWY -o proteins proteins.faa +kaiju-mkfmi proteins +``` + +:::tip +You can speed up database construction by supplying the threads parameter (`-t`). +::: + +
    +Expected files in database directory + +- `kaiju` + - `kaiju_db_*.fmi` + - `nodes.dmp` + - `names.dmp` + +
    + +For the Kaiju database construction documentation, see [here](https://github.com/bioinformatics-centre/kaiju#custom-database). + +### Kraken2 custom database + +A number of database indexes have already been generated and maintained by [@BenLangmead Lab](https://github.com/BenLangmead), see [here](https://benlangmead.github.io/aws-indexes/k2). These databases can directly be used to run the workflow with Kraken2 as well as Bracken. + +In case the databases above do not contain your desired libraries, you can build a custom Kraken2 database. This requires two components: a taxonomy (consisting of `names.dmp`, `nodes.dmp`, and `*accession2taxid`) files, and the FASTA files you wish to include. + +To pull the NCBI taxonomy, you can run the following: + +```bash +kraken2-build --download-taxonomy --db +``` + +You can then add your FASTA files with the following build command. + +```bash +kraken2-build --add-to-library *.fna --db +``` + +You can repeat this step multiple times to iteratively add more genomes prior building. + +Once all genomes are added to the library, you can build the database (and optionally clean it up): + +```bash +kraken2-build --build --db +kraken2-build --clean --db +``` + +You can then add the `/` path to your nf-core/taxprofiler database input sheet. + +
    +Expected files in database directory + +- `kraken2` + - `opts.k2d` + - `hash.k2d` + - `taxo.k2d` + +
    + +You can follow the Kraken2 [tutorial](https://github.com/DerrickWood/kraken2/blob/master/docs/MANUAL.markdown#custom-databases) for a more detailed description. + +### KrakenUniq custom database + +For any KrakenUniq database, you require: taxonomy files, the FASTA files you wish to include, a `seqid2mapid` file, and a k-mer length. + +First you must make a `seqid2taxid.map` file which is a two column text file containing the FASTA sequence header and the NCBI taxonomy ID for each sequence: + +``` +MT192765.1 2697049 +``` + +Then make a directory (`/`), containing the `seqid2taxid.map` file, and your FASTA files in a subdirectory called `library/` (these FASTA files can be symlinked). You must then run the `taxonomy` command on the `/` directory, and then build it. + +```bash +mkdir -p /library +mv `seqid2taxid.map` / +mv *.fna /library +krakenuniq-download --db taxonomy +krakenuniq-build --db --kmer-len 31 +``` + +:::tip +You can speed up database construction by supplying the threads parameter (`--threads`) to `krakenuniq-build`. +::: + +
    +Expected files in database directory + +- `krakenuniq` + - `opts.k2d` + - `hash.k2d` + - `taxo.k2d` + - `database.idx` + - `taxDB` + +
    + +Please see the [KrakenUniq documentation](https://github.com/fbreitwieser/krakenuniq#database-building) for more information. + +### MALT custom database + +To build a MALT database, you need the FASTA files to include, and an (unzipped) [MEGAN mapping 'db' file](https://software-ab.informatik.uni-tuebingen.de/download/megan6/) for your FASTA type. In addition to the input directory, output directory, and the mapping file database, you also need to specify the sequence type (DNA or Protein) with the `-s` flag. + +```bash +malt-build -i ///*.{fna,fa,fasta} -a2t //.db -d / -s DNA +``` + +You can then add the `/` path to your nf-core/taxprofiler database input sheet. + +:::warning +MALT generates very large database files and requires large amounts of RAM. You can reduce both by increasing the step size `-st` (with a reduction in sensitivity). +::: + +:::tip +MALT-build can be multi-threaded with `-t` to speed up building. +::: + +
    +Expected files in database directory + +- `malt` + - `ref.idx` + - `taxonomy.idx` + - `taxonomy.map` + - `index0.idx` + - `table0.idx` + - `table0.db` + - `ref.inf` + - `ref.db` + - `taxonomy.tre` + +
    + +See the [MALT manual](https://software-ab.informatik.uni-tuebingen.de/download/malt/manual.pdf) for more information. + +### MetaPhlAn custom database + +MetaPhlAn does not allow (easy) construction of custom databases. Therefore we recommend to use the prebuilt database of marker genes that is provided by the developers. + +To perform this task, ensure that you have installed `MetaPhlAn` on your machine. Keep in mind that each version of MetaPhlAn aligns with a specific version of the database. Therefore, if you download the MetaPhlAn3 database, remember to include `--mpa3` as a parameter for the database in the `--databases` CSV file. + +```bash +metaphlan --install --bowtie2db / +``` + +You can then add the `/` path to your nf-core/taxprofiler database input sheet. + +:::warning +It is generally not recommended to modify this database yourself, thus this is currently not supported in the pipeline. However, it is possible to customise the existing database by adding your own marker genomes following the instructions [here](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-4#customizing-the-database). +::: + +:::note +If using your own database is relevant for you, please contact the nf-core/taxprofiler developers on the [nf-core slack](https://nf-co.re/join) and we will investigate supporting this. +::: + +
    +Expected files in database directory + +- `metaphlan4` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.pkl` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.fna.bz2` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.1.bt2l` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.2.bt2l` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.3.bt2l` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.4.bt2l` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.rev.1.bt2l` + - `mpa_vJan21_TOY_CHOCOPhlAnSGB_202103.rev.2.bt2l` + - `mpa_latest` + +
    + +More information on the MetaPhlAn database can be found [here](https://github.com/biobakery/MetaPhlAn/wiki/MetaPhlAn-4#Pre-requisites). + +### mOTUs custom database + +mOTUs does not provide the ability to construct custom databases. Therefore we recommend to use the the prebuilt database of marker genes provided by the developers. + +:::warning +**Do not change the directory name of the resulting database if moving to a central location** The database name of `db_mOTU/` is hardcoded in the mOTUs tool +::: + +To do this you need to have `mOTUs` installed on your machine. + +```bash +motus downloadDB +``` + +Then supply the `db_mOTU/` path to your nf-core/taxprofiler database input sheet. + +:::warning +The `db_mOTU/` directory may be downloaded to somewhere in your Python's `site-package` directory. You will have to find this yourself as the exact location varies depends on installation method. +::: + +More information on the mOTUs database can be found [here](https://motu-tool.org/installation.html). + +#### ganon custom database + +To build a custom ganon database you need two components: the FASTA files you wish to include, and the file extension of those FASTA files. + +:::tip +You can also use [`ganon build`](https://pirovc.github.io/ganon/default_databases/) to download and generate pre-defined databases for you. +::: + +You can optionally include your own taxonomy files, however `ganon build-custom` will download these for you if not provided. + +```bash +ganon build-custom --threads 4 --input *.fa --db-prefix +``` + +You can then add the `/` path to your nf-core/taxprofiler database input sheet. + +:::tip +`ganon build-custom` can be multi-threaded with `-t` to speed up building. +::: + +
    +Expected files in database directory + +- `ganon` + - `*.ibf` or `.hibf` + - `*.tax` + +
    + +More information on custom ganon database construction can be found [here](https://pirovc.github.io/ganon/custom_databases/). + +#### KMCP custom database + +To build a KMCP database you need four components: the FASTA files you wish to include in gzip-compressed format and one genome per file with the reference identifier in the file name, the taxid mapping file, NCBI taxonomy dump files (names.dmp, nodes.dmp), and the range of k-mers to build the database with. + +1. You need to compute the k-mers with [`kmcp compute`](https://bioinf.shenwei.me/kmcp/usage/#compute) and by providing as input the FASTA files you wish to include. +2. You need to build index for k-mers with [`kmcp index`](https://bioinf.shenwei.me/kmcp/usage/#index) by providing as input the output of `kmcp compute` + +For example + +```bash +kmcp compute -k 21 -n 10 -l 150 -O ///*.{fna,fa,fasta} +kmcp index -I / --threads 8 --num-hash 1 --false-positive-rate 0.3 --out-dir / +``` + +You can then add the `/` path to your nf-core/taxprofiler database input sheet. + +
    +Expected files in database directory + +- `kmcp` + - `.unik` + - `_info.txt` + - `*.kmcp/` + - `__db.yml` + +
    + +More information on custom KMCP database construction can be found [here](https://bioinf.shenwei.me/kmcp/database/#building-custom-databases). diff --git a/main.nf b/main.nf index 4819e4ef..8a68a8df 100644 --- a/main.nf +++ b/main.nf @@ -17,22 +17,45 @@ nextflow.enable.dsl = 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +<<<<<<< HEAD include { TAXPROFILER } from './workflows/taxprofiler' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_taxprofiler_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_taxprofiler_pipeline' include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_taxprofiler_pipeline' +======= +>>>>>>> dev /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GENOME PARAMETER VALUES ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +<<<<<<< HEAD // TODO nf-core: Remove this line if you don't need a FASTA file // This is an example of how to use getGenomeAttribute() to fetch parameters // from igenomes.config using `--genome` params.fasta = getGenomeAttribute('fasta') +======= +include { validateParameters; paramsHelp } from 'plugin/nf-validation' + +// Print help message if needed +if (params.help) { + def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) + def citation = '\n' + WorkflowMain.citation(workflow) + '\n' + def String command = "nextflow run ${workflow.manifest.name} --input samplesheet.csv --databases databases.csv --outdir results/ -profile docker --run_kraken2" + log.info logo + paramsHelp(command) + citation + NfcoreTemplate.dashedLine(params.monochrome_logs) + System.exit(0) +} + +// Validate input parameters +if (params.validate_params) { + validateParameters() +} + +WorkflowMain.initialise(workflow, params, log, args) +>>>>>>> dev /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 50c537e9..002f8bba 100644 --- a/modules.json +++ b/modules.json @@ -5,15 +5,250 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { +<<<<<<< HEAD +======= + "adapterremoval": { + "branch": "master", + "git_sha": "5add1e8e11af620c779462936ce8bbcc1abcef2d", + "installed_by": ["modules"] + }, + "bbmap/bbduk": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "bowtie2/align": { + "branch": "master", + "git_sha": "fe54581f8bed20e4c4a51c616c93fd3379d89820", + "installed_by": ["modules"] + }, + "bowtie2/build": { + "branch": "master", + "git_sha": "6a24fbe314bb2e6fe6306c29a63076ea87e8eb3c", + "installed_by": ["modules"] + }, + "bracken/bracken": { + "branch": "master", + "git_sha": "093e35505936bce5127e1d14966b6cac91cd137f", + "installed_by": ["modules"] + }, + "bracken/combinebrackenoutputs": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "cat/fastq": { + "branch": "master", + "git_sha": "5c460c5a4736974abde2843294f35307ee2b0e5e", + "installed_by": ["modules"] + }, + "centrifuge/centrifuge": { + "branch": "master", + "git_sha": "9a07a1293d9b818d1e06d0f7b58152f74d462012", + "installed_by": ["modules"] + }, + "centrifuge/kreport": { + "branch": "master", + "git_sha": "9a07a1293d9b818d1e06d0f7b58152f74d462012", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "installed_by": ["modules"] + }, + "diamond/blastx": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "falco": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/falco/falco.diff" + }, + "fastp": { + "branch": "master", + "git_sha": "d497a4868ace3302016ea8ed4b395072d5e833cd", + "installed_by": ["modules"] + }, +>>>>>>> dev "fastqc": { "branch": "master", "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", "installed_by": ["modules"] }, + "filtlong": { + "branch": "master", + "git_sha": "b4c8ac68ffcdabac3f63aaa5e7420b175e1d8d76", + "installed_by": ["modules"] + }, + "ganon/classify": { + "branch": "master", + "git_sha": "3b9d0fd0431442facdc816ca0c731f9807c47ebd", + "installed_by": ["modules"] + }, + "ganon/report": { + "branch": "master", + "git_sha": "f61e38c82ca2bbb8dfa740822df6f2b75f2d8a86", + "installed_by": ["modules"] + }, + "ganon/table": { + "branch": "master", + "git_sha": "c02373677641897c9744c0f55f0c12fd79232c71", + "installed_by": ["modules"] + }, + "gunzip": { + "branch": "master", + "git_sha": "e06548bfa36ee31869b81041879dd6b3a83b1d57", + "installed_by": ["modules"] + }, + "kaiju/kaiju": { + "branch": "master", + "git_sha": "3db50674956b1fb3741a44eb917458d788a50197", + "installed_by": ["modules"] + }, + "kaiju/kaiju2krona": { + "branch": "master", + "git_sha": "3db50674956b1fb3741a44eb917458d788a50197", + "installed_by": ["modules"] + }, + "kaiju/kaiju2table": { + "branch": "master", + "git_sha": "3db50674956b1fb3741a44eb917458d788a50197", + "installed_by": ["modules"] + }, + "kmcp/profile": { + "branch": "master", + "git_sha": "e198734cc3be18af5f64f6d7734c7f1a7c3af5a6", + "installed_by": ["modules"] + }, + "kmcp/search": { + "branch": "master", + "git_sha": "e198734cc3be18af5f64f6d7734c7f1a7c3af5a6", + "installed_by": ["modules"] + }, + "kraken2/kraken2": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] + }, + "krakentools/combinekreports": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "krakentools/kreport2krona": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "krakenuniq/preloadedkrakenuniq": { + "branch": "master", + "git_sha": "8bbaa881ab9e59f3e18680550d65d52339640630", + "installed_by": ["modules"] + }, + "krona/ktimporttaxonomy": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "krona/ktimporttext": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "malt/run": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "megan/rma2info": { + "branch": "master", + "git_sha": "dbce8951ff9a39ad08d87e563636bbcc6ef34032", + "installed_by": ["modules"] + }, + "metaphlan/mergemetaphlantables": { + "branch": "master", + "git_sha": "efae1c431e539d6a6d323ee2e9223c4b81a152ce", + "installed_by": ["modules"] + }, + "metaphlan/metaphlan": { + "branch": "master", + "git_sha": "1038d3de36263159b4138324a646105941ac271a", + "installed_by": ["modules"] + }, + "minimap2/align": { + "branch": "master", + "git_sha": "603ecbd9f45300c9788f197d2a15a005685b4220", + "installed_by": ["modules"] + }, + "minimap2/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "motus/merge": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "motus/profile": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "ccacf6f5de6df3bc6d73b665c1fd2933d8bbc290", "installed_by": ["modules"] + }, + "porechop/porechop": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"], + "patch": "modules/nf-core/porechop/porechop/porechop-porechop.diff" + }, + "prinseqplusplus": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "samtools/fastq": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "samtools/index": { + "branch": "master", + "git_sha": "911696ea0b62df80e900ef244d7867d177971f73", + "installed_by": ["modules"] + }, + "samtools/stats": { + "branch": "master", + "git_sha": "735e1e04e7e01751d2d6e97055bbdb6f70683cc1", + "installed_by": ["modules"] + }, + "samtools/view": { + "branch": "master", + "git_sha": "3ffae3598260a99e8db3207dead9f73f87f90d1f", + "installed_by": ["modules"] + }, + "taxpasta/merge": { + "branch": "master", + "git_sha": "48019785051ba491e82dce910273c2eca61bd5b7", + "installed_by": ["modules"] + }, + "taxpasta/standardise": { + "branch": "master", + "git_sha": "48019785051ba491e82dce910273c2eca61bd5b7", + "installed_by": ["modules"] + }, + "untar": { + "branch": "master", + "git_sha": "d0b4fc03af52a1cc8c6fb4493b921b57352b1dd8", + "installed_by": ["modules"] } } }, diff --git a/modules/local/kraken2_standard_report.nf b/modules/local/kraken2_standard_report.nf new file mode 100644 index 00000000..109503ab --- /dev/null +++ b/modules/local/kraken2_standard_report.nf @@ -0,0 +1,32 @@ +process KRAKEN2_STANDARD_REPORT { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(report) + + output: + tuple val(meta), path(result), emit: report + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + result = "${prefix}_standardized.kraken2.report.txt" + """ + cut -f1-3,6-8 '${report}' > '${result}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cut: \$(echo \$(cut --version 2>&1) | sed 's/^.*(GNU coreutils) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} + diff --git a/modules/local/krona_cleanup.nf b/modules/local/krona_cleanup.nf new file mode 100644 index 00000000..d5da8e5e --- /dev/null +++ b/modules/local/krona_cleanup.nf @@ -0,0 +1,40 @@ +process KRONA_CLEANUP { + tag "$meta.id" + label 'process_low' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(krona, stageAs: 'uncleaned.krona.txt') + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + # Copy the file to a new name + cp ${krona} ${prefix}.txt + + # Remove ugly 'x__' prefixes for each of the taxonomic levels + LEVELS=(d k p c o f g s) + for L in "\${LEVELS[@]}"; do + sed -i "s/\${L}__//g" ${prefix}.txt + done + + # Remove underscores that are standing in place of spaces + sed -i "s/_/ /g" ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sed: \$(echo \$(sed --version 2>&1) | sed 's/^.*GNU sed) //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/adapterremoval/environment.yml b/modules/nf-core/adapterremoval/environment.yml new file mode 100644 index 00000000..1737b14b --- /dev/null +++ b/modules/nf-core/adapterremoval/environment.yml @@ -0,0 +1,7 @@ +name: adapterremoval +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::adapterremoval=2.3.2 diff --git a/modules/nf-core/adapterremoval/main.nf b/modules/nf-core/adapterremoval/main.nf new file mode 100644 index 00000000..d128d324 --- /dev/null +++ b/modules/nf-core/adapterremoval/main.nf @@ -0,0 +1,92 @@ +process ADAPTERREMOVAL { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/adapterremoval:2.3.2--hb7ba0dd_0' : + 'biocontainers/adapterremoval:2.3.2--hb7ba0dd_0' }" + + input: + tuple val(meta), path(reads) + path(adapterlist) + + output: + tuple val(meta), path("${prefix}.truncated.fastq.gz") , optional: true, emit: singles_truncated + tuple val(meta), path("${prefix}.discarded.fastq.gz") , optional: true, emit: discarded + tuple val(meta), path("${prefix}.pair{1,2}.truncated.fastq.gz") , optional: true, emit: paired_truncated + tuple val(meta), path("${prefix}.collapsed.fastq.gz") , optional: true, emit: collapsed + tuple val(meta), path("${prefix}.collapsed.truncated.fastq.gz") , optional: true, emit: collapsed_truncated + tuple val(meta), path("${prefix}.paired.fastq.gz") , optional: true, emit: paired_interleaved + tuple val(meta), path('*.settings') , emit: settings + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def list = adapterlist ? "--adapter-list ${adapterlist}" : "" + prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + AdapterRemoval \\ + --file1 $reads \\ + $args \\ + $list \\ + --basename ${prefix} \\ + --threads ${task.cpus} \\ + --seed 42 \\ + --gzip + + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } else { + """ + AdapterRemoval \\ + --file1 ${reads[0]} \\ + --file2 ${reads[1]} \\ + $args \\ + $list \\ + --basename ${prefix} \\ + --threads $task.cpus \\ + --seed 42 \\ + --gzip + + ensure_fastq() { + if [ -f "\${1}" ]; then + mv "\${1}" "\${1::-3}.fastq.gz" + fi + + } + + ensure_fastq '${prefix}.truncated.gz' + ensure_fastq '${prefix}.discarded.gz' + ensure_fastq '${prefix}.pair1.truncated.gz' + ensure_fastq '${prefix}.pair2.truncated.gz' + ensure_fastq '${prefix}.collapsed.gz' + ensure_fastq '${prefix}.collapsed.truncated.gz' + ensure_fastq '${prefix}.paired.gz' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + adapterremoval: \$(AdapterRemoval --version 2>&1 | sed -e "s/AdapterRemoval ver. //g") + END_VERSIONS + """ + } + +} diff --git a/modules/nf-core/adapterremoval/meta.yml b/modules/nf-core/adapterremoval/meta.yml new file mode 100644 index 00000000..2deb613b --- /dev/null +++ b/modules/nf-core/adapterremoval/meta.yml @@ -0,0 +1,80 @@ +name: adapterremoval +description: Trim sequencing adapters and collapse overlapping reads +keywords: + - trimming + - adapters + - merging + - fastq +tools: + - adapterremoval: + description: The AdapterRemoval v2 tool for merging and clipping reads. + homepage: https://github.com/MikkelSchubert/adapterremoval + documentation: https://adapterremoval.readthedocs.io + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - adapterlist: + type: file + description: Optional text file containing list of adapters to look for for removal with one adapter per line. Otherwise will look for default adapters (see AdapterRemoval man page), or can be modified to remove user-specified adapters via ext.args. +output: + - singles_truncated: + type: file + description: | + Adapter trimmed FastQ files of either single-end reads, or singleton + 'orphaned' reads from merging of paired-end data (i.e., one of the pair + was lost due to filtering thresholds). + pattern: "*.truncated.fastq.gz" + - discarded: + type: file + description: | + Adapter trimmed FastQ files of reads that did not pass filtering + thresholds. + pattern: "*.discarded.fastq.gz" + - paired_truncated: + type: file + description: | + Adapter trimmed R{1,2} FastQ files of paired-end reads that did not merge + with their respective R{1,2} pair due to long templates. The respective pair + is stored in 'pair{1,2}_truncated'. + pattern: "*.pair{1,2}.truncated.fastq.gz" + - collapsed: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair but were not trimmed. + pattern: "*.collapsed.fastq.gz" + - collapsed_truncated: + type: file + description: | + Collapsed FastQ of paired-end reads that successfully merged with their + respective R1 pair and were trimmed of adapter due to sufficient overlap. + pattern: "*.collapsed.truncated.fastq.gz" + - paired_interleaved: + type: file + description: | + Write paired-end reads to a single file, interleaving mate 1 and mate 2 reads + pattern: "*.paired.fastq.gz" + - settings: + type: file + description: AdapterRemoval log file + pattern: "*.settings" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@maxibor" + - "@jfy133" +maintainers: + - "@maxibor" + - "@jfy133" diff --git a/modules/nf-core/adapterremoval/tests/main.nf.test b/modules/nf-core/adapterremoval/tests/main.nf.test new file mode 100644 index 00000000..91c07b7e --- /dev/null +++ b/modules/nf-core/adapterremoval/tests/main.nf.test @@ -0,0 +1,120 @@ +nextflow_process { + + name "Test Process ADAPTERREMOVAL" + script "../main.nf" + process "ADAPTERREMOVAL" + + tag "modules" + tag "modules_nfcore" + tag "adapterremoval" + + test("single-end - sarscov2 - [fastq]") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true, collapse:false ], // meta map + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.singles_truncated, + process.out.settings, + process.out.versions).match() }, + ) + } + } + + test("paired-end - sarscov2 - [fastq]") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false, collapse:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.paired_truncated, + process.out.settings, + process.out.versions).match() }, + { assert process.out.discarded } + ) + } + + } + + test("paired-end collapse - sarscov2 - [fastq]") { + + options "--collapse" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.paired_truncated, + process.out.collapsed, + process.out.settings, + process.out.versions).match() }, + { assert process.out.discarded } + ) + } + + } + + test("paired-end adapterlist - sarscov2 - [fastq]") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false, collapse:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = file("https://github.com/nf-core/test-datasets/raw/modules/data/delete_me/adapterremoval/adapterremoval_adapterlist.txt", + checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.paired_truncated, + process.out.settings, + process.out.versions).match() }, + { assert process.out.discarded } + ) + } + + } + +} diff --git a/modules/nf-core/adapterremoval/tests/main.nf.test.snap b/modules/nf-core/adapterremoval/tests/main.nf.test.snap new file mode 100644 index 00000000..f890167a --- /dev/null +++ b/modules/nf-core/adapterremoval/tests/main.nf.test.snap @@ -0,0 +1,124 @@ +{ + "single-end - sarscov2 - [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true, + "collapse": false + }, + "test.truncated.fastq.gz:md5,119d1b1a0a71ca6e080ff7c53ee0b690" + ] + ], + [ + [ + { + "id": "test", + "single_end": true, + "collapse": false + }, + "test.settings:md5,2fd3d5d703b63ba33a83021fccf25f77" + ] + ], + [ + "versions.yml:md5,00bcc9f0b864b96eeee21bc11773ee67" + ] + ], + "timestamp": "2023-12-09T19:19:36.429445996" + }, + "paired-end - sarscov2 - [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false, + "collapse": false + }, + [ + "test.pair1.truncated.fastq.gz:md5,e3da014fbb9b428e952c62e8f0fb6402", + "test.pair2.truncated.fastq.gz:md5,2ebae722295ea66d84075a3b042e2b42" + ] + ] + ], + [ + [ + { + "id": "test", + "single_end": false, + "collapse": false + }, + "test.settings:md5,b8a451d3981b327f3fdb44f40ba2d6d1" + ] + ], + [ + "versions.yml:md5,00bcc9f0b864b96eeee21bc11773ee67" + ] + ], + "timestamp": "2023-12-09T19:19:42.88672676" + }, + "paired-end collapse - sarscov2 - [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + [ + "test.pair1.truncated.fastq.gz:md5,e3da014fbb9b428e952c62e8f0fb6402", + "test.pair2.truncated.fastq.gz:md5,2ebae722295ea66d84075a3b042e2b42" + ] + ] + ], + [ + + ], + [ + [ + { + "id": "test", + "single_end": false + }, + "test.settings:md5,b8a451d3981b327f3fdb44f40ba2d6d1" + ] + ], + [ + "versions.yml:md5,00bcc9f0b864b96eeee21bc11773ee67" + ] + ], + "timestamp": "2023-12-09T19:19:49.299792876" + }, + "paired-end adapterlist - sarscov2 - [fastq]": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false, + "collapse": false + }, + [ + "test.pair1.truncated.fastq.gz:md5,e3da014fbb9b428e952c62e8f0fb6402", + "test.pair2.truncated.fastq.gz:md5,2ebae722295ea66d84075a3b042e2b42" + ] + ] + ], + [ + [ + { + "id": "test", + "single_end": false, + "collapse": false + }, + "test.settings:md5,36d47d9b40dbc178167d1ae0274d18f3" + ] + ], + [ + "versions.yml:md5,00bcc9f0b864b96eeee21bc11773ee67" + ] + ], + "timestamp": "2023-12-09T19:19:57.26567964" + } +} \ No newline at end of file diff --git a/modules/nf-core/adapterremoval/tests/tags.yml b/modules/nf-core/adapterremoval/tests/tags.yml new file mode 100644 index 00000000..d3375ec5 --- /dev/null +++ b/modules/nf-core/adapterremoval/tests/tags.yml @@ -0,0 +1,2 @@ +adapterremoval: + - "modules/nf-core/adapterremoval/**" diff --git a/modules/nf-core/bbmap/bbduk/main.nf b/modules/nf-core/bbmap/bbduk/main.nf new file mode 100644 index 00000000..001e27d4 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/main.nf @@ -0,0 +1,43 @@ +process BBMAP_BBDUK { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::bbmap=39.01" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bbmap:39.01--h5c4e2a8_0': + 'biocontainers/bbmap:39.01--h5c4e2a8_0' }" + + input: + tuple val(meta), path(reads) + path contaminants + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def raw = meta.single_end ? "in=${reads[0]}" : "in1=${reads[0]} in2=${reads[1]}" + def trimmed = meta.single_end ? "out=${prefix}.fastq.gz" : "out1=${prefix}_1.fastq.gz out2=${prefix}_2.fastq.gz" + def contaminants_fa = contaminants ? "ref=$contaminants" : '' + """ + maxmem=\$(echo \"$task.memory\"| sed 's/ GB/g/g') + bbduk.sh \\ + -Xmx\$maxmem \\ + $raw \\ + $trimmed \\ + threads=$task.cpus \\ + $args \\ + $contaminants_fa \\ + &> ${prefix}.bbduk.log + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset") + END_VERSIONS + """ +} diff --git a/modules/nf-core/bbmap/bbduk/meta.yml b/modules/nf-core/bbmap/bbduk/meta.yml new file mode 100644 index 00000000..c1719918 --- /dev/null +++ b/modules/nf-core/bbmap/bbduk/meta.yml @@ -0,0 +1,52 @@ +name: bbmap_bbduk +description: Adapter and quality trimming of sequencing reads +keywords: + - trimming + - adapter trimming + - quality trimming + - fastq +tools: + - bbmap: + description: BBMap is a short read aligner, as well as various other bioinformatic tools. + homepage: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + documentation: https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide/ + + licence: ["UC-LBL license (see package)"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - contaminants: + type: file + description: | + Reference files containing adapter and/or contaminant sequences for sequence kmer matching + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified fastq reads + pattern: "*fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Bbduk log file + pattern: "*bbduk.log" + +authors: + - "@MGordon09" diff --git a/modules/nf-core/bowtie2/align/main.nf b/modules/nf-core/bowtie2/align/main.nf new file mode 100644 index 00000000..a77114d2 --- /dev/null +++ b/modules/nf-core/bowtie2/align/main.nf @@ -0,0 +1,94 @@ +process BOWTIE2_ALIGN { + tag "$meta.id" + label "process_high" + + conda "bioconda::bowtie2=2.4.4 bioconda::samtools=1.16.1 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' : + 'biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' }" + + input: + tuple val(meta) , path(reads) + tuple val(meta2), path(index) + val save_unaligned + val sort_bam + + output: + tuple val(meta), path("*.{bam,sam}"), emit: aligned + tuple val(meta), path("*.log") , emit: log + tuple val(meta), path("*fastq.gz") , emit: fastq, optional:true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def unaligned = "" + def reads_args = "" + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-U ${reads}" + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-1 ${reads[0]} -2 ${reads[1]}" + } + + def samtools_command = sort_bam ? 'sort' : 'view' + def extension_pattern = /(--output-fmt|-O)+\s+(\S+)/ + def extension = (args2 ==~ extension_pattern) ? (args2 =~ extension_pattern)[0][2].toLowerCase() : "bam" + + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 \\ + -x \$INDEX \\ + $reads_args \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.${extension} - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + + stub: + def args2 = task.ext.args2 ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + def extension_pattern = /(--output-fmt|-O)+\s+(\S+)/ + def extension = (args2 ==~ extension_pattern) ? (args2 =~ extension_pattern)[0][2].toLowerCase() : "bam" + + """ + touch ${prefix}.${extension} + touch ${prefix}.bowtie2.log + touch ${prefix}.unmapped_1.fastq.gz + touch ${prefix}.unmapped_2.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ + +} diff --git a/modules/nf-core/bowtie2/align/meta.yml b/modules/nf-core/bowtie2/align/meta.yml new file mode 100644 index 00000000..60d04c12 --- /dev/null +++ b/modules/nf-core/bowtie2/align/meta.yml @@ -0,0 +1,67 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - map + - fasta + - fastq + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" + - save_unaligned: + type: boolean + description: | + Save reads that do not map to the reference (true) or discard them (false) + (default: false) + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - aligned: + type: file + description: Output BAM/SAM file containing read alignments + pattern: "*.{bam,sam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - log: + type: file + description: Aligment log + pattern: "*.log" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bowtie2/build/main.nf b/modules/nf-core/bowtie2/build/main.nf new file mode 100644 index 00000000..069d9c12 --- /dev/null +++ b/modules/nf-core/bowtie2/build/main.nf @@ -0,0 +1,42 @@ +process BOWTIE2_BUILD { + tag "$fasta" + label 'process_high' + + conda "bioconda::bowtie2=2.4.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bowtie2:2.4.4--py39hbb4e92a_0' : + 'biocontainers/bowtie2:2.4.4--py39hbb4e92a_0' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path('bowtie2') , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + mkdir bowtie2 + bowtie2-build $args --threads $task.cpus $fasta bowtie2/${fasta.baseName} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ + + stub: + """ + mkdir bowtie2 + touch bowtie2/${fasta.baseName}.{1..4}.bt2 + touch bowtie2/${fasta.baseName}.rev.{1,2}.bt2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie2/build/meta.yml b/modules/nf-core/bowtie2/build/meta.yml new file mode 100644 index 00000000..0240224d --- /dev/null +++ b/modules/nf-core/bowtie2/build/meta.yml @@ -0,0 +1,43 @@ +name: bowtie2_build +description: Builds bowtie index for reference genome +keywords: + - build + - index + - fasta + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input genome fasta file +output: + - meta: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.bt2" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/bracken/bracken/main.nf b/modules/nf-core/bracken/bracken/main.nf new file mode 100644 index 00000000..ec49c10e --- /dev/null +++ b/modules/nf-core/bracken/bracken/main.nf @@ -0,0 +1,43 @@ +process BRACKEN_BRACKEN { + tag "$meta.id" + label 'process_low' + + // WARN: Version information not provided by tool on CLI. + // Please update version string below when bumping container versions. + conda "bioconda::bracken=2.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0': + 'biocontainers/bracken:2.7--py39hc16433a_0' }" + + input: + tuple val(meta), path(kraken_report) + path database + + output: + tuple val(meta), path(bracken_report) , emit: reports + tuple val(meta), path("*bracken_species.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + bracken_report = "${prefix}.tsv" + // WARN: Version information not provided by tool on CLI. + // Please update version string below when bumping container versions. + def VERSION = '2.7' + """ + bracken \\ + ${args} \\ + -d '${database}' \\ + -i '${kraken_report}' \\ + -o '${bracken_report}' + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bracken: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/bracken/bracken/meta.yml b/modules/nf-core/bracken/bracken/meta.yml new file mode 100644 index 00000000..43f0455a --- /dev/null +++ b/modules/nf-core/bracken/bracken/meta.yml @@ -0,0 +1,52 @@ +name: bracken_bracken +description: Re-estimate taxonomic abundance of metagenomic samples analyzed by kraken. +keywords: + - bracken + - metagenomics + - abundance + - kraken2 +tools: + - bracken: + description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. + homepage: https://ccb.jhu.edu/software/bracken/ + documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual + tool_dev_url: https://github.com/jenniferlu717/Bracken + doi: "10.7717/peerj-cs.104" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kraken_report: + type: file + description: TSV file with six columns coming from kraken2 output + pattern: "*.{tsv}" + - database: + type: file + description: Directory containing the kraken2/Bracken files for analysis + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reports: + type: file + description: TSV output report of the re-estimated abundances + pattern: "*.{tsv}" + - txt: + type: file + description: TXT file of bracken corrected results of Kraken2 report output + pattern: "*.txt" + +authors: + - "@Midnighter" diff --git a/modules/nf-core/bracken/combinebrackenoutputs/main.nf b/modules/nf-core/bracken/combinebrackenoutputs/main.nf new file mode 100644 index 00000000..6a3a22e2 --- /dev/null +++ b/modules/nf-core/bracken/combinebrackenoutputs/main.nf @@ -0,0 +1,37 @@ +process BRACKEN_COMBINEBRACKENOUTPUTS { + tag "$meta.id" + label 'process_low' + + conda "bioconda::bracken=2.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bracken:2.7--py39hc16433a_0': + 'biocontainers/bracken:2.7--py39hc16433a_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + // WARN: Version information not provided by tool on CLI. + // Please update version string below when bumping container versions. + def VERSION = '2.7' + """ + combine_bracken_outputs.py \\ + $args \\ + --files ${input} \\ + -o ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_bracken_output: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/bracken/combinebrackenoutputs/meta.yml b/modules/nf-core/bracken/combinebrackenoutputs/meta.yml new file mode 100644 index 00000000..9ad53859 --- /dev/null +++ b/modules/nf-core/bracken/combinebrackenoutputs/meta.yml @@ -0,0 +1,41 @@ +name: "bracken_combinebrackenoutputs" +description: Combine output of metagenomic samples analyzed by bracken. +keywords: + - sort +tools: + - "bracken": + description: Bracken (Bayesian Reestimation of Abundance with KrakEN) is a highly accurate statistical method that computes the abundance of species in DNA sequences from a metagenomics sample. + homepage: https://ccb.jhu.edu/software/bracken/ + documentation: https://ccb.jhu.edu/software/bracken/index.shtml?t=manual + tool_dev_url: https://github.com/jenniferlu717/Bracken + doi: "10.7717/peerj-cs.104" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: List of output files from bracken + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Combined output in table format + pattern: "*.txt" + +authors: + - "@jfy133" diff --git a/modules/nf-core/cat/fastq/main.nf b/modules/nf-core/cat/fastq/main.nf new file mode 100644 index 00000000..5021e6fc --- /dev/null +++ b/modules/nf-core/cat/fastq/main.nf @@ -0,0 +1,80 @@ +process CAT_FASTQ { + tag "$meta.id" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(reads, stageAs: "input*/*") + + output: + tuple val(meta), path("*.merged.fastq.gz"), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size >= 1) { + """ + cat ${readList.join(' ')} > ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size >= 2) { + def read1 = [] + def read2 = [] + readList.eachWithIndex{ v, ix -> ( ix & 1 ? read2 : read1 ) << v } + """ + cat ${read1.join(' ')} > ${prefix}_1.merged.fastq.gz + cat ${read2.join(' ')} > ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def readList = reads instanceof List ? reads.collect{ it.toString() } : [reads.toString()] + if (meta.single_end) { + if (readList.size > 1) { + """ + touch ${prefix}.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } else { + if (readList.size > 2) { + """ + touch ${prefix}_1.merged.fastq.gz + touch ${prefix}_2.merged.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cat: \$(echo \$(cat --version 2>&1) | sed 's/^.*coreutils) //; s/ .*\$//') + END_VERSIONS + """ + } + } + +} diff --git a/modules/nf-core/cat/fastq/meta.yml b/modules/nf-core/cat/fastq/meta.yml new file mode 100644 index 00000000..8a39e309 --- /dev/null +++ b/modules/nf-core/cat/fastq/meta.yml @@ -0,0 +1,40 @@ +name: cat_fastq +description: Concatenates fastq files +keywords: + - cat + - fastq + - concatenate +tools: + - cat: + description: | + The cat utility reads files sequentially, writing them to the standard output. + documentation: https://www.gnu.org/software/coreutils/manual/html_node/cat-invocation.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files to be concatenated. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: Merged fastq file + pattern: "*.{merged.fastq.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/centrifuge/centrifuge/environment.yml b/modules/nf-core/centrifuge/centrifuge/environment.yml new file mode 100644 index 00000000..cf34dc0e --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/environment.yml @@ -0,0 +1,7 @@ +name: centrifuge_centrifuge +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::centrifuge=1.0.4.1 diff --git a/modules/nf-core/centrifuge/centrifuge/main.nf b/modules/nf-core/centrifuge/centrifuge/main.nf new file mode 100644 index 00000000..d9a5653d --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/main.nf @@ -0,0 +1,91 @@ +process CENTRIFUGE_CENTRIFUGE { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4.1--hdcf5f25_1' : + 'biocontainers/centrifuge:1.0.4.1--hdcf5f25_1' }" + + input: + tuple val(meta), path(reads) + path db + val save_unaligned + val save_aligned + + output: + tuple val(meta), path('*report.txt') , emit: report + tuple val(meta), path('*results.txt') , emit: results + tuple val(meta), path('*.{sam,tab}') , optional: true, emit: sam + tuple val(meta), path('*.mapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_mapped + tuple val(meta), path('*.unmapped.fastq{,.1,.2}.gz') , optional: true, emit: fastq_unmapped + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + """ + ## we add "-no-name ._" to ensure silly Mac OSX metafiles files aren't included + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/\\.1.cf\$//'` + + ## make a directory for placing the pipe files in somewhere other than default /tmp + ## otherwise get pipefile name clashes when multiple centrifuge runs on same node + ## use /tmp at the same time + mkdir ./temp + + centrifuge \\ + -x \$db_name \\ + --temp-directory ./temp \\ + -p $task.cpus \\ + $paired \\ + --report-file ${prefix}.report.txt \\ + -S ${prefix}.results.txt \\ + $unaligned \\ + $aligned \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "-U ${reads}" : "-1 ${reads[0]} -2 ${reads[1]}" + def unaligned = '' + def aligned = '' + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-gz ${prefix}.mapped.fastq.gz" : '' + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' + aligned = save_aligned ? "--al-conc-gz ${prefix}.mapped.fastq.gz" : '' + } + """ + touch ${prefix}.report.txt + touch ${prefix}.results.txt + touch ${prefix}.sam + echo | gzip -n > ${prefix}.unmapped.fastq.gz + echo | gzip -n > ${prefix}.mapped.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/centrifuge/centrifuge/meta.yml b/modules/nf-core/centrifuge/centrifuge/meta.yml new file mode 100644 index 00000000..a06104e1 --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/meta.yml @@ -0,0 +1,75 @@ +name: centrifuge_centrifuge +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Path to directory containing centrifuge database files + - save_unaligned: + type: boolean + description: If true unmapped fastq files are saved + - save_aligned: + type: boolean + description: If true mapped fastq files are saved +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: | + File containing a classification summary + pattern: "*.{report.txt}" + - results: + type: file + description: | + File containing classification results + pattern: "*.{results.txt}" + - sam: + type: file + description: | + Optional output file containing read alignments (SAM format )or a table of per-read hit information (TAB)s + pattern: "*.{sam,tab}" + - fastq_unmapped: + type: file + description: Unmapped fastq files + pattern: "*.unmapped.fastq.gz" + - fastq_mapped: + type: file + description: Mapped fastq files + pattern: "*.mapped.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" +maintainers: + - "@sofstam" + - "@jfy133" + - "@sateeshperi" diff --git a/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test new file mode 100644 index 00000000..d83b522a --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process CENTRIFUGE_CENTRIFUGE" + script "../main.nf" + process "CENTRIFUGE_CENTRIFUGE" + + tag "modules" + tag "modules_nfcore" + tag "centrifuge" + tag "centrifuge/centrifuge" + tag "untar" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = db = [ [], file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/minigut_cf.tar.gz', checkIfExists: true) ] + """ + } + } + } + + test("sarscov2_fastq_se") { + + when { + process { + """ + input[0] = [ [id: 'test', single_end: true], file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.report[0][1]).name, + file(process.out.results[0][1]).name, + file(process.out.fastq_mapped[0][1][0]).name, + file(process.out.fastq_unmapped[0][1][0]).name, + ).match() } + ) + } + + } + + test("sarscov2_fastq_pe") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ] + ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.report[0][1]).name, + file(process.out.results[0][1]).name, + file(process.out.fastq_mapped[0][1][0]).name, + file(process.out.fastq_unmapped[0][1][0]).name, + ).match() } + ) + } + + } + + test("sarscov2_fastq_se_stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [id: 'test'], file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test.snap b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test.snap new file mode 100644 index 00000000..f8a2ef7b --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/tests/main.nf.test.snap @@ -0,0 +1,125 @@ +{ + "sarscov2_fastq_se_stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.report.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.results.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + [ + { + "id": "test" + }, + "test.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "3": [ + [ + { + "id": "test" + }, + "test.mapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "4": [ + [ + { + "id": "test" + }, + "test.unmapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "5": [ + "versions.yml:md5,1ce028d9f968eca6df31586fe3b77c84" + ], + "fastq_mapped": [ + [ + { + "id": "test" + }, + "test.mapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "fastq_unmapped": [ + [ + { + "id": "test" + }, + "test.unmapped.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "report": [ + [ + { + "id": "test" + }, + "test.report.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "results": [ + [ + { + "id": "test" + }, + "test.results.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "sam": [ + [ + { + "id": "test" + }, + "test.sam:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,1ce028d9f968eca6df31586fe3b77c84" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T07:47:36.886757827" + }, + "sarscov2_fastq_se": { + "content": [ + "test.report.txt", + "test.results.txt", + "", + "" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T08:22:31.470316024" + }, + "sarscov2_fastq_pe": { + "content": [ + "test.report.txt", + "test.results.txt", + "test.mapped.fastq.1.gz", + "test.unmapped.fastq.1.gz" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T08:22:48.866073154" + } +} \ No newline at end of file diff --git a/modules/nf-core/centrifuge/centrifuge/tests/tags.yml b/modules/nf-core/centrifuge/centrifuge/tests/tags.yml new file mode 100644 index 00000000..53444cd2 --- /dev/null +++ b/modules/nf-core/centrifuge/centrifuge/tests/tags.yml @@ -0,0 +1,2 @@ +centrifuge/centrifuge: + - "modules/nf-core/centrifuge/centrifuge/**" diff --git a/modules/nf-core/centrifuge/kreport/environment.yml b/modules/nf-core/centrifuge/kreport/environment.yml new file mode 100644 index 00000000..5c8fb451 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/environment.yml @@ -0,0 +1,7 @@ +name: centrifuge_kreport +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::centrifuge=1.0.4.1 diff --git a/modules/nf-core/centrifuge/kreport/main.nf b/modules/nf-core/centrifuge/kreport/main.nf new file mode 100644 index 00000000..25eb7167 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/main.nf @@ -0,0 +1,45 @@ +process CENTRIFUGE_KREPORT { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/centrifuge:1.0.4.1--hdcf5f25_1' : + 'biocontainers/centrifuge:1.0.4.1--hdcf5f25_1' }" + + input: + tuple val(meta), path(report) + path db + + output: + tuple val(meta), path('*.txt'), emit: kreport + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + db_name=`find -L ${db} -name "*.1.cf" -not -name "._*" | sed 's/\\.1.cf\$//'` + centrifuge-kreport -x \$db_name ${report} > ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + centrifuge: \$( centrifuge --version | sed -n 1p | sed 's/^.*centrifuge-class version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/centrifuge/kreport/meta.yml b/modules/nf-core/centrifuge/kreport/meta.yml new file mode 100644 index 00000000..5641152b --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/meta.yml @@ -0,0 +1,51 @@ +name: "centrifuge_kreport" +description: Creates Kraken-style reports from centrifuge out files +keywords: + - classify + - metagenomics + - fastq + - db + - report + - kraken +tools: + - centrifuge: + description: Centrifuge is a classifier for metagenomic sequences. + homepage: https://ccb.jhu.edu/software/centrifuge/ + documentation: https://ccb.jhu.edu/software/centrifuge/manual.shtml + doi: 10.1101/gr.210641.116 + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - report: + type: file + description: File containing the centrifuge classification report + pattern: "*.{txt}" + - db: + type: directory + description: Path to directory containing centrifuge database files +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - kreport: + type: file + description: | + File containing kraken-style report from centrifuge + out files. + pattern: "*.{txt}" +authors: + - "@sofstam" + - "@jfy133" +maintainers: + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/centrifuge/kreport/tests/main.nf.test b/modules/nf-core/centrifuge/kreport/tests/main.nf.test new file mode 100644 index 00000000..6347bd7c --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/tests/main.nf.test @@ -0,0 +1,81 @@ +// nf-core modules test centrifuge/kreport +nextflow_process { + + name "Test Process CENTRIFUGE_KREPORT" + script "../main.nf" + process "CENTRIFUGE_KREPORT" + + tag "modules" + tag "modules_nfcore" + tag "centrifuge" + tag "centrifuge/centrifuge" + tag "centrifuge/kreport" + tag "untar" + + setup { + run("UNTAR") { + script "../../../untar/main.nf" + process { + """ + input[0] = db = [ [], file('https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/delete_me/minigut_cf.tar.gz', checkIfExists: true) ] + """ + } + } + run("CENTRIFUGE_CENTRIFUGE") { + script "../../../centrifuge/centrifuge/main.nf" + process { + """ + input[0] = [ [id: 'test', single_end: true], file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ] + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = true + input[3] = true + """ + } + } + } + + test("sarscov2_fastq_se") { + + when { + process { + """ + input[0] = CENTRIFUGE_CENTRIFUGE.out.results + input[1] = UNTAR.out.untar.map{it[1]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + file(process.out.kreport[0][1]).name, + ).match() } + ) + } + + } + + test("sarscov2 - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = CENTRIFUGE_CENTRIFUGE.out.results + input[1] = UNTAR.out.untar.map{it[1]} + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/centrifuge/kreport/tests/main.nf.test.snap b/modules/nf-core/centrifuge/kreport/tests/main.nf.test.snap new file mode 100644 index 00000000..4e0aaa79 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/tests/main.nf.test.snap @@ -0,0 +1,47 @@ +{ + "sarscov2 - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,43c766a19f2edf7e05d1a2a0b1816b13" + ], + "kreport": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,43c766a19f2edf7e05d1a2a0b1816b13" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T06:18:36.794405448" + }, + "sarscov2_fastq_se": { + "content": [ + "test.txt" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-02T06:28:20.461891873" + } +} \ No newline at end of file diff --git a/modules/nf-core/centrifuge/kreport/tests/tags.yml b/modules/nf-core/centrifuge/kreport/tests/tags.yml new file mode 100644 index 00000000..a3823d76 --- /dev/null +++ b/modules/nf-core/centrifuge/kreport/tests/tags.yml @@ -0,0 +1,2 @@ +centrifuge/kreport: + - "modules/nf-core/centrifuge/kreport/**" diff --git a/modules/nf-core/diamond/blastx/main.nf b/modules/nf-core/diamond/blastx/main.nf new file mode 100644 index 00000000..e08fb0d9 --- /dev/null +++ b/modules/nf-core/diamond/blastx/main.nf @@ -0,0 +1,68 @@ +process DIAMOND_BLASTX { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::diamond=2.0.15" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/diamond:2.0.15--hb97b32f_0' : + 'biocontainers/diamond:2.0.15--hb97b32f_0' }" + + input: + tuple val(meta), path(fasta) + path db + val out_ext + val blast_columns + + output: + tuple val(meta), path('*.blast'), optional: true, emit: blast + tuple val(meta), path('*.xml') , optional: true, emit: xml + tuple val(meta), path('*.txt') , optional: true, emit: txt + tuple val(meta), path('*.daa') , optional: true, emit: daa + tuple val(meta), path('*.sam') , optional: true, emit: sam + tuple val(meta), path('*.tsv') , optional: true, emit: tsv + tuple val(meta), path('*.paf') , optional: true, emit: paf + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def columns = blast_columns ? "${blast_columns}" : '' + switch ( out_ext ) { + case "blast": outfmt = 0; break + case "xml": outfmt = 5; break + case "txt": outfmt = 6; break + case "daa": outfmt = 100; break + case "sam": outfmt = 101; break + case "tsv": outfmt = 102; break + case "paf": outfmt = 103; break + default: + outfmt = '6'; + out_ext = 'txt'; + log.warn("Unknown output file format provided (${out_ext}): selecting DIAMOND default of tabular BLAST output (txt)"); + break + } + """ + DB=`find -L ./ -name "*.dmnd" | sed 's/\\.dmnd\$//'` + + diamond \\ + blastx \\ + --threads $task.cpus \\ + --db \$DB \\ + --query $fasta \\ + --outfmt ${outfmt} ${columns} \\ + $args \\ + --out ${prefix}.${out_ext} \\ + --log + + mv diamond.log ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS + """ +} diff --git a/modules/nf-core/diamond/blastx/meta.yml b/modules/nf-core/diamond/blastx/meta.yml new file mode 100644 index 00000000..a2a6013d --- /dev/null +++ b/modules/nf-core/diamond/blastx/meta.yml @@ -0,0 +1,81 @@ +name: diamond_blastx +description: Queries a DIAMOND database using blastx mode +keywords: + - fasta + - diamond + - blastx + - DNA sequence +tools: + - diamond: + description: Accelerated BLAST compatible local sequence aligner + homepage: https://github.com/bbuchfink/diamond + documentation: https://github.com/bbuchfink/diamond/wiki + tool_dev_url: https://github.com/bbuchfink/diamond + doi: "10.1038/s41592-021-01101-x" + licence: ["GPL v3.0"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Input fasta file containing query sequences + pattern: "*.{fa,fasta}" + - db: + type: directory + description: Directory containing the nucelotide blast database + pattern: "*" + - out_ext: + type: string + description: | + Specify the type of output file to be generated. `blast` corresponds to + BLAST pairwise format. `xml` corresponds to BLAST xml format. + `txt` corresponds to to BLAST tabular format. `tsv` corresponds to + taxonomic classification format. + pattern: "blast|xml|txt|daa|sam|tsv|paf" + +output: + - blast: + type: file + description: File containing blastp hits + pattern: "*.{blast}" + - xml: + type: file + description: File containing blastp hits + pattern: "*.{xml}" + - txt: + type: file + description: File containing hits in tabular BLAST format. + pattern: "*.{txt}" + - daa: + type: file + description: File containing hits DAA format + pattern: "*.{daa}" + - sam: + type: file + description: File containing aligned reads in SAM format + pattern: "*.{sam}" + - tsv: + type: file + description: Tab separated file containing taxonomic classification of hits + pattern: "*.{tsv}" + - paf: + type: file + description: File containing aligned reads in pairwise mapping format format + pattern: "*.{paf}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - log: + type: file + description: Log file containing stdout information + pattern: "*.{log}" + +authors: + - "@spficklin" + - "@jfy133" + - "@mjamy" diff --git a/modules/nf-core/falco/falco.diff b/modules/nf-core/falco/falco.diff new file mode 100644 index 00000000..4c726b9a --- /dev/null +++ b/modules/nf-core/falco/falco.diff @@ -0,0 +1,16 @@ +Changes in module 'nf-core/falco' +--- modules/nf-core/falco/main.nf ++++ modules/nf-core/falco/main.nf +@@ -33,7 +33,9 @@ + """ + } else { + """ +- falco $args --threads $task.cpus ${reads} ++ [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz ++ [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz ++ falco $args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + +************************************************************ diff --git a/modules/nf-core/falco/main.nf b/modules/nf-core/falco/main.nf new file mode 100644 index 00000000..29ccc4bc --- /dev/null +++ b/modules/nf-core/falco/main.nf @@ -0,0 +1,59 @@ +process FALCO { + tag "$meta.id" + label 'process_single' + + + conda "bioconda::falco=1.2.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/falco:1.2.1--h867801b_3': + 'biocontainers/falco:1.2.1--h867801b_3' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.html"), emit: html + tuple val(meta), path("*.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + if ( reads.toList().size() == 1 ) { + """ + falco $args --threads $task.cpus ${reads} -D ${prefix}_data.txt -S ${prefix}_summary.txt -R ${prefix}_report.html + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } else { + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -s ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -s ${reads[1]} ${prefix}_2.fastq.gz + falco $args --threads $task.cpus ${prefix}_1.fastq.gz ${prefix}_2.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco:\$( falco --version | sed -e "s/falco//g" ) + END_VERSIONS + """ + } + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_data.txt + touch ${prefix}_fastqc_data.html + touch ${prefix}_summary.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + falco: \$( falco --version | sed -e "s/falco v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/falco/meta.yml b/modules/nf-core/falco/meta.yml new file mode 100644 index 00000000..63846491 --- /dev/null +++ b/modules/nf-core/falco/meta.yml @@ -0,0 +1,51 @@ +name: falco +description: Run falco on sequenced reads +keywords: + - quality control + - qc + - adapters + - fastq +tools: + - fastqc: + description: "falco is a drop-in C++ implementation of FastQC to assess the quality of sequence reads." + + homepage: "https://falco.readthedocs.io/" + documentation: "https://falco.readthedocs.io/" + + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - html: + type: file + description: FastQC like report + pattern: "*_{fastqc_report.html}" + - txt: + type: file + description: falco report data + pattern: "*_{data.txt}" + - txt: + type: file + description: falco summary file + pattern: "*_{summary.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@lucacozzuto" diff --git a/modules/nf-core/fastp/main.nf b/modules/nf-core/fastp/main.nf new file mode 100644 index 00000000..831b7f12 --- /dev/null +++ b/modules/nf-core/fastp/main.nf @@ -0,0 +1,102 @@ +process FASTP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::fastp=0.23.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/fastp:0.23.4--h5f740d0_0' : + 'biocontainers/fastp:0.23.4--h5f740d0_0' }" + + input: + tuple val(meta), path(reads) + path adapter_fasta + val save_trimmed_fail + val save_merged + + output: + tuple val(meta), path('*.fastp.fastq.gz') , optional:true, emit: reads + tuple val(meta), path('*.json') , emit: json + tuple val(meta), path('*.html') , emit: html + tuple val(meta), path('*.log') , emit: log + path "versions.yml" , emit: versions + tuple val(meta), path('*.fail.fastq.gz') , optional:true, emit: reads_fail + tuple val(meta), path('*.merged.fastq.gz'), optional:true, emit: reads_merged + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def adapter_list = adapter_fasta ? "--adapter_fasta ${adapter_fasta}" : "" + def fail_fastq = save_trimmed_fail && meta.single_end ? "--failed_out ${prefix}.fail.fastq.gz" : save_trimmed_fail && !meta.single_end ? "--unpaired1 ${prefix}_1.fail.fastq.gz --unpaired2 ${prefix}_2.fail.fastq.gz" : '' + // Added soft-links to original fastqs for consistent naming in MultiQC + // Use single ended for interleaved. Add --interleaved_in in config. + if ( task.ext.args?.contains('--interleaved_in') ) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --stdout \\ + --in1 ${prefix}.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log \\ + | gzip -c > ${prefix}.fastp.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else if (meta.single_end) { + """ + [ ! -f ${prefix}.fastq.gz ] && ln -sf $reads ${prefix}.fastq.gz + + fastp \\ + --in1 ${prefix}.fastq.gz \\ + --out1 ${prefix}.fastp.fastq.gz \\ + --thread $task.cpus \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } else { + def merge_fastq = save_merged ? "-m --merged_out ${prefix}.merged.fastq.gz" : '' + """ + [ ! -f ${prefix}_1.fastq.gz ] && ln -sf ${reads[0]} ${prefix}_1.fastq.gz + [ ! -f ${prefix}_2.fastq.gz ] && ln -sf ${reads[1]} ${prefix}_2.fastq.gz + fastp \\ + --in1 ${prefix}_1.fastq.gz \\ + --in2 ${prefix}_2.fastq.gz \\ + --out1 ${prefix}_1.fastp.fastq.gz \\ + --out2 ${prefix}_2.fastp.fastq.gz \\ + --json ${prefix}.fastp.json \\ + --html ${prefix}.fastp.html \\ + $adapter_list \\ + $fail_fastq \\ + $merge_fastq \\ + --thread $task.cpus \\ + --detect_adapter_for_pe \\ + $args \\ + 2> ${prefix}.fastp.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + fastp: \$(fastp --version 2>&1 | sed -e "s/fastp //g") + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/fastp/meta.yml b/modules/nf-core/fastp/meta.yml new file mode 100644 index 00000000..197ea7ca --- /dev/null +++ b/modules/nf-core/fastp/meta.yml @@ -0,0 +1,73 @@ +name: fastp +description: Perform adapter/quality trimming on sequencing reads +keywords: + - trimming + - quality control + - fastq +tools: + - fastp: + description: | + A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. + documentation: https://github.com/OpenGene/fastp + doi: 10.1093/bioinformatics/bty560 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Use 'single_end: true' to specify single ended or interleaved FASTQs. Use 'single_end: false' for paired-end reads. + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. If you wish to run interleaved paired-end data, supply as single-end data + but with `--interleaved_in` in your `modules.conf`'s `ext.args` for the module. + - adapter_fasta: + type: file + description: File in FASTA format containing possible adapters to remove. + pattern: "*.{fasta,fna,fas,fa}" + - save_trimmed_fail: + type: boolean + description: Specify true to save files that failed to pass trimming thresholds ending in `*.fail.fastq.gz` + - save_merged: + type: boolean + description: Specify true to save all merged reads to the a file ending in `*.merged.fastq.gz` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: The trimmed/modified/unmerged fastq reads + pattern: "*fastp.fastq.gz" + - json: + type: file + description: Results in JSON format + pattern: "*.json" + - html: + type: file + description: Results in HTML format + pattern: "*.html" + - log: + type: file + description: fastq log file + pattern: "*.log" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads_fail: + type: file + description: Reads the failed the preprocessing + pattern: "*fail.fastq.gz" + - reads_merged: + type: file + description: Reads that were successfully merged + pattern: "*.{merged.fastq.gz}" +authors: + - "@drpatelh" + - "@kevinmenden" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test index 70edae4d..83586e4a 100644 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ b/modules/nf-core/fastqc/tests/main.nf.test @@ -94,6 +94,32 @@ nextflow_process { } } + test("sarscov2 paired-end [bam]") { + + when { + process { + """ + input[0] = [ + [id: 'test', single_end: false], // meta map + file(params.test_data['sarscov2']['illumina']['test_paired_end_sorted_bam'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + + { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, + { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, + { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, + + { assert snapshot(process.out.versions).match("versions") } + ) + } + } + test("sarscov2 paired-end [bam]") { when { diff --git a/modules/nf-core/filtlong/environment.yml b/modules/nf-core/filtlong/environment.yml new file mode 100644 index 00000000..0fb7e8b4 --- /dev/null +++ b/modules/nf-core/filtlong/environment.yml @@ -0,0 +1,7 @@ +name: filtlong +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::filtlong=0.2.1 diff --git a/modules/nf-core/filtlong/main.nf b/modules/nf-core/filtlong/main.nf new file mode 100644 index 00000000..627247fe --- /dev/null +++ b/modules/nf-core/filtlong/main.nf @@ -0,0 +1,39 @@ +process FILTLONG { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/filtlong:0.2.1--h9a82719_0' : + 'biocontainers/filtlong:0.2.1--h9a82719_0' }" + + input: + tuple val(meta), path(shortreads), path(longreads) + + output: + tuple val(meta), path("*.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def short_reads = !shortreads ? "" : meta.single_end ? "-1 $shortreads" : "-1 ${shortreads[0]} -2 ${shortreads[1]}" + if ("$longreads" == "${prefix}.fastq.gz") error "Longread FASTQ input and output names are the same, set prefix in module configuration to disambiguate!" + """ + filtlong \\ + $short_reads \\ + $args \\ + $longreads \\ + 2> >(tee ${prefix}.log >&2) \\ + | gzip -n > ${prefix}.fastq.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + filtlong: \$( filtlong --version | sed -e "s/Filtlong v//g" ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/filtlong/meta.yml b/modules/nf-core/filtlong/meta.yml new file mode 100644 index 00000000..4a0f072a --- /dev/null +++ b/modules/nf-core/filtlong/meta.yml @@ -0,0 +1,53 @@ +name: filtlong +description: Filtlong filters long reads based on quality measures or short read data. +keywords: + - nanopore + - quality control + - QC + - filtering + - long reads + - short reads +tools: + - filtlong: + description: Filtlong is a tool for filtering long reads. It can take a set of long reads and produce a smaller, better subset. It uses both read length (longer is better) and read identity (higher is better) when choosing which reads pass the filter. + homepage: https://anaconda.org/bioconda/filtlong + tool_dev_url: https://github.com/rrwick/Filtlong + licence: ["GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - shortreads: + type: file + description: fastq file + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" + - longreads: + type: file + description: fastq file + pattern: "*.{fq,fastq,fq.gz,fastq.gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Filtered (compressed) fastq file + pattern: "*.fastq.gz" + - log: + type: file + description: Standard error logging file containing summary statistics + pattern: "*.log" +authors: + - "@d4straub" + - "@sofstam" +maintainers: + - "@d4straub" + - "@sofstam" diff --git a/modules/nf-core/filtlong/tests/main.nf.test b/modules/nf-core/filtlong/tests/main.nf.test new file mode 100644 index 00000000..b7d73e79 --- /dev/null +++ b/modules/nf-core/filtlong/tests/main.nf.test @@ -0,0 +1,99 @@ +nextflow_process { + + name "Test Process FILTLONG" + script "../main.nf" + process "FILTLONG" + config "./nextflow.config" + tag "filtlong" + tag "modules" + tag "modules_nfcore" + + test("sarscov2 nanopore [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [], + [ file(params.test_data['sarscov2']['nanopore']['test_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("Scoring long reads")}, + { assert snapshot(process.out.reads).match("nanopore") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + + test("sarscov2 nanopore [fastq] + Illumina single-end [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) ], + [ file(params.test_data['sarscov2']['nanopore']['test_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("Scoring long reads")}, + { assert snapshot(process.out.reads).match("nanopore_illumina_se") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + + test("sarscov2 nanopore [fastq] + Illumina paired-end [fastq]") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + [ file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) ], + [ file(params.test_data['sarscov2']['nanopore']['test_fastq_gz'], checkIfExists: true) ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.log.get(0).get(1)).readLines().contains("Scoring long reads")}, + { assert snapshot(process.out.reads).match("nanopore_illumina_pe") }, + { assert snapshot(process.out.versions).match("versions") } + ) + } + + } + + +} diff --git a/modules/nf-core/filtlong/tests/main.nf.test.snap b/modules/nf-core/filtlong/tests/main.nf.test.snap new file mode 100644 index 00000000..49d4bb6c --- /dev/null +++ b/modules/nf-core/filtlong/tests/main.nf.test.snap @@ -0,0 +1,52 @@ +{ + "nanopore": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test_lr.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ] + ], + "timestamp": "2023-12-11T14:23:36.351509" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,af5988f30157282acdb0ac50ebb4c8cc" + ] + ], + "timestamp": "2023-12-11T14:23:36.372189" + }, + "nanopore_illumina_pe": { + "content": [ + [ + [ + { + "id": "test", + "single_end": false + }, + "test_lr.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ] + ], + "timestamp": "2023-12-11T14:24:05.202047" + }, + "nanopore_illumina_se": { + "content": [ + [ + [ + { + "id": "test", + "single_end": true + }, + "test_lr.fastq.gz:md5,7567d853ada6ac142332619d0b541d76" + ] + ] + ], + "timestamp": "2023-12-11T14:23:50.607338" + } +} \ No newline at end of file diff --git a/modules/nf-core/filtlong/tests/nextflow.config b/modules/nf-core/filtlong/tests/nextflow.config new file mode 100644 index 00000000..5e4c9fbb --- /dev/null +++ b/modules/nf-core/filtlong/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + ext.args = "--min_length 10" + ext.prefix = "test_lr" + +} diff --git a/modules/nf-core/filtlong/tests/tags.yml b/modules/nf-core/filtlong/tests/tags.yml new file mode 100644 index 00000000..bf2cff9d --- /dev/null +++ b/modules/nf-core/filtlong/tests/tags.yml @@ -0,0 +1,2 @@ +filtlong: + - modules/nf-core/filtlong/** diff --git a/modules/nf-core/ganon/classify/main.nf b/modules/nf-core/ganon/classify/main.nf new file mode 100644 index 00000000..d4130a01 --- /dev/null +++ b/modules/nf-core/ganon/classify/main.nf @@ -0,0 +1,64 @@ +process GANON_CLASSIFY { + tag "$meta.id" + label 'process_high' + + conda "bioconda::ganon=1.5.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ganon:1.5.1--py310h8abeb55_0': + 'biocontainers/ganon:1.5.1--py310h8abeb55_0' }" + + input: + tuple val(meta) , path(fastqs) + path(db) + + output: + tuple val(meta), path("*.tre"), emit: tre + tuple val(meta), path("*.rep"), emit: report + tuple val(meta), path("*.lca"), emit: lca , optional: true + tuple val(meta), path("*.all"), emit: all , optional: true + tuple val(meta), path("*.unc"), emit: unc , optional: true + tuple val(meta), path("*.log"), emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "--single-reads ${fastqs}" : "--paired-reads ${fastqs}" + """ + dbprefix=\$(find -L . -name '*.ibf' | sed 's/\\.ibf\$//') + + ganon \\ + classify \\ + --db-prefix \${dbprefix%%.ibf} \\ + $args \\ + --threads $task.cpus \\ + --output-prefix ${prefix} \\ + $input \ + 2>&1 | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ganon: \$(echo \$(ganon --version 2>1) | sed 's/.*ganon //g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "--single-reads ${fastqs}" : "--paired-reads ${fastqs}" + """ + touch ${prefix}.tre + touch ${prefix}.report + touch ${prefix}.lca + touch ${prefix}.all + touch ${prefix}.unc + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ganon: \$(echo \$(ganon --version 2>1) | sed 's/.*ganon //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ganon/classify/meta.yml b/modules/nf-core/ganon/classify/meta.yml new file mode 100644 index 00000000..692d4e56 --- /dev/null +++ b/modules/nf-core/ganon/classify/meta.yml @@ -0,0 +1,71 @@ +name: "ganon_classify" +description: Classify FASTQ files against ganon database +keywords: + - ganon + - metagenomics + - profiling + - taxonomy + - k-mer + - classification + - classify +tools: + - "ganon": + description: "ganon classifies short DNA sequences against large sets of genomic reference sequences efficiently" + homepage: "https://github.com/pirovc/ganon" + documentation: "https://github.com/pirovc/ganon" + tool_dev_url: "https://github.com/pirovc/ganon" + doi: "10.1093/bioinformatics/btaa458" + licence: "['MIT License']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Single or paired FASTQ files, optionally gzipped + pattern: "*.{fq,fq.gz,fastq,fastq.gz}" + - dbs: + type: file + description: Ganon database files from build or build-custom + pattern: "*.{ibf,tax}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tre: + type: file + description: Full ganon report file + pattern: "*.tre" + - report: + type: file + description: Plain ganon report file with only targets with match + pattern: "*.rep" + - lca: + type: file + description: Information about the lowest common ancestor match of a given read + pattern: "*.lca" + - all: + type: file + description: Information of all matches to a given read + pattern: "*.all" + - unc: + type: file + description: List of all reads without a hit + pattern: "*.unc" + - log: + type: file + description: Text file containing console output from ganon classify + pattern: "*.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/ganon/report/main.nf b/modules/nf-core/ganon/report/main.nf new file mode 100644 index 00000000..bd0a2207 --- /dev/null +++ b/modules/nf-core/ganon/report/main.nf @@ -0,0 +1,53 @@ +process GANON_REPORT { + tag "$meta.id" + label 'process_single' + + conda "bioconda::ganon=1.5.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ganon:1.5.1--py310h8abeb55_0': + 'biocontainers/ganon:1.5.1--py310h8abeb55_0' }" + + input: + tuple val(meta), path(rep) + path db + + output: + tuple val(meta), path("*.tre"), emit: tre + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + dbprefix=\$(find -L . -name '*.ibf' | sed 's/\\.ibf\$//') + + ganon \\ + report \\ + --input ${rep} \\ + --output-prefix ${prefix} \\ + --db-prefix \${dbprefix%%.ibf} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ganon: \$(echo \$(ganon --version 2>1) | sed 's/.*ganon //g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.tre + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ganon: \$(echo \$(ganon --version 2>1) | sed 's/.*ganon //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ganon/report/meta.yml b/modules/nf-core/ganon/report/meta.yml new file mode 100644 index 00000000..c7852c8a --- /dev/null +++ b/modules/nf-core/ganon/report/meta.yml @@ -0,0 +1,53 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "ganon_report" +description: Generate a ganon report file from the output of ganon classify +keywords: + - ganon + - metagenomics + - profiling + - taxonomy + - k-mer + - classification + - report +tools: + - "ganon": + description: "ganon classifies short DNA sequences against large sets of genomic reference sequences efficiently" + homepage: "https://github.com/pirovc/ganon" + documentation: "https://github.com/pirovc/ganon" + tool_dev_url: "https://github.com/pirovc/ganon" + doi: "10.1093/bioinformatics/btaa458" + licence: "['MIT License']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - rep: + type: file + description: Input 'repo' files from ganon classify + pattern: "*.rep" + - dbs: + type: file + description: Ganon database files from build or build-custom + pattern: "*.{ibf,tax}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - tre: + type: file + description: Output ganon report containing taxonomic profile information. Formatting of contents depends on --output-format. + pattern: "*.tre" + +authors: + - "@jfy133" diff --git a/modules/nf-core/ganon/table/main.nf b/modules/nf-core/ganon/table/main.nf new file mode 100644 index 00000000..5adaa65c --- /dev/null +++ b/modules/nf-core/ganon/table/main.nf @@ -0,0 +1,48 @@ +process GANON_TABLE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::ganon=1.5.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ganon:1.5.1--py310h8abeb55_0': + 'biocontainers/ganon:1.5.1--py310h8abeb55_0' }" + + input: + tuple val(meta), path(tre) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + ganon \\ + table \\ + --input ${tre} \\ + --output-file ${prefix}.txt \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ganon: \$(echo \$(ganon --version 2>1) | sed 's/.*ganon //g') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ganon: \$(echo \$(ganon --version 2>1) | sed 's/.*ganon //g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ganon/table/meta.yml b/modules/nf-core/ganon/table/meta.yml new file mode 100644 index 00000000..4fa19b74 --- /dev/null +++ b/modules/nf-core/ganon/table/meta.yml @@ -0,0 +1,50 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/yaml-schema.json +name: "ganon_table" +description: Generate a multi-sample report file from the output of ganon report runs +keywords: + - ganon + - metagenomics + - profiling + - taxonomy + - k-mer + - classification + - report + - table +tools: + - "ganon": + description: "ganon classifies short DNA sequences against large sets of genomic reference sequences efficiently" + homepage: "https://github.com/pirovc/ganon" + documentation: "https://github.com/pirovc/ganon" + tool_dev_url: "https://github.com/pirovc/ganon" + doi: "10.1093/bioinformatics/btaa458" + licence: "['MIT License']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tre: + type: file + description: A list of 'tre' files from ganon report + pattern: "*.tre" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Output ganon table containing taxonomic profile information of multiple samples. Formatting of contents depends on --output-format. + pattern: "*.txt" + +authors: + - "@jfy133" diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..73bf08cd --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..4cdcdf4c --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,35 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/kaiju/kaiju/environment.yml b/modules/nf-core/kaiju/kaiju/environment.yml new file mode 100644 index 00000000..baac450b --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/environment.yml @@ -0,0 +1,7 @@ +name: kaiju_kaiju +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kaiju=1.10.0 diff --git a/modules/nf-core/kaiju/kaiju/main.nf b/modules/nf-core/kaiju/kaiju/main.nf new file mode 100644 index 00000000..2f5f6e7d --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/main.nf @@ -0,0 +1,55 @@ +process KAIJU_KAIJU { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.10.0--h43eeafb_0': + 'biocontainers/kaiju:1.10.0--h43eeafb_0' }" + + input: + tuple val(meta), path(reads) + path(db) + + output: + tuple val(meta), path('*.tsv'), emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "-i ${reads}" : "-i ${reads[0]} -j ${reads[1]}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbname=`find -L ${db} -name "*.fmi" -not -name "._*"` + kaiju \\ + $args \\ + -z $task.cpus \\ + -t \$dbnodes \\ + -f \$dbname \\ + -o ${prefix}.tsv \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "-i ${reads}" : "-i ${reads[0]} -j ${reads[1]}" + """ + touch ${prefix}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ + +} diff --git a/modules/nf-core/kaiju/kaiju/meta.yml b/modules/nf-core/kaiju/kaiju/meta.yml new file mode 100644 index 00000000..33c85690 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/meta.yml @@ -0,0 +1,54 @@ +name: kaiju_kaiju +description: Taxonomic classification of metagenomic sequence data using a protein reference database +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling +tools: + - kaiju: + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data, + respectively. + pattern: "*.{fastq,fq,fasta,fa,fsa,fas,fna,fastq.gz,fq.gz,fasta.gz,fa.gz,fsa.gz,fas.gz,fna.gz}" + - db: + type: directory + description: | + List containing the database and nodes files for Kaiju + e.g. [ 'database.fmi', 'nodes.dmp' ] +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: file + description: Results with taxonomic classification of each read + pattern: "*.tsv" +authors: + - "@talnor" + - "@sofstam" + - "@jfy133" +maintainers: + - "@talnor" + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/kaiju/kaiju/tests/main.nf.test b/modules/nf-core/kaiju/kaiju/tests/main.nf.test new file mode 100644 index 00000000..ede2f952 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/tests/main.nf.test @@ -0,0 +1,117 @@ +nextflow_process { + + name "Test Process KAIJU_KAIJU" + script "../main.nf" + process "KAIJU_KAIJU" + + tag "modules" + tag "modules_nfcore" + tag "kaiju" + tag "kaiju/kaiju" + tag "untar" + + test("sarscov2 - fastq - single-end") { + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.results[0][1]).getText().contains("C\tERR5069949.2257580\t2697049") } + ) + } + + } + + test("sarscov2 - fastq - paired-end") { + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert path(process.out.results[0][1]).getText().contains("C\tERR5069949.2257580\t2697049") } + ) + } + } + + test("sarscov2 - fastq - stub") { + + options '-stub' + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + } + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.results[0][1]).name).match() } + ) + } + } + +} diff --git a/modules/nf-core/kaiju/kaiju/tests/main.nf.test.snap b/modules/nf-core/kaiju/kaiju/tests/main.nf.test.snap new file mode 100644 index 00000000..08735f47 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/tests/main.nf.test.snap @@ -0,0 +1,8 @@ +{ + "sarscov2 - fastq - stub": { + "content": [ + "test.tsv" + ], + "timestamp": "2024-01-20T14:44:57.116024519" + } +} \ No newline at end of file diff --git a/modules/nf-core/kaiju/kaiju/tests/tags.yml b/modules/nf-core/kaiju/kaiju/tests/tags.yml new file mode 100644 index 00000000..c73d4df5 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju/tests/tags.yml @@ -0,0 +1,2 @@ +kaiju/kaiju: + - "modules/nf-core/kaiju/kaiju/**" diff --git a/modules/nf-core/kaiju/kaiju2krona/environment.yml b/modules/nf-core/kaiju/kaiju2krona/environment.yml new file mode 100644 index 00000000..2905be97 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/environment.yml @@ -0,0 +1,7 @@ +name: kaiju_kaiju2krona +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kaiju=1.10.0 diff --git a/modules/nf-core/kaiju/kaiju2krona/main.nf b/modules/nf-core/kaiju/kaiju2krona/main.nf new file mode 100644 index 00000000..85d2dfd2 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/main.nf @@ -0,0 +1,52 @@ +process KAIJU_KAIJU2KRONA { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.10.0--h43eeafb_0': + 'biocontainers/kaiju:1.10.0--h43eeafb_0' }" + + input: + tuple val(meta), path(tsv) + path(db) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbnames=`find -L ${db} -name "*names.dmp"` + kaiju2krona \\ + $args \\ + -t \$dbnodes \\ + -n \$dbnames \\ + -i ${tsv} \\ + -o ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} + diff --git a/modules/nf-core/kaiju/kaiju2krona/meta.yml b/modules/nf-core/kaiju/kaiju2krona/meta.yml new file mode 100644 index 00000000..355416f8 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/meta.yml @@ -0,0 +1,43 @@ +name: kaiju_kaiju2krona +description: Convert Kaiju's tab-separated output file into a tab-separated text file which can be imported into Krona. +keywords: + - taxonomy + - visualisation + - krona chart + - metagenomics +tools: + - "kaiju": + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - tsv: + type: file + description: Kaiju tab-separated output file + pattern: "*.{tsv,txt}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Krona text-based input file converted from Kaiju report + pattern: "*.{txt,krona}" +authors: + - "@MillironX" +maintainers: + - "@MillironX" diff --git a/modules/nf-core/kaiju/kaiju2krona/tests/main.nf.test b/modules/nf-core/kaiju/kaiju2krona/tests/main.nf.test new file mode 100644 index 00000000..cf522cd0 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/tests/main.nf.test @@ -0,0 +1,105 @@ +nextflow_process { + + name "Test Process KAIJU_KAIJU2KRONA" + script "../main.nf" + process "KAIJU_KAIJU2KRONA" + + tag "modules" + tag "modules_nfcore" + tag "kaiju" + tag "kaiju/kaiju2krona" + tag "kaiju/kaiju" + tag "untar" + + test("sarscov2 - fastq - single-end") { + + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + + run("KAIJU_KAIJU") { + script "../../kaiju/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + } + + when { + process { + """ + input[0] = KAIJU_KAIJU.out.results + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq - stub") { + + options "-stub" + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + + run("KAIJU_KAIJU") { + script "../../kaiju/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + } + + when { + process { + """ + input[0] = KAIJU_KAIJU.out.results + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.txt[0][1]).name).match() } + ) + } + + } + +} diff --git a/modules/nf-core/kaiju/kaiju2krona/tests/main.nf.test.snap b/modules/nf-core/kaiju/kaiju2krona/tests/main.nf.test.snap new file mode 100644 index 00000000..5532a694 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/tests/main.nf.test.snap @@ -0,0 +1,39 @@ +{ + "sarscov2 - fastq - stub": { + "content": [ + "test.txt" + ], + "timestamp": "2024-01-20T15:06:32.789121011" + }, + "sarscov2 - fastq - single-end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,68b2309d37767e444193fa6cea7c0494" + ] + ], + "1": [ + "versions.yml:md5,f75aa349971d581981d3a0399450b395" + ], + "txt": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,68b2309d37767e444193fa6cea7c0494" + ] + ], + "versions": [ + "versions.yml:md5,f75aa349971d581981d3a0399450b395" + ] + } + ], + "timestamp": "2024-01-20T15:06:08.840865115" + } +} \ No newline at end of file diff --git a/modules/nf-core/kaiju/kaiju2krona/tests/tags.yml b/modules/nf-core/kaiju/kaiju2krona/tests/tags.yml new file mode 100644 index 00000000..661fe924 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2krona/tests/tags.yml @@ -0,0 +1,2 @@ +kaiju/kaiju2krona: + - "modules/nf-core/kaiju/kaiju2krona/**" diff --git a/modules/nf-core/kaiju/kaiju2table/environment.yml b/modules/nf-core/kaiju/kaiju2table/environment.yml new file mode 100644 index 00000000..18685f41 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/environment.yml @@ -0,0 +1,7 @@ +name: kaiju_kaiju2table +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kaiju=1.10.0 diff --git a/modules/nf-core/kaiju/kaiju2table/main.nf b/modules/nf-core/kaiju/kaiju2table/main.nf new file mode 100644 index 00000000..44049744 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/main.nf @@ -0,0 +1,52 @@ +process KAIJU_KAIJU2TABLE { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kaiju:1.10.0--h43eeafb_0': + 'biocontainers/kaiju:1.10.0--h43eeafb_0' }" + + input: + tuple val(meta), path(input) + path db + val taxon_rank + + output: + tuple val(meta), path('*.txt'), emit: summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + dbnodes=`find -L ${db} -name "*nodes.dmp"` + dbnames=`find -L ${db} -name "*names.dmp"` + kaiju2table $args \\ + -t \$dbnodes \\ + -n \$dbnames \\ + -r ${taxon_rank} \\ + -o ${prefix}.txt \\ + ${input} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kaiju: \$(echo \$( kaiju -h 2>&1 | sed -n 1p | sed 's/^.*Kaiju //' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kaiju/kaiju2table/meta.yml b/modules/nf-core/kaiju/kaiju2table/meta.yml new file mode 100644 index 00000000..0f62374e --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/meta.yml @@ -0,0 +1,52 @@ +name: "kaiju_kaiju2table" +description: write your description here +keywords: + - classify + - metagenomics + - taxonomic profiling +tools: + - kaiju: + description: Fast and sensitive taxonomic classification for metagenomics + homepage: https://kaiju.binf.ku.dk/ + documentation: https://github.com/bioinformatics-centre/kaiju/blob/master/README.md + tool_dev_url: https://github.com/bioinformatics-centre/kaiju + doi: "10.1038/ncomms11257" + licence: ["GNU GPL v3"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - results: + type: file + description: File containing the kaiju classification results + pattern: "*.{txt}" + - taxon_rank: + type: string + description: | + Taxonomic rank to display in report + pattern: "phylum|class|order|family|genus|species" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - results: + type: file + description: | + Summary table for a given taxonomic rank + pattern: "*.{tsv}" +authors: + - "@sofstam" + - "@talnor" + - "@jfy133" +maintainers: + - "@sofstam" + - "@talnor" + - "@jfy133" diff --git a/modules/nf-core/kaiju/kaiju2table/tests/main.nf.test b/modules/nf-core/kaiju/kaiju2table/tests/main.nf.test new file mode 100644 index 00000000..d93fb31a --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process KAIJU_KAIJU2TABLE" + script "../main.nf" + process "KAIJU_KAIJU2TABLE" + + tag "modules" + tag "modules_nfcore" + tag "kaiju" + tag "kaiju/kaiju2table" + tag "kaiju/kaiju" + tag "untar" + + test("sarscov2 - fastq - single-end") { + + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + + run("KAIJU_KAIJU") { + script "../../kaiju/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + } + + when { + process { + """ + input[0] = KAIJU_KAIJU.out.results + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = 'species' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("sarscov2 - fastq - stub") { + + options "-stub" + + setup { + run ("UNTAR"){ + script "../../../untar/main.nf" + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kaiju.tar.gz', checkIfExists: true) ] + """ + } + } + + run("KAIJU_KAIJU") { + script "../../kaiju/main.nf" + process { + """ + input[0] = [ + [ id:'test', single_end:true ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + input[1] = UNTAR.out.untar.map{ it[1] } + """ + } + } + } + + when { + process { + """ + input[0] = KAIJU_KAIJU.out.results + input[1] = UNTAR.out.untar.map{ it[1] } + input[2] = 'species' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.summary[0][1]).name).match() } + ) + } + + } +} diff --git a/modules/nf-core/kaiju/kaiju2table/tests/main.nf.test.snap b/modules/nf-core/kaiju/kaiju2table/tests/main.nf.test.snap new file mode 100644 index 00000000..e97eb8b8 --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/tests/main.nf.test.snap @@ -0,0 +1,39 @@ +{ + "sarscov2 - fastq - stub": { + "content": [ + "test.txt" + ], + "timestamp": "2024-01-20T16:10:49.521322767" + }, + "sarscov2 - fastq - single-end": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,0d9f8fd36fcf2888296ae12632c5f0a8" + ] + ], + "1": [ + "versions.yml:md5,2b2b98cb635a611e5eb964e5a77f6248" + ], + "summary": [ + [ + { + "id": "test", + "single_end": true + }, + "test.txt:md5,0d9f8fd36fcf2888296ae12632c5f0a8" + ] + ], + "versions": [ + "versions.yml:md5,2b2b98cb635a611e5eb964e5a77f6248" + ] + } + ], + "timestamp": "2024-01-20T16:08:47.644443775" + } +} diff --git a/modules/nf-core/kaiju/kaiju2table/tests/tags.yml b/modules/nf-core/kaiju/kaiju2table/tests/tags.yml new file mode 100644 index 00000000..0fa6b81e --- /dev/null +++ b/modules/nf-core/kaiju/kaiju2table/tests/tags.yml @@ -0,0 +1,2 @@ +kaiju/kaiju2table: + - "modules/nf-core/kaiju/kaiju2table/**" diff --git a/modules/nf-core/kmcp/profile/main.nf b/modules/nf-core/kmcp/profile/main.nf new file mode 100644 index 00000000..a4672122 --- /dev/null +++ b/modules/nf-core/kmcp/profile/main.nf @@ -0,0 +1,54 @@ +process KMCP_PROFILE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kmcp=0.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kmcp:0.9.1--h9ee0642_0': + 'biocontainers/kmcp:0.9.1--h9ee0642_0' }" + + input: + tuple val(meta), path(search_results) + path (db) + val mode + + output: + tuple val(meta), path("*.profile"), emit: profile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + taxid=`find -L ${db} -name "*map"` + taxdump=`find -L ${db}/*/ -type d -not -name "R001"` + kmcp \\ + profile \\ + $args \\ + -X \$taxdump \\ + -T \$taxid \\ + -m $mode \\ + -j $task.cpus \\ + -o ${prefix}.profile \\ + $search_results + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmcp: \$(echo \$(kmcp version 2>&1) | sed -n 1p | sed 's/^.*kmcp v//') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.profile + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmcp: \$(echo \$(kmcp version 2>&1) | sed -n 1p | sed 's/^.*kmcp v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kmcp/profile/meta.yml b/modules/nf-core/kmcp/profile/meta.yml new file mode 100644 index 00000000..14f292c7 --- /dev/null +++ b/modules/nf-core/kmcp/profile/meta.yml @@ -0,0 +1,58 @@ +name: "kmcp_profile" +description: Generate taxonomic profile from search results +keywords: + - metagenomics + - classify + - taxonomic profiling + - fastq + - sequences + - kmers + - index +tools: + - "kmcp": + description: "Accurate metagenomic profiling of both prokaryotic and viral populations by pseudo-mapping" + homepage: "https://github.com/shenwei356/kmcp" + documentation: "https://bioinf.shenwei.me/kmcp/usage/#profile" + tool_dev_url: "https://github.com/shenwei356/kmcp" + doi: "10.1093/bioinformatics/btac845" + licence: "['MIT']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - db: + type: directory + description: Database directory containing taxdump files and taxid file + - search_results: + type: file + description: Gzipped file output from kmcp search module + pattern: "*.gz" + - mode: + type: integer + description: Profiling mode. + 0-pathogen detection + 1-higher recall + 2-high recall + 3-default + 4-high precision + 5-higher precision +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-delimited format file with 17 columns. + pattern: "*.profile" + +authors: + - "@sofstam" diff --git a/modules/nf-core/kmcp/search/main.nf b/modules/nf-core/kmcp/search/main.nf new file mode 100644 index 00000000..cb2d6843 --- /dev/null +++ b/modules/nf-core/kmcp/search/main.nf @@ -0,0 +1,51 @@ +process KMCP_SEARCH { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::kmcp=0.9.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kmcp:0.9.1--h9ee0642_0': + 'biocontainers/kmcp:0.9.1--h9ee0642_0' }" + + input: + path(db) + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*.gz") , emit: result + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input = meta.single_end ? "${reads}": "-1 ${reads[0]} -2 ${reads[1]}" + """ + kmcp \\ + search \\ + $args \\ + --threads $task.cpus \\ + --db-dir $db \\ + $reads \\ + --out-file ${prefix}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmcp: \$(echo \$(kmcp version 2>&1) | sed -n 1p | sed 's/^.*kmcp v//') + END_VERSIONS + """ + stub: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kmcp: \$(echo \$(kmcp version 2>&1) | sed -n 1p | sed 's/^.*kmcp v//') + END_VERSIONS + """ + +} diff --git a/modules/nf-core/kmcp/search/meta.yml b/modules/nf-core/kmcp/search/meta.yml new file mode 100644 index 00000000..5526a179 --- /dev/null +++ b/modules/nf-core/kmcp/search/meta.yml @@ -0,0 +1,50 @@ +name: "kmcp_search" +description: Search sequences against database +keywords: + - metagenomics + - classify + - taxonomic profiling + - fastq + - sequences + - kmers +tools: + - "kmcp": + description: "Accurate metagenomic profiling of both prokaryotic and viral populations by pseudo-mapping" + homepage: "https://github.com/shenwei356/kmcp" + documentation: "https://github.com/shenwei356/kmcp#documents" + tool_dev_url: "https://github.com/shenwei356/kmcp" + doi: "10.1093/bioinformatics/btac845" + licence: "['MIT']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - db: + type: directory + description: Database directory created by "kmcp index" + pattern: "*" + - reads: + type: file + description: gzipped fasta or fastq files + pattern: "*.{fq.gz,fastq.gz,fa.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - result: + type: file + description: Output file in tab-delimited format with 15 columns + pattern: "*.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@sofstam" diff --git a/modules/nf-core/kraken2/kraken2/main.nf b/modules/nf-core/kraken2/kraken2/main.nf new file mode 100644 index 00000000..da8d8c6d --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/main.nf @@ -0,0 +1,58 @@ +process KRAKEN2_KRAKEN2 { + tag "$meta.id" + label 'process_high' + + conda "bioconda::kraken2=2.1.2 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' : + 'biocontainers/mulled-v2-5799ab18b5fc681e75923b2450abaa969907ec98:87fc08d11968d081f3e8a37131c1f1f6715b6542-0' }" + + input: + tuple val(meta), path(reads) + path db + val save_output_fastqs + val save_reads_assignment + + output: + tuple val(meta), path('*.classified{.,_}*') , optional:true, emit: classified_reads_fastq + tuple val(meta), path('*.unclassified{.,_}*') , optional:true, emit: unclassified_reads_fastq + tuple val(meta), path('*classifiedreads.txt') , optional:true, emit: classified_reads_assignment + tuple val(meta), path('*report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def paired = meta.single_end ? "" : "--paired" + def classified = meta.single_end ? "${prefix}.classified.fastq" : "${prefix}.classified#.fastq" + def unclassified = meta.single_end ? "${prefix}.unclassified.fastq" : "${prefix}.unclassified#.fastq" + def classified_option = save_output_fastqs ? "--classified-out ${classified}" : "" + def unclassified_option = save_output_fastqs ? "--unclassified-out ${unclassified}" : "" + def readclassification_option = save_reads_assignment ? "--output ${prefix}.kraken2.classifiedreads.txt" : "--output /dev/null" + def compress_reads_command = save_output_fastqs ? "pigz -p $task.cpus *.fastq" : "" + + """ + kraken2 \\ + --db $db \\ + --threads $task.cpus \\ + --report ${prefix}.kraken2.report.txt \\ + --gzip-compressed \\ + $unclassified_option \\ + $classified_option \\ + $readclassification_option \\ + $paired \\ + $args \\ + $reads + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kraken2: \$(echo \$(kraken2 --version 2>&1) | sed 's/^.*Kraken version //; s/ .*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/kraken2/kraken2/meta.yml b/modules/nf-core/kraken2/kraken2/meta.yml new file mode 100644 index 00000000..4721f45b --- /dev/null +++ b/modules/nf-core/kraken2/kraken2/meta.yml @@ -0,0 +1,75 @@ +name: kraken2_kraken2 +description: Classifies metagenomic sequence data +keywords: + - classify + - metagenomics + - fastq + - db +tools: + - kraken2: + description: | + Kraken2 is a taxonomic sequence classifier that assigns taxonomic labels to sequence reads + homepage: https://ccb.jhu.edu/software/kraken2/ + documentation: https://github.com/DerrickWood/kraken2/wiki/Manual + doi: 10.1186/s13059-019-1891-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - db: + type: directory + description: Kraken2 database + - save_output_fastqs: + type: string + description: | + If true, optional commands are added to save classified and unclassified reads + as fastq files + - save_reads_assignment: + type: string + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fastq: + type: file + description: | + Reads classified as belonging to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - unclassified_reads_fastq: + type: file + description: | + Reads not classified to any of the taxa + on the Kraken2 database. + pattern: "*{fastq.gz}" + - classified_reads_assignment: + type: file + description: | + Kraken2 output file indicating the taxonomic assignment of + each input read + - report: + type: file + description: | + Kraken2 report containing stats about classified + and not classifed reads. + pattern: "*.{report.txt}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/krakentools/combinekreports/main.nf b/modules/nf-core/krakentools/combinekreports/main.nf new file mode 100644 index 00000000..43cc3793 --- /dev/null +++ b/modules/nf-core/krakentools/combinekreports/main.nf @@ -0,0 +1,34 @@ +process KRAKENTOOLS_COMBINEKREPORTS { + label 'process_single' + + conda "bioconda::krakentools=1.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': + 'biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreports) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + combine_kreports.py \\ + -r ${kreports} \\ + -o ${prefix}.txt \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + combine_kreports.py: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/krakentools/combinekreports/meta.yml b/modules/nf-core/krakentools/combinekreports/meta.yml new file mode 100644 index 00000000..213fc8c6 --- /dev/null +++ b/modules/nf-core/krakentools/combinekreports/meta.yml @@ -0,0 +1,43 @@ +name: krakentools_combinekreports +description: Takes a Kraken report file and prints out a krona-compatible TEXT file +keywords: + - kraken + - krakentools + - metagenomics + - table + - combining + - merging +tools: + - krakentools: + description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs. + homepage: https://github.com/jenniferlu717/KrakenTools + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kreports: + type: file + description: List of kraken-style report files + pattern: "*.{txt,kreport}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Combined kreport file of all input files + pattern: "*.txt" + +authors: + - "@jfy133" diff --git a/modules/nf-core/krakentools/kreport2krona/main.nf b/modules/nf-core/krakentools/kreport2krona/main.nf new file mode 100644 index 00000000..a3923afe --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/main.nf @@ -0,0 +1,36 @@ +process KRAKENTOOLS_KREPORT2KRONA { + tag "$meta.id" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "bioconda::krakentools=1.2" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakentools:1.2--pyh5e36f6f_0': + 'biocontainers/krakentools:1.2--pyh5e36f6f_0' }" + + input: + tuple val(meta), path(kreport) + + output: + tuple val(meta), path("*.txt"), emit: txt + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '1.2' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + kreport2krona.py \\ + -r ${kreport} \\ + -o ${prefix}.txt \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kreport2krona.py: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/krakentools/kreport2krona/meta.yml b/modules/nf-core/krakentools/kreport2krona/meta.yml new file mode 100644 index 00000000..2f8a163c --- /dev/null +++ b/modules/nf-core/krakentools/kreport2krona/meta.yml @@ -0,0 +1,41 @@ +name: krakentools_kreport2krona +description: Takes a Kraken report file and prints out a krona-compatible TEXT file +keywords: + - kraken + - krona + - metagenomics + - visualization +tools: + - krakentools: + description: KrakenTools is a suite of scripts to be used for post-analysis of Kraken/KrakenUniq/Kraken2/Bracken results. Please cite the relevant paper if using KrakenTools with any of the listed programs. + homepage: https://github.com/jenniferlu717/KrakenTools + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - kreport: + type: file + description: Kraken report + pattern: "*.{txt,kreport}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - krona: + type: file + description: Krona text-based input file converted from Kraken report + pattern: "*.{txt,krona}" + +authors: + - "@MillironX" diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml new file mode 100644 index 00000000..11bbb879 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/environment.yml @@ -0,0 +1,7 @@ +name: krakenuniq_preloadedkrakenuniq +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::krakenuniq=1.0.4 diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf new file mode 100644 index 00000000..59055bdb --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/main.nf @@ -0,0 +1,235 @@ +process KRAKENUNIQ_PRELOADEDKRAKENUNIQ { + tag "$meta.id" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krakenuniq:1.0.4--pl5321h19e8d03_0': + 'biocontainers/krakenuniq:1.0.4--pl5321h19e8d03_0' }" + + input: + tuple val(meta), path(fastqs) + path db + val ram_chunk_size + val save_output_reads + val report_file + val save_output + + output: + tuple val(meta), path('*.classified.fasta.gz') , optional:true, emit: classified_reads_fasta + tuple val(meta), path('*.unclassified.fasta.gz') , optional:true, emit: unclassified_reads_fasta + tuple val(meta), path('*.krakenuniq.classified.txt'), optional:true, emit: classified_assignment + tuple val(meta), path('*.krakenuniq.report.txt') , emit: report + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fasta"' : '"\${PREFIX}.merged.classified.fasta"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fasta"' : '"\${PREFIX}.merged.unclassified.fasta"' + def classified_option = save_output_reads ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_reads ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_reads ? 'gzip --no-name *.fasta' : '' + if (meta.single_end) { + """ + krakenuniq \\ + $args \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + PREFIX="\$(strip_suffix "\${FASTQ}")" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $args2 \\ + "\${FASTQ}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + krakenuniq \\ + $args \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + + krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + done + + $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } + + stub: + def args = task.ext.args ?: '' + def args2 = task.ext.args ?: '' + + def classified = meta.single_end ? '"\${PREFIX}.classified.fasta"' : '"\${PREFIX}.merged.classified.fasta"' + def unclassified = meta.single_end ? '"\${PREFIX}.unclassified.fasta"' : '"\${PREFIX}.merged.unclassified.fasta"' + def classified_option = save_output_reads ? "--classified-out ${classified}" : '' + def unclassified_option = save_output_reads ? "--unclassified-out ${unclassified}" : '' + def output_option = save_output ? '--output "\${PREFIX}.krakenuniq.classified.txt"' : '' + def report = report_file ? '--report-file "\${PREFIX}.krakenuniq.report.txt"' : '' + def compress_reads_command = save_output_reads ? 'gzip --no-name *.fasta' : '' + if (meta.single_end) { + """ + echo krakenuniq \\ + $args \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus + + strip_suffix() { + local result=\$1 + # Strip any file extensions. + echo "\${result%%.*}" + } + + create_file() { + echo '<3 nf-core' > "\$1" + } + + create_gzip_file() { + echo '<3 nf-core' | gzip -n > "\$1" + } + + printf "%s\\n" ${fastqs} | while read FASTQ; do \\ + echo "\${FASTQ}" + PREFIX="\$(strip_suffix "\${FASTQ}")" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + $args2 \\ + "\${FASTQ}" + + create_file "\${PREFIX}.krakenuniq.classified.txt" + create_file "\${PREFIX}.krakenuniq.report.txt" + create_gzip_file "\${PREFIX}.classified.fasta.gz" + create_gzip_file "\${PREFIX}.unclassified.fasta.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } else { + """ + echo krakenuniq \\ + $args \\ + --db $db \\ + --preload \\ + --preload-size $ram_chunk_size \\ + --threads $task.cpus + + strip_suffix() { + local result + read result + # Strip any trailing dot or underscore. + result="\${result%_}" + echo "\${result%.}" + } + + create_file() { + echo '<3 nf-core' > "\$1" + } + + create_gzip_file() { + echo '<3 nf-core' | gzip -n > "\$1" + } + + printf "%s %s\\n" ${fastqs} | while read FASTQ; do \\ + read -r -a FASTQ <<< "\${FASTQ}" + echo "\${FASTQ[@]}" + PREFIX="\$(printf "%s\\n" "\${FASTQ[@]}" | sed -e 'N;s/^\\(.*\\).*\\n\\1.*\$/\\1\\n\\1/;D' | strip_suffix)" + echo "\${PREFIX}" + + echo krakenuniq \\ + --db $db \\ + --threads $task.cpus \\ + $report \\ + $output_option \\ + $unclassified_option \\ + $classified_option \\ + --paired \\ + $args2 \\ + "\${FASTQ[@]}" + + create_file "\${PREFIX}.krakenuniq.classified.txt" + create_file "\${PREFIX}.krakenuniq.report.txt" + create_gzip_file "\${PREFIX}.merged.classified.fasta.gz" + create_gzip_file "\${PREFIX}.merged.unclassified.fasta.gz" + done + + echo $compress_reads_command + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krakenuniq: \$(echo \$(krakenuniq --version 2>&1) | sed 's/^.*KrakenUniq version //; s/ .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml new file mode 100644 index 00000000..4a6dffee --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/meta.yml @@ -0,0 +1,81 @@ +name: "krakenuniq_preloadedkrakenuniq" +description: Classifies metagenomic sequence data using unique k-mer counts +keywords: + - classify + - metagenomics + - kmers + - fastq + - db +tools: + - "krakenuniq": + description: "Metagenomics classifier with unique k-mer counting for more specific results" + homepage: https://github.com/fbreitwieser/krakenuniq + documentation: https://github.com/fbreitwieser/krakenuniq + doi: 10.1186/s13059-018-1568-0 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: List of input FastQ files + - db: + type: directory + description: KrakenUniq database + - ram_chunk_size: + type: string + description: Amount of maximum amount of RAM each chunk of database that should be loaded at any one time + pattern: "*GB" + - save_output_reads: + type: boolean + description: | + Optionally commands are added to save classified and unclassified reads as FASTA files. + When the input is paired-end, the single output FASTA contains merged reads. + - save_reads_assignment: + type: boolean + description: | + If true, an optional command is added to save a file reporting the taxonomic + classification of each input read +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - classified_reads_fasta: + type: file + description: | + Reads classified as belonging to any of the taxa + in the KrakenUniq reference database. + pattern: "*.classified.fasta.gz" + - unclassified_reads_fasta: + type: file + description: | + Reads not classified to any of the taxa + in the KrakenUniq reference database. + pattern: "*.unclassified.fasta.gz" + - classified_assignment: + type: file + description: | + KrakenUniq output file indicating the taxonomic assignment of + each input read ## DOUBLE CHECK!! + pattern: "*.krakenuniq.classified.txt" + - report: + type: file + description: | + KrakenUniq report containing statistics about classified + and unclassified reads. + pattern: "*.krakenuniq.report.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@mjamy" + - "@Midnighter" +maintainers: + - "@mjamy" + - "@Midnighter" diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test new file mode 100644 index 00000000..a7c44707 --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test @@ -0,0 +1,77 @@ +nextflow_process { + + name "Test Process KRAKENUNIQ_PRELOADEDKRAKENUNIQ" + script "../main.nf" + process "KRAKENUNIQ_PRELOADEDKRAKENUNIQ" + tag "modules" + tag "modules_nfcore" + tag "krakenuniq" + tag "krakenuniq/preloadedkrakenuniq" + + test("sarscov2 - Illumina FASTQ single - stub-run") { + options "-stub-run" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id:'test', single_end:true], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [] + input[2] = '8GB' + input[3] = true + input[4] = true + input[5] = true + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } + + test("sarscov2 - Illumina FASTQ paired-end - stub-run") { + options "-stub-run" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = [ + [id:'test', single_end:false], + [ + file(params.test_data['sarscov2']['illumina']['test_1_fastq_gz'], checkIfExists: true), + file(params.test_data['sarscov2']['illumina']['test_2_fastq_gz'], checkIfExists: true) + ] + ] + input[1] = [] + input[2] = '8GB' + input[3] = true + input[4] = true + input[5] = true + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() }, + ) + } + + } +} diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap new file mode 100644 index 00000000..970865bd --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/main.nf.test.snap @@ -0,0 +1,172 @@ +{ + "sarscov2 - Illumina FASTQ paired-end - stub-run": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.merged.classified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.merged.unclassified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": false + }, + "test.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": false + }, + "test.krakenuniq.report.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "4": [ + "versions.yml:md5,6abf6c733f53fa3b6aaaa6f06864ef0c" + ], + "classified_assignment": [ + [ + { + "id": "test", + "single_end": false + }, + "test.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "classified_reads_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.merged.classified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "report": [ + [ + { + "id": "test", + "single_end": false + }, + "test.krakenuniq.report.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "unclassified_reads_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.merged.unclassified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "versions": [ + "versions.yml:md5,6abf6c733f53fa3b6aaaa6f06864ef0c" + ] + } + ], + "timestamp": "2023-11-21T15:38:47.810576872" + }, + "sarscov2 - Illumina FASTQ single - stub-run": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.classified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.unclassified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "2": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "3": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.krakenuniq.report.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "4": [ + "versions.yml:md5,6abf6c733f53fa3b6aaaa6f06864ef0c" + ], + "classified_assignment": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.krakenuniq.classified.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "classified_reads_fasta": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.classified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "report": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.krakenuniq.report.txt:md5,a5704c35e6b573a45e3a344768fe6975" + ] + ], + "unclassified_reads_fasta": [ + [ + { + "id": "test", + "single_end": true + }, + "test_1.unclassified.fasta.gz:md5,34ed306e94fa7eed00b1adccd2e0de20" + ] + ], + "versions": [ + "versions.yml:md5,6abf6c733f53fa3b6aaaa6f06864ef0c" + ] + } + ], + "timestamp": "2023-11-21T15:38:42.894597091" + } +} \ No newline at end of file diff --git a/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/tags.yml b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/tags.yml new file mode 100644 index 00000000..35ffde4d --- /dev/null +++ b/modules/nf-core/krakenuniq/preloadedkrakenuniq/tests/tags.yml @@ -0,0 +1,2 @@ +krakenuniq/preloadedkrakenuniq: + - modules/nf-core/krakenuniq/preloadedkrakenuniq/** diff --git a/modules/nf-core/krona/ktimporttaxonomy/main.nf b/modules/nf-core/krona/ktimporttaxonomy/main.nf new file mode 100644 index 00000000..0758a382 --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/main.nf @@ -0,0 +1,41 @@ +process KRONA_KTIMPORTTAXONOMY { + tag "${meta.id}" + label 'process_single' + + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + conda "bioconda::krona=2.8" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.8--pl5262hdfd78af_2' : + 'biocontainers/krona:2.8--pl5262hdfd78af_2' }" + + input: + tuple val(meta), path(report) + path taxonomy, stageAs: 'taxonomy.tab' + + output: + tuple val(meta), path ('*.html'), emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = '2.8' // WARN: Version information not provided by tool on CLI. Please update this string when bumping container versions. + """ + TAXONOMY=\$(find -L . -name '*.tab' -exec dirname {} \\;) + echo \$TAXONOMY + + ktImportTaxonomy \\ + $args \\ + -o ${prefix}.html \\ + -tax \$TAXONOMY/ \\ + $report + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krona: $VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/krona/ktimporttaxonomy/meta.yml b/modules/nf-core/krona/ktimporttaxonomy/meta.yml new file mode 100644 index 00000000..dfcd2f2b --- /dev/null +++ b/modules/nf-core/krona/ktimporttaxonomy/meta.yml @@ -0,0 +1,46 @@ +name: krona_ktimporttaxonomy +description: KronaTools Import Taxonomy imports taxonomy classifications and produces an interactive Krona plot. +keywords: + - plot + - taxonomy + - interactive + - html + - visualisation + - krona chart +tools: + - krona: + description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files. + homepage: https://github.com/marbl/Krona/wiki/KronaTools + documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html + doi: 10.1186/1471-2105-12-385 + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - database: + type: file + description: | + Path to a Krona taxonomy .tab file normally downloaded and generated by + krona/ktUpdateTaxonomy. Custom taxonomy files can have any name, but + must end in `.tab`. + pattern: "*tab" + - report: + type: file + description: "A tab-delimited file with taxonomy IDs and (optionally) query IDs, magnitudes, and scores. Query IDs are taken from column 1, taxonomy IDs from column 2, and scores from column 3. Lines beginning with # will be ignored." + pattern: "*.{tsv}" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - html: + type: file + description: A html file containing an interactive krona plot. + pattern: "*.{html}" + +authors: + - "@mjakobs" diff --git a/modules/nf-core/krona/ktimporttext/main.nf b/modules/nf-core/krona/ktimporttext/main.nf new file mode 100644 index 00000000..43280191 --- /dev/null +++ b/modules/nf-core/krona/ktimporttext/main.nf @@ -0,0 +1,34 @@ +process KRONA_KTIMPORTTEXT { + tag "$meta.id" + label 'process_single' + + conda "bioconda::krona=2.8.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/krona:2.8.1--pl5321hdfd78af_1': + 'biocontainers/krona:2.8.1--pl5321hdfd78af_1' }" + + input: + tuple val(meta), path(report) + + output: + tuple val(meta), path ('*.html'), emit: html + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + ktImportText \\ + $args \\ + -o ${prefix}.html \\ + $report + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + krona: \$( echo \$(ktImportText 2>&1) | sed 's/^.*KronaTools //g; s/- ktImportText.*\$//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/krona/ktimporttext/meta.yml b/modules/nf-core/krona/ktimporttext/meta.yml new file mode 100644 index 00000000..a7108e0d --- /dev/null +++ b/modules/nf-core/krona/ktimporttext/meta.yml @@ -0,0 +1,47 @@ +name: "krona_ktimporttext" +description: Creates a Krona chart from text files listing quantities and lineages. +keywords: + - plot + - taxonomy + - interactive + - html + - visualisation + - krona chart + - metagenomics +tools: + - krona: + description: Krona Tools is a set of scripts to create Krona charts from several Bioinformatics tools as well as from text and XML files. + homepage: https://github.com/marbl/Krona/wiki/KronaTools + documentation: http://manpages.ubuntu.com/manpages/impish/man1/ktImportTaxonomy.1.html + tool_dev_url: https://github.com/marbl/Krona + doi: 10.1186/1471-2105-12-385 + licence: https://raw.githubusercontent.com/marbl/Krona/master/KronaTools/LICENSE.txt + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - report: + type: file + description: "Tab-delimited text file. Each line should be a number followed by a list of wedges to contribute to (starting from the highest level). If no wedges are listed (and just a quantity is given), it will contribute to the top level. If the same lineage is listed more than once, the values will be added. Quantities can be omitted if -q is specified. Lines beginning with '#' will be ignored." + pattern: "*.{txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - html: + type: file + description: A html file containing an interactive krona plot. + pattern: "*.{html}" + +authors: + - "@jianhong" diff --git a/modules/nf-core/malt/run/main.nf b/modules/nf-core/malt/run/main.nf new file mode 100644 index 00000000..3ece2a45 --- /dev/null +++ b/modules/nf-core/malt/run/main.nf @@ -0,0 +1,41 @@ +process MALT_RUN { + tag "$meta.id" + label 'process_high' + + conda "bioconda::malt=0.61" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/malt:0.61--hdfd78af_0' : + 'biocontainers/malt:0.61--hdfd78af_0' }" + + input: + tuple val(meta), path(fastqs) + path index + + output: + tuple val(meta), path("*.rma6") , emit: rma6 + tuple val(meta), path("*.{tab,text,sam}"), optional:true, emit: alignments + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + malt-run \\ + -t $task.cpus \\ + -v \\ + -o . \\ + $args \\ + --inFile ${fastqs.join(' ')} \\ + --index $index/ |&tee ${prefix}-malt-run.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + malt: \$(malt-run --help 2>&1 | grep -o 'version.* ' | cut -f 1 -d ',' | cut -f2 -d ' ') + END_VERSIONS + """ +} diff --git a/modules/nf-core/malt/run/meta.yml b/modules/nf-core/malt/run/meta.yml new file mode 100644 index 00000000..f5ee655a --- /dev/null +++ b/modules/nf-core/malt/run/meta.yml @@ -0,0 +1,54 @@ +name: malt_run +description: MALT, an acronym for MEGAN alignment tool, is a sequence alignment and analysis tool designed for processing high-throughput sequencing data, especially in the context of metagenomics. +keywords: + - malt + - alignment + - metagenomics + - ancient DNA + - aDNA + - palaeogenomics + - archaeogenomics + - microbiome +tools: + - malt: + description: A tool for mapping metagenomic data + homepage: https://www.wsi.uni-tuebingen.de/lehrstuehle/algorithms-in-bioinformatics/software/malt/ + documentation: https://software-ab.cs.uni-tuebingen.de/download/malt/manual.pdf + + doi: "10.1038/s41559-017-0446-6" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fastqs: + type: file + description: Input FASTQ files + pattern: "*.{fastq.gz,fq.gz}" + - index: + type: directory + description: Index/database directory from malt-build + pattern: "*/" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - rma6: + type: file + description: MEGAN6 RMA6 file + pattern: "*.rma6" + - sam: + type: file + description: Alignment files in Tab, Text or MEGAN-compatible SAM format + pattern: "*.{tab,txt,sam}" + - log: + type: file + description: Log of verbose MALT stdout + pattern: "*-malt-run.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/megan/rma2info/environment.yml b/modules/nf-core/megan/rma2info/environment.yml new file mode 100644 index 00000000..471e5507 --- /dev/null +++ b/modules/nf-core/megan/rma2info/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::megan=6.21.7 diff --git a/modules/nf-core/megan/rma2info/main.nf b/modules/nf-core/megan/rma2info/main.nf new file mode 100644 index 00000000..e91af504 --- /dev/null +++ b/modules/nf-core/megan/rma2info/main.nf @@ -0,0 +1,38 @@ +process MEGAN_RMA2INFO { + tag "$meta.id" + label 'process_single' + + conda 'modules/nf-core/megan/rma2info/environment.yml' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/megan:6.24.20--h9ee0642_0': + 'biocontainers/megan:6.24.20--h9ee0642_0' }" + + input: + tuple val(meta), path(rma6) + val(megan_summary) + + output: + tuple val(meta), path("*.txt.gz") , emit: txt + tuple val(meta), path("*.megan"), optional: true, emit: megan_summary + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def summary = megan_summary ? "-es ${prefix}.megan" : "" + """ + rma2info \\ + -i ${rma6} \\ + -o ${prefix}.txt.gz \\ + ${summary} \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + megan: \$(echo \$(rma2info 2>&1) | grep version | sed 's/.*version //g;s/, built.*//g') + END_VERSIONS + """ +} diff --git a/modules/nf-core/megan/rma2info/meta.yml b/modules/nf-core/megan/rma2info/meta.yml new file mode 100644 index 00000000..af3dd96c --- /dev/null +++ b/modules/nf-core/megan/rma2info/meta.yml @@ -0,0 +1,50 @@ +name: "megan_rma2info" +description: Analyses an RMA file and exports information in text format +keywords: + - megan + - rma6 + - classification + - conversion +tools: + - "megan": + description: "A tool for studying the taxonomic content of a set of DNA reads" + homepage: "https://uni-tuebingen.de/fakultaeten/mathematisch-naturwissenschaftliche-fakultaet/fachbereiche/informatik/lehrstuehle/algorithms-in-bioinformatics/software/megan6/" + documentation: "https://software-ab.cs.uni-tuebingen.de/download/megan6/welcome.html" + tool_dev_url: "https://github.com/husonlab/megan-ce" + doi: "10.1371/journal.pcbi.1004957" + licence: "['GPL >=3']" +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - rma6: + type: file + description: RMA6 file from MEGAN or MALT + pattern: "*.rma6" + - megan_summary: + type: boolean + description: Specify whether to generate an MEGAN summary file +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Compressed text file + pattern: "*.txt.gz" + - megan_summary: + type: file + description: Optionally generated MEGAN summary file + pattern: "*.megan" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/modules/nf-core/metaphlan/mergemetaphlantables/main.nf b/modules/nf-core/metaphlan/mergemetaphlantables/main.nf new file mode 100644 index 00000000..0403bee9 --- /dev/null +++ b/modules/nf-core/metaphlan/mergemetaphlantables/main.nf @@ -0,0 +1,33 @@ +process METAPHLAN_MERGEMETAPHLANTABLES { + label 'process_single' + + conda "bioconda::metaphlan=4.0.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:4.0.6--pyhca03a8a_0' : + 'biocontainers/metaphlan:4.0.6--pyhca03a8a_0' }" + + input: + tuple val(meta), path(profiles) + + output: + tuple val(meta), path("${prefix}.txt") , emit: txt + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + merge_metaphlan_tables.py \\ + $args \\ + -o ${prefix}.txt \\ + ${profiles} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml b/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml new file mode 100644 index 00000000..3c93964b --- /dev/null +++ b/modules/nf-core/metaphlan/mergemetaphlantables/meta.yml @@ -0,0 +1,45 @@ +name: "metaphlan_mergemetaphlantables" +description: Merges output abundance tables from MetaPhlAn4 +keywords: + - metagenomics + - classification + - merge + - table + - profiles +tools: + - metaphlan4: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.1038/s41587-023-01688-w" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profiles: + type: file + description: List of per-sample MetaPhlAn4 taxonomic abundance tables + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: Combined MetaPhlAn4 table + pattern: "*.txt" + +authors: + - "@jfy133" + - "@LilyAnderssonLee" diff --git a/modules/nf-core/metaphlan/metaphlan/main.nf b/modules/nf-core/metaphlan/metaphlan/main.nf new file mode 100644 index 00000000..24533571 --- /dev/null +++ b/modules/nf-core/metaphlan/metaphlan/main.nf @@ -0,0 +1,50 @@ +process METAPHLAN_METAPHLAN { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::metaphlan=4.0.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/metaphlan:4.0.6--pyhca03a8a_0' : + 'biocontainers/metaphlan:4.0.6--pyhca03a8a_0' }" + + input: + tuple val(meta), path(input) + path metaphlan_db_latest + + output: + tuple val(meta), path("*_profile.txt") , emit: profile + tuple val(meta), path("*.biom") , emit: biom + tuple val(meta), path('*.bowtie2out.txt'), optional:true, emit: bt2out + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def input_type = "$input" =~ /.*\.(fastq|fq)/ ? "--input_type fastq" : "$input" =~ /.*\.(fasta|fna|fa)/ ? "--input_type fasta" : "$input".endsWith(".bowtie2out.txt") ? "--input_type bowtie2out" : "--input_type sam" + def input_data = ("$input_type".contains("fastq")) && !meta.single_end ? "${input[0]},${input[1]}" : "$input" + def bowtie2_out = "$input_type" == "--input_type bowtie2out" || "$input_type" == "--input_type sam" ? '' : "--bowtie2out ${prefix}.bowtie2out.txt" + + """ + BT2_DB=`find -L "${metaphlan_db_latest}" -name "*rev.1.bt2*" -exec dirname {} \\;` + BT2_DB_INDEX=`find -L ${metaphlan_db_latest} -name "*.rev.1.bt2*" | sed 's/\\.rev.1.bt2.*\$//' | sed 's/.*\\///'` + + metaphlan \\ + --nproc $task.cpus \\ + $input_type \\ + $input_data \\ + $args \\ + $bowtie2_out \\ + --bowtie2db \$BT2_DB \\ + --index \$BT2_DB_INDEX \\ + --biom ${prefix}.biom \\ + --output_file ${prefix}_profile.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + metaphlan: \$(metaphlan --version 2>&1 | awk '{print \$3}') + END_VERSIONS + """ +} diff --git a/modules/nf-core/metaphlan/metaphlan/meta.yml b/modules/nf-core/metaphlan/metaphlan/meta.yml new file mode 100644 index 00000000..cb74bd59 --- /dev/null +++ b/modules/nf-core/metaphlan/metaphlan/meta.yml @@ -0,0 +1,59 @@ +name: metaphlan_metaphlan +description: MetaPhlAn is a tool for profiling the composition of microbial communities from metagenomic shotgun sequencing data. +keywords: + - metagenomics + - classification + - fastq + - fasta + - sam +tools: + - metaphlan: + description: Identify clades (phyla to species) present in the metagenome obtained from a microbiome sample and their relative abundance + homepage: https://huttenhower.sph.harvard.edu/metaphlan/ + documentation: https://github.com/biobakery/MetaPhlAn + doi: "10.1038/s41587-023-01688-w" + licence: ["MIT License"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: Metaphlan can classify the metagenome from a variety of input data types, including FASTQ files (single-end and paired-end), FASTA, bowtie2-produced SAM files (produced from alignments to the MetaPHlAn marker database) and intermediate bowtie2 alignment files (bowtie2out) + pattern: "*.{fastq.gz, fasta, fasta.gz, sam, bowtie2out.txt}" + - metaphlan_db: + type: file + description: | + Directory containing pre-downloaded and uncompressed MetaPhlAn database downloaded from: http://cmprod1.cibio.unitn.it/biobakery4/metaphlan_databases/. + Note that you will also need to specify `--index` and the database version name (e.g. 'mpa_vJan21_TOY_CHOCOPhlAnSGB_202103') in your module.conf ext.args for METAPHLAN_METAPHLAN! + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - profile: + type: file + description: Tab-separated output file of the predicted taxon relative abundances + pattern: "*.{txt}" + - biom: + type: file + description: General-use format for representing biological sample by observation contingency tables + pattern: "*.{biom}" + - bowtie2out: + type: file + description: Intermediate Bowtie2 output produced from mapping the metagenome against the MetaPHlAn marker database ( not compatible with `bowtie2out` files generated with MetaPhlAn versions below 3 ) + pattern: "*.{bowtie2out.txt}" + +authors: + - "@MGordon09" + - "@LilyAnderssonLee" diff --git a/modules/nf-core/minimap2/align/main.nf b/modules/nf-core/minimap2/align/main.nf new file mode 100644 index 00000000..4da47c18 --- /dev/null +++ b/modules/nf-core/minimap2/align/main.nf @@ -0,0 +1,48 @@ +process MINIMAP2_ALIGN { + tag "$meta.id" + label 'process_medium' + + // Note: the versions here need to match the versions used in the mulled container below and minimap2/index + conda "bioconda::minimap2=2.24 bioconda::samtools=1.14" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' : + 'biocontainers/mulled-v2-66534bcbb7031a148b13e2ad42583020b9cd25c4:1679e915ddb9d6b4abda91880c4b48857d471bd8-0' }" + + input: + tuple val(meta), path(reads) + path reference + val bam_format + val cigar_paf_format + val cigar_bam + + output: + tuple val(meta), path("*.paf"), optional: true, emit: paf + tuple val(meta), path("*.bam"), optional: true, emit: bam + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def bam_output = bam_format ? "-a | samtools sort | samtools view -@ ${task.cpus} -b -h -o ${prefix}.bam" : "-o ${prefix}.paf" + def cigar_paf = cigar_paf_format && !bam_format ? "-c" : '' + def set_cigar_bam = cigar_bam && bam_format ? "-L" : '' + """ + minimap2 \\ + $args \\ + -t $task.cpus \\ + "${reference ?: reads}" \\ + "$reads" \\ + $cigar_paf \\ + $set_cigar_bam \\ + $bam_output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/align/meta.yml b/modules/nf-core/minimap2/align/meta.yml new file mode 100644 index 00000000..991b39a0 --- /dev/null +++ b/modules/nf-core/minimap2/align/meta.yml @@ -0,0 +1,65 @@ +name: minimap2_align +description: A versatile pairwise aligner for genomic and spliced nucleotide sequences +keywords: + - align + - fasta + - fastq + - genome + - paf + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FASTA or FASTQ files of size 1 and 2 for single-end + and paired-end data, respectively. + - reference: + type: file + description: | + Reference database in FASTA format. + - bam_format: + type: boolean + description: Specify that output should be in BAM format + - cigar_paf_format: + type: boolean + description: Specify that output CIGAR should be in PAF format + - cigar_bam: + type: boolean + description: | + Write CIGAR with >65535 ops at the CG tag. This is recommended when + doing XYZ (https://github.com/lh3/minimap2#working-with-65535-cigar-operations) +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - paf: + type: file + description: Alignment in PAF format + pattern: "*.paf" + - bam: + type: file + description: Alignment in BAM format + pattern: "*.bam" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@heuermh" + - "@sofstam" + - "@sateeshperi" + - "@jfy133" diff --git a/modules/nf-core/minimap2/index/main.nf b/modules/nf-core/minimap2/index/main.nf new file mode 100644 index 00000000..7a1bb227 --- /dev/null +++ b/modules/nf-core/minimap2/index/main.nf @@ -0,0 +1,34 @@ +process MINIMAP2_INDEX { + label 'process_medium' + + // Note: the versions here need to match the versions used in minimap2/align + conda "bioconda::minimap2=2.24" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimap2:2.24--h7132678_1' : + 'biocontainers/minimap2:2.24--h7132678_1' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.mmi"), emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + minimap2 \\ + -t $task.cpus \\ + -d ${fasta.baseName}.mmi \\ + $args \\ + $fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimap2: \$(minimap2 --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimap2/index/meta.yml b/modules/nf-core/minimap2/index/meta.yml new file mode 100644 index 00000000..b58f35c6 --- /dev/null +++ b/modules/nf-core/minimap2/index/meta.yml @@ -0,0 +1,40 @@ +name: minimap2_index +description: Provides fasta index required by minimap2 alignment. +keywords: + - index + - fasta + - reference +tools: + - minimap2: + description: | + A versatile pairwise aligner for genomic and spliced nucleotide sequences. + homepage: https://github.com/lh3/minimap2 + documentation: https://github.com/lh3/minimap2#uguide + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: | + Reference database in FASTA format. +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Minimap2 fasta index. + pattern: "*.mmi" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@yuukiiwa" + - "@drpatelh" diff --git a/modules/nf-core/motus/merge/main.nf b/modules/nf-core/motus/merge/main.nf new file mode 100644 index 00000000..adc60dc8 --- /dev/null +++ b/modules/nf-core/motus/merge/main.nf @@ -0,0 +1,45 @@ +process MOTUS_MERGE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::motus=3.0.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/motus:3.0.3--pyhdfd78af_0': + 'biocontainers/motus:3.0.3--pyhdfd78af_0' }" + + input: + tuple val(meta), path(input) + path db // to stop docker saying it can't find it... would have to have the module in upstream steps anyway + path profile_version_yml, stageAs: 'profile_version.yml' + + output: + tuple val(meta), path("*.txt") , optional: true, emit: txt + tuple val(meta), path("*.biom"), optional: true, emit: biom + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cmd_input = input.size() > 1 ? "-i ${input.join(',')}" : input.isDirectory() ? "-d ${input}" : "-i ${input}" + def suffix = task.ext.args?.contains("-B") ? "biom" : "txt" + """ + motus \\ + merge \\ + -db $db \\ + ${cmd_input} \\ + $args \\ + -o ${prefix}.${suffix} + + ## Take version from the mOTUs/profile module output, as cannot reconstruct + ## version without having database staged in this directory. + VERSION=\$(cat ${profile_version_yml} | grep '/*motus:.*' | sed 's/.*otus: //g') + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + motus: \$VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/motus/merge/meta.yml b/modules/nf-core/motus/merge/meta.yml new file mode 100644 index 00000000..24ae9399 --- /dev/null +++ b/modules/nf-core/motus/merge/meta.yml @@ -0,0 +1,54 @@ +name: "motus_merge" +description: Taxonomic meta-omics profiling using universal marker genes +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling + - merging + - merge + - otu table +tools: + - "motus": + description: "Marker gene-based OTU (mOTU) profiling" + homepage: "https://motu-tool.org/" + documentation: "https://github.com/motu-tool/mOTUs/wiki" + tool_dev_url: "https://github.com/motu-tool/mOTUs" + doi: "10.1186/s40168-022-01410-z" + licence: "['GPL v3']" + +input: + - input: + type: file + description: | + List of output files (more than one) from motus profile, + or a single directory containing motus output files. + - db: + type: directory + description: | + mOTUs database downloaded by `motus downloadDB` + pattern: "db_mOTU/" + - profile_version_yml: + type: file + description: | + A single versions.yml file output from motus/profile. motus/merge cannot reconstruct + this itself without having the motus database present and configured with the tool + so here we take it from what is already reported by the upstream module. + pattern: "versions.yml" + +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - txt: + type: file + description: OTU table in txt format, if BIOM format not requested + pattern: "*.txt" + - biom: + type: file + description: OTU table in biom format, if BIOM format requested + pattern: "*.biom" + +authors: + - "@jfy133" diff --git a/modules/nf-core/motus/profile/main.nf b/modules/nf-core/motus/profile/main.nf new file mode 100644 index 00000000..ca61865a --- /dev/null +++ b/modules/nf-core/motus/profile/main.nf @@ -0,0 +1,56 @@ +process MOTUS_PROFILE { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::motus=3.0.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/motus:3.0.3--pyhdfd78af_0': + 'biocontainers/motus:3.0.3--pyhdfd78af_0' }" + + input: + tuple val(meta), path(reads) + path db + + output: + tuple val(meta), path("*.out"), emit: out + tuple val(meta), path("*.bam"), optional: true, emit: bam + tuple val(meta), path("*.mgc"), optional: true, emit: mgc + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def inputs = reads[0].getExtension() == 'bam' ? + "-i ${reads}" : + reads[0].getExtension() == 'mgc' ? "-m $reads" : + meta.single_end ? + "-s $reads" : "-f ${reads[0]} -r ${reads[1]}" + def refdb = db ? "-db ${db}" : "" + """ + motus profile \\ + $args \\ + $inputs \\ + $refdb \\ + -t $task.cpus \\ + -n $prefix \\ + -o ${prefix}.out \\ + 2> ${prefix}.log + + ## mOTUs version number is not available from command line. + ## mOTUs save the version number in index database folder. + ## mOTUs will check the database version is same version as exec version. + if [ "$db" == "" ]; then + VERSION=\$(echo \$(motus -h 2>&1) | sed 's/^.*Version: //; s/References.*\$//') + else + VERSION=\$(grep motus $db/db_mOTU_versions | sed 's/motus\\t//g') + fi + cat <<-END_VERSIONS > versions.yml + "${task.process}": + motus: \$VERSION + END_VERSIONS + """ +} diff --git a/modules/nf-core/motus/profile/meta.yml b/modules/nf-core/motus/profile/meta.yml new file mode 100644 index 00000000..12020425 --- /dev/null +++ b/modules/nf-core/motus/profile/meta.yml @@ -0,0 +1,65 @@ +name: "motus_profile" +description: Taxonomic meta-omics profiling using universal marker genes +keywords: + - classify + - metagenomics + - fastq + - taxonomic profiling +tools: + - "motus": + description: "Marker gene-based OTU (mOTU) profiling" + homepage: "https://motu-tool.org/" + documentation: "https://github.com/motu-tool/mOTUs/wiki" + tool_dev_url: "https://github.com/motu-tool/mOTUs" + doi: "10.1186/s40168-022-01410-z" + licence: "['GPL v3']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input fastq/fasta files of size 1 and 2 for single-end and paired-end data, + respectively. + Or the intermediate bam file mapped by bwa to the mOTUs database or + the output bam file from motus profile. + Or the intermediate mgc read counts table. + pattern: "*.{fastq,fq,fasta,fa,fastq.gz,fq.gz,fasta.gz,fa.gz,.bam,.mgc}" + - db: + type: directory + description: | + mOTUs database downloaded by `motus downloadDB` + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - out: + type: file + description: Results with taxonomic classification of each read + pattern: "*.out" + - bam: + type: file + description: Optional intermediate sorted BAM file from BWA + pattern: "*.{bam}" + - mgc: + type: file + description: Optional intermediate mgc read count table file saved with `-M`. + pattern: "*.{mgc}" + - log: + type: file + description: Standard error logging file containing summary statistics + pattern: "*.log" + +authors: + - "@jianhong" diff --git a/modules/nf-core/porechop/porechop/main.nf b/modules/nf-core/porechop/porechop/main.nf new file mode 100644 index 00000000..648f2029 --- /dev/null +++ b/modules/nf-core/porechop/porechop/main.nf @@ -0,0 +1,41 @@ +process PORECHOP_PORECHOP { + tag "$meta.id" + label 'process_medium' + + conda "bioconda::porechop=0.2.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/porechop:0.2.4--py39h7cff6ad_2' : + 'biocontainers/porechop:0.2.4--py39h7cff6ad_2' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_porechopped.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + ## To ensure ID matches rest of pipeline based on meta.id rather than input file name + + [[ -f ${prefix}.fastq.gz ]] || ln -s $reads ${prefix}.fastq.gz + + porechop \\ + -i ${prefix}.fastq.gz \\ + -t $task.cpus \\ + $args \\ + -o ${prefix}_porechopped.fastq.gz \\ + > ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/porechop/porechop/meta.yml b/modules/nf-core/porechop/porechop/meta.yml new file mode 100644 index 00000000..98b838f6 --- /dev/null +++ b/modules/nf-core/porechop/porechop/meta.yml @@ -0,0 +1,55 @@ +name: "porechop_porechop" +description: Adapter removal and demultiplexing of Oxford Nanopore reads +keywords: + - adapter + - nanopore + - demultiplexing +tools: + - porechop: + description: Adapter removal and demultiplexing of Oxford Nanopore reads + homepage: "https://github.com/rrwick/Porechop" + documentation: "https://github.com/rrwick/Porechop" + tool_dev_url: "https://github.com/rrwick/Porechop" + doi: "10.1099/mgen.0.000132" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: fastq/fastq.gz file + pattern: "*.{fastq,fastq.gz,fq,fq.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Demultiplexed and/or adapter-trimmed fastq.gz file + pattern: "*.{fastq.gz}" + - log: + type: file + description: Log file containing stdout information + pattern: "*.log" + +authors: + - "@ggabernet" + - "@jasmezz" + - "@d4straub" + - "@LaurenceKuhl" + - "@SusiJo" + - "@jonasscheid" + - "@jonoave" + - "@GokceOGUZ" + - "@jfy133" diff --git a/modules/nf-core/porechop/porechop/porechop-porechop.diff b/modules/nf-core/porechop/porechop/porechop-porechop.diff new file mode 100644 index 00000000..5d51f24a --- /dev/null +++ b/modules/nf-core/porechop/porechop/porechop-porechop.diff @@ -0,0 +1,34 @@ +Changes in module 'nf-core/porechop/porechop' +--- modules/nf-core/porechop/porechop/main.nf ++++ modules/nf-core/porechop/porechop/main.nf +@@ -11,7 +11,7 @@ + tuple val(meta), path(reads) + + output: +- tuple val(meta), path("*.fastq.gz"), emit: reads ++ tuple val(meta), path("*_porechopped.fastq.gz"), emit: reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + +@@ -22,12 +22,17 @@ + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ ++ ## To ensure ID matches rest of pipeline based on meta.id rather than input file name ++ ++ [[ -f ${prefix}.fastq.gz ]] || ln -s $reads ${prefix}.fastq.gz ++ + porechop \\ +- -i $reads \\ ++ -i ${prefix}.fastq.gz \\ + -t $task.cpus \\ + $args \\ +- -o ${prefix}.fastq.gz \\ ++ -o ${prefix}_porechopped.fastq.gz \\ + > ${prefix}.log ++ + cat <<-END_VERSIONS > versions.yml + "${task.process}": + porechop: \$( porechop --version ) + +************************************************************ diff --git a/modules/nf-core/prinseqplusplus/main.nf b/modules/nf-core/prinseqplusplus/main.nf new file mode 100644 index 00000000..63b2c723 --- /dev/null +++ b/modules/nf-core/prinseqplusplus/main.nf @@ -0,0 +1,61 @@ +process PRINSEQPLUSPLUS { + tag "$meta.id" + label 'process_low' + + conda "bioconda::prinseq-plus-plus=1.2.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/prinseq-plus-plus:1.2.3--hc90279e_1': + 'biocontainers/prinseq-plus-plus:1.2.3--hc90279e_1' }" + + input: + tuple val(meta), path(reads) + + output: + tuple val(meta), path("*_good_out*.fastq.gz") , emit: good_reads + tuple val(meta), path("*_single_out*.fastq.gz"), optional: true, emit: single_reads + tuple val(meta), path("*_bad_out*.fastq.gz") , optional: true, emit: bad_reads + tuple val(meta), path("*.log") , emit: log + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + if (meta.single_end) { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } else { + """ + prinseq++ \\ + -threads $task.cpus \\ + -fastq ${reads[0]} \\ + -fastq2 ${reads[1]} \\ + -out_name ${prefix} \\ + -out_gz \\ + -VERBOSE 1 \\ + $args \\ + | tee ${prefix}.log + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prinseqplusplus: \$(echo \$(prinseq++ --version | cut -f 2 -d ' ' )) + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/prinseqplusplus/meta.yml b/modules/nf-core/prinseqplusplus/meta.yml new file mode 100644 index 00000000..8155df93 --- /dev/null +++ b/modules/nf-core/prinseqplusplus/meta.yml @@ -0,0 +1,60 @@ +name: "prinseqplusplus" +description: PRINSEQ++ is a C++ implementation of the prinseq-lite.pl program. It can be used to filter, reformat or trim genomic and metagenomic sequence data +keywords: + - fastq + - fasta + - filter + - trim +tools: + - "prinseqplusplus": + description: "PRINSEQ++ - Multi-threaded C++ sequence cleaning" + homepage: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + documentation: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + tool_dev_url: "https://github.com/Adrian-Cantu/PRINSEQ-plus-plus" + doi: "10.7287/peerj.preprints.27553v1" + licence: "['GPL v2']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end + data, respectively. + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - good_reads: + type: file + description: Reads passing filter(s) in gzipped FASTQ format + pattern: "*_good_out_{R1,R2}.fastq.gz" + - single_reads: + type: file + description: | + Single reads without the pair passing filter(s) in gzipped FASTQ format + pattern: "*_single_out_{R1,R2}.fastq.gz" + - bad_reads: + type: file + description: | + Reads without not passing filter(s) in gzipped FASTQ format + pattern: "*_bad_out_{R1,R2}.fastq.gz" + - log: + type: file + description: | + Verbose level 2 STDOUT information in a log file + pattern: "*.log" + +authors: + - "@jfy133" diff --git a/modules/nf-core/samtools/fastq/main.nf b/modules/nf-core/samtools/fastq/main.nf new file mode 100644 index 00000000..15d89769 --- /dev/null +++ b/modules/nf-core/samtools/fastq/main.nf @@ -0,0 +1,44 @@ +process SAMTOOLS_FASTQ { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + val(interleave) + + output: + tuple val(meta), path("*_{1,2}.fastq.gz") , optional:true, emit: fastq + tuple val(meta), path("*_interleaved.fastq.gz"), optional:true, emit: interleaved + tuple val(meta), path("*_singleton.fastq.gz") , optional:true, emit: singleton + tuple val(meta), path("*_other.fastq.gz") , optional:true, emit: other + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def output = ( interleave && ! meta.single_end ) ? "> ${prefix}_interleaved.fastq.gz" : + meta.single_end ? "-1 ${prefix}_1.fastq.gz -s ${prefix}_singleton.fastq.gz" : + "-1 ${prefix}_1.fastq.gz -2 ${prefix}_2.fastq.gz -s ${prefix}_singleton.fastq.gz" + """ + samtools \\ + fastq \\ + $args \\ + --threads ${task.cpus-1} \\ + -0 ${prefix}_other.fastq.gz \\ + $input \\ + $output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/fastq/meta.yml b/modules/nf-core/samtools/fastq/meta.yml new file mode 100644 index 00000000..b1a1ed38 --- /dev/null +++ b/modules/nf-core/samtools/fastq/meta.yml @@ -0,0 +1,62 @@ +name: samtools_fastq +description: Converts a SAM/BAM/CRAM file to FASTQ +keywords: + - bam + - sam + - cram + - fastq +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - interleave: + type: boolean + description: Set true for interleaved fastq file + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Compressed FASTQ file(s) with reads with either the READ1 or READ2 flag set in separate files. + pattern: "*_{1,2}.fastq.gz" + - interleaved: + type: file + description: Compressed FASTQ file with reads with either the READ1 or READ2 flag set in a combined file. Needs collated input file. + pattern: "*_interleaved.fastq.gz" + - singleton: + type: file + description: Compressed FASTQ file with singleton reads + pattern: "*_singleton.fastq.gz" + - other: + type: file + description: Compressed FASTQ file with reads with either both READ1 and READ2 flags set or unset + pattern: "*_other.fastq.gz" + +authors: + - "@priyanka-surana" + - "@suzannejin" diff --git a/modules/nf-core/samtools/index/main.nf b/modules/nf-core/samtools/index/main.nf new file mode 100644 index 00000000..0b20aa4b --- /dev/null +++ b/modules/nf-core/samtools/index/main.nf @@ -0,0 +1,48 @@ +process SAMTOOLS_INDEX { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input) + + output: + tuple val(meta), path("*.bai") , optional:true, emit: bai + tuple val(meta), path("*.csi") , optional:true, emit: csi + tuple val(meta), path("*.crai"), optional:true, emit: crai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + samtools \\ + index \\ + -@ ${task.cpus-1} \\ + $args \\ + $input + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + """ + touch ${input}.bai + touch ${input}.crai + touch ${input}.csi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/index/meta.yml b/modules/nf-core/samtools/index/meta.yml new file mode 100644 index 00000000..8bd2fa6f --- /dev/null +++ b/modules/nf-core/samtools/index/meta.yml @@ -0,0 +1,53 @@ +name: samtools_index +description: Index SAM/BAM/CRAM file +keywords: + - index + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - crai: + type: file + description: BAM/CRAM/SAM index file + pattern: "*.{bai,crai,sai}" + - csi: + type: file + description: CSI index file + pattern: "*.{csi}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@ewels" + - "@maxulysse" diff --git a/modules/nf-core/samtools/stats/main.nf b/modules/nf-core/samtools/stats/main.nf new file mode 100644 index 00000000..4a2607de --- /dev/null +++ b/modules/nf-core/samtools/stats/main.nf @@ -0,0 +1,49 @@ +process SAMTOOLS_STATS { + tag "$meta.id" + label 'process_single' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(input_index) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.stats"), emit: stats + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + """ + samtools \\ + stats \\ + --threads ${task.cpus} \\ + ${reference} \\ + ${input} \\ + > ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.stats + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/stats/meta.yml b/modules/nf-core/samtools/stats/meta.yml new file mode 100644 index 00000000..90e6345f --- /dev/null +++ b/modules/nf-core/samtools/stats/meta.yml @@ -0,0 +1,59 @@ +name: samtools_stats +description: Produces comprehensive statistics from SAM/BAM/CRAM file +keywords: + - statistics + - counts + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM file from alignment + pattern: "*.{bam,cram}" + - input_index: + type: file + description: BAI/CRAI file from alignment + pattern: "*.{bai,crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - stats: + type: file + description: File containing samtools stats output + pattern: "*.{stats}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@FriederikeHanssen" + - "@ramprasadn" diff --git a/modules/nf-core/samtools/view/main.nf b/modules/nf-core/samtools/view/main.nf new file mode 100644 index 00000000..cb91facf --- /dev/null +++ b/modules/nf-core/samtools/view/main.nf @@ -0,0 +1,66 @@ +process SAMTOOLS_VIEW { + tag "$meta.id" + label 'process_low' + + conda "bioconda::samtools=1.17" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/samtools:1.17--h00cdaf9_0' : + 'biocontainers/samtools:1.17--h00cdaf9_0' }" + + input: + tuple val(meta), path(input), path(index) + tuple val(meta2), path(fasta) + path qname + + output: + tuple val(meta), path("*.bam"), emit: bam, optional: true + tuple val(meta), path("*.cram"), emit: cram, optional: true + tuple val(meta), path("*.sam"), emit: sam, optional: true + tuple val(meta), path("*.bai"), emit: bai, optional: true + tuple val(meta), path("*.csi"), emit: csi, optional: true + tuple val(meta), path("*.crai"), emit: crai, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def reference = fasta ? "--reference ${fasta}" : "" + def readnames = qname ? "--qname-file ${qname}": "" + def file_type = args.contains("--output-fmt sam") ? "sam" : + args.contains("--output-fmt bam") ? "bam" : + args.contains("--output-fmt cram") ? "cram" : + input.getExtension() + if ("$input" == "${prefix}.${file_type}") error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!" + """ + samtools \\ + view \\ + --threads ${task.cpus-1} \\ + ${reference} \\ + ${readnames} \\ + $args \\ + -o ${prefix}.${file_type} \\ + $input \\ + $args2 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.bam + touch ${prefix}.cram + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/samtools/view/meta.yml b/modules/nf-core/samtools/view/meta.yml new file mode 100644 index 00000000..3b05450b --- /dev/null +++ b/modules/nf-core/samtools/view/meta.yml @@ -0,0 +1,84 @@ +name: samtools_view +description: filter/convert SAM/BAM/CRAM file +keywords: + - view + - bam + - sam + - cram +tools: + - samtools: + description: | + SAMtools is a set of utilities for interacting with and post-processing + short DNA sequence read alignments in the SAM, BAM and CRAM formats, written by Heng Li. + These files are generated as output by short read aligners like BWA. + homepage: http://www.htslib.org/ + documentation: http://www.htslib.org/doc/samtools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: BAM/CRAM/SAM file + pattern: "*.{bam,cram,sam}" + - index: + type: file + description: BAM.BAI/BAM.CSI/CRAM.CRAI file (optional) + pattern: "*.{.bai,.csi,.crai}" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - fasta: + type: file + description: Reference file the CRAM was created with (optional) + pattern: "*.{fasta,fa}" + - qname: + type: file + description: Optional file with read names to output only select alignments + pattern: "*.{txt,list}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - bam: + type: file + description: optional filtered/converted BAM file + pattern: "*.{bam}" + - cram: + type: file + description: optional filtered/converted CRAM file + pattern: "*.{cram}" + - sam: + type: file + description: optional filtered/converted SAM file + pattern: "*.{sam}" + # bai, csi, and crai are created with `--write-index` + - bai: + type: file + description: optional BAM file index + pattern: "*.{bai}" + - csi: + type: file + description: optional tabix BAM file index + pattern: "*.{csi}" + - crai: + type: file + description: optional CRAM file index + pattern: "*.{crai}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@drpatelh" + - "@joseespinosa" + - "@FriederikeHanssen" + - "@priyanka-surana" diff --git a/modules/nf-core/taxpasta/merge/main.nf b/modules/nf-core/taxpasta/merge/main.nf new file mode 100644 index 00000000..de135221 --- /dev/null +++ b/modules/nf-core/taxpasta/merge/main.nf @@ -0,0 +1,47 @@ +process TAXPASTA_MERGE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::taxpasta=0.6.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/taxpasta:0.6.1--pyhdfd78af_0': + 'biocontainers/taxpasta:0.6.1--pyhdfd78af_0' }" + + + input: + tuple val(meta), path(profiles) + path taxonomy + path samplesheet + + output: + tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: merged_profiles + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // N.B.: Taxpasta requires a --profiler option and will fail without it. + // This must be specified via a `nextflow.config` or `modules.config`, for + // example, as "--profiler kraken2". Additionally, it requires a --output + // option with the output file name. The desired format will be parsed from + // the name and should correspond to the output pattern specified above, + // e.g., "--output ${task.ext.prefix}.tsv". + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : '' + def samplesheet_input = samplesheet ? "-s ${samplesheet}" : '' + """ + taxpasta merge \\ + $args \\ + $taxonomy_option \\ + $samplesheet_input \\ + $profiles + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + taxpasta: \$(taxpasta --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/taxpasta/merge/meta.yml b/modules/nf-core/taxpasta/merge/meta.yml new file mode 100644 index 00000000..ed89e62c --- /dev/null +++ b/modules/nf-core/taxpasta/merge/meta.yml @@ -0,0 +1,58 @@ +name: "taxpasta_merge" +description: Standardise and merge two or more taxonomic profiles into a single table +keywords: + - taxonomic profile + - standardise + - standardisation + - metagenomics + - taxonomic profiling + - otu tables + - taxon tables +tools: + - "taxpasta": + description: "TAXonomic Profile Aggregation and STAndardisation" + homepage: "https://taxpasta.readthedocs.io/" + documentation: "https://taxpasta.readthedocs.io/" + tool_dev_url: "https://github.com/taxprofiler/taxpasta" + + licence: "['Apache-2.0']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profiles: + type: file + description: A list of taxonomic profiler output files (typically in text format, mandatory) + pattern: "*.{tsv,csv,arrow,parquet,biom}" + - samplesheet: + type: file + description: + A samplesheet describing the sample name and a filepath to a taxonomic abundance profile that needs to be relative + from the work environment. The profiles must be provided even if you give a samplesheet as argument (optional) + pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet}" + - taxonomy: + type: directory + description: Directory containing at a minimum nodes.dmp and names.dmp files (optional) + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - merged_profiles: + type: file + description: Output file with standardised multiple profiles in one go and have all profiles combined into a single table. + pattern: "*.{tsv,csv,ods,xlsx,arrow,parquet,biom}" + +authors: + - "@sofstam" + - "@jfy133" diff --git a/modules/nf-core/taxpasta/standardise/main.nf b/modules/nf-core/taxpasta/standardise/main.nf new file mode 100644 index 00000000..7822912a --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/main.nf @@ -0,0 +1,42 @@ +process TAXPASTA_STANDARDISE { + tag "$meta.id" + label 'process_single' + + conda "bioconda::taxpasta=0.6.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/taxpasta:0.6.1--pyhdfd78af_0': + 'biocontainers/taxpasta:0.6.1--pyhdfd78af_0' }" + + input: + tuple val(meta), path(profile) + path taxonomy + + output: + tuple val(meta), path("*.{tsv,csv,arrow,parquet,biom}"), emit: standardised_profile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // N.B.: Taxpasta requires a --profiler option and will fail without it. + // This must be specified via a `nextflow.config` or `modules.config`, for + // example, as "--profiler kraken2". Additionally, it requires a --output + // option with the output file name. The desired format will be parsed from + // the name and should correspond to the output pattern specified above, + // e.g., "--output ${task.ext.prefix}.tsv". + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def taxonomy_option = taxonomy ? "--taxonomy ${taxonomy}" : '' + """ + taxpasta standardise \\ + $args \\ + $taxonomy_option \\ + $profile + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + taxpasta: \$(taxpasta --version) + END_VERSIONS + """ +} diff --git a/modules/nf-core/taxpasta/standardise/meta.yml b/modules/nf-core/taxpasta/standardise/meta.yml new file mode 100644 index 00000000..81df6e2c --- /dev/null +++ b/modules/nf-core/taxpasta/standardise/meta.yml @@ -0,0 +1,51 @@ +name: "taxpasta_standardise" +description: "Standardise the output of a wide range of taxonomic profilers" +keywords: + - taxonomic profile + - standardise + - standardisation + - metagenomics + - taxonomic profiling + - otu tables + - taxon tables +tools: + - "taxpasta": + description: "TAXonomic Profile Aggregation and STAndardisation" + homepage: "https://taxpasta.readthedocs.io/" + documentation: "https://taxpasta.readthedocs.io/" + tool_dev_url: "https://github.com/taxprofiler/taxpasta" + + licence: "['Apache-2.0']" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - profile: + type: file + description: profiler output file (mandatory) + pattern: "*" + - taxonomy: + type: directory + description: Directory containing at a minimum nodes.dmp and names.dmp files (optional) + pattern: "*/" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - standardised_profile: + type: file + description: Standardised taxonomic profile + pattern: "*.{tsv,csv,arrow,parquet,biom}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@Midnighter" diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 00000000..61461c39 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "conda-forge::sed=4.7 conda-forge::grep=3.11 conda-forge::tar=1.34" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 00000000..db241a6e --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,41 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index c8212265..bc40fa64 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,14 +9,18 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options input = null // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false +<<<<<<< HEAD fasta = null// MultiQC options +======= + + // MultiQC options +>>>>>>> dev multiqc_config = null multiqc_title = null multiqc_logo = null @@ -51,10 +55,139 @@ params { // Schema validation default options validationFailUnrecognisedParams = false validationLenientMode = false - validationSchemaIgnoreParams = 'genomes,igenomes_base' + validationSchemaIgnoreParams = 'genomes,igenomes_base,fasta' validationShowHiddenParams = false validate_params = true + // Databases + databases = null + + // FASTQ preprocessing + skip_preprocessing_qc = false + preprocessing_qc_tool = 'fastqc' + + perform_shortread_qc = false + shortread_qc_tool = 'fastp' + shortread_qc_skipadaptertrim = false + shortread_qc_mergepairs = false + shortread_qc_includeunmerged = false + shortread_qc_adapter1 = null + shortread_qc_adapter2 = null + shortread_qc_adapterlist = null + shortread_qc_minlength = 15 + shortread_qc_dedup = false + + perform_longread_qc = false + longread_qc_skipadaptertrim = false + longread_qc_skipqualityfilter = false + longread_qc_qualityfilter_minlength = 1000 + longread_qc_qualityfilter_keeppercent = 90 + longread_qc_qualityfilter_targetbases = 500000000 + + save_preprocessed_reads = false + + // Complexity filtering + perform_shortread_complexityfilter = false + shortread_complexityfilter_tool = 'bbduk' + shortread_complexityfilter_entropy = 0.3 + shortread_complexityfilter_bbduk_windowsize = 50 + shortread_complexityfilter_bbduk_mask = false + shortread_complexityfilter_prinseqplusplus_mode = 'entropy' + shortread_complexityfilter_prinseqplusplus_dustscore = 0.5 + shortread_complexityfilter_fastp_threshold = 30 + save_complexityfiltered_reads = false + + // run merging + perform_runmerging = false + save_runmerged_reads = false + + // Host Removal + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + hostremoval_reference = null + shortread_hostremoval_index = null + longread_hostremoval_index = null + save_hostremoval_index = false + save_hostremoval_bam = false + save_hostremoval_unmapped = false + + // Publishing final reads going into profiling + save_analysis_ready_fastqs = false + + // MALT + run_malt = false + malt_mode = 'BlastN' + malt_generate_megansummary = false + malt_save_reads = false // added via map + database args extension in profiling.nf + + // kraken2 + run_kraken2 = false + kraken2_save_reads = false // added directly to module in profiling.nf + kraken2_save_readclassifications = false // added directly to module in profiling.nf + kraken2_save_minimizers = false + + //krakenuniq + run_krakenuniq = false + krakenuniq_ram_chunk_size = '16G' + krakenuniq_save_reads = false // added directly to module in profiling.nf + krakenuniq_save_readclassifications = false // added directly to module in profiling.nf + krakenuniq_batch_size = 20 + + // Bracken + run_bracken = false + + // centrifuge + run_centrifuge = false + centrifuge_save_reads = false // added directly to module in profiling.nf + + // metaphlan + run_metaphlan = false + + // kaiju + run_kaiju = false + kaiju_expand_viruses = false + kaiju_taxon_rank = 'species' + + // diamond + run_diamond = false + diamond_output_format = 'tsv' // TSV is only format with taxonomic information apparently + diamond_save_reads = false // this will override default diamond output format so no taxonomic profile is generated! added directly to module in profiling.nf + + // mOTUs + run_motus = false + motus_use_relative_abundance = false + motus_remove_ncbi_ids = false + motus_save_mgc_read_counts = false + + // kmcp + run_kmcp = false + kmcp_mode = 3 // default kmcp profiling value + kmcp_save_search = false + + // ganon + run_ganon = false + ganon_report_type = 'reads' + ganon_report_rank = 'default' + ganon_report_toppercentile = 0 + ganon_report_mincount = 0 + ganon_report_maxcount = 0 + ganon_save_readclassifications = false + + // krona + run_krona = false + krona_taxonomy_directory = null + + // profile standardisation + run_profile_standardisation = false + standardisation_taxpasta_format = 'tsv' + taxpasta_taxonomy_dir = null + taxpasta_add_name = false + taxpasta_add_rank = false + taxpasta_add_lineage = false + taxpasta_add_idlineage = false + taxpasta_add_ranklineage = false + taxpasta_ignore_errors = false + standardisation_motus_generatebiom = false } // Load base.config by default for all pipelines @@ -69,11 +202,12 @@ try { // Load nf-core/taxprofiler custom profiles from different institutions. // Warning: Uncomment only if a pipeline-specific institutional config already exists on nf-core/configs! -// try { -// includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" -// } catch (Exception e) { -// System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") -// } +try { + includeConfig "${params.custom_config_base}/pipeline/taxprofiler.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config/taxprofiler profiles: ${params.custom_config_base}/pipeline/taxprofiler.config") + } + profiles { debug { dumpHashes = true @@ -166,8 +300,20 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } + test_noprofiling { includeConfig 'conf/test_noprofiling.config' } + test_nopreprocessing { includeConfig 'conf/test_nopreprocessing.config' } + test_nothing { includeConfig 'conf/test_nothing.config' } + test_motus { includeConfig 'conf/test_motus.config' } + test_krakenuniq { includeConfig 'conf/test_krakenuniq.config' } + test_malt { includeConfig 'conf/test_malt.config' } + test_falco { includeConfig 'conf/test_falco.config' } + test_fastp { includeConfig 'conf/test_fastp.config' } + test_adapterremoval { includeConfig 'conf/test_adapterremoval.config' } + test_bbduk { includeConfig 'conf/test_bbduk.config' } + test_prinseqplusplus { includeConfig 'conf/test_prinseqplusplus.config' } + } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -184,6 +330,7 @@ plugins { } // Load igenomes.config if required + if (!params.igenomes_ignore) { includeConfig 'conf/igenomes.config' } else { @@ -194,7 +341,7 @@ if (!params.igenomes_ignore) { // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. env { - PYTHONNOUSERSITE = 1 + PYTHONNOUSERSITE = '1' R_PROFILE_USER = "/.Rprofile" R_ENVIRON_USER = "/.Renviron" JULIA_DEPOT_PATH = "/usr/local/share/julia" @@ -232,7 +379,11 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' version = '1.1.6dev' +<<<<<<< HEAD doi = '' +======= + doi = '10.1101/2023.10.20.563221' +>>>>>>> dev } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index b7fd15d7..73364791 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,7 +10,7 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": ["input", "outdir"], + "required": ["input", "databases", "outdir"], "properties": { "input": { "type": "string", @@ -19,10 +19,21 @@ "schema": "assets/schema_input.json", "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", + "description": "Path to comma-separated file containing information about the samples and libraries/runs.", + "help_text": "You will need to create a design file with information about the samples and libraries/runs you want to running in your pipeline run. Use this parameter to specify its location. It has to be a comma-separated file with 6 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/usage#samplesheet-input).", "fa_icon": "fas fa-file-csv" }, + "databases": { + "type": "string", + "mimetype": "text/csv", + "format": "file-path", + "exists": true, + "schema": "assets/schema_database.json", + "pattern": "^\\S+\\.csv$", + "fa_icon": "fas fa-database", + "description": "Path to comma-separated file containing information about databases and profiling parameters for each taxonomic profiler", + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/taxprofiler/dev/usage#full-database-sheet).\n\nProfilers will only be executed if a corresponding database are supplied. \n\nWe recommend storing this database sheet somewhere centrally and accessible by others members of your lab/institutions, as this file will likely be regularly reused." + }, "outdir": { "type": "string", "format": "directory-path", @@ -43,36 +54,612 @@ } } }, - "reference_genome_options": { - "title": "Reference genome options", + "preprocessing_general_qc_options": { + "title": "Preprocessing general QC options", "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", + "description": "Common options across both long and short read preprocessing QC steps", + "default": "", "properties": { - "genome": { + "skip_preprocessing_qc": { + "type": "boolean", + "fa_icon": "fas fa-forward", + "description": "Specify to skip sequencing quality control of raw sequencing reads", + "help": "Skipping running of FastQC or Falco maybe useful in cases where you are already running with preprocessed data (e.g. you are also skipping short/long read qc steps) that you already know the quality of" + }, + "preprocessing_qc_tool": { "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "default": "fastqc", + "enum": ["fastqc", "falco"], + "help_text": "Falco is designed as a drop-in replacement for FastQC but written in C++ for faster computation. We particularly recommend using falco when using long reads (due to reduced memory constraints), however is also applicable for short reads.", + "description": "Specify the tool used for quality control of raw sequencing reads", + "fa_icon": "fas fa-tools" + }, + "save_preprocessed_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save reads from samples that went through the adapter clipping, pair-merging, and length filtering steps for both short and long reads", + "help_text": "This saves the FASTQ output from the following tools:\n\n- fastp\n- AdapterRemoval\n- Porechop\n- Filtlong\n\nThese reads will be a mixture of: adapter clipped, quality trimmed, pair-merged, and length filtered, depending on the parameters you set." + }, + "save_analysis_ready_fastqs": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save only the final reads from all read processing steps (that are sent to classification/profiling) in results directory.", + "help_text": "This flag will generate the directory `results/analysis_read_reads` that contains the reads from the last preprocessing (QC, host removal, run merging etc.) step of the pipeline run. \n\nThis can be useful if you wish to re-use the final cleaned-up and prepared reads - the data actually used for the actual classification/profiling steps of the pipeline - for other analyses or purposes (e.g., to reduce redundant preprocessing between different pipelines, e.g. [nf-core/mag](https://nf-co.re/mag)).\n\nIn most cases this will be preferred over similar parameters e.g. ` --save_preprocessed_reads` or ` --save_complexityfiltered_reads`, unless you wish to explore in more detail the output of each specific preprocessing step independently.\n\nNote if you do no preprocessing of any kind, nothing will be present in this directory. " + } + }, + "fa_icon": "fas fa-users-cog" + }, + "preprocessing_short_read_qc_options": { + "title": "Preprocessing short-read QC options", + "type": "object", + "description": "Options for adapter clipping, quality trimming, pair-merging, and complexity filtering", + "default": "", + "properties": { + "perform_shortread_qc": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turns on short read quality control steps (adapter clipping, complexity filtering etc.)", + "help_text": "Turns on short read quality control steps (adapter clipping, complexity filtering etc.)\n\nThis subworkflow can perform:\n\n- Adapter removal\n- Read quality trimming\n- Read pair merging\n- Length filtering\n- Complexity filtering\n\nEither with fastp or AdapterRemoval.\n\nRemoving adapters (if present) is recommend to reduce false-postive hits that may occur from 'dirty' or 'contaminated' reference genomes in a profiling database that contain accidentially incorporated adapter sequences. Note that some, but not all, tools support paired-end alignment (utilising information about the insert covered by the pairs). However read pair merging in some cases can be recommend to increase read length (such as in aDNA). Length filtering, and/or complexity can speed up alignment by reducing the number of short unspecific reads that need to be aligned." }, - "fasta": { + "shortread_qc_tool": { "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" + "default": "fastp", + "enum": ["fastp", "adapterremoval"], + "fa_icon": "fas fa-tools", + "description": "Specify which tool to use for short-read QC" }, - "igenomes_ignore": { + "shortread_qc_skipadaptertrim": { "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "fa_icon": "fas fa-forward", + "description": "Skip adapter trimming", + "help_text": "Skip the removal of sequencing adapters. \n\nThis often can be useful to speed up run-time of the pipeline when analysing data downloaded from public databases such as the ENA or SRA, as adapters should already be removed (however we recommend to check FastQC results to ensure this is the case)." + }, + "shortread_qc_adapter1": { + "type": "string", + "fa_icon": "fas fa-grip-lines", + "description": "Specify adapter 1 nucleotide sequence", + "help_text": "Specify a custom forward or R1 adapter sequence to be removed from reads. \n\nIf not set, the selected short-read QC tool's defaults will be used.\n\n> Modifies tool parameter(s):\n> - fastp: `--adapter_sequence`. fastp default: `AGATCGGAAGAGCACACGTCTGAACTCCAGTCA`\n> - AdapterRemoval: `--adapter1`. AdapteRemoval2 default: `AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG`" + }, + "shortread_qc_adapter2": { + "type": "string", + "fa_icon": "fas fa-grip-lines", + "description": "Specify adapter 2 nucleotide sequence", + "help_text": "Specify a custom reverse or R2 adapter sequence to be removed from reads. \n\nIf not set, the selected short-read QC tool's defaults will be used.\n\n> Modifies tool parameter(s):\n> - fastp: `--adapter_sequence`. fastp default: `AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT`\n> - AdapterRemoval: `--adapter1`. AdapteRemoval2 default: `AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT`" + }, + "shortread_qc_adapterlist": { + "type": "string", + "description": "Specify a list of all possible adapters to trim. Overrides --shortread_qc_adapter1/2. Formats: .txt (AdapterRemoval) or .fasta. (fastp).", + "help_text": "Allows to supply a file with a list of adapter (combinations) to remove from all files. \n\nOverrides the --shortread_qc_adapter1/--shortread_qc_adapter2 parameters . \n\nFor AdapterRemoval this consists of a two column table with a `.txt` extension: first column represents forward strand, second column for reverse strand. You must supply all possible combinations, one per line, and this list is applied to all files. See AdapterRemoval documentation for more information.\n\nFor fastp this consists of a standard FASTA format with a `.fasta`/`.fa`/`.fna`/`.fas` extension. The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. fastp trims the adapters present in the FASTA file one by one.\n\n> Modifies AdapterRemoval parameter: --adapter-list\n> Modifies fastp parameter: --adapter_fasta", + "fa_icon": "fas fa-th-list" + }, + "shortread_qc_mergepairs": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on merging of read pairs for paired-end data", + "help_text": "Turn on the merging of read-pairs of paired-end short read sequencing data. \n\n> Modifies tool parameter(s):\n> - AdapterRemoval: `--collapse`\n> - fastp: `-m --merged_out`\n" + }, + "shortread_qc_includeunmerged": { + "type": "boolean", + "fa_icon": "far fa-times-circle", + "description": "Include unmerged reads from paired-end merging in the downstream analysis", + "help_text": "Turns on the inclusion of unmerged reads in resulting FASTQ file from merging paired-end sequencing data when using `fastp` and/or `AdapterRemoval`. For `fastp` this means the unmerged read pairs are directly included in the output FASTQ file. For `AdapterRemoval`, additional output files containing unmerged reads are all concatenated into one file by the workflow.\n\nExcluding unmerged reads can be useful in cases where you prefer to have very short reads (e.g. aDNA), thus excluding longer-reads or possibly faulty reads where one of the pair was discarded.\n\n> Adds `fastp` option: `--include_unmerged`\n" + }, + "shortread_qc_minlength": { + "type": "integer", + "default": 15, + "fa_icon": "fas fa-ruler-horizontal", + "description": "Specify the minimum length of reads to be retained", + "help_text": "Specifying a mimum read length filtering can speed up profiling by reducing the number of short unspecific reads that need to be match/aligned to the database.\n\n> Modifies tool parameter(s):\n> - removed from reads `--length_required`\n> - AdapterRemoval: `--minlength`" + }, + "shortread_qc_dedup": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Perform deduplication of the input reads (fastp only)", + "help_text": "This enables the deduplication of processed reads during fastp adapter removal and/or merging. It removes identical reads that are likely artefacts from laboratory protocols (e.g. amplification), and provide no additional sequence information to the library.\n\nRemoving duplicates can increase runtime and increase accuracy of abundance calculations.\n\n> Modifies tool parameter(s):\n> fastp: ` --dedup`\n" + }, + "perform_shortread_complexityfilter": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turns on nucleotide sequence complexity filtering", + "help_text": "Turns on sequencing complexity filtering. Complexity filtering can be useful to increase run-time by removing unspecific read sequences that do not provide any informative taxon ID." + }, + "shortread_complexityfilter_tool": { + "type": "string", + "default": "bbduk", + "enum": ["bbduk", "prinseqplusplus", "fastp"], + "fa_icon": "fas fa-hammer", + "description": "Specify which tool to use for complexity filtering" + }, + "shortread_complexityfilter_entropy": { + "type": "number", + "default": 0.3, + "fa_icon": "fas fa-random", + "description": "Specify the minimum sequence entropy level for complexity filtering", + "help_text": "Specify the minimum 'entropy' value for complexity filtering for BBDuk or PRINSEQ++.\n\nNote that this value will only be used for PRINSEQ++ if `--shortread_complexityfilter_prinseqplusplus_mode` is set to `entropy`.\n\nEntropy here corresponds to the amount of sequence variation exists within the read. Higher values correspond to more variety, and thus will likely reslut in more specific matching to a taxon's reference genome. The trade off here is fewer reads (or abundance information) available for having a confident identification.\n\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropy=`\n> - PRINSEQ++: `-lc_entropy`\n\n" + }, + "shortread_complexityfilter_bbduk_windowsize": { + "type": "integer", + "default": 50, + "fa_icon": "far fa-window-maximize", + "description": "Specify the window size for BBDuk complexity filtering", + "help_text": "Specify the window size to calculate the level entropy within for BBDuk.\n\n> Modifies tool parameter(s):\n> - BBDuk: `entropywindow=`" + }, + "shortread_complexityfilter_bbduk_mask": { + "type": "boolean", + "fa_icon": "fas fa-mask", + "description": "Turn on masking rather than discarding of low complexity reads for BBduk", + "help_text": "Turn on masking of low-complexity reads (i.e., replacement with `N`) rather than removal.\n\n> Modifies tool parameter(s)\n> - BBDuk: `entropymask=`" + }, + "shortread_complexityfilter_fastp_threshold": { + "type": "integer", + "default": 30, + "fa_icon": "fas fa-sort-numeric-down", + "description": "Specify the minimum complexity filter threshold of fastp", + "help_text": "Specify the minimum sequence complexity value for fastp. This value corresponds to the percentage of bases that is different from it's adjacent bases.\n\n> Modifies tool parameter(s):\n> - removed from reads `--complexity_threshold`" + }, + "shortread_complexityfilter_prinseqplusplus_mode": { + "type": "string", + "default": "entropy", + "enum": ["entropy", "dust"], + "fa_icon": "fas fa-check-square", + "description": "Specify the complexity filter mode for PRINSEQ++" + }, + "shortread_complexityfilter_prinseqplusplus_dustscore": { + "type": "number", + "default": 0.5, + "fa_icon": "fas fa-head-side-mask", + "description": "Specify the minimum dust score for PRINTSEQ++ complexity filtering", + "help_text": "Specify the minimum dust score below which low-complexity reads will be removed. A DUST score is based on how often different tri-nucleotides occur along a read.\n\n> Modifies tool parameter(s):\n> - PRINSEQ++: `--lc_dust`" + }, + "save_complexityfiltered_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save reads from samples that went through the complexity filtering step", + "help_text": "Specify whether to save the final complexity filtered reads in your results directory (`--outdir`)." } - } + }, + "fa_icon": "fas fa-compress-alt" + }, + "preprocessing_long_read_qc_options": { + "title": "Preprocessing long-read QC options", + "type": "object", + "description": "Options for adapter clipping, quality trimming, and length filtering", + "default": "", + "properties": { + "perform_longread_qc": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turns on long read quality control steps (adapter clipping, length filtering etc.)", + "help_text": "Turns on long read quality control steps (adapter clipping, length and/or quality filtering.)\n\nRemoving adapters (if present) is recommend to reduce false-postive hits that may occur from 'dirty' or 'contaminated' reference genomes in a profiling database that contain accidentially incorporated adapter sequences.\n\nLength filtering, and quality filtering can speed up alignment by reducing the number of unspecific reads that need to be aligned." + }, + "longread_qc_skipadaptertrim": { + "type": "boolean", + "description": "Skip long-read trimming", + "fa_icon": "fas fa-forward", + "help_text": "Skip removal of adapters by Porechop. This can be useful in some cases to speed up run time - particularly when you are running data downloading from public databases such as the ENA/SRA that should already have adapters removed. We recommend that you check your FastQC results this is indeed the case." + }, + "longread_qc_skipqualityfilter": { + "type": "boolean", + "description": "Skip long-read length and quality filtering", + "fa_icon": "fas fa-forward", + "help_text": "Skip removal of quality filtering with Filtlong. This will skip length, percent reads, and target bases filtering (see other `--longread_qc_qualityfilter_*` parameters)." + }, + "longread_qc_qualityfilter_minlength": { + "type": "integer", + "default": 1000, + "description": "Specify the minimum length of reads to be retained", + "fa_icon": "fas fa-ruler-horizontal", + "help_text": "Specify the minimum of length of reads to be kept for downstream analysis.\n\n> Modifies tool parameter(s):\n> - Filtlong: `--min_length`" + }, + "longread_qc_qualityfilter_keeppercent": { + "type": "integer", + "default": 90, + "description": "Specify the percent of high-quality bases to be retained", + "fa_icon": "fas fa-percentage", + "help_text": "Throw out the remaining percentage of reads outside the value. This is measured by bp, not by read count. So this option throws out the worst e.g. 10% of read bases if the parameter is set to `90`. _Modified from [Filtlong documentation](https://github.com/rrwick/Filtlong)_\n\n> Modifies tool parameter(s):\n> - Filtlong: `--keep_percent`" + }, + "longread_qc_qualityfilter_targetbases": { + "type": "integer", + "default": 500000000, + "description": "Specify the number of high-quality bases in the library to be retained", + "fa_icon": "fas fa-bullseye", + "help_text": "Removes the worst reads until only the specified value of bases remain, useful for very large read sets. If the input read set is less than the specified value, this setting will have no effect. _Modified from [Filtlong documentation](https://github.com/rrwick/Filtlong)_\n\n> Modifies tool parameter(s):\n> - Filtlong: `--keep_percent`" + } + }, + "fa_icon": "fas fa-expand-alt" + }, + "preprocessing_host_removal_options": { + "title": "Preprocessing host removal options", + "type": "object", + "description": "Options for pre-profiling host read removal", + "default": "", + "properties": { + "perform_shortread_hostremoval": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on short-read host removal", + "help_text": "Turns on the ability to remove short-reads from the that derived from a known organism, using Bowtie2 and samtools\n\nThis subworkflow is useful to remove reads that may come from a host, or a known contamination like the human reference genome. Human DNA contamination of (microbial) reference genomes is well known, so removal of these prior profiling both reduces the risks of false positives, and in _some cases_ a faster runtime (as less reads need to be profiled).\n\nAlternatively, you can include the reference genome within your profiling databases and can turn off this subworkflow, with the trade off of a larger taxonomic profiling database." + }, + "perform_longread_hostremoval": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on long-read host removal", + "help_text": "Turns on the ability to remove long-reads from the that derived from a known organism, using minimap2 and samtools\n\nThis subworkflow is useful to remove reads that may come from a host, or a known contamination like the human reference genome. Human DNA contamination of (microbial) reference genomes is well known, so removal of these prior profiling both reduces the risks of false positives, and in _some cases_ a faster runtime (as less reads need to be profiled).\n\nAlternatively, you can include the reference genome within your profiling databases and can turn off this subworkflow, with the trade off of a larger taxonomic profiling database." + }, + "hostremoval_reference": { + "type": "string", + "fa_icon": "fas fa-file-alt", + "description": "Specify path to single reference FASTA of host(s) genome(s)", + "help_text": "Specify a path to the FASTA file (optionally gzipped) of the reference genome of the organism to be removed.\n\nIf you have two or more host organisms or contaminants you wish to remove, you can concatenate the FASTAs of the different taxa into a single one to provide to the pipeline." + }, + "shortread_hostremoval_index": { + "type": "string", + "fa_icon": "fas fa-address-book", + "description": "Specify path to the directory containing pre-made BowTie2 indexes of the host removal reference", + "help_text": "Specify the path to a _directory_ containing pre-made Bowtie2 reference index files (i.e. the directory containing `.bt1`, `.bt2` files etc.). These should sit in the same directory alongside the the reference file specified in `--hostremoval_reference`.\n\nSpecifying premade indices can speed up runtime of the host-removal step, however if not supplied the pipeline will generate the indices for you." + }, + "longread_hostremoval_index": { + "type": "string", + "fa_icon": "fas fa-address-book", + "description": "Specify path to a pre-made Minimap2 index file (.mmi) of the host removal reference", + "help_text": "Specify path to a pre-made Minimap2 index file (.mmi) of the host removal reference file given to `--hostremoval_reference`.\n\nSpecifying a premade index file can speed up runtime of the host-removal step, however if not supplied the pipeline will generate the indices for you." + }, + "save_hostremoval_index": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save mapping index of input reference when not already supplied by user", + "help_text": "Save the output files of the in-built indexing of the host genome.\n\nThis is recommend to be turned on if you plan to use the same reference genome multiple times, as supplying the directory or file to `--shortread_hostremoval_index` or `--longread_hostremoval_index` respectively can speed up runtime of future runs. Once generated, we recommend you place this file _outside_ of your run results directory in a central 'cache' directory you and others using your machine can access and supply to the pipeline." + }, + "save_hostremoval_bam": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Saved mapped and unmapped reads in BAM format from host removal", + "help_text": "Save the reads mapped to the reference genome and off-target reads in BAM format as output by the respective hostremoval alignment tool.\n\nThis can be useful if you wish to perform other analyses on the host organism (such as host-microbe interaction), however, you should consider whether the default mapping parameters of Bowtie2 (short-read) or minimap2 (long-read) are optimised to your context." + }, + "save_hostremoval_unmapped": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save reads from samples that went through the host-removal step", + "help_text": "Save only the reads NOT mapped to the reference genome in FASTQ format (as exported from `samtools view` and `fastq`).\n\nThis can be useful if you wish to perform other analyses on the off-target reads from the host mapping, such as manual profiling or _de novo_ assembly." + } + }, + "fa_icon": "fas fa-user-times" + }, + "preprocessing_run_merging_options": { + "title": "Preprocessing run merging options", + "type": "object", + "description": "Options for per-sample run-merging", + "default": "", + "properties": { + "perform_runmerging": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on run merging", + "help_text": "Turns on the concatenation of sequencing runs or libraries with the same sample name.\n\nThis can be useful to ensure you get a single profile per sample, rather than one profile per run or library. Note that in some cases comparing profiles of independent _libraries_ may be useful, so this parameter may not always be suitable. " + }, + "save_runmerged_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Save reads from samples that went through the run-merging step", + "help_text": "Save the run- and library-concatenated reads of a given sample in FASTQ format.\n\n> ⚠️ Only samples that went through the run-merging step of the pipeline will be stored in the resulting directory. \n\nIf you wish to save the files that go to the classification/profiling steps for samples that _did not_ go through run merging, you must supply the appropriate upstream `--save_` flag.\n\n" + } + }, + "fa_icon": "fas fa-clipboard-check" + }, + "profiling_options": { + "title": "Profiling options", + "type": "object", + "description": "", + "default": "", + "properties": { + "run_centrifuge": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with Centrifuge. Requires database to be present CSV file passed to --databases" + }, + "centrifuge_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Centrifuge-aligned reads", + "help_text": "Save mapped (SAM, FASTQ) and unmapped (FASTQ) reads from alignment step of centrifuge in your output results directory.\n\n> Modifies tool parameter(s):\n> - centrifuge: `--un-gz`, `--al-gz`, `--un-conc-gz`, `--al-conc-gz`, `--out-fmt`" + }, + "run_diamond": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with DIAMOND. Requires database to be present CSV file passed to --databases" + }, + "diamond_output_format": { + "type": "string", + "default": "tsv", + "enum": ["blast", "xml", "txt", "daa", "sam", "tsv", "paf"], + "fa_icon": "fas fa-file", + "description": "Specify output format from DIAMOND profiling.", + "help_text": "DIAMOND can produce output in a number of different formats, you can specify here which to produce.\n\nNote that DIAMOND can only produce one format at a time, and depending on which you pick, some downstream steps may not be executed. For example, selecting `daa` or `sam` will mean you will not get a tabular taxonomic profile as with the other tools.\n\nWill be overriden by `--diamond_save_reads.`\n\n> Modifies tool parameter(s):\n> - diamond blastx: `--outfmt`" + }, + "diamond_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of DIAMOND-aligned reads. Will override --diamond_output_format and no taxon tables will be generated", + "help_text": "Save aligned reads in SAM format from alignment step of DIAMOND in your output results directory.\n\nNote this explicitly overrides `--diamond_output_format` to produce the SAM file, and no taxon table will be generated.\n\n> Modifies tool parameter(s):\n> - DIAMOND: `--outfmt`" + }, + "run_kaiju": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with Kaiju. Requires database to be present CSV file passed to --databases" + }, + "kaiju_expand_viruses": { + "type": "boolean", + "description": "Turn on expanding of virus hits to individual viruses rather than aggregating at a taxonomic level.", + "help_text": "Turn on the reporting by Kaiju of viruses at specific virus levels, rather than aggregating at specific taxonomic levels as specified by `-- kaiju_taxon_rank` (i.e., read counts will not be summarised at higher taxonomic levels).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-e`", + "fa_icon": "fas fa-expand-arrows-alt" + }, + "kaiju_taxon_rank": { + "type": "string", + "default": "species", + "enum": ["phylum", "class", "order", "family", "genus", "species"], + "fa_icon": "fas fa-tag", + "description": "Specify taxonomic rank to be displayed in Kaiju taxon table", + "help_text": "Specify the taxonomic level(s) to be displayed in the resulting Kaiju taxon table, as generated by the kaiju2table helper tool.\n\nThis can be only be a single level (e.g. `species`).\n\n> Modifies tool parameter(s):\n> - kaiju2table: `-l`" + }, + "run_kraken2": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with Kraken2. Requires database to be present CSV file passed to --databases" + }, + "kraken2_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2-aligned reads", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTQ format.\n\n> Modifies tool parameter(s):\n> - kraken2: `--classified-out` and `--unclassified-out`" + }, + "kraken2_save_readclassifications": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of Kraken2 per-read taxonomic assignment file", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read recieved.\n\n> Modifies tool parameter(s):\n> - kraken2: `--output`" + }, + "kraken2_save_minimizers": { + "type": "boolean", + "description": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.", + "fa_icon": "fas fa-save", + "help_text": "Turn on saving minimizer information in the kraken2 report thus increasing to an eight column layout.\n\nAdds `--report-minimizer-data` to the kraken2 command." + }, + "run_krakenuniq": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with KrakenUniq. Requires one or more KrakenUniq databases to be present in the CSV file passed to --databases." + }, + "krakenuniq_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq (un-)classified reads as FASTA.", + "help_text": "Save reads that do and do not have a taxonomic classification in your output results directory in FASTA format. Reads from paired-end input are merged.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--classified-out` and `--unclassified-out`" + }, + "krakenuniq_ram_chunk_size": { + "type": "string", + "default": "16G", + "description": "Specify how large to chunk the database when loading into memory for KrakenUniq.", + "fa_icon": "fas fa-database", + "help_text": "nf-core/taxprofiler utilises a 'low memory' option for KrakenUniq that can reduce the amount of RAM the process requires using the `--preloaded` option.\n\nA further extension to this option is that you can specify how large each chunk of the database should be that gets loaded into memory at any one time. You can specify the amount of RAM to chunk the database to with this parameter, and is particularly useful for people with limited computational resources.\n\nMore information about this parameter can be seen [here](https://github.com/fbreitwieser/krakenuniq/blob/master/README.md#new-release-v07).\n\n> Modifies KrakenUniq parameter: --preload-size\n\n" + }, + "krakenuniq_save_readclassifications": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of KrakenUniq per-read taxonomic assignment file.", + "help_text": "Save a text file that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic taxonomic assignment that that read received.\n\n> Modifies tool parameter(s):\n> - krakenuniq: `--output`" + }, + "krakenuniq_batch_size": { + "type": "integer", + "default": 20, + "fa_icon": "far fa-window-restore", + "description": "Specify the number of samples for each KrakenUniq run.", + "help_text": "Specify the batch size for KrakenUniq. The reference database for KrakenUniq is loaded into memory once per nextflow process and then used to classify many samples. When you have many samples, a single KrakenUniq run can be rather slow. Alternatively, we can split up KrakenUniq runs for a 'batch' of samples, allowing a balance between shared use of database for multiple samples, but also faster parallelised KrakenUniq runs. This parameter determines for how many samples at a time." + }, + "run_bracken": { + "type": "boolean", + "description": "Turn on Bracken (and the required Kraken2 prerequisite step).", + "fa_icon": "fas fa-toggle-on" + }, + "run_malt": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with MALT. Requires database to be present CSV file passed to --databases" + }, + "malt_mode": { + "type": "string", + "default": "BlastN", + "fa_icon": "fas fa-check-square", + "description": "Specify which MALT alignment mode to use", + "help_text": "Specify which version of MALT alignment to use.\n\nBlastN is generally recommended (nucleotide-nucleotide alignment), but particularly for very short reads (such as aDNA), whereas BlastX mode is similar to DIAMOND and will translate the nucleotide to amino acid sequences. Note each type of alignment mode requires different parameters during database construction. Refer to the MALT manual for more information.\n\n> Modifies tool parameter(s):\n> - malt-run: `-mode` " + }, + "malt_save_reads": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving of MALT-aligned reads", + "help_text": "Turns on saving of MALT aligned reads in SAM format.\n\nNote that the SAM format produce by MALT is not completely valid, and may not work with downstream tools.\n\n> Modifies tool parameter(s):\n> - malt-run: `--alignments`, `-za`" + }, + "malt_generate_megansummary": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on generation of MEGAN summary file from MALT results", + "help_text": "Turns on saving of MALT output in an additional MEGAN summary file (`.megan`) that can be loaded into the MEGAN metagenomic exploration tool.\n\nNote: this file is generated not directly from MALT but rather then MEGAN utility script `rma2info`.\n\n> Modifies tool parameter(s):\n> - rma2info: `-es`" + }, + "run_metaphlan": { + "type": "boolean", + "description": "Turn on profiling with MetaPhlAn. Requires database to be present CSV file passed to --databases", + "fa_icon": "fas fa-toggle-on" + }, + "run_motus": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on profiling with mOTUs. Requires database to be present CSV file passed to --databases" + }, + "motus_use_relative_abundance": { + "type": "boolean", + "description": "Turn on printing relative abundance instead of counts.", + "fa_icon": "fas fa-percent", + "help_text": "This parameter specifies to use the calculated relative abundance (i.e., percentage of all hits). This is normally the default behaviour in mOTUs however we use counts by default in nf-core/taxprofiler for consistency with other classification/profiling tools.\n\n> Modifies tool parameter(s):\n> - mOTUs: `-c` (removed from the default nf-core/taxprofiler command)" + }, + "motus_save_mgc_read_counts": { + "type": "boolean", + "description": "Turn on saving the mgc reads count.", + "fa_icon": "fas fa-save", + "help_text": "Turns on the saving of the read counts against each mOTU marker-gene clusters, in addition to per-taxon count/abundance reporting (in a separate file)\n\n> Modifies tool parameter(s):\n- mOTUs: `-M`" + }, + "motus_remove_ncbi_ids": { + "type": "boolean", + "description": "Turn on removing NCBI taxonomic IDs.", + "fa_icon": "fas fa-address-card", + "help_text": "By default mOTUs will report species names rather than NCBI Taxon IDs. In nf-core/taxprofiler we prefer taxon IDs due to interoperatbility and comparability with the output of other classifiers and profilers. If you prefer to have just species names, you can specify this to remove the IDs.\n\n> Modifies tool parameter(s):\n- mOTUs: `-p` (removed from the default nf-core/taxprofiler command)" + }, + "run_kmcp": { + "type": "boolean", + "description": "Turn on classification with KMCP.", + "fa_icon": "fas fa-toggle-on" + }, + "kmcp_mode": { + "type": "integer", + "default": 3, + "description": "Specify which KMCP profiling mode to use.", + "help_text": "Available values: \n0 (for pathogen detection)\n1 (higherrecall)\n2 (high recall)\n3 (default)\n4 (high precision)\n5 (higher precision).\nFor more information about the different profiling modes, please see the [kmcp documentation](https://bioinf.shenwei.me/kmcp/usage/#profile)\n\n> Modifies tool parameter(s):\n- kmcp profile: `--mode`\n\n", + "fa_icon": "fas fa-check-square" + }, + "kmcp_save_search": { + "type": "boolean", + "fa_icon": "fas fa-save", + "description": "Turn on saving the output of KMCP search", + "help_text": "During the searching step, KMCP searches across the database with all k-mers and returns reference genome chunks sharing enough k-mers with the query. The output file is a tab-delimited file in gzip format with 15 columns and is used to generate the taxonomic profiling. More information about the columns can be found here. https://bioinf.shenwei.me/kmcp/usage/#search. " + }, + "run_ganon": { + "type": "boolean", + "description": "Turn on profiling with ganon. Requires database to be present CSV file passed to --databases.", + "fa_icon": "fas fa-toggle-on" + }, + "ganon_save_readclassifications": { + "type": "boolean", + "description": "Turn on saving of ganon per-read taxonomic assignment file(s).", + "fa_icon": "fas fa-save", + "help_text": "Saves `.lca`, `.all`, and `.unc` text files that contains a list of each read that had a taxonomic assignment, with information on specific taxonomic assignment that the read received.\n\n> Modifies tool parameter(s):\n- ganon classify: `--output-all --output-lca --output-unclassified`" + }, + "ganon_report_type": { + "type": "string", + "default": "reads", + "description": "Specify the type of ganon report to save.", + "help_text": "Specify the type of taxonomic report to produce from ganon report. This mainly refers to which form of 'value' to print: raw read counts, abundance estimates, genome-size normalised etc. \n\nSee the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/#ganon-report) for more information of each option.\n\n> Modifies tool parameter(s):\n- ganon report: `--report-type`\n", + "enum": ["abundance", "reads", "matches", "dist", "corr"], + "fa_icon": "fas fa-file" + }, + "ganon_report_rank": { + "type": "string", + "description": "Specify the taxonomic report the ganon report file should display.", + "help_text": "Specify the taxonomic rank level to report each taxonomic hit as. `all` will specify all ranks, however you can customise this from `superkingdom` through to `species` to as specific as `assembly`. ganon has a default preset, however you can customise the specific ranks in a comma separated list, e.g. `--ganon_report_rank [phylum,genus,species]`.\n\nSee the [ganon documentation](https://pirovc.github.io/ganon/outputfiles/#ganon-report) for more information of each option.\n\n> Modifies tool parameter(s):\n- ganon report: `--ranks`", + "fa_icon": "fas fa-sort-amount-down-alt" + }, + "ganon_report_toppercentile": { + "type": "integer", + "default": 0, + "description": "Specify a percentile within which hits will be reported in ganon report output..", + "help_text": "Specify the top percentile or relative abundance\n under which all hits underneath the threshold are not reported. This can be useful to remove long tails of few-reads and thus unconfident hits.\n\n> Modifies tool parameter(s)\n> - ganon report: `--top-percentile`\n", + "fa_icon": "fas fa-percent" + }, + "ganon_report_mincount": { + "type": "integer", + "default": 0, + "description": "Specify a minimum number of reads a hit must have to be retained in the ganon report.", + "help_text": "Specify the minmum number of reads or percentage of counts a hit must have against a taxon to be retained. To specify a minimum percentage, specify between 0 and 1 (e.g. 0.1 for 10%), and more than 1 to specify a hard count cut off (e.g. 100 for minimum of 100 reads).\n\n> Modifies tool parameter(s):\n- ganon report: `--min-count`", + "fa_icon": "fas fa-filter" + }, + "ganon_report_maxcount": { + "type": "integer", + "default": 0, + "description": "Specify a maximum number of reads a hit must have to be retained in the ganon report.", + "help_text": "Specify the maximum number of reads or percentage of counts a hit must have against a taxon to be retained. To specify a maximum percentage, specify between 0 and 1 (e.g. 0.9 for 90%), and more than 1 to specify a hard count cut off (e.g. 10000 for maximum of 10,000 reads).\n\n> Modifies tool parameter(s):\n- ganon report: `--max-count`", + "fa_icon": "fas fa-filter" + } + }, + "fa_icon": "fas fa-align-center" + }, + "postprocessing_and_visualisation_options": { + "title": "Postprocessing and visualisation options", + "type": "object", + "description": "", + "default": "", + "properties": { + "run_profile_standardisation": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on standardisation of taxon tables across profilers", + "help_text": "Turns on standardisation of output OTU tables across all tools.\n\nThis happens in two forms, firstly - if available - by a given classifiers/profilers 'native' profile merger and standardisation (for Bracken, Kaiju, Kraken, Centrifuge, MetaPhlAn3, mOTUs), and secondly for _all_ classifier/profilers in the pipeline using [`taxpasta`](https://taxpasta.readthedocs.io).\n\nIn the latter case, taxpasta generates a standardised output as follows:\n\n|TAXON | SAMPLE_A | SAMPLE_B |\n|-------------|----------------|-----------------|\n| taxon_a | 32 | 123 |\n| taxon_b | 1 | 5 |\n\nwhereas all other 'native' tools have varying format outputs. See pipeline [output](https://nf-co.re/taxprofiler) documentation for more information." + }, + "standardisation_motus_generatebiom": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on generation of BIOM output (currently only applies to mOTUs)", + "help_text": "Turn on the saving of the taxonomic output in BIOM format (`.biom`) in the results directory of your pipeline run, instead of the default TSV format.\\n\\nNote this file is from the output of the `motus merge` command.\\n\\n> Modifies tool parameter(s):\\n> - `-B -o`" + }, + "run_krona": { + "type": "boolean", + "fa_icon": "fas fa-toggle-on", + "description": "Turn on generation of Krona plots for supported profilers", + "help_text": "Turn on the generation of Krona interactive pie-chart HTMLs for a selection of profilers.\n\nThe tools currently supported are:\n\n- centrifuge\n- kraken2\n- kaiju\n- MALT" + }, + "krona_taxonomy_directory": { + "type": "string", + "fa_icon": "fas fa-folder-open", + "description": "Specify path to krona taxonomy directories (required for MALT krona plots)", + "help_text": "Specify a path to a Krona taxonomy database directory (i.e. a directory containing a krona generated `.tab` file).\n\nThis is only required for generating Krona plots of MALT output.\n\nNote this taxonomy database must be downloaded and generated with the `updateTaxonomy.sh` script from the krona-tools package." + }, + "standardisation_taxpasta_format": { + "type": "string", + "default": "tsv", + "fa_icon": "fas fa-pastafarianism", + "description": "The desired output format.", + "enum": ["tsv", "csv", "arrow", "parquet", "biom"] + }, + "taxpasta_taxonomy_dir": { + "type": "string", + "description": "The path to a directory containing taxdump files.", + "help_text": "This arguments provides the path to the directory containing taxdump files. At least nodes.dmp and names.dmp are required. A merged.dmp file is optional. \n\nModifies tool parameter(s):\n-taxpasta: `--taxpasta_taxonomy_dir`", + "fa_icon": "fas fa-tree" + }, + "taxpasta_add_name": { + "type": "boolean", + "description": "Add the taxon name to the output.", + "help_text": "The standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon name can be added as additional information to the output table in addition to the taxon ID.\n\nModifies tool parameter(s):\n- taxpasta: `--add-name`", + "fa_icon": "fas fa-tag" + }, + "taxpasta_add_rank": { + "type": "boolean", + "description": "Add the taxon rank to the output.", + "help_text": "The standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon rank of the given taxonomic entry can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--add-rank`", + "fa_icon": "fas fa-sort-amount-down-alt" + }, + "taxpasta_add_lineage": { + "type": "boolean", + "description": "Add the taxon's entire name lineage to the output.", + "help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon's entire taxonomic lineage with the taxon names separated by semi-colons can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--add-lineage`\n", + "fa_icon": "fas fa-link" + }, + "taxpasta_add_idlineage": { + "type": "boolean", + "description": "Add the taxon's entire ID lineage to the output.", + "help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxon's entire taxonomic lineage with the taxon identifiers separated by semi-colons can be added as additional information to the output table.\n\nModifies tool parameter(s):\n- taxpasta: `--add-id-lineage`\n", + "fa_icon": "fas fa-link" + }, + "taxpasta_add_ranklineage": { + "type": "boolean", + "description": "Add the taxon's entire rank lineage to the output.", + "help_text": "\nThe standard output format of taxpasta is a two-column table including the read counts and the integer taxonomic ID. The taxonomic ranks categories of the taxon's entire lineage separated by semi-colons can be added as additional information to the output table. This complements `--taxpasta_add_lineage` by telling you which taxonomic rank level each entry in the lineage refers to.\n\nModifies tool parameter(s):\n- taxpasta: `--add-rank-lineage`\n", + "fa_icon": "fas fa-link" + }, + "taxpasta_ignore_errors": { + "type": "boolean", + "description": "Ignore individual profiles that cause errors.", + "help_text": "\nIgnore any metagenomic profiles with errors when running `taxpasta merge`. At least two profiles without errors are needed to merge.\n\nModifies tool parameter(s):\n- taxpasta: `--ignore-errors`\n", + "fa_icon": "fas fa-link" + } + }, + "fa_icon": "fas fa-chart-line" }, "institutional_config_options": { "title": "Institutional config options", @@ -267,6 +854,36 @@ "help_text": "Allows string values that are parseable as numbers or booleans. For further information see [JSONSchema docs](https://github.com/everit-org/json-schema#lenient-mode)." } } + }, + "reference_genome_options": { + "title": "Reference genome options", + "type": "object", + "fa_icon": "fas fa-dna", + "description": "Reference genome related files and options required for the workflow.", + "properties": { + "genome": { + "type": "string", + "description": "Name of iGenomes reference.", + "fa_icon": "fas fa-book", + "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details.", + "hidden": true + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes/", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + }, + "igenomes_ignore": { + "type": "boolean", + "description": "Do not load the iGenomes reference config.", + "fa_icon": "fas fa-ban", + "hidden": true, + "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + } + } } }, "allOf": [ @@ -274,7 +891,25 @@ "$ref": "#/definitions/input_output_options" }, { - "$ref": "#/definitions/reference_genome_options" + "$ref": "#/definitions/preprocessing_general_qc_options" + }, + { + "$ref": "#/definitions/preprocessing_short_read_qc_options" + }, + { + "$ref": "#/definitions/preprocessing_long_read_qc_options" + }, + { + "$ref": "#/definitions/preprocessing_host_removal_options" + }, + { + "$ref": "#/definitions/preprocessing_run_merging_options" + }, + { + "$ref": "#/definitions/profiling_options" + }, + { + "$ref": "#/definitions/postprocessing_and_visualisation_options" }, { "$ref": "#/definitions/institutional_config_options" @@ -284,6 +919,9 @@ }, { "$ref": "#/definitions/generic_options" + }, + { + "$ref": "#/definitions/reference_genome_options" } ] } diff --git a/subworkflows/local/longread_hostremoval.nf b/subworkflows/local/longread_hostremoval.nf new file mode 100644 index 00000000..bc146d6f --- /dev/null +++ b/subworkflows/local/longread_hostremoval.nf @@ -0,0 +1,61 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { MINIMAP2_INDEX } from '../../modules/nf-core/minimap2/index/main' +include { MINIMAP2_ALIGN } from '../../modules/nf-core/minimap2/align/main' +include { SAMTOOLS_VIEW } from '../../modules/nf-core/samtools/view/main' +include { SAMTOOLS_FASTQ } from '../../modules/nf-core/samtools/fastq/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' + +workflow LONGREAD_HOSTREMOVAL { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + index // /path/to/index + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.longread_hostremoval_index ) { + ch_minimap2_index = MINIMAP2_INDEX ( [ [], reference ] ).index.map { it[1] } + ch_versions = ch_versions.mix( MINIMAP2_INDEX.out.versions ) + } else { + ch_minimap2_index = index + } + + MINIMAP2_ALIGN ( reads, ch_minimap2_index, true, false, false ) + ch_versions = ch_versions.mix( MINIMAP2_ALIGN.out.versions.first() ) + ch_minimap2_mapped = MINIMAP2_ALIGN.out.bam + .map { + meta, reads -> + [ meta, reads, [] ] + } + + // Generate unmapped reads FASTQ for downstream taxprofiling + SAMTOOLS_VIEW ( ch_minimap2_mapped , [[],[]], [] ) + ch_versions = ch_versions.mix( SAMTOOLS_VIEW.out.versions.first() ) + + SAMTOOLS_FASTQ ( SAMTOOLS_VIEW.out.bam, false ) + ch_versions = ch_versions.mix( SAMTOOLS_FASTQ.out.versions.first() ) + + // Indexing whole BAM for host removal statistics + SAMTOOLS_INDEX ( MINIMAP2_ALIGN.out.bam ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() ) + + bam_bai = MINIMAP2_ALIGN.out.bam + .join(SAMTOOLS_INDEX.out.bai) + + SAMTOOLS_STATS ( bam_bai, [[],reference] ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_STATS.out.stats ) + + emit: + stats = SAMTOOLS_STATS.out.stats //channel: [val(meta), [reads ] ] + reads = SAMTOOLS_FASTQ.out.other // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/longread_preprocessing.nf b/subworkflows/local/longread_preprocessing.nf new file mode 100644 index 00000000..30963ec6 --- /dev/null +++ b/subworkflows/local/longread_preprocessing.nf @@ -0,0 +1,63 @@ +// +// Process long raw reads with porechop +// + +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' + +include { PORECHOP_PORECHOP } from '../../modules/nf-core/porechop/porechop/main' +include { FILTLONG } from '../../modules/nf-core/filtlong/main' + +workflow LONGREAD_PREPROCESSING { + take: + reads + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.longread_qc_skipadaptertrim && params.longread_qc_skipqualityfilter) { + PORECHOP_PORECHOP ( reads ) + + ch_processed_reads = PORECHOP_PORECHOP.out.reads + .map { meta, reads -> [ meta + [single_end: 1], reads ] } + + ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_PORECHOP.out.log ) + + } else if ( params.longread_qc_skipadaptertrim && !params.longread_qc_skipqualityfilter) { + + ch_processed_reads = FILTLONG ( reads.map { meta, reads -> [meta, [], reads ] } ) + ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) + + } else { + PORECHOP_PORECHOP ( reads ) + ch_clipped_reads = PORECHOP_PORECHOP.out.reads + .map { meta, reads -> [ meta + [single_end: 1], reads ] } + + ch_processed_reads = FILTLONG ( ch_clipped_reads.map { meta, reads -> [ meta, [], reads ] } ).reads + + ch_versions = ch_versions.mix(PORECHOP_PORECHOP.out.versions.first()) + ch_versions = ch_versions.mix(FILTLONG.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( PORECHOP_PORECHOP.out.log ) + ch_multiqc_files = ch_multiqc_files.mix( FILTLONG.out.log ) + } + + if (params.preprocessing_qc_tool == 'fastqc') { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + } + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf new file mode 100644 index 00000000..e306f1de --- /dev/null +++ b/subworkflows/local/profiling.nf @@ -0,0 +1,501 @@ +// +// Run profiling +// + +include { MALT_RUN } from '../../modules/nf-core/malt/run/main' +include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_TSV } from '../../modules/nf-core/megan/rma2info/main' +include { KRAKEN2_KRAKEN2 } from '../../modules/nf-core/kraken2/kraken2/main' +include { KRAKEN2_STANDARD_REPORT } from '../../modules/local/kraken2_standard_report' +include { BRACKEN_BRACKEN } from '../../modules/nf-core/bracken/bracken/main' +include { CENTRIFUGE_CENTRIFUGE } from '../../modules/nf-core/centrifuge/centrifuge/main' +include { CENTRIFUGE_KREPORT } from '../../modules/nf-core/centrifuge/kreport/main' +include { METAPHLAN_METAPHLAN } from '../../modules/nf-core/metaphlan/metaphlan/main' +include { KAIJU_KAIJU } from '../../modules/nf-core/kaiju/kaiju/main' +include { KAIJU_KAIJU2TABLE as KAIJU_KAIJU2TABLE_SINGLE } from '../../modules/nf-core/kaiju/kaiju2table/main' +include { DIAMOND_BLASTX } from '../../modules/nf-core/diamond/blastx/main' +include { MOTUS_PROFILE } from '../../modules/nf-core/motus/profile/main' +include { KRAKENUNIQ_PRELOADEDKRAKENUNIQ } from '../../modules/nf-core/krakenuniq/preloadedkrakenuniq/main' +include { KMCP_SEARCH } from '../../modules/nf-core/kmcp/search/main' +include { KMCP_PROFILE } from '../../modules/nf-core/kmcp/profile/main' +include { GANON_CLASSIFY } from '../../modules/nf-core/ganon/classify/main' +include { GANON_REPORT } from '../../modules/nf-core/ganon/report/main' + + +// Custom Functions + +/** +* Combine profiles with their original database, then separate into two channels. +* +* The channel elements are assumed to be tuples one of [ meta, profile ], and the +* database to be of [db_key, meta, database_file]. +* +* @param ch_profile A channel containing a meta and the profilign report of a given profiler +* @param ch_database A channel containing a key, the database meta, and the database file/folders itself +* @return A multiMap'ed output channel with two sub channels, one with the profile and the other with the db +*/ +def combineProfilesWithDatabase(ch_profile, ch_database) { + +return ch_profile + .map { meta, profile -> [meta.db_name, meta, profile] } + .combine(ch_database, by: 0) + .multiMap { + key, meta, profile, db_meta, db -> + profile: [meta, profile] + db: db + } +} + +workflow PROFILING { + take: + reads // [ [ meta ], [ reads ] ] + databases // [ [ meta ], path ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + ch_raw_classifications = Channel.empty() // These per-read ID taxonomic assingment + ch_raw_profiles = Channel.empty() // These are count tables + +/* + COMBINE READS WITH POSSIBLE DATABASES + */ + + // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] + ch_input_for_profiling = reads + .map { + meta, reads -> + [meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads] + } + .combine(databases) + .branch { + centrifuge: it[2]['tool'] == 'centrifuge' + diamond: it[2]['tool'] == 'diamond' + kaiju: it[2]['tool'] == 'kaiju' + kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken + krakenuniq: it[2]['tool'] == 'krakenuniq' + malt: it[2]['tool'] == 'malt' + metaphlan: it[2]['tool'] == 'metaphlan' + motus: it[2]['tool'] == 'motus' + kmcp: it[2]['tool'] == 'kmcp' + ganon: it[2]['tool'] == 'ganon' + unknown: true + } + + /* + PREPARE PROFILER INPUT CHANNELS & RUN PROFILING + */ + + // Each tool as a slightly different input structure and generally separate + // input channels for reads vs databases. We restructure the channel tuple + // for each tool and make liberal use of multiMap to keep reads/databases + // channel element order in sync with each other + + if ( params.run_malt ) { + + // MALT: We groupTuple to have all samples in one channel for MALT as database + // loading takes a long time, so we only want to run it once per database + ch_input_for_malt = ch_input_for_profiling.malt + .map { + meta, reads, db_meta, db -> + + // Reset entire input meta for MALT to just database name, + // as we don't run run on a per-sample basis due to huge datbaases + // so all samples are in one run and so sample-specific metadata + // unnecessary. Set as database name to prevent `null` job ID and prefix. + def new_meta = db_meta + [ id: db_meta.db_name ] + + // Extend database parameters to specify whether to save alignments or not + def sam_format = params.malt_save_reads ? ' --alignments ./ -za false' : "" + new_meta.db_params = db_meta.db_params + sam_format + + [ new_meta, reads, db ] + + } + .groupTuple(by: [0,2]) + .multiMap { + meta, reads, db -> + reads: [ meta, reads.flatten() ] + db: db + } + + MALT_RUN ( ch_input_for_malt.reads, ch_input_for_malt.db ) + + ch_maltrun_for_megan = MALT_RUN.out.rma6 + .transpose() + .map{ + meta, rma -> + // re-extract meta from file names, use filename without rma to + // ensure we keep paired-end information in downstream filenames + // when no pair-merging + def meta_new = meta + [db_name: meta.id, id: rma.baseName] + + [ meta_new, rma ] + } + + MEGAN_RMA2INFO_TSV (ch_maltrun_for_megan, params.malt_generate_megansummary ) + ch_multiqc_files = ch_multiqc_files.mix( MALT_RUN.out.log ) + ch_versions = ch_versions.mix( MALT_RUN.out.versions.first(), MEGAN_RMA2INFO_TSV.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( ch_maltrun_for_megan ) + ch_raw_profiles = ch_raw_profiles.mix( MEGAN_RMA2INFO_TSV.out.txt ) + + } + + if ( params.run_kraken2 || params.run_bracken ) { + // Have to pick first element of db_params if using bracken, + // as db sheet for bracken must have ; sep list to + // distinguish between kraken and bracken parameters + ch_input_for_kraken2 = ch_input_for_profiling.kraken2 + .map { + meta, reads, db_meta, db -> + + // Only take first element if one exists + def parsed_params = db_meta['db_params'].split(";") + if ( parsed_params.size() == 2 ) { + db_meta_new = db_meta + [db_params: parsed_params[0]] + } else if ( parsed_params.size() == 0 ) { + db_meta_new = db_meta + [db_params: ""] + } else { + db_meta_new = db_meta + [db_params: parsed_params[0]] + } + + [ meta, reads, db_meta_new, db ] + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + KRAKEN2_KRAKEN2 ( ch_input_for_kraken2.reads, ch_input_for_kraken2.db, params.kraken2_save_reads, params.kraken2_save_readclassifications ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKEN2_KRAKEN2.out.report ) + ch_versions = ch_versions.mix( KRAKEN2_KRAKEN2.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKEN2_KRAKEN2.out.classified_reads_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( + KRAKEN2_KRAKEN2.out.report + // Rename tool in the meta for the for-bracken files to disambiguate from only-kraken2 results in downstream steps. + // Note may need to rename back to to just bracken in those downstream steps depending on context. + .map { + meta, report -> + def new_tool = + [meta + [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report] + } + ) + + } + + if ( params.run_kraken2 && params.run_bracken ) { + // Remove files from 'pure' kraken2 runs, so only those aligned against Bracken & kraken2 database are used. + def ch_kraken2_output = KRAKEN2_KRAKEN2.out.report + .filter { + meta, report -> + if ( meta.instrument_platform == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Bracken has not been evaluated for Nanopore data. Skipping Bracken for sample ${meta.id}." + meta.tool == 'bracken' && meta.instrument_platform != 'OXFORD_NANOPORE' + } + + // If necessary, convert the eight column output to six column output. + if (params.kraken2_save_minimizers) { + ch_kraken2_output = KRAKEN2_STANDARD_REPORT(ch_kraken2_output).report + } + + // Extract the database name to combine by. + ch_bracken_databases = databases + .filter { meta, db -> meta.tool == 'bracken' } + .map { meta, db -> [meta.db_name, meta, db] } + + // Combine back with the reads + ch_input_for_bracken = ch_kraken2_output + .map { meta, report -> [meta.db_name, meta, report] } + .combine(ch_bracken_databases, by: 0) + .map { + + key, meta, reads, db_meta, db -> + + // // Have to make a completely fresh copy here as otherwise + // // was getting db_param loss due to upstream meta parsing at + // // kraken2 input channel manipulation step + def db_meta_keys = db_meta.keySet() + def db_meta_new = db_meta.subMap(db_meta_keys) + + // Have to pick second element if using bracken, as first element + // contains kraken parameters + if ( db_meta.tool == 'bracken' ) { + + // Only take second element if one exists + def parsed_params = db_meta['db_params'].split(";") + + if ( parsed_params.size() == 2 ) { + db_meta_new = db_meta + [ db_params: parsed_params[1] ] + } else { + db_meta_new = db_meta + [ db_params: "" ] + } + + } else { + db_meta_new['db_params'] + } + + [ key, meta, reads, db_meta_new, db ] + } + .multiMap { key, meta, report, db_meta, db -> + report: [meta + db_meta, report] + db: db + } + + BRACKEN_BRACKEN(ch_input_for_bracken.report, ch_input_for_bracken.db) + ch_versions = ch_versions.mix(BRACKEN_BRACKEN.out.versions.first()) + ch_raw_profiles = ch_raw_profiles.mix(BRACKEN_BRACKEN.out.reports) + + } + + if ( params.run_centrifuge ) { + + ch_input_for_centrifuge = ch_input_for_profiling.centrifuge + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] Centrifuge currently does not accept FASTA files as input. Skipping Centrifuge for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + CENTRIFUGE_CENTRIFUGE ( ch_input_for_centrifuge.reads, ch_input_for_centrifuge.db, params.centrifuge_save_reads, params.centrifuge_save_reads ) + ch_versions = ch_versions.mix( CENTRIFUGE_CENTRIFUGE.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( CENTRIFUGE_CENTRIFUGE.out.results ) + + // Ensure the correct database goes with the generated report for KREPORT + ch_database_for_centrifugekreport = databases + .filter { meta, db -> meta.tool == 'centrifuge' } + .map { meta, db -> [meta.db_name, meta, db] } + + ch_input_for_centrifuge_kreport = combineProfilesWithDatabase(CENTRIFUGE_CENTRIFUGE.out.results, ch_database_for_centrifugekreport) + + // Generate profile + CENTRIFUGE_KREPORT (ch_input_for_centrifuge_kreport.profile, ch_input_for_centrifuge_kreport.db) + ch_versions = ch_versions.mix( CENTRIFUGE_KREPORT.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( CENTRIFUGE_KREPORT.out.kreport ) + ch_multiqc_files = ch_multiqc_files.mix( CENTRIFUGE_KREPORT.out.kreport ) + + } + + if ( params.run_metaphlan ) { + + ch_input_for_metaphlan = ch_input_for_profiling.metaphlan + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + METAPHLAN_METAPHLAN ( ch_input_for_metaphlan.reads, ch_input_for_metaphlan.db ) + ch_versions = ch_versions.mix( METAPHLAN_METAPHLAN.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( METAPHLAN_METAPHLAN.out.profile ) + + } + + if ( params.run_kaiju ) { + + ch_input_for_kaiju = ch_input_for_profiling.kaiju + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + KAIJU_KAIJU ( ch_input_for_kaiju.reads, ch_input_for_kaiju.db) + ch_versions = ch_versions.mix( KAIJU_KAIJU.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KAIJU_KAIJU.out.results ) + + // Ensure the correct database goes with the generated report for KAIJU2TABLE + ch_database_for_kaiju2table = databases + .filter { meta, db -> meta.tool == 'kaiju' } + .map { meta, db -> [meta.db_name, meta, db] } + + ch_input_for_kaiju2table = combineProfilesWithDatabase(KAIJU_KAIJU.out.results, ch_database_for_kaiju2table) + // Generate profile + KAIJU_KAIJU2TABLE_SINGLE ( ch_input_for_kaiju2table.profile, ch_input_for_kaiju2table.db, params.kaiju_taxon_rank) + ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE_SINGLE.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary ) + ch_raw_profiles = ch_raw_profiles.mix( KAIJU_KAIJU2TABLE_SINGLE.out.summary ) + } + + if ( params.run_diamond ) { + + ch_input_for_diamond = ch_input_for_profiling.diamond + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + // diamond only accepts single output file specification, therefore + // this will replace output file! + ch_diamond_reads_format = params.diamond_save_reads ? 'sam' : params.diamond_output_format + + DIAMOND_BLASTX ( ch_input_for_diamond.reads, ch_input_for_diamond.db, ch_diamond_reads_format , [] ) + ch_versions = ch_versions.mix( DIAMOND_BLASTX.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( DIAMOND_BLASTX.out.tsv ) + ch_multiqc_files = ch_multiqc_files.mix( DIAMOND_BLASTX.out.log ) + + } + + if ( params.run_motus ) { + + ch_input_for_motus = ch_input_for_profiling.motus + .filter{ + if (it[0].is_fasta) log.warn "[nf-core/taxprofiler] mOTUs currently does not accept FASTA files as input. Skipping mOTUs for sample ${it[0].id}." + !it[0].is_fasta + } + .multiMap { + it -> + reads: [it[0] + it[2], it[1]] + db: it[3] + } + + MOTUS_PROFILE ( ch_input_for_motus.reads, ch_input_for_motus.db ) + ch_versions = ch_versions.mix( MOTUS_PROFILE.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( MOTUS_PROFILE.out.out ) + ch_multiqc_files = ch_multiqc_files.mix( MOTUS_PROFILE.out.log ) + } + + if ( params.run_krakenuniq ) { + ch_input_for_krakenuniq = ch_input_for_profiling.krakenuniq + .map { + meta, reads, db_meta, db -> + [[id: db_meta.db_name, single_end: meta.single_end], reads, db_meta, db] + } + .groupTuple(by: [0,2,3]) + .flatMap { single_meta, reads, db_meta, db -> + def batches = reads.collate(params.krakenuniq_batch_size) + return batches.collect { batch -> [ single_meta + db_meta, batch.flatten(), db ]} + } + .multiMap { + meta, reads, db -> + reads: [ meta, reads ] + db: db + } + // Hardcode to _always_ produce the report file (which is our basic output, and goes into) + KRAKENUNIQ_PRELOADEDKRAKENUNIQ ( ch_input_for_krakenuniq.reads, ch_input_for_krakenuniq.db, params.krakenuniq_ram_chunk_size, params.krakenuniq_save_reads, true, params.krakenuniq_save_readclassifications ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + ch_versions = ch_versions.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.classified_assignment ) + ch_raw_profiles = ch_raw_profiles.mix( KRAKENUNIQ_PRELOADEDKRAKENUNIQ.out.report ) + + } + + if (params.run_kmcp) { + + ch_input_for_kmcp = ch_input_for_profiling.kmcp + .filter { + meta, reads, meta_db, db -> + if ( meta['instrument_platform'] == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] KMCP is only suitable for short-read metagenomic profiling, with much lower sensitivity on long-read datasets. Skipping KMCP for sample ${meta.id}." + meta_db['tool'] == 'kmcp' && meta['instrument_platform'] != 'OXFORD_NANOPORE' + } + .map { + meta, reads, db_meta, db -> + def db_meta_keys = db_meta.keySet() + def db_meta_new = db_meta.subMap(db_meta_keys) + + // Split the string, the arguments before semicolon should be parsed into kmcp search + def parsed_params = db_meta_new['db_params'].split(";") + if ( parsed_params.size() == 2 ) { + db_meta_new['db_params'] = parsed_params[0] + } else if ( parsed_params.size() == 0 ) { + db_meta_new['db_params'] = "" + } else { + db_meta_new['db_params'] = parsed_params[0] + } + + [ meta, reads, db_meta_new, db ] + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + KMCP_SEARCH ( ch_input_for_kmcp.db, ch_input_for_kmcp.reads ) + + ch_versions = ch_versions.mix( KMCP_SEARCH.out.versions.first() ) + ch_raw_classifications = ch_raw_classifications.mix(KMCP_SEARCH.out.result) + + ch_database_for_kmcp_profile = databases + .filter { meta, db -> meta.tool == 'kmcp' } + .map { meta, db -> [meta.db_name, meta, db] } + + ch_input_for_kmcp_profile = KMCP_SEARCH.out.result + .map { meta, report -> [meta.db_name, meta, report] } + .combine(ch_database_for_kmcp_profile, by: 0) + .map { + + key, meta, reads, db_meta, db -> + + // Same as kraken2/bracken logic here. Arguments after semicolon are going into KMCP_PROFILE + def db_meta_keys = db_meta.keySet() + def db_meta_new = db_meta.subMap(db_meta_keys) + + def parsed_params = db_meta['db_params'].split(";") + + if ( parsed_params.size() == 2 ) { + db_meta_new = db_meta + [ db_params: parsed_params[1] ] + } else { + db_meta_new = db_meta + [ db_params: "" ] + } + + [ key, meta, reads, db_meta_new, db ] + + } + .multiMap { key, meta, report, db_meta, db -> + report: [meta + db_meta, report] + db: db + } + + //Generate kmcp profile + KMCP_PROFILE( ch_input_for_kmcp_profile.report, ch_input_for_kmcp.db, params.kmcp_mode ) + ch_versions = ch_versions.mix( KMCP_PROFILE.out.versions.first() ) + ch_raw_profiles = ch_raw_profiles.mix( KMCP_PROFILE.out.profile ) + ch_multiqc_files = ch_multiqc_files.mix( KMCP_PROFILE.out.profile ) +} + + + if ( params.run_ganon ) { + + ch_input_for_ganonclassify = ch_input_for_profiling.ganon + .filter { + meta, reads, meta_db, db -> + if ( meta.instrument_platform == 'OXFORD_NANOPORE' ) log.warn "[nf-core/taxprofiler] Ganon has not been evaluated for Nanopore data. Skipping Ganon for sample ${meta.id}." + meta_db.tool == 'ganon' && meta.instrument_platform != 'OXFORD_NANOPORE' + } + .multiMap { + it -> + reads: [ it[0] + it[2], it[1] ] + db: it[3] + } + + ch_input_for_ganonclassify.reads + + GANON_CLASSIFY( ch_input_for_ganonclassify.reads, ch_input_for_ganonclassify.db ) + ch_versions = ch_versions.mix( GANON_CLASSIFY.out.versions.first() ) + + ch_database_for_ganonreport = databases + .filter { meta, db -> meta.tool == "ganon" } + .map { meta, db -> [meta.db_name, meta, db] } + + ch_report_for_ganonreport = combineProfilesWithDatabase(GANON_CLASSIFY.out.report, ch_database_for_ganonreport) + + GANON_REPORT(ch_report_for_ganonreport.profile, ch_report_for_ganonreport.db) + ch_versions = ch_versions.mix( GANON_REPORT.out.versions.first() ) + + // Might be flipped - check/define what is a profile vs raw classification + ch_raw_profiles = ch_raw_profiles.mix( GANON_REPORT.out.tre ) + ch_raw_classifications = ch_raw_classifications.mix( GANON_CLASSIFY.out.all ) + + } + + emit: + classifications = ch_raw_classifications + profiles = ch_raw_profiles // channel: [ val(meta), [ reads ] ] - should be text files or biom + versions = ch_versions // channel: [ versions.yml ] + motus_version = params.run_motus ? MOTUS_PROFILE.out.versions.first() : Channel.empty() + mqc = ch_multiqc_files +} diff --git a/subworkflows/local/shortread_adapterremoval.nf b/subworkflows/local/shortread_adapterremoval.nf new file mode 100644 index 00000000..7f5a0fbb --- /dev/null +++ b/subworkflows/local/shortread_adapterremoval.nf @@ -0,0 +1,95 @@ +// +// Process short raw reads with AdapterRemoval +// + +include { ADAPTERREMOVAL as ADAPTERREMOVAL_SINGLE } from '../../modules/nf-core/adapterremoval/main' +include { ADAPTERREMOVAL as ADAPTERREMOVAL_PAIRED } from '../../modules/nf-core/adapterremoval/main' +include { CAT_FASTQ } from '../../modules/nf-core/cat/fastq/main' + +workflow SHORTREAD_ADAPTERREMOVAL { + + take: + reads // [[meta], [reads]] + adapterlist // file + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_input_for_adapterremoval = reads + .branch{ + single: it[0].single_end + paired: !it[0].single_end + } + + ADAPTERREMOVAL_SINGLE ( ch_input_for_adapterremoval.single, adapterlist ) + ADAPTERREMOVAL_PAIRED ( ch_input_for_adapterremoval.paired, adapterlist ) + + /* + * Due to the ~slightly~ very ugly output implementation of the current AdapterRemoval2 version, each file + * has to be exported in a separate channel and we must manually recombine when necessary. + */ + + if ( params.shortread_qc_mergepairs && params.shortread_qc_includeunmerged ) { + + ch_concat_fastq = Channel.empty() + .mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated, + ADAPTERREMOVAL_PAIRED.out.singles_truncated, + ADAPTERREMOVAL_PAIRED.out.paired_truncated + ) + .map { meta, reads -> + [meta + [single_end: true], reads] + } + .groupTuple() + // Paired-end reads cause a nested tuple during grouping. + // We want to present a flat list of files to `CAT_FASTQ`. + .map { meta, fastq -> [meta, fastq.flatten()] } + + + CAT_FASTQ(ch_concat_fastq) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + } else if ( params.shortread_qc_mergepairs && !params.shortread_qc_includeunmerged ) { + + ch_concat_fastq = Channel.empty() + .mix( + ADAPTERREMOVAL_PAIRED.out.collapsed, + ADAPTERREMOVAL_PAIRED.out.collapsed_truncated + ) + .map { meta, reads -> + [meta + [single_end: true], reads] + } + .groupTuple() + .map { meta, fastq -> [meta, fastq.flatten()] } + + + CAT_FASTQ(ch_concat_fastq) + + ch_adapterremoval_reads_prepped = CAT_FASTQ.out.reads + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + } else { + + ch_adapterremoval_reads_prepped = ADAPTERREMOVAL_PAIRED.out.paired_truncated + .mix(ADAPTERREMOVAL_SINGLE.out.singles_truncated) + + } + + ch_versions = ch_versions.mix( ADAPTERREMOVAL_SINGLE.out.versions.first() ) + ch_versions = ch_versions.mix( ADAPTERREMOVAL_PAIRED.out.versions.first() ) + + ch_multiqc_files = ch_multiqc_files.mix( + ADAPTERREMOVAL_PAIRED.out.settings, + ADAPTERREMOVAL_SINGLE.out.settings + ) + + emit: + reads = ch_adapterremoval_reads_prepped // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_complexityfiltering.nf b/subworkflows/local/shortread_complexityfiltering.nf new file mode 100644 index 00000000..844cd15f --- /dev/null +++ b/subworkflows/local/shortread_complexityfiltering.nf @@ -0,0 +1,33 @@ +// +// Check input samplesheet and get read channels +// + +include { BBMAP_BBDUK } from '../../modules/nf-core/bbmap/bbduk/main' +include { PRINSEQPLUSPLUS } from '../../modules/nf-core/prinseqplusplus/main' + +workflow SHORTREAD_COMPLEXITYFILTERING { + take: + reads // [ [ meta ], [ reads ] ] + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.shortread_complexityfilter_tool == 'bbduk' ) { + ch_filtered_reads = BBMAP_BBDUK ( reads, [] ).reads + ch_versions = ch_versions.mix( BBMAP_BBDUK.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BBMAP_BBDUK.out.log ) + } else if ( params.shortread_complexityfilter_tool == 'prinseqplusplus' ) { + ch_filtered_reads = PRINSEQPLUSPLUS ( reads ).good_reads + ch_versions = ch_versions.mix( PRINSEQPLUSPLUS.out.versions.first() ) + } else { + ch_filtered_reads = reads + } + + emit: + reads = ch_filtered_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_fastp.nf b/subworkflows/local/shortread_fastp.nf new file mode 100644 index 00000000..ac421854 --- /dev/null +++ b/subworkflows/local/shortread_fastp.nf @@ -0,0 +1,55 @@ +// +// Process short raw reads with FastP +// + +include { FASTP as FASTP_SINGLE } from '../../modules/nf-core/fastp/main' +include { FASTP as FASTP_PAIRED } from '../../modules/nf-core/fastp/main' + +workflow SHORTREAD_FASTP { + take: + reads // [[meta], [reads]] + adapterlist + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + ch_input_for_fastp = reads + .branch{ + single: it[0]['single_end'] == true + paired: it[0]['single_end'] == false + } + + FASTP_SINGLE ( ch_input_for_fastp.single, adapterlist, false, false ) + // Last parameter here turns on merging of PE data + FASTP_PAIRED ( ch_input_for_fastp.paired, adapterlist, false, params.shortread_qc_mergepairs ) + + if ( params.shortread_qc_mergepairs ) { + ch_fastp_reads_prepped_pe = FASTP_PAIRED.out.reads_merged + .map { + meta, reads -> + def meta_new = meta + [single_end: true] + [ meta + [single_end:true], [ reads ].flatten() ] + } + + ch_fastp_reads_prepped = ch_fastp_reads_prepped_pe.mix( FASTP_SINGLE.out.reads ) + + } else { + ch_fastp_reads_prepped = FASTP_PAIRED.out.reads + .mix( FASTP_SINGLE.out.reads ) + } + + ch_versions = ch_versions.mix(FASTP_SINGLE.out.versions.first()) + ch_versions = ch_versions.mix(FASTP_PAIRED.out.versions.first()) + + ch_processed_reads = ch_fastp_reads_prepped + + ch_multiqc_files = ch_multiqc_files.mix( FASTP_SINGLE.out.json ) + ch_multiqc_files = ch_multiqc_files.mix( FASTP_PAIRED.out.json ) + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_hostremoval.nf b/subworkflows/local/shortread_hostremoval.nf new file mode 100644 index 00000000..32d64749 --- /dev/null +++ b/subworkflows/local/shortread_hostremoval.nf @@ -0,0 +1,49 @@ +// +// Remove host reads via alignment and export off-target reads +// + +include { BOWTIE2_BUILD } from '../../modules/nf-core/bowtie2/build/main' +include { BOWTIE2_ALIGN } from '../../modules/nf-core/bowtie2/align/main' +include { SAMTOOLS_INDEX } from '../../modules/nf-core/samtools/index/main' +include { SAMTOOLS_STATS } from '../../modules/nf-core/samtools/stats/main' + +workflow SHORTREAD_HOSTREMOVAL { + take: + reads // [ [ meta ], [ reads ] ] + reference // /path/to/fasta + index // /path/to/index + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( !params.shortread_hostremoval_index ) { + ch_bowtie2_index = BOWTIE2_BUILD ( [ [], reference ] ).index + ch_versions = ch_versions.mix( BOWTIE2_BUILD.out.versions ) + } else { + ch_bowtie2_index = index.first() + } + + // Map, generate BAM with all reads and unmapped reads in FASTQ for downstream + BOWTIE2_ALIGN ( reads, ch_bowtie2_index, true, true) + ch_versions = ch_versions.mix( BOWTIE2_ALIGN.out.versions.first() ) + ch_multiqc_files = ch_multiqc_files.mix( BOWTIE2_ALIGN.out.log ) + + // Indexing whole BAM for host removal statistics + SAMTOOLS_INDEX ( BOWTIE2_ALIGN.out.aligned ) + ch_versions = ch_versions.mix( SAMTOOLS_INDEX.out.versions.first() ) + + bam_bai = BOWTIE2_ALIGN.out.aligned + .join(SAMTOOLS_INDEX.out.bai, remainder: true) + + SAMTOOLS_STATS ( bam_bai, [[],reference] ) + ch_versions = ch_versions.mix(SAMTOOLS_STATS.out.versions.first()) + ch_multiqc_files = ch_multiqc_files.mix( SAMTOOLS_STATS.out.stats ) + + emit: + stats = SAMTOOLS_STATS.out.stats + reads = BOWTIE2_ALIGN.out.fastq // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/shortread_preprocessing.nf b/subworkflows/local/shortread_preprocessing.nf new file mode 100644 index 00000000..c823e3df --- /dev/null +++ b/subworkflows/local/shortread_preprocessing.nf @@ -0,0 +1,47 @@ +// +// Perform read trimming and merging +// + + +include { SHORTREAD_FASTP } from './shortread_fastp' +include { SHORTREAD_ADAPTERREMOVAL } from './shortread_adapterremoval' +include { FASTQC as FASTQC_PROCESSED } from '../../modules/nf-core/fastqc/main' +include { FALCO as FALCO_PROCESSED } from '../../modules/nf-core/falco/main' + +workflow SHORTREAD_PREPROCESSING { + take: + reads // [ [ meta ], [ reads ] ] + adapterlist // file + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + if ( params.shortread_qc_tool == "fastp" ) { + ch_processed_reads = SHORTREAD_FASTP ( reads, adapterlist ).reads + ch_versions = ch_versions.mix( SHORTREAD_FASTP.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_FASTP.out.mqc ) + } else if ( params.shortread_qc_tool == "adapterremoval" ) { + ch_processed_reads = SHORTREAD_ADAPTERREMOVAL ( reads, adapterlist ).reads + ch_versions = ch_versions.mix( SHORTREAD_ADAPTERREMOVAL.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_ADAPTERREMOVAL.out.mqc ) + } else { + ch_processed_reads = reads + } + + if (params.preprocessing_qc_tool == 'fastqc') { + FASTQC_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FASTQC_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FASTQC_PROCESSED.out.zip ) + } else if (params.preprocessing_qc_tool == 'falco') { + FALCO_PROCESSED ( ch_processed_reads ) + ch_versions = ch_versions.mix( FALCO_PROCESSED.out.versions ) + ch_multiqc_files = ch_multiqc_files.mix( FALCO_PROCESSED.out.txt ) + } + + emit: + reads = ch_processed_reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] + mqc = ch_multiqc_files +} + diff --git a/subworkflows/local/standardisation_profiles.nf b/subworkflows/local/standardisation_profiles.nf new file mode 100644 index 00000000..4592e9de --- /dev/null +++ b/subworkflows/local/standardisation_profiles.nf @@ -0,0 +1,230 @@ +// +// Standardise output files e.g. aggregation +// + +include { TAXPASTA_MERGE } from '../../modules/nf-core/taxpasta/merge/main' +include { TAXPASTA_STANDARDISE } from '../../modules/nf-core/taxpasta/standardise/main' +include { BRACKEN_COMBINEBRACKENOUTPUTS } from '../../modules/nf-core/bracken/combinebrackenoutputs/main' +include { KAIJU_KAIJU2TABLE as KAIJU_KAIJU2TABLE_COMBINED } from '../../modules/nf-core/kaiju/kaiju2table/main' +include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_KRAKEN } from '../../modules/nf-core/krakentools/combinekreports/main' +include { KRAKENTOOLS_COMBINEKREPORTS as KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE } from '../../modules/nf-core/krakentools/combinekreports/main' +include { METAPHLAN_MERGEMETAPHLANTABLES } from '../../modules/nf-core/metaphlan/mergemetaphlantables/main' +include { MOTUS_MERGE } from '../../modules/nf-core/motus/merge/main' +include { GANON_TABLE } from '../../modules/nf-core/ganon/table/main' + +// Custom Functions + +/** +* Combine profiles with their original database, then separate into two channels. +* +* The channel elements are assumed to be tuples one of [ meta, profile ], and the +* database to be of [db_key, meta, database_file]. +* +* @param ch_profile A channel containing a meta and the profilign report of a given profiler +* @param ch_database A channel containing a key, the database meta, and the database file/folders itself +* @return A multiMap'ed output channel with two sub channels, one with the profile and the other with the db +*/ +def combineProfilesWithDatabase(ch_profile, ch_database) { + +return ch_profile + .map { meta, profile -> [meta.db_name, meta, profile] } + .combine(ch_database, by: 0) + .multiMap { + key, meta, profile, db_meta, db -> + profile: [meta, profile] + db: db + } +} + +workflow STANDARDISATION_PROFILES { + take: + classifications + profiles + databases + motu_version + + main: + ch_versions = Channel.empty() + ch_multiqc_files = Channel.empty() + + //Taxpasta standardisation + ch_prepare_for_taxpasta = profiles + .map { + meta, profile -> + def meta_new = [:] + meta_new.tool = meta.tool == 'malt' ? 'megan6' : meta.tool + meta_new.db_name = meta.db_name + [meta_new, profile] + } + .groupTuple () + .map { + meta, profiles -> + meta = meta + [ + tool: meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool, // replace to get the right output-format description + id: meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken + ] + [ meta, profiles.flatten() ] + } + + ch_taxpasta_tax_dir = params.taxpasta_taxonomy_dir ? Channel.fromPath(params.taxpasta_taxonomy_dir, checkIfExists: true).collect() : [] + + ch_input_for_taxpasta = ch_prepare_for_taxpasta + .branch { + meta, profile -> + merge: profile.size() > 1 + standardise: true + } + + + TAXPASTA_MERGE (ch_input_for_taxpasta.merge , ch_taxpasta_tax_dir, []) + ch_versions = ch_versions.mix( TAXPASTA_MERGE.out.versions.first() ) + TAXPASTA_STANDARDISE (ch_input_for_taxpasta.standardise, ch_taxpasta_tax_dir ) + ch_version = ch_versions.mix( TAXPASTA_STANDARDISE.out.versions.first() ) + + + + /* + Split profile results based on tool they come from + */ + ch_input_profiles = profiles + .branch { + bracken: it[0]['tool'] == 'bracken' + centrifuge: it[0]['tool'] == 'centrifuge' + ganon: it[0]['tool'] == 'ganon' + kmcp: it [0]['tool'] == 'kmcp' + kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken' + metaphlan: it[0]['tool'] == 'metaphlan' + motus: it[0]['tool'] == 'motus' + unknown: true + } + + ch_input_classifications = classifications + .branch { + kaiju: it[0]['tool'] == 'kaiju' + unknown: true + } + + ch_input_databases = databases + .branch { + motus: it[0]['tool'] == 'motus' + kaiju: it[0]['tool'] == 'kaiju' + unknown: true + } + + /* + Standardise and aggregate + */ + + // Bracken + + ch_profiles_for_bracken = ch_input_profiles.bracken + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + BRACKEN_COMBINEBRACKENOUTPUTS ( ch_profiles_for_bracken ) + + // CENTRIFUGE + + // Collect and replace id for db_name for prefix + // Have to sort by size to ensure first file actually has hits otherwise + // the script fails + ch_profiles_for_centrifuge = ch_input_profiles.centrifuge + .map { [it[0]['db_name'], it[1]] } + .groupTuple(sort: {-it.size()} ) + .map { + [[id:it[0]], it[1]] + } + + + KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE ( ch_profiles_for_centrifuge ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS_CENTRIFUGE.out.versions ) + + // Kaiju + + // Collect and replace id for db_name for prefix + ch_profiles_for_kaiju = ch_input_classifications.kaiju + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + ch_input_for_kaiju2tablecombine = combineProfilesWithDatabase(ch_profiles_for_kaiju, ch_input_databases.kaiju) + + KAIJU_KAIJU2TABLE_COMBINED ( ch_input_for_kaiju2tablecombine.profile, ch_input_for_kaiju2tablecombine.db, params.kaiju_taxon_rank) + ch_multiqc_files = ch_multiqc_files.mix( KAIJU_KAIJU2TABLE_COMBINED.out.summary ) + ch_versions = ch_versions.mix( KAIJU_KAIJU2TABLE_COMBINED.out.versions ) + + // Kraken2 + + // Collect and replace id for db_name for prefix + // Have to sort by size to ensure first file actually has hits otherwise + // the script fails + ch_profiles_for_kraken2 = ch_input_profiles.kraken2 + .map { + meta, profiles -> + def new_meta = [:] + new_meta.tool = meta.tool == 'kraken2-bracken' ? 'kraken2' : meta.tool // replace to get the right output-format description + new_meta.id = meta.tool // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken + new_meta.db_name = meta.tool == 'kraken2-bracken' ? "${meta.db_name}-bracken" : "${meta.db_name}" // append so to disambiguate when we have same databases for kraken2 step of bracken, with normal bracken + [ new_meta, profiles ] + } + .groupTuple(sort: {-it.size()}) + + KRAKENTOOLS_COMBINEKREPORTS_KRAKEN ( ch_profiles_for_kraken2 ) + ch_multiqc_files = ch_multiqc_files.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_COMBINEKREPORTS_KRAKEN.out.versions ) + + // MetaPhlAn + + ch_profiles_for_metaphlan = ch_input_profiles.metaphlan + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + METAPHLAN_MERGEMETAPHLANTABLES ( ch_profiles_for_metaphlan ) + ch_multiqc_files = ch_multiqc_files.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.txt ) + ch_versions = ch_versions.mix( METAPHLAN_MERGEMETAPHLANTABLES.out.versions ) + + // mOTUs + + // mOTUs has a 'single' database, and cannot create custom ones. + // Therefore removing db info here, and publish merged at root mOTUs results + // directory + + ch_profiles_for_motus = ch_input_profiles.motus + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + ch_input_for_motusmerge = combineProfilesWithDatabase(ch_profiles_for_motus, ch_input_databases.motus) + + MOTUS_MERGE ( ch_input_for_motusmerge.profile, ch_input_for_motusmerge.db, motu_version ) + ch_versions = ch_versions.mix( MOTUS_MERGE.out.versions ) + + // Ganon + + ch_profiles_for_ganon = ch_input_profiles.ganon + .map { [it[0]['db_name'], it[1]] } + .groupTuple() + .map { + [[id:it[0]], it[1]] + } + + GANON_TABLE ( ch_profiles_for_ganon ) + ch_multiqc_files = ch_multiqc_files.mix( GANON_TABLE.out.txt ) + ch_versions = ch_versions.mix( GANON_TABLE.out.versions ) + + emit: + taxpasta = TAXPASTA_MERGE.out.merged_profiles + versions = ch_versions + mqc = ch_multiqc_files +} diff --git a/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf b/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf index dc90ad90..d1fc747d 100644 --- a/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_taxprofiler_pipeline/main.nf @@ -36,6 +36,7 @@ workflow PIPELINE_INITIALISATION { nextflow_cli_args // array: List of positional nextflow CLI args outdir // string: The output directory where the results will be saved input // string: Path to input samplesheet + databases // string: Path to databases main: @@ -56,7 +57,7 @@ workflow PIPELINE_INITIALISATION { // pre_help_text = nfCoreLogo(monochrome_logs) post_help_text = '\n' + workflowCitation() + '\n' + dashedLine(monochrome_logs) - def String workflow_command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --outdir " + def String workflow_command = "nextflow run ${workflow.manifest.name} -profile --input samplesheet.csv --databases databases.csv --outdir --run_kraken2" UTILS_NFVALIDATION_PLUGIN ( help, workflow_command, @@ -82,27 +83,20 @@ workflow PIPELINE_INITIALISATION { // Channel .fromSamplesheet("input") - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { - validateInputSamplesheet(it) - } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] - } .set { ch_samplesheet } emit: samplesheet = ch_samplesheet versions = ch_versions + +// Create channel from databases file provided through params.databases + Channel + .fromSamplesheet("databases") + .set {ch_databases} + + emit: + databases = ch_databases + versions = ch_versions } /* @@ -156,7 +150,7 @@ def validateInputParameters() { // Validate channels from input samplesheet // def validateInputSamplesheet(input) { - def (metas, fastqs) = input[1..2] + def (metas, fastqs, fasta) = input[1..3] // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end def endedness_ok = metas.collect{ it.single_end }.unique().size == 1 @@ -164,7 +158,7 @@ def validateInputSamplesheet(input) { error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") } - return [ metas[0], fastqs ] + return [ metas[0], fastqs, fasta ] } // // Get attribute from genome config file e.g. fasta @@ -194,53 +188,178 @@ def genomeExistsError() { // Generate methods description for MultiQC // def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report + def text_seq_qc = [ + "Sequencing quality control was carried out with:", + params.preprocessing_qc_tool == "falco" ? "Falco (de Sena Brandine and Smith 2021)." : "FastQC (Andrews 2010)." + ].join(' ').trim() + + def text_shortread_qc = [ + "Short read preprocessing was performed with:", + params.shortread_qc_tool == "adapterremoval" ? "AdapterRemoval (Schubert et al. 2016)." : "", + params.shortread_qc_tool == "fastp" ? "fastp (Chen et al. 2018)." : "", + ].join(' ').trim() + + def text_longread_qc = [ + "Long read preprocessing was performed with:", + !params.longread_qc_skipadaptertrim ? "Porechop (Wick et al. 2017)," : "", + !params.longread_qc_skipqualityfilter ? "Filtlong (Wick 2021)," : "", + "." + ].join(' ').trim() + + def text_shortreadcomplexity = [ + "Low-complexity sequence filtering was carried out with:", + params.shortread_complexityfilter_tool == "bbduk" ? "BBDuk (Bushnell 2022)." : "", + params.shortread_complexityfilter_tool == "prinseqplusplus" ? "PRINSEQ++ (Cantu et al. 2019)." : "", + params.shortread_complexityfilter_tool == "fastp" ? "fastp (Chen et al. 2018)." : "", + ].join(' ').trim() + + def text_shortreadhostremoval = [ + "Host read removal was performed for short reads with Bowtie2 (Langmead and Salzberg 2012) and SAMtools (Danecek et al. 2021)." + ].join(' ').trim() + + def text_longreadhostremoval = [ + "Host read removal was performed for long reads with minimap2 (Li et al. 2018) and SAMtools (Danecek et al. 2021)." + ].join(' ').trim() + + + def text_classification = [ + "Taxonomic classification or profiling was carried out with:", + params.run_bracken ? "Bracken (Lu et al. 2017)," : "", + params.run_kraken2 ? "Kraken2 (Wood et al. 2019)," : "", + params.run_krakenuniq ? "KrakenUniq (Breitwieser et al. 2018)," : "", + params.run_metaphlan ? "MetaPhlAn (Blanco-Míguez et al. 2023)," : "", + params.run_malt ? "MALT (Vågene et al. 2018) and MEGAN6 CE (Huson et al. 2016)," : "", + params.run_diamond ? "DIAMOND (Buchfink et al. 2015)," : "", + params.run_centrifuge ? "Centrifuge (Kim et al. 2016)," : "", + params.run_kaiju ? "Kaiju (Menzel et al. 2016)," : "", + params.run_motus ? "mOTUs (Ruscheweyh et al. 2022)," : "", + params.run_ganon ? "ganon (Piro et al. 2020)" : "", + params.run_kmcp ? "KMCP (Shen et al. 2023)" : "", + "." + ].join(' ').trim() + + def text_visualisation = [ + "Visualisation of results, where supported, was performed with Krona (Ondov et al. 2011)." + ].join(' ').trim() + + def text_postprocessing = [ + "Standardisation of taxonomic profiles was carried out with TAXPASTA (Beber et al. 2023).", + ].join(' ').trim() + def citation_text = [ - "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", - "." - ].join(' ').trim() + "Tools used in the workflow included:", + text_seq_qc, + params.perform_shortread_qc ? text_shortread_qc : "", + params.perform_longread_qc ? text_longread_qc : "", + params.perform_shortread_complexityfilter ? text_shortreadcomplexity : "", + params.perform_shortread_hostremoval ? text_shortreadhostremoval : "", + params.perform_longread_hostremoval ? text_longreadhostremoval : "", + text_classification, + params.run_krona ? text_visualisation : "", + params.run_profile_standardisation ? text_postprocessing : "", + "Pipeline results statistics were summarised with MultiQC (Ewels et al. 2016)." + ].join(' ').trim().replaceAll("[,|.] +\\.", ".") return citation_text } def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report + def text_seq_qc = [ + params.preprocessing_qc_tool == "falco" ? "
  • de Sena Brandine, G., & Smith, A. D. (2021). Falco: high-speed FastQC emulation for quality control of sequencing data. F1000Research, 8(1874), 1874. 10.12688/f1000research.21142.2
  • " : "", + params.preprocessing_qc_tool == "fastqc" ? "
  • Andrews S. (2010) FastQC: A Quality Control Tool for High Throughput Sequence Data, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/
  • " : "", + ].join(' ').trim() + + + def text_shortread_qc = [ + params.shortread_qc_tool == "adapterremoval" ? "
  • Schubert, M., Lindgreen, S., & Orlando, L. (2016). AdapterRemoval v2: rapid adapter trimming, identification, and read merging. BMC Research Notes, 9, 88. 10.1186/s13104-016-1900-2
  • " : "", + ].join(' ').trim() + + def text_longread_qc = [ + !params.longread_qc_skipadaptertrim ? "
  • Wick, R. R., Judd, L. M., Gorrie, C. L., & Holt, K. E. (2017). Completing bacterial genome assemblies with multiplex MinION sequencing. Microbial Genomics, 3(10), e000132. 10.1099/mgen.0.000132
  • " : "", + !params.longread_qc_skipqualityfilter ? "
  • Wick R. (2021) Filtlong, URL: https://github.com/rrwick/Filtlong
  • " : "" + ].join(' ').trim() + + def text_shortreadcomplexity = [ + params.shortread_complexityfilter_tool == "bbduk" ? "
  • Bushnell B. (2022) BBMap, URL: http://sourceforge.net/projects/bbmap/
  • " : "", + params.shortread_complexityfilter_tool == "prinseqplusplus" ? "
  • Cantu, V. A., Sadural, J., & Edwards, R. (2019). PRINSEQ++, a multi-threaded tool for fast and efficient quality control and preprocessing of sequencing datasets (e27553v1). PeerJ Preprints. 10.7287/peerj.preprints.27553v1
  • " : "", + ].join(' ').trim() + + def text_shortreadhostremoval = [ + "
  • Langmead, B., & Salzberg, S. L. (2012). Fast gapped-read alignment with Bowtie 2. Nature Methods, 9(4), 357–359. 10.1038/nmeth.1923
  • ", + ].join(' ').trim() + + def text_longreadhostremoval = [ + "
  • Li, H. (2018). Minimap2: pairwise alignment for nucleotide sequences. Bioinformatics , 34(18), 3094–3100. 10.1093/bioinformatics/bty191
  • ", + ].join(' ').trim() + + + def text_classification = [ + params.run_bracken ? "
  • Lu, J., Breitwieser, F. P., Thielen, P., & Salzberg, S. L. (2017). Bracken: estimating species abundance in metagenomics data. PeerJ. Computer Science, 3(e104), e104. 10.7717/peerj-cs.104
  • " : "", + params.run_kraken2 ? "
  • Wood, D. E., Lu, J., & Langmead, B. (2019). Improved metagenomic analysis with Kraken 2. Genome Biology, 20(1), 257. 10.1186/s13059-019-1891-0
  • " : "", + params.run_krakenuniq ? "
  • Breitwieser, F. P., Baker, D. N., & Salzberg, S. L. (2018). KrakenUniq: confident and fast metagenomics classification using unique k-mer counts. Genome Biology, 19(1), 198. 10.1186/s13059-018-1568-0
  • " : "", + params.run_metaphlan ? "
  • Blanco-Míguez, A., Beghini, F., Cumbo, F., McIver, L. J., Thompson, K. N., Zolfo, M., Manghi, P., Dubois, L., Huang, K. D., Thomas, A. M., Nickols, W. A., Piccinno, G., Piperni, E., Punčochář, M., Valles-Colomer, M., Tett, A., Giordano, F., Davies, R., Wolf, J., … Segata, N. (2023). Extending and improving metagenomic taxonomic profiling with uncharacterized species using MetaPhlAn 4. Nature Biotechnology, 1–12. 10.1038/s41587-023-01688-w
  • " : "", + params.run_malt ? "
  • Vågene, Å. J., Herbig, A., Campana, M. G., Robles García, N. M., Warinner, C., Sabin, S., Spyrou, M. A., Andrades Valtueña, A., Huson, D., Tuross, N., Bos, K. I., & Krause, J. (2018). Salmonella enterica genomes from victims of a major sixteenth-century epidemic in Mexico. Nature Ecology & Evolution, 2(3), 520–528. 10.1038/s41559-017-0446-6
  • " : "", + params.run_malt ? "
  • Huson, D. H., Beier, S., Flade, I., Górska, A., El-Hadidi, M., Mitra, S., Ruscheweyh, H.-J., & Tappu, R. (2016). MEGAN Community Edition - Interactive Exploration and Analysis of Large-Scale Microbiome Sequencing Data. PLoS Computational Biology, 12(6), e1004957. 10.1371/journal.pcbi.1004957
  • " : "", + params.run_diamond ? "
  • Buchfink, B., Xie, C., & Huson, D. H. (2015). Fast and sensitive protein alignment using DIAMOND. Nature Methods, 12(1), 59–60. 10.1038/nmeth.3176
  • " : "", + params.run_centrifuge ? "
  • Kim, D., Song, L., Breitwieser, F. P., & Salzberg, S. L. (2016). Centrifuge: rapid and sensitive classification of metagenomic sequences. Genome Research, 26(12), 1721–1729. 10.1101/gr.210641.116
  • " : "", + params.run_kaiju ? "
  • Menzel, P., Ng, K. L., & Krogh, A. (2016). Fast and sensitive taxonomic classification for metagenomics with Kaiju. Nature Communications, 7, 11257. 10.1038/ncomms11257
  • " : "", + params.run_motus ? "
  • Ruscheweyh, H.-J., Milanese, A., Paoli, L., Karcher, N., Clayssen, Q., Keller, M. I., Wirbel, J., Bork, P., Mende, D. R., Zeller, G., & Sunagawa, S. (2022). Cultivation-independent genomes greatly expand taxonomic-profiling capabilities of mOTUs across various environments. Microbiome, 10(1), 212. 10.1186/s40168-022-01410-z
  • " : "", + params.run_ganon ? "
  • Piro, V. C., Dadi, T. H., Seiler, E., Reinert, K., & Renard, B. Y. (2020). Ganon: Precise metagenomics classification against large and up-to-date sets of reference sequences. Bioinformatics (Oxford, England), 36(Suppl_1), i12–i20. 10.1093/bioinformatics/btaa458
  • " : "", + params.run_kmcp ? "
  • Shen, W., Xiang, H., Huang, T., Tang, H., Peng, M., Cai, D., Hu, P., & Ren, H. (2023). KMCP: accurate metagenomic profiling of both prokaryotic and viral populations by pseudo-mapping. Bioinformatics (Oxford, England), 39(1). 10.1093/bioinformatics/btac845
  • " : "", + ].join(' ').trim() + + def text_visualisation = [ + "
  • Ondov, B. D., Bergman, N. H., & Phillippy, A. M. (2011). Interactive metagenomic visualization in a Web browser. BMC Bioinformatics, 12(1), 385. 10.1186/1471-2105-12-385
  • " + ].join(' ').trim() + + def text_postprocessing = [ + "
  • Beber, M. E., Borry, M., Stamouli, S., & Fellows Yates, J. A. (2023). TAXPASTA: TAXonomic Profile Aggregation and STAndardisation. Journal of Open Source Software, 8(87), 5627. 10.21105/joss.05627
  • ", + ].join(' ').trim() + + def text_extras = [ + // fastp shortread qc / complexity filtering + ( params.perform_shortread_qc && params.shortread_qc_tool == "fastp" ) || ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool == "fastp" ) ? "
  • Chen, S., Zhou, Y., Chen, Y., & Gu, J. (2018). fastp: an ultra-fast all-in-one FASTQ preprocessor. Bioinformatics , 34(17), i884–i890. 10.1093/bioinformatics/bty560
  • " : "", + // samtools long / short hostremoval + params.perform_shortread_hostremoval || params.perform_longread_hostremoval ? "
  • Danecek, P., Bonfield, J. K., Liddle, J., Marshall, J., Ohan, V., Pollard, M. O., Whitwham, A., Keane, T., McCarthy, S. A., Davies, R. M., & Li, H. (2021). Twelve years of SAMtools and BCFtools. GigaScience, 10(2). 10.1093/gigascience/giab008
  • " : "", + ].join(' ').trim() + def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", - "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " - ].join(' ').trim() + text_seq_qc, + params.perform_shortread_qc ? text_shortread_qc : "", + params.perform_longread_qc ? text_longread_qc : "", + params.perform_shortread_complexityfilter ? text_shortreadcomplexity : "", + params.perform_shortread_hostremoval ? text_shortreadhostremoval : "", + params.perform_longread_hostremoval ? text_longreadhostremoval : "", + text_extras, + text_classification, + params.run_krona ? text_visualisation : "", + params.run_profile_standardisation ? text_postprocessing : "", + "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. 10.1093/bioinformatics/btw354.
  • " + ].join(' ').trim().replaceAll("[,|.] +\\.", ".") return reference_text } def methodsDescriptionText(mqc_methods_yaml) { - // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file + def meta = [:] - meta.workflow = workflow.toMap() - meta["manifest_map"] = workflow.manifest.toMap() + + meta.workflow = run_workflow.toMap() + meta["manifest_map"] = run_workflow.manifest.toMap() // Pipeline DOI - meta["doi_text"] = meta.manifest_map.doi ? "(doi: ${meta.manifest_map.doi})" : "" + meta["doi_text"] = meta.manifest_map.doi ? "(doi: Stamouli et al. 2023)" : "" meta["nodoi_text"] = meta.manifest_map.doi ? "": "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " - // Tool references meta["tool_citations"] = "" meta["tool_bibliography"] = "" - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - // meta["tool_bibliography"] = toolBibliographyText() - + meta["tool_citations"] = toolCitationText(params) + meta["tool_bibliography"] = toolBibliographyText(params) def methods_text = mqc_methods_yaml.text - def engine = new groovy.text.SimpleTemplateEngine() + def engine = new SimpleTemplateEngine() def description_html = engine.createTemplate(methods_text).make(meta) return description_html.toString() diff --git a/subworkflows/local/visualization_krona.nf b/subworkflows/local/visualization_krona.nf new file mode 100644 index 00000000..77e26a22 --- /dev/null +++ b/subworkflows/local/visualization_krona.nf @@ -0,0 +1,109 @@ +// +// Create Krona visualizations +// + +include { MEGAN_RMA2INFO as MEGAN_RMA2INFO_KRONA } from '../../modules/nf-core/megan/rma2info/main' +include { KAIJU_KAIJU2KRONA } from '../../modules/nf-core/kaiju/kaiju2krona/main' +include { KRAKENTOOLS_KREPORT2KRONA } from '../../modules/nf-core/krakentools/kreport2krona/main' +include { KRONA_CLEANUP } from '../../modules/local/krona_cleanup' +include { KRONA_KTIMPORTTEXT } from '../../modules/nf-core/krona/ktimporttext/main' +include { KRONA_KTIMPORTTAXONOMY } from '../../modules/nf-core/krona/ktimporttaxonomy/main' +include { GUNZIP } from '../../modules/nf-core/gunzip/main' + +workflow VISUALIZATION_KRONA { + take: + classifications + profiles + databases + + main: + ch_krona_text = Channel.empty() + ch_krona_html = Channel.empty() + ch_versions = Channel.empty() + + /* + Split profile results based on tool they come from + */ + ch_input_profiles = profiles + .branch { + centrifuge: it[0]['tool'] == 'centrifuge' + kraken2: it[0]['tool'] == 'kraken2' || it[0]['tool'] == 'kraken2-bracken' + unknown: true + } + ch_input_classifications = classifications + .branch { + kaiju: it[0]['tool'] == 'kaiju' + malt: it[0]['tool'] == 'malt' + unknown: true + } + + /* + Convert Kraken2 formatted reports into Krona text files + */ + ch_kraken_reports = ch_input_profiles.kraken2 + .map { + meta, report -> + [meta + [tool: meta.tool == 'bracken' ? 'kraken2-bracken' : meta.tool], report] + } + .mix( ch_input_profiles.centrifuge ) + KRAKENTOOLS_KREPORT2KRONA ( ch_kraken_reports ) + ch_krona_text = ch_krona_text.mix( KRAKENTOOLS_KREPORT2KRONA.out.txt ) + ch_versions = ch_versions.mix( KRAKENTOOLS_KREPORT2KRONA.out.versions.first() ) + + /* + Combine Kaiju profiles with their databases + */ + ch_input_for_kaiju2krona = ch_input_classifications.kaiju + .map{ meta, profiles -> [[meta['tool'], meta['db_name']], meta, profiles] } + .combine( databases.map{ meta, db -> [[meta['tool'], meta['db_name']], db] }, by: 0 ) + .multiMap{ + it -> + profiles: [it[1], it[2]] + db: it[3] + } + + /* + Convert Kaiju formatted reports into Krona text files + */ + KAIJU_KAIJU2KRONA( ch_input_for_kaiju2krona.profiles, ch_input_for_kaiju2krona.db ) + ch_krona_text = ch_krona_text.mix( KAIJU_KAIJU2KRONA.out.txt ) + ch_versions = ch_versions.mix( KAIJU_KAIJU2KRONA.out.versions.first() ) + + /* + Remove taxonomy level annotations from the Krona text files + */ + KRONA_CLEANUP( ch_krona_text ) + ch_cleaned_krona_text = KRONA_CLEANUP.out.txt + ch_versions = ch_versions.mix( KRONA_CLEANUP.out.versions.first() ) + + /* + Convert Krona text files into html Krona visualizations + */ + ch_krona_text_for_import = ch_cleaned_krona_text + .map{[[id: it[0]['db_name'], tool: it[0]['tool']], it[1]]} + .groupTuple() + + KRONA_KTIMPORTTEXT( ch_krona_text_for_import ) + ch_krona_html = ch_krona_html.mix( KRONA_KTIMPORTTEXT.out.html ) + ch_versions = ch_versions.mix( KRONA_KTIMPORTTEXT.out.versions.first() ) + + /* + Convert MALT/MEGAN RMA2INFO files into html Krona visualisations + */ + if ( params.krona_taxonomy_directory ) { + MEGAN_RMA2INFO_KRONA ( ch_input_classifications.malt, false ) + GUNZIP ( MEGAN_RMA2INFO_KRONA.out.txt ) + ch_krona_taxonomy_for_input = GUNZIP.out.gunzip + .map{[[id: it[0]['db_name'], tool: it[0]['tool']], it[1]]} + .groupTuple() + + KRONA_KTIMPORTTAXONOMY ( ch_krona_taxonomy_for_input, file(params.krona_taxonomy_directory, checkExists: true) ) + ch_krona_html.mix( KRONA_KTIMPORTTAXONOMY.out.html ) + ch_versions = ch_versions.mix( MEGAN_RMA2INFO_KRONA.out.versions.first() ) + ch_versions = ch_versions.mix( KRONA_KTIMPORTTAXONOMY.out.versions.first() ) + } + + emit: + html = ch_krona_html + versions = ch_versions +} diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index e722ebaa..ce771382 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -10,6 +10,102 @@ include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_taxprofiler_pipeline' +include { validateParameters; paramsHelp; paramsSummaryLog; paramsSummaryMap; fromSamplesheet } from 'plugin/nf-validation' + +def logo = NfcoreTemplate.logo(workflow, params.monochrome_logs) +def citation = '\n' + WorkflowMain.citation(workflow) + '\n' +def summary_params = paramsSummaryMap(workflow) + +// Print parameter summary log to screen +log.info logo + paramsSummaryLog(workflow) + citation + +WorkflowTaxprofiler.initialise(params, log) + +// Check input path parameters to see if they exist +def checkPathParamList = [ params.input, params.genome, params.databases, + params.longread_hostremoval_index, + params.hostremoval_reference, params.shortread_hostremoval_index, + params.multiqc_config, params.shortread_qc_adapterlist, + params.krona_taxonomy_directory, + params.taxpasta_taxonomy_dir, + params.multiqc_logo, params.multiqc_methods_description + ] +for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } + +// Check mandatory parameters +if ( params.input ) { + ch_input = file(params.input, checkIfExists: true) +} else { + error("Input samplesheet not specified") +} + +if (params.databases) { ch_databases = file(params.databases, checkIfExists: true) } else { error('Input database sheet not specified!') } + +if (!params.shortread_qc_mergepairs && params.run_malt ) log.warn "[nf-core/taxprofiler] MALT does not accept uncollapsed paired-reads. Pairs will be profiled as separate files." +if (params.shortread_qc_includeunmerged && !params.shortread_qc_mergepairs) error("ERROR: [nf-core/taxprofiler] cannot include unmerged reads when merging is not turned on. Please specify --shortread_qc_mergepairs") + +if (params.shortread_complexityfilter_tool == 'fastp' && ( params.perform_shortread_qc == false || params.shortread_qc_tool != 'fastp' )) error("ERROR: [nf-core/taxprofiler] cannot use fastp complexity filtering if preprocessing not turned on and/or tool is not fastp. Please specify --perform_shortread_qc and/or --shortread_qc_tool 'fastp'") + +if (params.perform_shortread_hostremoval && !params.hostremoval_reference) { error("ERROR: [nf-core/taxprofiler] --shortread_hostremoval requested but no --hostremoval_reference FASTA supplied. Check input.") } +if (params.perform_shortread_hostremoval && !params.hostremoval_reference && params.shortread_hostremoval_index) { error("ERROR: [nf-core/taxprofiler] --shortread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input.") } +if (params.perform_longread_hostremoval && !params.hostremoval_reference && params.longread_hostremoval_index) { error("ERROR: [nf-core/taxprofiler] --longread_hostremoval_index provided but no --hostremoval_reference FASTA supplied. Check input.") } + +if (params.hostremoval_reference ) { ch_reference = file(params.hostremoval_reference) } +if (params.shortread_hostremoval_index ) { ch_shortread_reference_index = Channel.fromPath(params.shortread_hostremoval_index).map{[[], it]} } else { ch_shortread_reference_index = [] } +if (params.longread_hostremoval_index ) { ch_longread_reference_index = file(params.longread_hostremoval_index ) } else { ch_longread_reference_index = [] } + +if (params.diamond_save_reads ) log.warn "[nf-core/taxprofiler] DIAMOND only allows output of a single format. As --diamond_save_reads supplied, only aligned reads in SAM format will be produced, no taxonomic profiles will be available." + +if (params.run_malt && params.run_krona && !params.krona_taxonomy_directory) log.warn "[nf-core/taxprofiler] Krona can only be run on MALT output if path to Krona taxonomy database supplied to --krona_taxonomy_directory. Krona will not be executed in this run for MALT." +if (params.run_bracken && !params.run_kraken2) error('ERROR: [nf-core/taxprofiler] You are attempting to run Bracken without running kraken2. This is not possible! Please set --run_kraken2 as well.') + +if ( [params.taxpasta_add_name, params.taxpasta_add_rank, params.taxpasta_add_lineage, params.taxpasta_add_lineage, params.taxpasta_add_idlineage, params.taxpasta_add_ranklineage].any() && !params.taxpasta_taxonomy_dir ) error('ERROR: [nf-core/taxprofiler] All --taxpasta_add_* parameters require a taxonomy supplied to --taxpasta_taxonomy_dir. However the latter parameter was not detected. Please check input.') + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// + +include { SHORTREAD_PREPROCESSING } from '../subworkflows/local/shortread_preprocessing' +include { LONGREAD_PREPROCESSING } from '../subworkflows/local/longread_preprocessing' +include { SHORTREAD_HOSTREMOVAL } from '../subworkflows/local/shortread_hostremoval' +include { LONGREAD_HOSTREMOVAL } from '../subworkflows/local/longread_hostremoval' +include { SHORTREAD_COMPLEXITYFILTERING } from '../subworkflows/local/shortread_complexityfiltering' +include { PROFILING } from '../subworkflows/local/profiling' +include { VISUALIZATION_KRONA } from '../subworkflows/local/visualization_krona' +include { STANDARDISATION_PROFILES } from '../subworkflows/local/standardisation_profiles' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Installed directly from nf-core/modules +// +include { UNTAR } from '../modules/nf-core/untar/main' +include { FASTQC } from '../modules/nf-core/fastqc/main' +include { FALCO } from '../modules/nf-core/falco/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { CAT_FASTQ as MERGE_RUNS } from '../modules/nf-core/cat/fastq/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -20,21 +116,229 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_taxp workflow TAXPROFILER { take: - ch_samplesheet // channel: samplesheet read in from --input + samplesheet // channel: samplesheet read in from --input + databases // channel: databases from --databases main: ch_versions = Channel.empty() ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet - ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + // Validate input files and create separate channels for FASTQ, FASTA, and Nanopore data + samplesheet + .branch { meta, run_accession, instrument_platform, fastq_1, fastq_2, fasta -> + //println "Mapping: meta=$meta, run_accession=$run_accession, instrument_platform=$instrument_platform, fastq_1=$fastq_1, fastq_2=$fastq_2, fasta=$fasta" + + meta.run_accession = run_accession + meta.instrument_platform = instrument_platform + + // Define single_end based on the conditions + meta.single_end = (fastq_1 && !fastq_2 && instrument_platform != 'OXFORD_NANOPORE') + + // Define is_fasta based on the presence of fasta + meta.is_fasta = fasta ? true : false + + if (!meta.is_fasta && !fastq_1) { + error("ERROR: Please check input samplesheet: entry `fastq_1` doesn't exist!") + } + if (meta.instrument_platform == 'OXFORD_NANOPORE' && fastq_2) { + error("Error: Please check input samplesheet: for Oxford Nanopore reads entry `fastq_2` should be empty!") + } + if (meta.single_end && fastq_2) { + error("Error: Please check input samplesheet: for single-end reads entry `fastq_2` should be empty") + } + // create fastq_se channel if single_end + fastq_se: meta.single_end + return [meta, [fastq_1]] + // + nanopore: instrument_platform == 'OXFORD_NANOPORE' + return [meta, [fastq_1]] + fastq_pe: fastq_2 + return [meta, [fastq_1, fastq_2]] + ch_fasta: meta.is_fasta && meta.single_end + return [meta, [fasta]] + } + .set { ch_input } + + // Merge ch_input.fastq_pe and ch_input.fastq_se into a single channel + def ch_fastq = ch_input.fastq_pe.mix(ch_input.fastq_se) + // Merge ch_fastq and ch_input.nanopore into a single channel + def ch_input_for_fastqc = ch_fastq.mix(ch_input.nanopore) + + // Validate databases + databases + .map { db_meta, db -> [db_meta.db_params] + def corrected_db_params = db_meta.db_params == null ? '' : db_meta.db_params + db_meta.db_params = corrected_db_params + } + .set { ch_databases } + + // Equivenent to "uniqueEntries" in schema_databases.json, but with better error message. Should we remove this part? + // ch_databases + // .map {db_meta, db -> [db_meta.tool, db_meta.db_name] } + // .groupTuple() + // .map { tool, db_name -> + // def unique_names = db_name.unique(false) + // if (unique_names.size() < db_name.size()) { + // error("[nf-core/taxprofiler] ERROR: Each database for a tool must have a unique name, duplicates detected. Tool: ${tool}, Database names: ${unique_names}") + // } + // } + + // Decompress + ch_dbs_for_untar = ch_databases + .branch { db_meta, db_path -> + //println "Branching: Database Meta: $db_meta, Database: $db_path" + untar: db_path.name.endsWith(".tar.gz") + skip: true + } + // Filter the channel to untar only those databases for tools that are selected to be run by the user. + ch_input_untar = ch_dbs_for_untar.untar + .filter { db_meta, db_path -> + //println "db_meta; $db_meta, Database Path: $db_path" + params["run_${db_meta.tool}"] + } + UNTAR (ch_input_untar) + + ch_final_dbs = ch_dbs_for_untar.skip.mix( UNTAR.out.untar ) + ch_versions = ch_versions.mix(UNTAR.out.versions.first()) + + /* + MODULE: Run FastQC + */ + + + if ( !params.skip_preprocessing_qc ) { + if ( params.preprocessing_qc_tool == 'falco' ) { + FALCO ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FALCO.out.versions.first()) + } else { + FASTQC ( ch_input_for_fastqc ) + ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + } + } + + /* + SUBWORKFLOW: PERFORM PREPROCESSING + */ + + if ( params.perform_shortread_qc ) { + ch_shortreads_preprocessed = SHORTREAD_PREPROCESSING ( ch_fastq, adapterlist ).reads + ch_versions = ch_versions.mix( SHORTREAD_PREPROCESSING.out.versions ) + } else { + ch_shortreads_preprocessed = ch_fastq + } + + if ( params.perform_longread_qc ) { + ch_longreads_preprocessed = LONGREAD_PREPROCESSING ( ch_input.nanopore ).reads + .map { it -> [ it[0], [it[1]] ] } + ch_versions = ch_versions.mix( LONGREAD_PREPROCESSING.out.versions ) + } else { + ch_longreads_preprocessed = ch_input.nanopore + } + + /* + MODULE: REDUNDANCY ESTIMATION + */ + + if ( params.perform_shortread_redundancyestimation ) { + NONPAREIL ( ch_shortreads_preprocessed ) + ch_versions = ch_versions.mix( NONPAREIL.out.versions ) + } + + + /* + SUBWORKFLOW: COMPLEXITY FILTERING + */ + + // fastp complexity filtering is activated via modules.conf in shortread_preprocessing + if ( params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp' ) { + ch_shortreads_filtered = SHORTREAD_COMPLEXITYFILTERING ( ch_shortreads_preprocessed ).reads + ch_versions = ch_versions.mix( SHORTREAD_COMPLEXITYFILTERING.out.versions ) + } else { + ch_shortreads_filtered = ch_shortreads_preprocessed + } + + /* + SUBWORKFLOW: HOST REMOVAL + */ + + if ( params.perform_shortread_hostremoval ) { + ch_shortreads_hostremoved = SHORTREAD_HOSTREMOVAL ( ch_shortreads_filtered, ch_reference, ch_shortread_reference_index ).reads + ch_versions = ch_versions.mix(SHORTREAD_HOSTREMOVAL.out.versions) + } else { + ch_shortreads_hostremoved = ch_shortreads_filtered + } + + if ( params.perform_longread_hostremoval ) { + ch_longreads_hostremoved = LONGREAD_HOSTREMOVAL ( ch_longreads_preprocessed, ch_reference, ch_longread_reference_index ).reads + ch_versions = ch_versions.mix(LONGREAD_HOSTREMOVAL.out.versions) + } else { + ch_longreads_hostremoved = ch_longreads_preprocessed + } + + if ( params.perform_runmerging ) { + + ch_reads_for_cat_branch = ch_shortreads_hostremoved + .mix( ch_longreads_hostremoved ) + .map { + meta, reads -> + def meta_new = meta - meta.subMap('run_accession') + [ meta_new, reads ] + } + .groupTuple() + .map { + meta, reads -> + [ meta, reads.flatten() ] + } + .branch { + meta, reads -> + // we can't concatenate files if there is not a second run, we branch + // here to separate them out, and mix back in after for efficiency + cat: ( meta.single_end && reads.size() > 1 ) || ( !meta.single_end && reads.size() > 2 ) + skip: true + } + + ch_reads_runmerged = MERGE_RUNS ( ch_reads_for_cat_branch.cat ).reads + .mix( ch_reads_for_cat_branch.skip ) + .map { + meta, reads -> + [ meta, [ reads ].flatten() ] + } + .mix( ch_input.ch_fasta ) + + ch_versions = ch_versions.mix(MERGE_RUNS.out.versions) + + } else { + ch_reads_runmerged = ch_shortreads_hostremoved + .mix( ch_longreads_hostremoved, ch_input.ch_fasta ) + } + + /* + SUBWORKFLOW: PROFILING + */ + + PROFILING ( ch_reads_runmerged, ch_final_dbs ) + ch_versions = ch_versions.mix( PROFILING.out.versions ) + + /* + SUBWORKFLOW: VISUALIZATION_KRONA + */ + if ( params.run_krona ) { + VISUALIZATION_KRONA ( PROFILING.out.classifications, PROFILING.out.profiles, ch_final_dbs ) + ch_versions = ch_versions.mix( VISUALIZATION_KRONA.out.versions ) + } + + /* + SUBWORKFLOW: PROFILING STANDARDISATION + */ + if ( params.run_profile_standardisation ) { + STANDARDISATION_PROFILES ( PROFILING.out.classifications, PROFILING.out.profiles, ch_final_dbs, PROFILING.out.motus_version ) + ch_versions = ch_versions.mix( STANDARDISATION_PROFILES.out.versions ) + } + + /* + MODULE: MultiQC + */ // // Collate and save software versions @@ -43,6 +347,7 @@ workflow TAXPROFILER { .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } +<<<<<<< HEAD // // MODULE: MultiQC // @@ -56,6 +361,58 @@ workflow TAXPROFILER { ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: false)) +======= + + workflow_summary = WorkflowTaxprofiler.paramsSummaryMultiqc(workflow, summary_params) + ch_workflow_summary = Channel.value(workflow_summary) + + methods_description = WorkflowTaxprofiler.methodsDescriptionText(workflow, ch_multiqc_custom_methods_description, params) + ch_methods_description = Channel.value(methods_description) + + ch_multiqc_files = Channel.empty() + ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml')) + ch_multiqc_files = ch_multiqc_files.mix(CUSTOM_DUMPSOFTWAREVERSIONS.out.mqc_yml.collect()) + + if ( !params.skip_preprocessing_qc ) { + if ( params.preprocessing_qc_tool == 'falco' ) { + // only mix in files actually used by MultiQC + ch_multiqc_files = ch_multiqc_files.mix(FALCO.out.txt + .map { meta, reports -> reports } + .flatten() + .filter { path -> path.name.endsWith('_data.txt')} + .ifEmpty([])) + } else { + ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}.ifEmpty([])) + } + } + + if (params.perform_shortread_qc) { + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + if (params.perform_longread_qc) { + ch_multiqc_files = ch_multiqc_files.mix( LONGREAD_PREPROCESSING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + if (params.perform_shortread_complexityfilter && params.shortread_complexityfilter_tool != 'fastp'){ + ch_multiqc_files = ch_multiqc_files.mix( SHORTREAD_COMPLEXITYFILTERING.out.mqc.collect{it[1]}.ifEmpty([]) ) + } + + if (params.perform_shortread_hostremoval) { + ch_multiqc_files = ch_multiqc_files.mix(SHORTREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) + } + + if (params.perform_longread_hostremoval) { + ch_multiqc_files = ch_multiqc_files.mix(LONGREAD_HOSTREMOVAL.out.mqc.collect{it[1]}.ifEmpty([])) + } + + ch_multiqc_files = ch_multiqc_files.mix( PROFILING.out.mqc.collect{it[1]}.ifEmpty([]) ) + + if ( params.run_profile_standardisation ) { + ch_multiqc_files = ch_multiqc_files.mix( STANDARDISATION_PROFILES.out.mqc.collect{it[1]}.ifEmpty([]) ) + } +>>>>>>> dev MULTIQC ( ch_multiqc_files.collect(),