From 5e8cf1006c6055372a720464f0d81fefeb5516d1 Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Mon, 8 Apr 2024 14:59:22 +0200 Subject: [PATCH 01/16] add db_type feature into database --- .github/workflows/ci.yml | 6 ++++-- assets/schema_database.json | 7 ++++++- conf/test.config | 2 +- conf/test_adapterremoval.config | 2 +- conf/test_bbduk.config | 2 +- conf/test_falco.config | 2 +- conf/test_fastp.config | 2 +- conf/test_full.config | 2 +- conf/test_krakenuniq.config | 2 +- conf/test_malt.config | 2 +- conf/test_nopreprocessing.config | 2 +- conf/test_noprofiling.config | 2 +- conf/test_nothing.config | 2 +- conf/test_prinseqplusplus.config | 2 +- subworkflows/local/profiling.nf | 8 ++++++++ 15 files changed, 30 insertions(+), 15 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e6d5d4df..a5eb375f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -65,8 +65,10 @@ jobs: if [[ "${{ matrix.tags }}" == "test_motus" ]]; then wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py python downloadDB.py --no-download-progress - echo 'tool,db_name,db_params,db_path' > 'database_motus.csv' - echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv' + echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv' + echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv' + echo "motus,db2_mOTU,prep_long,long,db_mOTU" >> 'database_motus.csv' + echo "motus,db2_mOTU,,both,db_mOTU" >> 'database_motus.csv' nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }}; else nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; diff --git a/assets/schema_database.json b/assets/schema_database.json index 1f52a25c..fcf52fb6 100644 --- a/assets/schema_database.json +++ b/assets/schema_database.json @@ -57,6 +57,11 @@ "errorMessage": "Invalid database db_params entry. No quotes allowed.", "meta": ["db_params"] }, + "db_type": { + "type": "string", + "enum": ["short", "long", "both"], + "meta": ["db_type"] + }, "db_path": { "type": "string", "exists": true, @@ -64,7 +69,7 @@ "errorMessage": "db_path should be either a file path or a directory." } }, - "required": ["tool", "db_name", "db_path"], + "required": ["tool", "db_name", "db_type", "db_path"], "uniqueEntries": ["tool", "db_name"] } } diff --git a/conf/test.config b/conf/test.config index c11f27b6..d3f5969d 100644 --- a/conf/test.config +++ b/conf/test.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true shortread_qc_mergepairs = true diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index c3422d02..5419a276 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true shortread_qc_tool = 'adapterremoval' diff --git a/conf/test_bbduk.config b/conf/test_bbduk.config index 623fe191..c73823c4 100644 --- a/conf/test_bbduk.config +++ b/conf/test_bbduk.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true perform_shortread_complexityfilter = true diff --git a/conf/test_falco.config b/conf/test_falco.config index 3fb77c03..ff1e9ded 100644 --- a/conf/test_falco.config +++ b/conf/test_falco.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' preprocessing_qc_tool = 'falco' perform_shortread_qc = true perform_longread_qc = true diff --git a/conf/test_fastp.config b/conf/test_fastp.config index 3feeae7a..7a896e42 100644 --- a/conf/test_fastp.config +++ b/conf/test_fastp.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true shortread_qc_tool = 'fastp' diff --git a/conf/test_full.config b/conf/test_full.config index 2a74a80b..8dfa432b 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,7 +14,7 @@ params { // Input data for full size test input = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_full.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_full_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_full_v1.2.csv' // Genome references hostremoval_reference = 'ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/819/615/GCA_000819615.1_ViralProj14015/GCA_000819615.1_ViralProj14015_genomic.fna.gz' diff --git a/conf/test_krakenuniq.config b/conf/test_krakenuniq.config index e93de158..61827b83 100644 --- a/conf/test_krakenuniq.config +++ b/conf/test_krakenuniq.config @@ -25,7 +25,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_krakenuniq.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_krakenuniq_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true shortread_qc_mergepairs = true diff --git a/conf/test_malt.config b/conf/test_malt.config index 7e5f2df3..b5390972 100644 --- a/conf/test_malt.config +++ b/conf/test_malt.config @@ -25,7 +25,7 @@ params { // Input data input = 'https://github.com/nf-core/test-datasets/raw/taxprofiler/samplesheet_malt.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = false perform_longread_qc = false perform_shortread_complexityfilter = false diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config index 004a49e8..bd4f68e5 100644 --- a/conf/test_nopreprocessing.config +++ b/conf/test_nopreprocessing.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = false perform_longread_qc = false perform_shortread_complexityfilter = false diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config index 7cf2317d..9ad84acf 100644 --- a/conf/test_noprofiling.config +++ b/conf/test_noprofiling.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true shortread_qc_mergepairs = true diff --git a/conf/test_nothing.config b/conf/test_nothing.config index ed247ef4..577eb3d0 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = false perform_longread_qc = false perform_shortread_complexityfilter = false diff --git a/conf/test_prinseqplusplus.config b/conf/test_prinseqplusplus.config index acc23aa8..b4beea92 100644 --- a/conf/test_prinseqplusplus.config +++ b/conf/test_prinseqplusplus.config @@ -21,7 +21,7 @@ params { // Input data input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true perform_longread_qc = true perform_shortread_complexityfilter = true diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index e306f1de..2241d1eb 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -67,6 +67,14 @@ workflow PROFILING { [meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads] } .combine(databases) + .filter { it -> + def platform = it[0]['instrument_platform'] + def db_type = it[2]['db_type'] + def is_long_read = platform == 'OXFORD_NANOPORE' + def is_long_db = db_type == 'long' || db_type == 'both' + def is_short_db = db_type == 'short' || db_type == 'both' + (is_long_read && is_long_db) || (!is_long_read && is_short_db) + } .branch { centrifuge: it[2]['tool'] == 'centrifuge' diamond: it[2]['tool'] == 'diamond' From 226eb7b7b9a84ce576163e56091ac530e064583d Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Thu, 11 Apr 2024 09:41:26 +0200 Subject: [PATCH 02/16] fix the merge conflicts --- conf/test.config | 32 -------------------------------- conf/test_adapterremoval.config | 26 -------------------------- conf/test_fastp.config | 27 --------------------------- conf/test_noprofiling.config | 26 -------------------------- conf/test_nothing.config | 25 ------------------------- 5 files changed, 136 deletions(-) diff --git a/conf/test.config b/conf/test.config index fc96f44a..47e2c57a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,7 +20,6 @@ params { max_time = '6.h' // Input data -<<<<<<< HEAD input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true @@ -49,37 +48,6 @@ params { kraken2_save_reads = true centrifuge_save_reads = true run_profile_standardisation = true -======= - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_mergepairs = true - perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = true - run_malt = false - run_metaphlan = true - run_centrifuge = true - run_diamond = true - run_krakenuniq = true - run_motus = false - run_ganon = true - run_krona = true - run_kmcp = true - kmcp_mode = 0 - krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' - malt_save_reads = true - kraken2_save_reads = true - centrifuge_save_reads = true - run_profile_standardisation = true ->>>>>>> bouncy-basenji } process { diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index bfddd117..d6582373 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -20,7 +20,6 @@ params { max_time = '6.h' // Input data -<<<<<<< HEAD input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true @@ -43,31 +42,6 @@ params { run_ganon = false run_kmcp = false kmcp_mode = 0 -======= - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_tool = 'adapterremoval' - perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_ganon = false - run_kmcp = false - kmcp_mode = 0 ->>>>>>> bouncy-basenji } process { diff --git a/conf/test_fastp.config b/conf/test_fastp.config index 836bef09..57284db6 100644 --- a/conf/test_fastp.config +++ b/conf/test_fastp.config @@ -20,7 +20,6 @@ params { max_time = '6.h' // Input data -<<<<<<< HEAD input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true @@ -44,32 +43,6 @@ params { run_ganon = false run_kmcp = false kmcp_mode = 0 -======= - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_tool = 'fastp' - perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - shortread_complexityfilter_tool = 'fastp' - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_ganon = false - run_kmcp = false - kmcp_mode = 0 ->>>>>>> bouncy-basenji } process { diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config index 73667bd2..6c169408 100644 --- a/conf/test_noprofiling.config +++ b/conf/test_noprofiling.config @@ -20,7 +20,6 @@ params { max_time = '6.h' // Input data -<<<<<<< HEAD input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = true @@ -43,31 +42,6 @@ params { run_kmcp = false kmcp_mode = 0 run_ganon = false -======= - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_mergepairs = true - perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = false - run_kraken2 = false - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_kmcp = false - kmcp_mode = 0 - run_ganon = false ->>>>>>> bouncy-basenji } process { diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 91bdbaf7..93d126bf 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -20,7 +20,6 @@ params { max_time = '6.h' // Input data -<<<<<<< HEAD input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = false @@ -42,30 +41,6 @@ params { run_kmcp = false kmcp_mode = 0 run_ganon = false -======= - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' - perform_shortread_qc = false - perform_longread_qc = false - perform_shortread_complexityfilter = false - perform_shortread_redundancyestimation = false - perform_shortread_hostremoval = false - perform_longread_hostremoval = false - perform_runmerging = false - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = false - run_kraken2 = false - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_kmcp = false - kmcp_mode = 0 - run_ganon = false ->>>>>>> bouncy-basenji } process { From 921130810434f3d4f77f257efd281520abc10f08 Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Thu, 11 Apr 2024 09:54:25 +0200 Subject: [PATCH 03/16] Add missing information after fixing the merge conflicts. --- conf/test.config | 1 + conf/test_adapterremoval.config | 1 + conf/test_fastp.config | 1 + conf/test_nopreprocessing.config | 26 -------------------------- conf/test_noprofiling.config | 1 + conf/test_nothing.config | 1 + 6 files changed, 5 insertions(+), 26 deletions(-) diff --git a/conf/test.config b/conf/test.config index 47e2c57a..d6395c94 100644 --- a/conf/test.config +++ b/conf/test.config @@ -25,6 +25,7 @@ params { perform_shortread_qc = true perform_longread_qc = true shortread_qc_mergepairs = true + perform_shortread_redundancyestimation = true perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index d6582373..be77ded0 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -25,6 +25,7 @@ params { perform_shortread_qc = true perform_longread_qc = true shortread_qc_tool = 'adapterremoval' + perform_shortread_redundancyestimation = true perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true diff --git a/conf/test_fastp.config b/conf/test_fastp.config index 57284db6..ebd8f618 100644 --- a/conf/test_fastp.config +++ b/conf/test_fastp.config @@ -25,6 +25,7 @@ params { perform_shortread_qc = true perform_longread_qc = true shortread_qc_tool = 'fastp' + perform_shortread_redundancyestimation = true perform_shortread_complexityfilter = true shortread_complexityfilter_tool = 'fastp' perform_shortread_hostremoval = true diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config index 7f870617..441600b4 100644 --- a/conf/test_nopreprocessing.config +++ b/conf/test_nopreprocessing.config @@ -20,7 +20,6 @@ params { max_time = '6.h' // Input data -<<<<<<< HEAD input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = false @@ -43,31 +42,6 @@ params { kmcp_mode = 0 run_ganon = true run_krona = true -======= - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.1.csv' - perform_shortread_qc = false - perform_longread_qc = false - perform_shortread_redundancyestimation = false - perform_shortread_complexityfilter = false - perform_shortread_hostremoval = false - perform_longread_hostremoval = false - perform_runmerging = false - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = true - run_malt = false // too big with other profiles on GHA - run_metaphlan = true - run_centrifuge = true - run_diamond = true - run_krakenuniq = true - run_motus = false - run_kmcp = true - kmcp_mode = 0 - run_ganon = true - run_krona = true ->>>>>>> bouncy-basenji } process { diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config index 6c169408..9380980f 100644 --- a/conf/test_noprofiling.config +++ b/conf/test_noprofiling.config @@ -25,6 +25,7 @@ params { perform_shortread_qc = true perform_longread_qc = true shortread_qc_mergepairs = true + perform_shortread_redundancyestimation = true perform_shortread_complexityfilter = true perform_shortread_hostremoval = true perform_longread_hostremoval = true diff --git a/conf/test_nothing.config b/conf/test_nothing.config index 93d126bf..d5a52c81 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -24,6 +24,7 @@ params { databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' perform_shortread_qc = false perform_longread_qc = false + perform_shortread_redundancyestimation = true perform_shortread_complexityfilter = false perform_shortread_hostremoval = false perform_longread_hostremoval = false From ba2a2ed3689345cac76b2de51353bce01c3392e3 Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Thu, 11 Apr 2024 10:08:01 +0200 Subject: [PATCH 04/16] format config files --- conf/test.config | 56 ++++++++++++++++----------------- conf/test_adapterremoval.config | 44 +++++++++++++------------- conf/test_fastp.config | 46 +++++++++++++-------------- conf/test_noprofiling.config | 44 +++++++++++++------------- conf/test_nothing.config | 42 ++++++++++++------------- 5 files changed, 116 insertions(+), 116 deletions(-) diff --git a/conf/test.config b/conf/test.config index d6395c94..1e59686a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,35 +20,35 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_mergepairs = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = true - run_malt = false - run_metaphlan = true - run_centrifuge = true - run_diamond = true - run_krakenuniq = true - run_motus = false - run_ganon = true - run_krona = true - run_kmcp = true - kmcp_mode = 0 - krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' - malt_save_reads = true - kraken2_save_reads = true - centrifuge_save_reads = true - run_profile_standardisation = true + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = true + run_malt = false + run_metaphlan = true + run_centrifuge = true + run_diamond = true + run_krakenuniq = true + run_motus = false + run_ganon = true + run_krona = true + run_kmcp = true + kmcp_mode = 0 + krona_taxonomy_directory = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/sarscov2/metagenome/krona_taxonomy.tab' + malt_save_reads = true + kraken2_save_reads = true + centrifuge_save_reads = true + run_profile_standardisation = true } process { diff --git a/conf/test_adapterremoval.config b/conf/test_adapterremoval.config index be77ded0..73c5ae9f 100644 --- a/conf/test_adapterremoval.config +++ b/conf/test_adapterremoval.config @@ -20,29 +20,29 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_tool = 'adapterremoval' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_tool = 'adapterremoval' perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_ganon = false - run_kmcp = false - kmcp_mode = 0 + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 } process { diff --git a/conf/test_fastp.config b/conf/test_fastp.config index ebd8f618..dcfbbfbf 100644 --- a/conf/test_fastp.config +++ b/conf/test_fastp.config @@ -20,30 +20,30 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_tool = 'fastp' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_tool = 'fastp' perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - shortread_complexityfilter_tool = 'fastp' - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_ganon = false - run_kmcp = false - kmcp_mode = 0 + perform_shortread_complexityfilter = true + shortread_complexityfilter_tool = 'fastp' + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_ganon = false + run_kmcp = false + kmcp_mode = 0 } process { diff --git a/conf/test_noprofiling.config b/conf/test_noprofiling.config index 9380980f..6b9182c3 100644 --- a/conf/test_noprofiling.config +++ b/conf/test_noprofiling.config @@ -20,29 +20,29 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' - perform_shortread_qc = true - perform_longread_qc = true - shortread_qc_mergepairs = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' + perform_shortread_qc = true + perform_longread_qc = true + shortread_qc_mergepairs = true perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = true - perform_shortread_hostremoval = true - perform_longread_hostremoval = true - perform_runmerging = true - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = false - run_kraken2 = false - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_kmcp = false - kmcp_mode = 0 - run_ganon = false + perform_shortread_complexityfilter = true + perform_shortread_hostremoval = true + perform_longread_hostremoval = true + perform_runmerging = true + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_kmcp = false + kmcp_mode = 0 + run_ganon = false } process { diff --git a/conf/test_nothing.config b/conf/test_nothing.config index d5a52c81..bdb48364 100644 --- a/conf/test_nothing.config +++ b/conf/test_nothing.config @@ -20,28 +20,28 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' - perform_shortread_qc = false - perform_longread_qc = false + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' + perform_shortread_qc = false + perform_longread_qc = false perform_shortread_redundancyestimation = true - perform_shortread_complexityfilter = false - perform_shortread_hostremoval = false - perform_longread_hostremoval = false - perform_runmerging = false - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = false - run_kraken2 = false - run_bracken = false - run_malt = false - run_metaphlan = false - run_centrifuge = false - run_diamond = false - run_krakenuniq = false - run_motus = false - run_kmcp = false - kmcp_mode = 0 - run_ganon = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = false + run_kraken2 = false + run_bracken = false + run_malt = false + run_metaphlan = false + run_centrifuge = false + run_diamond = false + run_krakenuniq = false + run_motus = false + run_kmcp = false + kmcp_mode = 0 + run_ganon = false } process { From a5f7c785034798f6cfbc899a727f877c184a7f3c Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Thu, 11 Apr 2024 10:17:44 +0200 Subject: [PATCH 05/16] update CHANGELOG.md --- CHANGELOG.md | 1 + conf/test_nopreprocessing.config | 45 ++++++++++++++++---------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ea9134c4..4de8db60 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome estimation with Nonpareil (added by @jfy133) +- [#466](https://github.com/nf-core/taxprofiler/pull/466) The new column `db_type` has been added to the database sheet to differentiate between long-read and short-read parameters in databases. ## v1.1.6dev - [unreleased] diff --git a/conf/test_nopreprocessing.config b/conf/test_nopreprocessing.config index 441600b4..49d544e3 100644 --- a/conf/test_nopreprocessing.config +++ b/conf/test_nopreprocessing.config @@ -20,28 +20,29 @@ params { max_time = '6.h' // Input data - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' - databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' - perform_shortread_qc = false - perform_longread_qc = false - perform_shortread_complexityfilter = false - perform_shortread_hostremoval = false - perform_longread_hostremoval = false - perform_runmerging = false - hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' - run_kaiju = true - run_kraken2 = true - run_bracken = true - run_malt = false // too big with other profiles on GHA - run_metaphlan = true - run_centrifuge = true - run_diamond = true - run_krakenuniq = true - run_motus = false - run_kmcp = true - kmcp_mode = 0 - run_ganon = true - run_krona = true + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/samplesheet.csv' + databases = 'https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/database_v1.2.csv' + perform_shortread_qc = false + perform_longread_qc = false + perform_shortread_redundancyestimation = false + perform_shortread_complexityfilter = false + perform_shortread_hostremoval = false + perform_longread_hostremoval = false + perform_runmerging = false + hostremoval_reference = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/genomics/homo_sapiens/genome/genome.fasta' + run_kaiju = true + run_kraken2 = true + run_bracken = true + run_malt = false // too big with other profiles on GHA + run_metaphlan = true + run_centrifuge = true + run_diamond = true + run_krakenuniq = true + run_motus = false + run_kmcp = true + kmcp_mode = 0 + run_ganon = true + run_krona = true } process { From b5c76f7a350ceac8dd4ca662c46e2330be985a95 Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Thu, 11 Apr 2024 13:10:03 +0200 Subject: [PATCH 06/16] Enable downloading the results of failed tests for debugging --- .github/workflows/ci.yml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5eb375f..4f442bcb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -73,3 +73,12 @@ jobs: else nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; fi + + - name: Upload results and logs on failure + if: failure() + uses: actions/upload-artifact@v4 + with: + name: logs-${{ matrix.profile }} + path: | + ./results_${{ matrix.tags }} + overwrite: true From abe18e51f0595b2a2771e7e106943329cda50cc4 Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Thu, 11 Apr 2024 13:53:40 +0200 Subject: [PATCH 07/16] Update ci.yml Update the mOTU database name in ci.yml --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4f442bcb..d6bf9264 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,7 +68,7 @@ jobs: echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv' echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv' echo "motus,db2_mOTU,prep_long,long,db_mOTU" >> 'database_motus.csv' - echo "motus,db2_mOTU,,both,db_mOTU" >> 'database_motus.csv' + echo "motus,db3_mOTU,,both,db_mOTU" >> 'database_motus.csv' nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }}; else nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; From 60087e7250ec6c13a9dbfbc49d6866d696ed5fa6 Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Thu, 11 Apr 2024 15:26:11 +0200 Subject: [PATCH 08/16] Update ci.yml --- .github/workflows/ci.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d6bf9264..fd0bed28 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -73,12 +73,3 @@ jobs: else nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; fi - - - name: Upload results and logs on failure - if: failure() - uses: actions/upload-artifact@v4 - with: - name: logs-${{ matrix.profile }} - path: | - ./results_${{ matrix.tags }} - overwrite: true From f8bd172937d8961eee4b79375f534c79dcc238ac Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Wed, 17 Apr 2024 08:04:54 +0200 Subject: [PATCH 09/16] Update CHANGELOG.md Co-authored-by: James A. Fellows Yates --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4de8db60..f8ed19d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### `Added` - [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome estimation with Nonpareil (added by @jfy133) -- [#466](https://github.com/nf-core/taxprofiler/pull/466) The new column `db_type` has been added to the database sheet to differentiate between long-read and short-read parameters in databases. +- [#466](https://github.com/nf-core/taxprofiler/pull/466) - Input database sheets now require a `db_type` column to distinguish between short- and long-read databases ## v1.1.6dev - [unreleased] From 1f5d5eace0d152cc4a031c8789f8c36ffacda327 Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:01:14 +0200 Subject: [PATCH 10/16] combine reads and db by db_type --- assets/schema_database.json | 23 ++----------- subworkflows/local/profiling.nf | 61 +++++++++++++++++++-------------- workflows/taxprofiler.nf | 9 +++-- 3 files changed, 44 insertions(+), 49 deletions(-) diff --git a/assets/schema_database.json b/assets/schema_database.json index fcf52fb6..e5611894 100644 --- a/assets/schema_database.json +++ b/assets/schema_database.json @@ -36,30 +36,13 @@ "db_params": { "type": "string", "pattern": "^[^\"']*$", - "anyOf": [ - { - "properties": { - "tool": { "const": "bracken" } - }, - "not": { - "pattern": ".*;" - }, - "errorMessage": "Invalid database db_params entry. Bracken requires a semi-colon for passing one or more parameters." - }, - { - "properties": { - "tool": { "const": "kmcp" } - }, - "pattern": ".*;$", - "errorMessage": "Invalid database `db_params` entry. KMCP only requires a semi-colon if passing arguments to KMCP profile, in cases of which the arguments should go after the semi-colon." - } - ], "errorMessage": "Invalid database db_params entry. No quotes allowed.", "meta": ["db_params"] }, "db_type": { "type": "string", - "enum": ["short", "long", "both"], + "enum": ["short", "long", "short,long"], + "default": ["short,long"], "meta": ["db_type"] }, "db_path": { @@ -69,7 +52,7 @@ "errorMessage": "db_path should be either a file path or a directory." } }, - "required": ["tool", "db_name", "db_type", "db_path"], + "required": ["tool", "db_name", "db_path"], "uniqueEntries": ["tool", "db_name"] } } diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 2241d1eb..de80b9f6 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -61,33 +61,42 @@ workflow PROFILING { */ // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - ch_input_for_profiling = reads - .map { - meta, reads -> - [meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads] - } - .combine(databases) - .filter { it -> - def platform = it[0]['instrument_platform'] - def db_type = it[2]['db_type'] - def is_long_read = platform == 'OXFORD_NANOPORE' - def is_long_db = db_type == 'long' || db_type == 'both' - def is_short_db = db_type == 'short' || db_type == 'both' - (is_long_read && is_long_db) || (!is_long_read && is_short_db) - } - .branch { - centrifuge: it[2]['tool'] == 'centrifuge' - diamond: it[2]['tool'] == 'diamond' - kaiju: it[2]['tool'] == 'kaiju' - kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken - krakenuniq: it[2]['tool'] == 'krakenuniq' - malt: it[2]['tool'] == 'malt' - metaphlan: it[2]['tool'] == 'metaphlan' - motus: it[2]['tool'] == 'motus' - kmcp: it[2]['tool'] == 'kmcp' - ganon: it[2]['tool'] == 'ganon' - unknown: true + ch_reads = reads + .map { meta, reads -> [ meta.type, meta.subMap( meta.keySet() - 'type' ), reads ] } + + ch_dbs = databases + .flatMap { db -> + def ( db_meta, db_path ) = db + def db_types = db_meta.db_type.replaceAll(/\[|\]/, '').split(',') //removes the square brackets and splits the string into a list ["short", "long"] + if ( db_types.size() > 1 ) { + return db_types.collect { it -> + def new_db_meta = db_meta.clone() + [new_db_meta,db_path] + } + } else { + return [ db ] } + } + .map{ meta, db -> [ meta.db_type, meta.subMap( meta.keySet() - 'db_type' ), db ] } + + ch_input_for_profiling = reads + .map { meta, reads -> [ meta.type, meta.subMap( meta.keySet() - 'type' ), reads ] } + .combine(ch_dbs, by: 0) + .map{ db_type, meta, reads, db_meta, db -> + [ meta, reads, db_meta, db ] } + .branch { meta, reads, db_meta, db -> + centrifuge: db_meta.tool == 'centrifuge' + diamond: db_meta.tool == 'diamond' + kaiju: db_meta.tool == 'kaiju' + kraken2: db_meta.tool == 'kraken2' || db_meta.tool == 'bracken' // to reuse the kraken module to produce the input data for bracken + krakenuniq: db_meta.tool == 'krakenuniq' + malt: db_meta.tool == 'malt' + metaphlan: db_meta.tool == 'metaphlan' + motus: db_meta.tool == 'motus' + kmcp: db_meta.tool == 'kmcp' + ganon: db_meta.tool == 'ganon' + unknown: true + } /* PREPARE PROFILER INPUT CHANNELS & RUN PROFILING diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 2399797f..fad397b8 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -135,13 +135,13 @@ workflow TAXPROFILER { } .branch { meta, run_accession, instrument_platform, fastq_1, fastq_2, fasta -> fastq: meta.single_end || fastq_2 - return [ meta, fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ] + return [ meta + [ type: "short" ], fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ] nanopore: instrument_platform == 'OXFORD_NANOPORE' meta.single_end = true - return [ meta, [ fastq_1 ] ] + return [ meta + [ type: "long" ], [ fastq_1 ] ] fasta: meta.is_fasta meta.single_end = true - return [ meta, [ fasta ] ] + return [ meta + [ type: "short" ], [ fasta ] ] } // Merge ch_input.fastq and ch_input.nanopore into a single channel @@ -150,6 +150,9 @@ workflow TAXPROFILER { // Validate and decompress databases ch_dbs_for_untar = databases .branch { db_meta, db_path -> + if ( !db_meta.db_type ) { + db_meta = db_meta + [ db_type: "short,long" ] + } untar: db_path.name.endsWith( ".tar.gz" ) skip: true } From 301119ac9e5d3eb3d9f1098761aad692236b2642 Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Thu, 13 Jun 2024 16:39:03 +0200 Subject: [PATCH 11/16] Update ci.yml correct the definition of db_type for both short and long reads --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 21b49ed3..3fd66013 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -67,8 +67,8 @@ jobs: python downloadDB.py --no-download-progress echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv' echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv' - echo "motus,db2_mOTU,prep_long,long,db_mOTU" >> 'database_motus.csv' - echo "motus,db3_mOTU,,both,db_mOTU" >> 'database_motus.csv' + echo "motus,db2_mOTU,,long,db_mOTU" >> 'database_motus.csv' + echo "motus,db3_mOTU,,\"short,long\",db_mOTU" >> 'database_motus.csv' nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }}; else nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; From 749cd28cb52f1a0b6c2041df3b5a7cc975c9aca0 Mon Sep 17 00:00:00 2001 From: Lili Andersson-Li <64467552+LilyAnderssonLee@users.noreply.github.com> Date: Fri, 14 Jun 2024 08:53:54 +0200 Subject: [PATCH 12/16] Update profiling.nf --- subworkflows/local/profiling.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 55520c82..8028ef69 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -69,6 +69,7 @@ workflow PROFILING { if ( db_types.size() > 1 ) { return db_types.collect { it -> def new_db_meta = db_meta.clone() + new_db_meta.db_type = it [new_db_meta,db_path] } } else { From 862937b8ddb2881115e1715f562aaa4b58d4f9ae Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Tue, 18 Jun 2024 13:41:22 +0200 Subject: [PATCH 13/16] seperate db_type short,long(both) by semicolon --- assets/schema_database.json | 4 ++-- subworkflows/local/profiling.nf | 2 +- workflows/taxprofiler.nf | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/assets/schema_database.json b/assets/schema_database.json index e5611894..be401809 100644 --- a/assets/schema_database.json +++ b/assets/schema_database.json @@ -41,8 +41,8 @@ }, "db_type": { "type": "string", - "enum": ["short", "long", "short,long"], - "default": ["short,long"], + "enum": ["short", "long", "short;long"], + "default": ["short;long"], "meta": ["db_type"] }, "db_path": { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 8028ef69..b4f23c22 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -65,7 +65,7 @@ workflow PROFILING { ch_dbs = databases .flatMap { db -> def ( db_meta, db_path ) = db - def db_types = db_meta.db_type.replaceAll(/\[|\]/, '').split(',') //removes the square brackets and splits the string into a list ["short", "long"] + def db_types = db_meta.db_type.replaceAll(/\[|\]/, '').split(';') //removes the square brackets and splits the string into a list ["short", "long"] if ( db_types.size() > 1 ) { return db_types.collect { it -> def new_db_meta = db_meta.clone() diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf index 5a36c892..be051ff2 100644 --- a/workflows/taxprofiler.nf +++ b/workflows/taxprofiler.nf @@ -151,7 +151,7 @@ workflow TAXPROFILER { ch_dbs_for_untar = databases .branch { db_meta, db_path -> if ( !db_meta.db_type ) { - db_meta = db_meta + [ db_type: "short,long" ] + db_meta = db_meta + [ db_type: "short;long" ] } untar: db_path.name.endsWith( ".tar.gz" ) skip: true From f8e1e4f38a11c7e5cc8b9a6abeefb78958a7fecf Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Tue, 18 Jun 2024 13:48:25 +0200 Subject: [PATCH 14/16] replace comma as semicolon in db_type --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3fd66013..c47e8cdc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -68,7 +68,7 @@ jobs: echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv' echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv' echo "motus,db2_mOTU,,long,db_mOTU" >> 'database_motus.csv' - echo "motus,db3_mOTU,,\"short,long\",db_mOTU" >> 'database_motus.csv' + echo "motus,db3_mOTU,,short;long,db_mOTU" >> 'database_motus.csv' nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }}; else nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }}; From 58ef246a90f1164d24e95494a7e4443282b3c554 Mon Sep 17 00:00:00 2001 From: "James A. Fellows Yates" Date: Thu, 20 Jun 2024 08:56:42 +0000 Subject: [PATCH 15/16] Make database splitting and merging more nextflow-y --- assets/schema_database.json | 2 +- subworkflows/local/profiling.nf | 43 +++++++++++++++++++-------------- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/assets/schema_database.json b/assets/schema_database.json index be401809..ec04e326 100644 --- a/assets/schema_database.json +++ b/assets/schema_database.json @@ -42,7 +42,7 @@ "db_type": { "type": "string", "enum": ["short", "long", "short;long"], - "default": ["short;long"], + "default": "short;long", "meta": ["db_type"] }, "db_path": { diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index b4f23c22..95a1b74f 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -60,29 +60,36 @@ workflow PROFILING { COMBINE READS WITH POSSIBLE DATABASES */ - // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] - + // Separate default 'short;long' (when necessary) databases when short/long specified in database sheet ch_dbs = databases - .flatMap { db -> - def ( db_meta, db_path ) = db - def db_types = db_meta.db_type.replaceAll(/\[|\]/, '').split(';') //removes the square brackets and splits the string into a list ["short", "long"] - if ( db_types.size() > 1 ) { - return db_types.collect { it -> - def new_db_meta = db_meta.clone() - new_db_meta.db_type = it - [new_db_meta,db_path] - } - } else { - return [ db ] - } + .map{ + meta_db, db -> + [ [meta_db.db_type.split(";")].flatten(), meta_db, db] + } + .transpose(by: 0) + .map{ + type, meta_db, db -> + [[type: type], meta_db.subMap(meta_db.keySet() - 'db_type') + [type: type], db] } - .map{ meta, db -> [ meta.db_type, meta.subMap( meta.keySet() - 'db_type' ), db ] } + .dump(tag: 'databases') + + // Join short and long reads with their corresponding short/long database + // Note that for not-specified `short;long`, it will match with the database. + // E.g. if there is no 'long' reads the above generted 'long' database channel element + // will have nothing to join to and will be discarded + // Final output: [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], /2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], /malt90] ch_input_for_profiling = reads - .map { meta, reads -> [ meta.type, meta.subMap( meta.keySet() - 'type' ), reads ] } + .map{ + meta, reads -> + [[type: meta.type], meta, reads] + } .combine(ch_dbs, by: 0) - .map{ db_type, meta, reads, db_meta, db -> - [ meta, reads, db_meta, db ] } + .map{ + db_type, meta, reads, db_meta, db -> + [ meta, reads, db_meta, db ] + } + .dump(tag: 'input to profiling') .branch { meta, reads, db_meta, db -> centrifuge: db_meta.tool == 'centrifuge' diamond: db_meta.tool == 'diamond' From 2534c088de68bb068eda75f8450b539b62bb3467 Mon Sep 17 00:00:00 2001 From: LilyAnderssonLee Date: Tue, 25 Jun 2024 08:43:18 +0200 Subject: [PATCH 16/16] remove dump from profiling.nf --- subworkflows/local/profiling.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf index 95a1b74f..55ea8e47 100644 --- a/subworkflows/local/profiling.nf +++ b/subworkflows/local/profiling.nf @@ -71,7 +71,6 @@ workflow PROFILING { type, meta_db, db -> [[type: type], meta_db.subMap(meta_db.keySet() - 'db_type') + [type: type], db] } - .dump(tag: 'databases') // Join short and long reads with their corresponding short/long database // Note that for not-specified `short;long`, it will match with the database. @@ -89,7 +88,6 @@ workflow PROFILING { db_type, meta, reads, db_meta, db -> [ meta, reads, db_meta, db ] } - .dump(tag: 'input to profiling') .branch { meta, reads, db_meta, db -> centrifuge: db_meta.tool == 'centrifuge' diamond: db_meta.tool == 'diamond'