Merge pull request #466 from LilyAnderssonLee/add_db_type

Add the column db_type to database sheet
nf-core · Jun 25, 2024 · 2a679b6 · 2a679b6
2 parents 6b47739 + 2534c08
commit 2a679b6
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 24 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -65,8 +65,10 @@ jobs:
           if [[ "${{ matrix.tags }}" == "test_motus" ]]; then
             wget https://raw.githubusercontent.com/motu-tool/mOTUs/master/motus/downloadDB.py
             python downloadDB.py --no-download-progress
-            echo 'tool,db_name,db_params,db_path' > 'database_motus.csv'
-            echo "motus,db_mOTU,,db_mOTU" >> 'database_motus.csv'
+            echo 'tool,db_name,db_params,db_type,db_path' > 'database_motus.csv'
+            echo "motus,db1_mOTU,,short,db_mOTU" >> 'database_motus.csv'
+            echo "motus,db2_mOTU,,long,db_mOTU" >> 'database_motus.csv'
+            echo "motus,db3_mOTU,,short;long,db_mOTU" >> 'database_motus.csv'
             nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --databases ./database_motus.csv --outdir ./results_${{ matrix.tags }};
           else
             nextflow run ${GITHUB_WORKSPACE} -profile docker,${{ matrix.tags }} --outdir ./results_${{ matrix.tags }};

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - [#417](https://github.com/nf-core/taxprofiler/pull/417) - Added reference-free metagenome estimation with Nonpareil (added by @jfy133)
+- [#466](https://github.com/nf-core/taxprofiler/pull/466) - Input database sheets now require a `db_type` column to distinguish between short- and long-read databases
 
 ## v1.1.8dev - Augmented Akita Patch []
 

diff --git a/assets/schema_database.json b/assets/schema_database.json
@@ -39,6 +39,12 @@
                 "errorMessage": "Invalid database db_params entry. No quotes allowed.",
                 "meta": ["db_params"]
             },
+            "db_type": {
+                "type": "string",
+                "enum": ["short", "long", "short;long"],
+                "default": "short;long",
+                "meta": ["db_type"]
+            },
             "db_path": {
                 "type": "string",
                 "exists": true,

diff --git a/subworkflows/local/profiling.nf b/subworkflows/local/profiling.nf
@@ -60,26 +60,47 @@ workflow PROFILING {
         COMBINE READS WITH POSSIBLE DATABASES
     */
 
-    // e.g. output [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':true], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+    // Separate default 'short;long' (when necessary) databases when short/long specified in database sheet
+    ch_dbs = databases
+        .map{
+            meta_db, db ->
+            [ [meta_db.db_type.split(";")].flatten(), meta_db, db]
+        }
+        .transpose(by: 0)
+        .map{
+            type, meta_db, db ->
+            [[type: type], meta_db.subMap(meta_db.keySet() - 'db_type') + [type: type], db]
+        }
+
+    // Join short and long reads with their corresponding short/long database
+    // Note that for not-specified `short;long`, it will match with the database.
+    // E.g. if there is no 'long' reads the above generted 'long' database channel element
+    //  will have nothing to join to and will be discarded
+    // Final output: [DUMP: reads_plus_db] [['id':'2612', 'run_accession':'combined', 'instrument_platform':'ILLUMINA', 'single_end':1], <reads_path>/2612.merged.fastq.gz, ['tool':'malt', 'db_name':'mal95', 'db_params':'"-id 90"'], <db_path>/malt90]
+
     ch_input_for_profiling = reads
-            .map {
-                meta, reads ->
-                    [meta + [id: "${meta.id}${meta.single_end ? '_se' : '_pe'}"], reads]
-            }
-            .combine(databases)
-            .branch {
-                centrifuge: it[2]['tool'] == 'centrifuge'
-                diamond: it[2]['tool'] == 'diamond'
-                kaiju: it[2]['tool'] == 'kaiju'
-                kraken2: it[2]['tool'] == 'kraken2' || it[2]['tool'] == 'bracken' // to reuse the kraken module to produce the input data for bracken
-                krakenuniq: it[2]['tool'] == 'krakenuniq'
-                malt:    it[2]['tool'] == 'malt'
-                metaphlan: it[2]['tool'] == 'metaphlan'
-                motus: it[2]['tool'] == 'motus'
-                kmcp: it[2]['tool'] == 'kmcp'
-                ganon: it[2]['tool'] == 'ganon'
-                unknown: true
-            }
+        .map{
+            meta, reads ->
+            [[type: meta.type], meta, reads]
+        }
+        .combine(ch_dbs, by: 0)
+        .map{
+            db_type, meta, reads, db_meta, db ->
+            [ meta, reads, db_meta, db ]
+        }
+        .branch { meta, reads, db_meta, db ->
+            centrifuge: db_meta.tool == 'centrifuge'
+            diamond: db_meta.tool == 'diamond'
+            kaiju: db_meta.tool == 'kaiju'
+            kraken2: db_meta.tool == 'kraken2' || db_meta.tool == 'bracken' // to reuse the kraken module to produce the input data for bracken
+            krakenuniq: db_meta.tool == 'krakenuniq'
+            malt:    db_meta.tool == 'malt'
+            metaphlan: db_meta.tool == 'metaphlan'
+            motus: db_meta.tool == 'motus'
+            kmcp: db_meta.tool == 'kmcp'
+            ganon: db_meta.tool == 'ganon'
+            unknown: true
+        }
 
     /*
         PREPARE PROFILER INPUT CHANNELS & RUN PROFILING

diff --git a/workflows/taxprofiler.nf b/workflows/taxprofiler.nf
@@ -135,13 +135,13 @@ workflow TAXPROFILER {
         }
         .branch { meta, run_accession, instrument_platform, fastq_1, fastq_2, fasta ->
             fastq: meta.single_end || fastq_2
-                return [ meta, fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ]
+                return [ meta + [ type: "short" ], fastq_2 ? [ fastq_1, fastq_2 ] : [ fastq_1 ] ]
             nanopore: instrument_platform == 'OXFORD_NANOPORE'
                 meta.single_end = true
-                return [ meta, [ fastq_1 ] ]
+                return [ meta + [ type: "long" ], [ fastq_1 ] ]
             fasta: meta.is_fasta
                 meta.single_end = true
-                return [ meta, [ fasta ] ]
+                return [ meta + [ type: "short" ], [ fasta ] ]
         }
 
     // Merge ch_input.fastq and ch_input.nanopore into a single channel
@@ -150,6 +150,9 @@ workflow TAXPROFILER {
     // Validate and decompress databases
     ch_dbs_for_untar = databases
         .branch { db_meta, db_path ->
+            if ( !db_meta.db_type ) {
+                db_meta = db_meta + [ db_type: "short;long" ]
+            }
             untar: db_path.name.endsWith( ".tar.gz" )
             skip: true
         }