From 486e38ab0d4a96e76a54f538ba21c04c7bf6a070 Mon Sep 17 00:00:00 2001 From: Furentsu Date: Mon, 28 Oct 2024 11:52:11 +0100 Subject: [PATCH 1/5] added applybqsr module --- modules.json | 10 +++ .../parabricks/applybqsr/environment.yml | 3 + modules/nf-core/parabricks/applybqsr/main.nf | 63 ++++++++++++++++ modules/nf-core/parabricks/applybqsr/meta.yml | 75 +++++++++++++++++++ .../local/fastq_align_parabricks/main.nf | 12 ++- 5 files changed, 156 insertions(+), 7 deletions(-) create mode 100644 modules/nf-core/parabricks/applybqsr/environment.yml create mode 100644 modules/nf-core/parabricks/applybqsr/main.nf create mode 100644 modules/nf-core/parabricks/applybqsr/meta.yml diff --git a/modules.json b/modules.json index f15b1f14c..74c68cb5d 100644 --- a/modules.json +++ b/modules.json @@ -354,6 +354,16 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["bam_ngscheckmate"] }, + "parabricks/applybqsr": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, + "parabricks/fq2bam": { + "branch": "master", + "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", + "installed_by": ["modules"] + }, "samblaster": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/nf-core/parabricks/applybqsr/environment.yml b/modules/nf-core/parabricks/applybqsr/environment.yml new file mode 100644 index 000000000..3cebeff05 --- /dev/null +++ b/modules/nf-core/parabricks/applybqsr/environment.yml @@ -0,0 +1,3 @@ +channels: + - conda-forge + - bioconda diff --git a/modules/nf-core/parabricks/applybqsr/main.nf b/modules/nf-core/parabricks/applybqsr/main.nf new file mode 100644 index 000000000..b545297d8 --- /dev/null +++ b/modules/nf-core/parabricks/applybqsr/main.nf @@ -0,0 +1,63 @@ +process PARABRICKS_APPLYBQSR { + tag "$meta.id" + label 'process_high' + + container "nvcr.io/nvidia/clara/clara-parabricks:4.3.0-1" + + input: + tuple val(meta), path(bam), path(bam_index), path(bqsr_table), path(intervals) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.bam"), emit: bam + tuple val(meta), path("*.bai"), emit: bai + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "Parabricks module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? intervals.collect{"--interval-file $it"}.join(' ') : "" + def copy_index_command = bam_index ? "cp -L $bam_index `readlink -f $bam`.bai" : "" + """ + # parabricks complains when index is not a regular file in the same directory as the bam + # copy the index to this path. + $copy_index_command + + pbrun \\ + applybqsr \\ + --ref $fasta \\ + --in-bam $bam \\ + --in-recal-file $bqsr_table \\ + $interval_command \\ + --out-bam ${prefix}.bam \\ + --num-threads $task.cpus \\ + --num-gpus $task.accelerator.request \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pbrun: \$(echo \$(pbrun version 2>&1) | sed 's/^Please.* //' ) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? intervals.collect{"--interval-file $it"}.join(' ') : "" + """ + touch ${prefix}.bam + touch ${prefix}.bam.bai + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + pbrun: \$(echo \$(pbrun version 2>&1) | sed 's/^Please.* //' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/parabricks/applybqsr/meta.yml b/modules/nf-core/parabricks/applybqsr/meta.yml new file mode 100644 index 000000000..09fdbacab --- /dev/null +++ b/modules/nf-core/parabricks/applybqsr/meta.yml @@ -0,0 +1,75 @@ +name: "parabricks_applybqsr" +description: NVIDIA Clara Parabricks GPU-accelerated apply Base Quality Score Recalibration + (BQSR). +keywords: + - bqsr + - bam + - GPU-accelerated + - base quality score recalibration +tools: + - "parabricks": + description: "NVIDIA Clara Parabricks GPU-accelerated genomics tools" + homepage: "https://www.nvidia.com/en-us/clara/genomics/" + documentation: "https://docs.nvidia.com/clara/parabricks/" + licence: ["https://docs.nvidia.com/clara/parabricks/3.8.0/gettingstarted.html#licensing"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test' ] + - bam: + type: file + description: BAM file + pattern: "*.bam" + - bam_index: + type: file + description: BAM index file + pattern: "*.bai" + - bqsr_table: + type: file + description: Table from calculating BQSR. Output from parabricks/fq2bam or gatk4/baserecalibrator. + pattern: "*.table" + - intervals: + type: file + description: intervals + - - meta2: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - fasta: + type: file + description: Reference fasta - must be unzipped. + pattern: "*.fasta" +output: + - bam: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test' ] + - "*.bam": + type: file + description: BAM file after applying BQSR. + pattern: "*.bam" + - bai: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. [ id:'test' ] + - "*.bai": + type: file + description: bai index corresponding to output bam file. + pattern: "*.bai" + - versions: + - versions.yml: + type: file + description: File containing software versions. + pattern: "versions.yml" +authors: + - "@bsiranosian" +maintainers: + - "@bsiranosian" diff --git a/subworkflows/local/fastq_align_parabricks/main.nf b/subworkflows/local/fastq_align_parabricks/main.nf index b9009cab9..7dd5c0f7b 100644 --- a/subworkflows/local/fastq_align_parabricks/main.nf +++ b/subworkflows/local/fastq_align_parabricks/main.nf @@ -1,16 +1,14 @@ -// TODO nf-core: If in doubt look at other nf-core/subworkflows to see how we are doing things! :) -// https://github.com/nf-core/modules/tree/master/subworkflows -// You can also ask for help via your pull request or on the #subworkflows channel on the nf-core Slack workspace: -// https://nf-co.re/join -// TODO nf-core: A subworkflow SHOULD import at least two modules +// +// Alignment and BQSR with Nvidia CLARA Parabricks +// include { PARABRICKS_FQ2BAM } from '../../../modules/nf-core/parabricks/fq2bam/main' +include { APPLYBQSR } from '../../../modules/nf-core/gatk/applybqsr/main' workflow FASTQ_ALIGN_PARABRICKS { take: - // TODO nf-core: edit input (take) channels - ch_bam // channel: [ val(meta), [ bam ] ] + main: From f42a7cdd1047a53b0a02e2c7ba4458430eceaeb7 Mon Sep 17 00:00:00 2001 From: Furentsu Date: Mon, 28 Oct 2024 12:18:42 +0100 Subject: [PATCH 2/5] wip parabricks subworkflow --- subworkflows/local/fastq_align_parabricks/main.nf | 3 ++- workflows/sarek/main.nf | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/subworkflows/local/fastq_align_parabricks/main.nf b/subworkflows/local/fastq_align_parabricks/main.nf index 7dd5c0f7b..89b997fe1 100644 --- a/subworkflows/local/fastq_align_parabricks/main.nf +++ b/subworkflows/local/fastq_align_parabricks/main.nf @@ -5,10 +5,11 @@ include { PARABRICKS_FQ2BAM } from '../../../modules/nf-core/parabricks/fq2bam/main' include { APPLYBQSR } from '../../../modules/nf-core/gatk/applybqsr/main' + workflow FASTQ_ALIGN_PARABRICKS { take: - + main: diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index f60bc3d93..4995b7483 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -257,6 +257,12 @@ workflow SAREK { reports = reports.mix(FASTP.out.json.collect{ meta, json -> json }) reports = reports.mix(FASTP.out.html.collect{ meta, html -> html }) + if (params.aligner = 'parabricks') { + + params.split_fastq = 0 + + } + if (params.split_fastq) { reads_for_alignment = FASTP.out.reads.map{ meta, reads -> read_files = reads.sort(false) { a,b -> a.getName().tokenize('.')[0] <=> b.getName().tokenize('.')[0] }.collate(2) @@ -289,6 +295,8 @@ workflow SAREK { else [ meta, reads ] } + // TODO Move grouping of reads to separate parabricks subworkflow + // reads will be sorted sort_bam = true FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON(reads_for_alignment, index_alignment, sort_bam, fasta, fasta_fai) From 3c9241913b4884e6be68e28db34d74c13a8f2419 Mon Sep 17 00:00:00 2001 From: Furentsu Date: Mon, 28 Oct 2024 14:46:42 +0100 Subject: [PATCH 3/5] implementation of alignment subworkflow --- .../local/fastq_align_parabricks/main.nf | 45 ++++++++++++------- workflows/sarek/main.nf | 29 ++++++++++-- 2 files changed, 55 insertions(+), 19 deletions(-) diff --git a/subworkflows/local/fastq_align_parabricks/main.nf b/subworkflows/local/fastq_align_parabricks/main.nf index 89b997fe1..1f653801d 100644 --- a/subworkflows/local/fastq_align_parabricks/main.nf +++ b/subworkflows/local/fastq_align_parabricks/main.nf @@ -3,32 +3,45 @@ // include { PARABRICKS_FQ2BAM } from '../../../modules/nf-core/parabricks/fq2bam/main' -include { APPLYBQSR } from '../../../modules/nf-core/gatk/applybqsr/main' - +include { APPLYBQSR } from '../../../modules/nf-core/gatk/applybqsr/main' workflow FASTQ_ALIGN_PARABRICKS { take: - + ch_reads // channel: [mandatory] meta, reads + ch_interval_file // channel: [optional for parabricks] intervals_bed_combined + val_sort // boolean: [mandatory] true -> sort, false -> don't sort + ch_fasta + ch_fasta_fai + ch_known_sites // channel [optional for parabricks] known_sites_indels main: - + ch_reports = Channel.empty() ch_versions = Channel.empty() + ch_bam = Channel.empty() + ch_bai = Channel.empty() + ch_bqsr_table = Channel.empty() + ch_qc_metrics = Channel.empty() + ch_duplicate_metrics = Channel.empty() - // TODO nf-core: substitute modules here for the modules of your subworkflow + PARABRICKS_FQ2BAM(ch_reads.map{meta, reads -> [ meta, ch_reads, ch_interval_file ]}, ch_fasta, ch_fasta_fai, ch_known_sites) - SAMTOOLS_SORT ( ch_bam ) - ch_versions = ch_versions.mix(SAMTOOLS_SORT.out.versions.first()) + // Collecting FQ2BAM outputs + ch_bam = bam.mix(PARABRICKS_FQ2BAM.out.bam) + ch_bai = bai.mix(PARABRICKS_FQ2BAM.out.bai) + ch_bqsr_table = ch_bqsr_table(PARABRICKS_FQ2BAM.out.bqsr_table) + ch_qc_metrics = ch_qc_metrics(PARABRICKS_FQ2BAM.out.qc_metrics) + ch_duplicate_metrics = ch_duplicate_metrics(PARABRICKS_FQ2BAM.out.duplicate_metrics) - SAMTOOLS_INDEX ( SAMTOOLS_SORT.out.bam ) - ch_versions = ch_versions.mix(SAMTOOLS_INDEX.out.versions.first()) + // Apply BQSR + PARABRICKS_APPLYBQSR(ch_bam.map{meta, bam -> [ch_bam, ch_bai, ch_bqsr_table, ch_interval_file] }) - emit: - // TODO nf-core: edit emitted channels - bam = SAMTOOLS_SORT.out.bam // channel: [ val(meta), [ bam ] ] - bai = SAMTOOLS_INDEX.out.bai // channel: [ val(meta), [ bai ] ] - csi = SAMTOOLS_INDEX.out.csi // channel: [ val(meta), [ csi ] ] + ch_versions = ch_versions.mix(PARABRICKS_FQ2BAM.out.versions) + ch_versions = ch_versions.mix(PARABRICKS_APPLYBQSR.out.versions) - versions = ch_versions // channel: [ versions.yml ] -} + emit: + bam = PARABRICKS_APPLYBQSR.out.bam // channel: [ [meta], bam ] + bai = PARABRICKS_APPLYBQSR.out.bai // channel: [ [meta], bai ] + versions = ch_versions // channel: [ versions.yml ] +} \ No newline at end of file diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 4995b7483..c7677433b 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -280,6 +280,30 @@ workflow SAREK { // First, we must calculate number of lanes for each sample (meta.n_fastq) // This is needed to group reads from the same sample together using groupKey to avoid stalling the workflow // when reads from different samples are mixed together + + if (params.aligner = 'parabricks') { + + fastq_mapped = reads_for_alignment + .combine(reads_grouping_key) // Creates a tuple of [ meta, bam, reads_grouping_key ] + .filter { meta1, files, meta2 -> meta1.sample == meta2.sample } + // Add n_fastq and other variables to meta + .map { meta1, files, meta2 -> + [ meta1 + meta2, bam ] + } + // Manipulate meta map to remove old fields and add new ones + .map { meta, files -> + [ meta - meta.subMap('id', 'read_group', 'data_type', 'num_lanes', 'read_group', 'size') + [ data_type: 'fastq_gz', id: meta.sample ], fastq ] + } + // Create groupKey from meta map + .map { meta, files -> + [ groupKey( meta, meta.n_fastq), fastq ] + } + // Group + .groupTuple() + + + } + reads_for_alignment.map { meta, reads -> [ meta.subMap('patient', 'sample', 'sex', 'status'), reads ] } @@ -295,12 +319,11 @@ workflow SAREK { else [ meta, reads ] } - // TODO Move grouping of reads to separate parabricks subworkflow - // reads will be sorted sort_bam = true FASTQ_ALIGN_BWAMEM_MEM2_DRAGMAP_SENTIEON(reads_for_alignment, index_alignment, sort_bam, fasta, fasta_fai) - + FASTQ_ALIGN_PARABRICKS(reads_for_alignment...) + // Grouping the bams from the same samples not to stall the workflow // Use groupKey to make sure that the correct group can advance as soon as it is complete // and not stall the workflow until all reads from all channels are mapped From dc631851be4fcaea5bf4c47416e82f6680b14ed2 Mon Sep 17 00:00:00 2001 From: famosab Date: Mon, 28 Oct 2024 15:04:51 +0100 Subject: [PATCH 4/5] add test --- .../local/fastq_align_parabricks/main.nf | 2 +- .../fastq_align_parabricks/tests/main.nf.test | 69 +++++++++++++++++++ 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 subworkflows/local/fastq_align_parabricks/tests/main.nf.test diff --git a/subworkflows/local/fastq_align_parabricks/main.nf b/subworkflows/local/fastq_align_parabricks/main.nf index 1f653801d..947bdca40 100644 --- a/subworkflows/local/fastq_align_parabricks/main.nf +++ b/subworkflows/local/fastq_align_parabricks/main.nf @@ -10,7 +10,7 @@ workflow FASTQ_ALIGN_PARABRICKS { take: ch_reads // channel: [mandatory] meta, reads ch_interval_file // channel: [optional for parabricks] intervals_bed_combined - val_sort // boolean: [mandatory] true -> sort, false -> don't sort + // val_sort // boolean: [mandatory] true -> sort, false -> don't sort ch_fasta ch_fasta_fai ch_known_sites // channel [optional for parabricks] known_sites_indels diff --git a/subworkflows/local/fastq_align_parabricks/tests/main.nf.test b/subworkflows/local/fastq_align_parabricks/tests/main.nf.test new file mode 100644 index 000000000..33c590aec --- /dev/null +++ b/subworkflows/local/fastq_align_parabricks/tests/main.nf.test @@ -0,0 +1,69 @@ +nextflow_workflow { + + name "Test Subworkflow FASTQ_ALIGN_BWA" + script "../main.nf" + config "./nextflow.config" + workflow "FASTQ_ALIGN_BWA" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows/fastq_align_parabricks" + tag "parabricks" + tag "parabricks/fq2bam" + tag "parabricks/applybqsr" + + test("fastq_align_parabricks_single_end") { + + when { + workflow { + """ + input[0] = Channel.of([ + [ id:'test', single_end:true ], + [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ]]) + input[1] = [] // interval file + input[2] = Channel.value([ + [id: 'reference'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[3] = Channel.value([ + [id: 'reference_index'], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta.fai', checkIfExists: true) + ]) + input[4] = Channel.value([ + [id: 'known_sites'], + file(params.test_data['sarscov2']['illumina']['test_vcf_gz'], checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out).match()} + ) + } + } + + // test("fastq_align_parabricks_paired_end") { + + // when { + // workflow { + // """ + // input[0] = Channel.of([[ id:'test', single_end:false ], [file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)] + // ] ) + // input[1] = BWA_INDEX.out.index + // input[2] = false + // input[3] = Channel.value([[id: 'genome'], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true)]) + // """ + // } + // } + + // then { + // assertAll( + // { assert workflow.success}, + // { assert snapshot(workflow.out).match()} + // ) + // } + // } +} \ No newline at end of file From 4d2b0e218ceb51c16b9f5928bc9c3a394b25514c Mon Sep 17 00:00:00 2001 From: famosab Date: Mon, 28 Oct 2024 15:11:16 +0100 Subject: [PATCH 5/5] fix name --- subworkflows/local/fastq_align_parabricks/tests/main.nf.test | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/fastq_align_parabricks/tests/main.nf.test b/subworkflows/local/fastq_align_parabricks/tests/main.nf.test index 33c590aec..4ab322853 100644 --- a/subworkflows/local/fastq_align_parabricks/tests/main.nf.test +++ b/subworkflows/local/fastq_align_parabricks/tests/main.nf.test @@ -1,9 +1,9 @@ nextflow_workflow { - name "Test Subworkflow FASTQ_ALIGN_BWA" + name "Test Subworkflow FASTQ_ALIGN_PARABRICKS" script "../main.nf" config "./nextflow.config" - workflow "FASTQ_ALIGN_BWA" + workflow "FASTQ_ALIGN_PARABRICKS" tag "subworkflows" tag "subworkflows_nfcore"