diff --git a/conf/modules/post_variant_calling.config b/conf/modules/post_variant_calling.config index 3354d4671f..509cf1075a 100644 --- a/conf/modules/post_variant_calling.config +++ b/conf/modules/post_variant_calling.config @@ -16,6 +16,18 @@ process { + withName: 'GERMLINE_VCFS_NORM'{ + ext.args = { [ + '--multiallelics - both', //split multiallelic sites into biallelic records and both SNPs and indels should be merged separately into two records + '--rm-dup all' //output only the first instance of a record which is present multiple times + ].join(' ') } + ext.when = { params.concatenate_vcfs } + publishDir = [ + mode: params.publish_dir_mode, + path: { "${params.outdir}/variant_calling/concat/${meta.id}/" } + ] + } + withName: 'GERMLINE_VCFS_CONCAT'{ ext.args = { "-a" } ext.when = { params.concatenate_vcfs } diff --git a/modules/nf-core/bcftools/norm/environment.yml b/modules/nf-core/bcftools/norm/environment.yml new file mode 100644 index 0000000000..fe80e4e7c3 --- /dev/null +++ b/modules/nf-core/bcftools/norm/environment.yml @@ -0,0 +1,7 @@ +name: bcftools_norm +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::bcftools=1.18 diff --git a/modules/nf-core/bcftools/norm/main.nf b/modules/nf-core/bcftools/norm/main.nf new file mode 100644 index 0000000000..47d3dab1ee --- /dev/null +++ b/modules/nf-core/bcftools/norm/main.nf @@ -0,0 +1,60 @@ +process BCFTOOLS_NORM { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bcftools:1.18--h8b25389_0': + 'biocontainers/bcftools:1.18--h8b25389_0' }" + + input: + tuple val(meta), path(vcf), path(tbi) + tuple val(meta2), path(fasta) + + output: + tuple val(meta), path("*.{vcf,vcf.gz,bcf,bcf.gz}") , emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + + """ + bcftools norm \\ + --fasta-ref ${fasta} \\ + --output ${prefix}.${extension}\\ + $args \\ + --threads $task.cpus \\ + ${vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '--output-type z' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-type b") || args.contains("-Ob") ? "bcf.gz" : + args.contains("--output-type u") || args.contains("-Ou") ? "bcf" : + args.contains("--output-type z") || args.contains("-Oz") ? "vcf.gz" : + args.contains("--output-type v") || args.contains("-Ov") ? "vcf" : + "vcf.gz" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/bcftools/norm/meta.yml b/modules/nf-core/bcftools/norm/meta.yml new file mode 100644 index 0000000000..1f3e1b6265 --- /dev/null +++ b/modules/nf-core/bcftools/norm/meta.yml @@ -0,0 +1,61 @@ +name: bcftools_norm +description: Normalize VCF file +keywords: + - normalize + - norm + - variant calling + - VCF +tools: + - norm: + description: | + Normalize VCF files. + homepage: http://samtools.github.io/bcftools/bcftools.html + documentation: http://www.htslib.org/doc/bcftools.html + doi: 10.1093/bioinformatics/btp352 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: | + The vcf file to be normalized + e.g. 'file1.vcf' + pattern: "*.{vcf,vcf.gz}" + - tbi: + type: file + description: | + An optional index of the VCF file (for when the VCF is compressed) + pattern: "*.vcf.gz.tbi" + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: FASTA reference file + pattern: "*.{fasta,fa}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: One of uncompressed VCF (.vcf), compressed VCF (.vcf.gz), compressed BCF (.bcf.gz) or uncompressed BCF (.bcf) normalized output file + pattern: "*.{vcf,vcf.gz,bcf,bcf.gz}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@abhi18av" + - "@ramprasadn" +maintainers: + - "@abhi18av" + - "@ramprasadn" diff --git a/subworkflows/local/post_variantcalling/main.nf b/subworkflows/local/post_variantcalling/main.nf index 6b75d2c6b8..b0553ece43 100644 --- a/subworkflows/local/post_variantcalling/main.nf +++ b/subworkflows/local/post_variantcalling/main.nf @@ -8,13 +8,14 @@ workflow POST_VARIANTCALLING { take: vcfs + fasta concatenate_vcfs main: versions = Channel.empty() if (concatenate_vcfs){ - CONCATENATE_GERMLINE_VCFS(vcfs) + CONCATENATE_GERMLINE_VCFS(vcfs, fasta) vcfs = vcfs.mix(CONCATENATE_GERMLINE_VCFS.out.vcfs) versions = versions.mix(CONCATENATE_GERMLINE_VCFS.out.versions) diff --git a/subworkflows/local/vcf_concatenate_germline/main.nf b/subworkflows/local/vcf_concatenate_germline/main.nf index 87f46b22e1..65c5c95a3f 100644 --- a/subworkflows/local/vcf_concatenate_germline/main.nf +++ b/subworkflows/local/vcf_concatenate_germline/main.nf @@ -1,42 +1,59 @@ -// // CONCATENATE Germline VCFs -// // Concatenation of germline vcf-files -include { ADD_INFO_TO_VCF } from '../../../modules/local/add_info_to_vcf/main' -include { TABIX_BGZIPTABIX as TABIX_EXT_VCF } from '../../../modules/nf-core/tabix/bgziptabix/main' -include { BCFTOOLS_CONCAT as GERMLINE_VCFS_CONCAT } from '../../../modules/nf-core/bcftools/concat/main' -include { BCFTOOLS_SORT as GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/bcftools/sort/main' -include { TABIX_TABIX as TABIX_GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/tabix/tabix/main' +include { ADD_INFO_TO_VCF } from '../../../modules/local/add_info_to_vcf/main' +include { TABIX_BGZIPTABIX as TABIX_EXT_VCF } from '../../../modules/nf-core/tabix/bgziptabix/main' +include { BCFTOOLS_NORM as GERMLINE_VCFS_NORM } from '../../../modules/nf-core/bcftools/norm/main' +include { BCFTOOLS_CONCAT as GERMLINE_VCFS_CONCAT } from '../../../modules/nf-core/bcftools/concat/main' +include { BCFTOOLS_SORT as GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/bcftools/sort/main' +include { TABIX_TABIX as TABIX_GERMLINE_VCFS_CONCAT_SORT } from '../../../modules/nf-core/tabix/tabix/main' workflow CONCATENATE_GERMLINE_VCFS { take: vcfs + fasta main: versions = Channel.empty() - // Concatenate vcf-files + // Add additional information to VCF files ADD_INFO_TO_VCF(vcfs) + + // Compress the VCF files with bgzip TABIX_EXT_VCF(ADD_INFO_TO_VCF.out.vcf) + // Normalize the VCF files with BCFTOOLS_NORM + GERMLINE_VCFS_NORM(vcf: ADD_INFO_TO_VCF.out.vcf, fasta: fasta) + + // Compress the normalized VCF files with bgzip + TABIX_EXT_VCF(GERMLINE_VCFS_NORM.out.vcf) + + // Index the compressed normalized VCF files + TABIX_GERMLINE_VCFS_CONCAT_SORT(TABIX_EXT_VCF.out.gz) + // Gather vcfs and vcf-tbis for concatenating germline-vcfs - germline_vcfs_with_tbis = TABIX_EXT_VCF.out.gz_tbi.map{ meta, vcf, tbi -> [ meta.subMap('id'), vcf, tbi ] }.groupTuple() + germline_vcfs_with_tbis = TABIX_GERMLINE_VCFS_CONCAT_SORT.out.map { meta, vcf, tbi -> [meta.subMap('id'), vcf, tbi] }.groupTuple() + // Concatenate the VCF files GERMLINE_VCFS_CONCAT(germline_vcfs_with_tbis) + + // Sort the concatenated VCF files GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT.out.vcf) + + // Index the sorted concatenated VCF files TABIX_GERMLINE_VCFS_CONCAT_SORT(GERMLINE_VCFS_CONCAT_SORT.out.vcf) // Gather versions of all tools used versions = versions.mix(ADD_INFO_TO_VCF.out.versions) versions = versions.mix(TABIX_EXT_VCF.out.versions) + versions = versions.mix(GERMLINE_VCFS_NORM.out.versions) versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions) - versions = versions.mix(GERMLINE_VCFS_CONCAT.out.versions) + versions = versions.mix(GERMLINE_VCFS_CONCAT_SORT.out.versions) versions = versions.mix(TABIX_GERMLINE_VCFS_CONCAT_SORT.out.versions) emit: - vcfs = germline_vcfs_with_tbis // post processed vcfs - + vcfs = TABIX_GERMLINE_VCFS_CONCAT_SORT.out.gz_tbi // post-processed VCFs versions // channel: [ versions.yml ] } + diff --git a/workflows/sarek/main.nf b/workflows/sarek/main.nf index 5062470373..30fcb1985c 100644 --- a/workflows/sarek/main.nf +++ b/workflows/sarek/main.nf @@ -794,7 +794,8 @@ workflow SAREK { // POST VARIANTCALLING POST_VARIANTCALLING(BAM_VARIANT_CALLING_GERMLINE_ALL.out.vcf_all, - params.concatenate_vcfs) + fasta, + params.concatenate_vcfs) // Gather vcf files for annotation and QC vcf_to_annotate = Channel.empty()