From e9ff17c2cea26896b49759e9799d2df0e180a22a Mon Sep 17 00:00:00 2001 From: Felix Lenner <52530259+fellen31@users.noreply.github.com> Date: Tue, 19 Nov 2024 08:27:36 +0100 Subject: [PATCH] Filter SNVs, INDELs, CNVs and SVs (#496) * Filter variants * CHANGELOG * Update CHANGELOG.md * Update assets/schema_hgnc_ids.json * Update subworkflows/local/filter_variants/main.nf Co-authored-by: Daniel Schmitz * merge and review suggestions * review suggestions * Update subworkflows/local/filter_variants/main.nf Co-authored-by: Anders Jemt * Review suggestions --------- Co-authored-by: Daniel Schmitz Co-authored-by: Anders Jemt --- CHANGELOG.md | 35 ++-- README.md | 6 +- assets/schema_hgnc_ids.json | 26 +++ conf/modules/filter_variants.config | 77 +++++++++ conf/test.config | 3 +- docs/index.md | 6 +- docs/output.md | 34 +++- docs/parameters.md | 7 +- docs/usage.md | 20 +++ modules.json | 5 + .../ensemblvep/filtervep/environment.yml | 5 + modules/nf-core/ensemblvep/filtervep/main.nf | 49 ++++++ modules/nf-core/ensemblvep/filtervep/meta.yml | 50 ++++++ .../ensemblvep/filtervep/tests/main.nf.test | 136 ++++++++++++++++ .../filtervep/tests/main.nf.test.snap | 26 +++ .../filtervep/tests/nextflow.config | 10 ++ .../ensemblvep/filtervep/tests/tab.gz.config | 24 +++ .../ensemblvep/filtervep/tests/tags.yml | 2 + .../ensemblvep/filtervep/tests/vcf.config | 23 +++ nextflow.config | 6 +- nextflow_schema.json | 20 ++- subworkflows/local/filter_variants/main.nf | 39 +++++ .../local/filter_variants/tests/main.nf.test | 150 ++++++++++++++++++ .../filter_variants/tests/main.nf.test.snap | 33 ++++ .../filter_variants/tests/nextflow.config | 80 ++++++++++ tests/samplesheet.nf.test | 2 +- tests/samplesheet.nf.test.snap | 24 ++- tests/samplesheet_multisample_bam.nf.test | 2 +- .../samplesheet_multisample_bam.nf.test.snap | 24 ++- tests/samplesheet_multisample_ont_bam.nf.test | 2 +- ...mplesheet_multisample_ont_bam.nf.test.snap | 24 ++- workflows/nallo.nf | 46 +++++- 32 files changed, 958 insertions(+), 38 deletions(-) create mode 100644 assets/schema_hgnc_ids.json create mode 100644 conf/modules/filter_variants.config create mode 100644 modules/nf-core/ensemblvep/filtervep/environment.yml create mode 100644 modules/nf-core/ensemblvep/filtervep/main.nf create mode 100644 modules/nf-core/ensemblvep/filtervep/meta.yml create mode 100644 modules/nf-core/ensemblvep/filtervep/tests/main.nf.test create mode 100644 modules/nf-core/ensemblvep/filtervep/tests/main.nf.test.snap create mode 100644 modules/nf-core/ensemblvep/filtervep/tests/nextflow.config create mode 100644 modules/nf-core/ensemblvep/filtervep/tests/tab.gz.config create mode 100644 modules/nf-core/ensemblvep/filtervep/tests/tags.yml create mode 100644 modules/nf-core/ensemblvep/filtervep/tests/vcf.config create mode 100644 subworkflows/local/filter_variants/main.nf create mode 100644 subworkflows/local/filter_variants/tests/main.nf.test create mode 100644 subworkflows/local/filter_variants/tests/main.nf.test.snap create mode 100644 subworkflows/local/filter_variants/tests/nextflow.config diff --git a/CHANGELOG.md b/CHANGELOG.md index 63c0d1e8..6cfad570 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -35,6 +35,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#451](https://github.com/genomic-medicine-sweden/nallo/pull/451) - Added support for running methylation subworkflow without phasing - [#451](https://github.com/genomic-medicine-sweden/nallo/pull/451) - Added nf-test to methylation - [#491](https://github.com/genomic-medicine-sweden/nallo/pull/491) - Added a changelog reminder action +- [#496](https://github.com/genomic-medicine-sweden/nallo/pull/496) - Added a subworkflow to filter variants ### `Changed` @@ -132,6 +133,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | `--validationSkipDuplicateCheck` | | | `--validationS3PathCheck` | | | `--monochromeLogs` | `--monochrome_logs` | +| | `--filter_variants_hgnc_ids` | +| | `--filter_snvs_expression` | +| | `--filter_svs_expression` | | `--skip_short_variant_calling` | `--skip_snv_calling` | | `--skip_assembly_wf` | `--skip_genome_assembly` | | `--skip_mapping_wf` | `--skip_alignment` | @@ -159,21 +163,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Module updates -| Tool | Old version | New version | -| -------------- | ----------- | ----------- | -| fqcrs | 0.1.0 | -| severus | | 1.1 | -| longphase  |   | 1.7.3   | -| genmod | 3.8.2 | 3.9 | -| WhatsHap | 2.2 | 2.3 | -| SVDB | | 2.8.1 | -| hifiasm | 0.19.8 | 0.20.0 | -| HiFiCNV | 0.1.7 | 1.0.0 | -| samtools/faidx | 1.2 | 1.21 | -| samtools/index | 1.2 | 1.21 | -| samtools/merge | 1.2 | 1.21 | -| stranger | 0.9.1 | 0.9.2 | -| multiqc | 1.21 | 1.25.1 | +| Tool | Old version | New version | +| --------------------- | ----------- | ----------- | +| fqcrs | 0.1.0 | +| severus | | 1.1 | +| longphase  |   | 1.7.3   | +| genmod | 3.8.2 | 3.9 | +| WhatsHap | 2.2 | 2.3 | +| SVDB | | 2.8.2 | +| hifiasm | 0.19.8 | 0.20.0 | +| HiFiCNV | 0.1.7 | 1.0.0 | +| samtools/faidx | 1.2 | 1.21 | +| samtools/index | 1.2 | 1.21 | +| samtools/merge | 1.2 | 1.21 | +| stranger | 0.9.1 | 0.9.2 | +| multiqc | 1.21 | 1.25.1 | +| ensemblvep/filter_vep | | 113 | > [!NOTE] > Version has been updated if both old and new version information is present. diff --git a/README.md b/README.md index ef9224ed..42e1f721 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,11 @@ ##### Ranking -- Rank SNVs, INDELs and SVs with [GENMOD](https://github.com/Clinical-Genomics/genmod) +- Rank SNVs, INDELs, SVs and CNVs with [GENMOD](https://github.com/Clinical-Genomics/genmod) + +##### Filtering + +- Filter SNVs, INDELs, SVs and CNVs with [filter_vep](https://www.ensembl.org/vep) and [bcftools view](https://samtools.github.io/bcftools/bcftools.html). ## Usage diff --git a/assets/schema_hgnc_ids.json b/assets/schema_hgnc_ids.json new file mode 100644 index 00000000..2f3e64e1 --- /dev/null +++ b/assets/schema_hgnc_ids.json @@ -0,0 +1,26 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://raw.githubusercontent.com/genomic-medicine-sweden/nallo/master/assets/schema_hgnc_ids.json", + "title": "genomic-medicine-sweden/nallo pipeline - params.filter_variants_hgnc_ids schema", + "description": "Schema for the file provided with params.filter_variants_hgnc_ids", + "type": "array", + "items": { + "type": "object", + "properties": { + "hgnc_id": { + "oneOf": [ + { + "type": "string", + "pattern": "^\\S+$" + }, + { + "type": "integer" + } + ], + "exists": true, + "errorMessage": "HGNC IDs must exist with a header line `hgnc_id`, then one HGNC ID per line, either as e.g. `4826` or `HGNC:4826`." + } + }, + "required": ["hgnc_id"] + } +} diff --git a/conf/modules/filter_variants.config b/conf/modules/filter_variants.config new file mode 100644 index 00000000..b709aa59 --- /dev/null +++ b/conf/modules/filter_variants.config @@ -0,0 +1,77 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +---------------------------------------------------------------------------------------- +*/ + +process { + + /* + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Filter variants + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ + + withName: '.*:FILTER_VARIANTS_SNV:.*' { + publishDir = [ + enabled: false, + ] + } + + withName: '.*:FILTER_VARIANTS_SNVS:ENSEMBLVEP_FILTERVEP' { + ext.args = { "--filter \"HGNC_ID in ${feature_file}\"" } + publishDir = [ + enabled: false, + ] + } + + withName: '.*:FILTER_VARIANTS_SVS:ENSEMBLVEP_FILTERVEP' { + ext.args = { "--filter \"HGNC_ID in ${feature_file}\"" } + publishDir = [ + enabled: false, + ] + } + + withName: '.*:FILTER_VARIANTS_SNVS:BCFTOOLS_VIEW' { + ext.prefix = { params.skip_snv_annotation ? "${meta.id}_snvs_filtered" : (params.skip_rank_variants ? "${meta.id}_snvs_annotated_filtered" : "${meta.id}_snvs_annotated_ranked_filtered") } + ext.args = { [ + '--output-type z', + '--write-index=tbi', + "${params.filter_snvs_expression}" + ].join(" ") } + publishDir = [ + path: { "${params.outdir}/snvs/multi_sample/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: '.*:FILTER_VARIANTS_SVS:BCFTOOLS_VIEW' { + ext.prefix = { + def parts = [] + parts << "${meta.id}" + parts << (params.skip_cnv_calling ? 'svs_merged' : 'svs_cnvs_merged') + if (!params.skip_sv_annotation) parts << 'annotated' + if (!params.skip_rank_variants) parts << 'ranked' + parts << 'filtered' + return parts.join('_') + } + ext.args = { [ + '--output-type z', + '--write-index=tbi', + "${params.filter_svs_expression}" + ].join(" ") } + publishDir = [ + path: { "${params.outdir}/svs/family/${meta.id}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + +} diff --git a/conf/test.config b/conf/test.config index 86a53776..ec51373c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -18,12 +18,13 @@ params { modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' // Base directory for genomic-medicine-sweden/nallo test data - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/' // References fasta = params.pipelines_testdata_base_path + 'reference/hg38.test.fa.gz' input = params.pipelines_testdata_base_path + 'testdata/samplesheet.csv' target_regions = params.pipelines_testdata_base_path + 'reference/test_data.bed' + filter_variants_hgnc_ids = params.pipelines_testdata_base_path + 'testdata/hgnc_ids.tsv' hificnv_expected_xy_cn = params.pipelines_testdata_base_path + 'reference/expected_cn.hg38.XY.bed' hificnv_expected_xx_cn = params.pipelines_testdata_base_path + 'reference/expected_cn.hg38.XX.bed' hificnv_excluded_regions = params.pipelines_testdata_base_path + 'reference/empty.bed' diff --git a/docs/index.md b/docs/index.md index 0b06c33e..ecce661c 100644 --- a/docs/index.md +++ b/docs/index.md @@ -44,7 +44,11 @@ description: A bioinformatics analysis pipeline for long-reads from both PacBio ### Ranking -- Rank SNVs with [GENMOD](https://github.com/Clinical-Genomics/genmod) +- Rank SNVs, INDELs, SVs and CNVs with [GENMOD](https://github.com/Clinical-Genomics/genmod) + +### Filtering + +- Filter SNVs, INDELs, SVs and CNVs with [filter_vep](https://www.ensembl.org/vep) and [bcftools view](https://samtools.github.io/bcftools/bcftools.html). ## Usage diff --git a/docs/output.md b/docs/output.md index 0cf4f471..cc2f4b14 100644 --- a/docs/output.md +++ b/docs/output.md @@ -215,6 +215,21 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin | `snvs/family/{family}/{family}_snv_annotated_ranked.vcf.gz` | VCF file with annotated and ranked variants per family | | `snvs/family/{family}/{family}_snv_annotated_ranked.vcf.gz.tbi` | Index of the ranked VCF file | +[filter_vep](https://www.ensembl.org/vep) and [bcftools view](https://samtools.github.io/bcftools/bcftools.html) can be used to filter variants. + +!!!note + + Variants are only output if either of `--filter_variants_hgnc_id` and `--filter_snvs_expression` has been used, and only family VCFs are output. + +!!!tip + + Filtered variants are output alongside unfiltered variants as additional files. + +| Path | Description | +| ---------------------------------------------- | -------------------------------------------- | +| `snvs/{family}/{family}_*_filtered.vcf.gz` | VCF file with filtered variants for a family | +| `snvs/{family}/{family}_*_filtered.vcf.gz.tbi` | Index of the filtered VCF file | + ### SVs (and CNVs) [Severus](https://github.com/KolmogorovLab/Severus) or [Sniffles](https://github.com/fritzsedlazeck/Sniffles) is used to call structural variants. @@ -228,9 +243,7 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin !!!note - Due to the complexity of SV merging strategies, SVs and CNVs are reported per family rather than per project. - SV and CNV calls are output unmerged per sample, while the family files are first merged between samples for SVs and CNVs separately, - then the merged SV and CNV files are merged again, with priority given to coordinates from the SV calls. + SV and CNV calls are output unmerged per sample, while the family files are first merged between samples for SVs and CNVs separately, then the merged SV and CNV files are merged again, with priority given to coordinates from the SV calls. | Path | Description | | --------------------------------------------------------------- | ------------------------------------------------------------------ | @@ -261,6 +274,21 @@ If the pipeline is run with phasing, the aligned reads will be happlotagged usin | `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz` | VCF file with merged, annotated and ranked SVs per family (output if CNV-calling is off) | | `svs/family/{family_id}/{family_id}_svs_merged_annotated_ranked.vcf.gz.tbi` | Index of the merged VCF file | +[filter_vep](https://www.ensembl.org/vep) and [bcftools view](https://samtools.github.io/bcftools/bcftools.html) can be used to filter variants. + +!!!note + + Variants are only output if either of `--filter_variants_hgnc_id` and `--filter_svs_expression` has been used, and only family variants are output. + +!!!tip + + Filtered variants are output alongside unfiltered variants as additional files. + +| Path | Description | +| ---------------------------------------------------- | -------------------------------------------- | +| `svs/family/{family}/{family}_*_filtered.vcf.gz` | VCF file with filtered variants for a family | +| `svs/family/{family}/{family}_*_filtered.vcf.gz.tbi` | Index of the filtered VCF file | + ## Visualization Tracks [HiFiCNV](https://github.com/PacificBiosciences/HiFiCNV) is used to call CNVs, but it also produces copy number, depth, and MAF tracks that can be visualized in for example IGV. diff --git a/docs/parameters.md b/docs/parameters.md index 1956fe46..c843fa98 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -51,7 +51,7 @@ Define where the pipeline should find input data and save output data. | `genmod_score_config_snvs` | A SNV rank model config file for genmod. | `string` | | | | | `genmod_score_config_svs` | A SV rank model config file for genmod. | `string` | | | | | `somalier_sites` | A VCF of known polymorphic sites for somalier | `string` | | | | -| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/ | | True | +| `pipelines_testdata_base_path` | Base URL or local path to location of pipeline test dataset files | `string` | https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/ | | True | ## Reference genome options @@ -106,7 +106,10 @@ Workflow options specific to genomic-medicine-sweden/nallo | `alignment_processes` | If alignment_processes is bigger than 1, input files will be split and aligned in parallel to reduce processing time. | `integer` | 8 | | | | `snv_calling_processes` | If snv_calling_processes is bigger than 1, short variant calling will be done in parallel to reduce processing time. | `integer` | 13 | | | | `vep_cache_version` | VEP cache version | `integer` | 110 | | | -| `vep_plugin_files` | A csv file with vep_plugin_files as header, and then paths to vep plugin files. Paths to pLI_values.txt and LoFtool_scores.txt are required. | `string` | | | | +| `vep_plugin_files` | A csv file with vep_files as header, and then paths to vep plugin files. Paths to pLI_values.txt and LoFtool_scores.txt are required. | `string` | | | | +| `filter_variants_hgnc_ids` | A tsv/csv file with a `#hgnc_ids` column header, and then one numerical HGNC ID per row. E.g. `4281`, not `HGNC:4281`. | `string` | | | | +| `filter_snvs_expression` | An expression that is passed to bcftools view to filter SNVs, e.g. --filter_snvs_expression "-e 'INFO/AQ>60'" | `string` | | | | +| `filter_svs_expression` | An expression that is passed to bcftools view to filter SVs, e.g. --filter_svs_expression "-e 'INFO/AQ>60'" | `string` | | | | | `deepvariant_model_type` | Sets the model type used for DeepVariant. This is set automatically using `--preset` by default. | `string` | PACBIO | | True | | `minimap2_read_mapping_preset` | Sets the minimap2-preset (-x) for read alignment. This is set automatically using the pipeline `--preset` by default. | `string` | map-hifi | | True | | `extra_modkit_options` | Extra options to modkit, used for test profile. | `string` | | | True | diff --git a/docs/usage.md b/docs/usage.md index 53def5eb..84786bb7 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -309,6 +309,26 @@ This subworkflow ranks SVs, and relies on the mapping, SV calling and SV annotat `--skip_rank_variants`. +#### Filter variants + +SNVs and INDELs, and SVs and CNVs can be filtered using [filter_vep](https://www.ensembl.org/vep) and [bcftools view](https://samtools.github.io/bcftools/bcftools.html). + +| Parameter | Description | +| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `filter_variants_hgnc_ids` 1 |  Used by filter_vep to filter variants on HGNC IDs. Requires a tsv/bed file with a `#hgnc_ids` column with one numerical HGNC ID per row. E.g. `4281`, not `HGNC:4281`. | + +1 Example file for input with `--filter_variants_hgnc_ids`: + +``` +#hgnc_id +4865 +14150 +``` + +To pass filters to bcftools view, use `--filter_snvs_expression` and `--filter_svs_expression`. E.g `--filter_snvs_expression "-e 'INFO/AQ>60'"`. + +Filtering of variants only happens if any of these three parameters is active. + ## Other highlighted parameters - Limit SNV calling to regions in BED file (`--target_bed`). diff --git a/modules.json b/modules.json index 47a9ac33..40ce3646 100644 --- a/modules.json +++ b/modules.json @@ -87,6 +87,11 @@ "git_sha": "2f9a5431355897e299cb41928c45f51ea8410c42", "installed_by": ["modules"] }, + "ensemblvep/filtervep": { + "branch": "master", + "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4", + "installed_by": ["modules"] + }, "ensemblvep/vep": { "branch": "master", "git_sha": "6e3585d9ad20b41adc7d271009f8cb5e191ecab4", diff --git a/modules/nf-core/ensemblvep/filtervep/environment.yml b/modules/nf-core/ensemblvep/filtervep/environment.yml new file mode 100644 index 00000000..3d36eb17 --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/environment.yml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::ensembl-vep=113.0 diff --git a/modules/nf-core/ensemblvep/filtervep/main.nf b/modules/nf-core/ensemblvep/filtervep/main.nf new file mode 100644 index 00000000..69245df0 --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/main.nf @@ -0,0 +1,49 @@ +process ENSEMBLVEP_FILTERVEP { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ensembl-vep:113.0--pl5321h2a3209d_0' : + 'biocontainers/ensembl-vep:113.0--pl5321h2a3209d_0' }" + + input: + tuple val(meta), path(input) + path (feature_file) + + output: + tuple val(meta), path("*.${extension}"), emit: output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + extension = task.ext.suffix ?: "vcf" + """ + filter_vep \\ + $args \\ + --input_file $input \\ + --output_file ${prefix}.${extension} \\ + --only_matched + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + extension = task.ext.suffix ?: "vcf" + """ + touch ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + ensemblvep: \$( echo \$(vep --help 2>&1) | sed 's/^.*Versions:.*ensembl-vep : //;s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/ensemblvep/filtervep/meta.yml b/modules/nf-core/ensemblvep/filtervep/meta.yml new file mode 100644 index 00000000..a73e3b7a --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/meta.yml @@ -0,0 +1,50 @@ +name: ensemblvep_filtervep +description: Filter variants based on Ensembl Variant Effect Predictor (VEP) annotations. +keywords: + - annotation + - vcf + - tab + - filter +tools: + - ensemblvep: + description: | + VEP determines the effect of your variants (SNPs, insertions, deletions, CNVs + or structural variants) on genes, transcripts, and protein sequence, as well as regulatory regions. + homepage: https://www.ensembl.org/info/docs/tools/vep/index.html + documentation: https://www.ensembl.org/info/docs/tools/vep/script/index.html + licence: ["Apache-2.0"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - input: + type: file + description: VCF/TAB file annotated with vep + pattern: "*.{vcf,tab,tsv,txt}" + - - feature_file: + type: file + description: File containing features on separate lines. To be used with --filter + option. +output: + - output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'test', single_end:false ]` + - "*.${extension}": + type: file + description: VCF/TAB file + pattern: "*.{vcf,tab,txt,tsv}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@ramprasadn" +maintainers: + - "@ramprasadn" diff --git a/modules/nf-core/ensemblvep/filtervep/tests/main.nf.test b/modules/nf-core/ensemblvep/filtervep/tests/main.nf.test new file mode 100644 index 00000000..7147792f --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/tests/main.nf.test @@ -0,0 +1,136 @@ +nextflow_process { + + name "Test Process ENSEMBLVEP_FILTERVEP" + script "../main.nf" + process "ENSEMBLVEP_FILTERVEP" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "ensemblvep" + tag "ensemblvep/vep" + tag "ensemblvep/filtervep" + tag "ensemblvep/download" + + // Test for filtering VCF file + test("test_ensemblvep_filtervep_vcf") { + config "./vcf.config" + + setup { + run("ENSEMBLVEP_DOWNLOAD") { + script "../../download/main.nf" + + process { + """ + input[0] = Channel.of([ + [id:"113_WBcel235"], + params.vep_genome, + params.vep_species, + params.vep_cache_version + ]) + """ + } + } + run("ENSEMBLVEP_VEP") { + script "../../vep/main.nf" + + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true), + [] + ]) + input[1] = params.vep_genome + input[2] = params.vep_species + input[3] = params.vep_cache_version + input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] } + input[5] = Channel.value([ + [id:"fasta"], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[6] = [] + """ + } + } + } + + when { + process { + """ + input[0] = ENSEMBLVEP_VEP.out.vcf + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert path(process.out.output.get(0).get(1)).readLines().first().contains("##fileformat=VCFv4.2") } + ) + } + } + + // Test for filtering TAB file + test("test_ensemblvep_filtervep_tab_gz") { + config "./tab.gz.config" + + setup { + run("ENSEMBLVEP_DOWNLOAD") { + script "../../download/main.nf" + + process { + """ + input[0] = Channel.of([ + [id:"113_WBcel235"], + params.vep_genome, + params.vep_species, + params.vep_cache_version + ]) + """ + } + } + run("ENSEMBLVEP_VEP") { + script "../../vep/main.nf" + + process { + """ + input[0] = Channel.of([ + [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/vcf/test.vcf', checkIfExists: true), + [] + ]) + input[1] = params.vep_genome + input[2] = params.vep_species + input[3] = params.vep_cache_version + input[4] = ENSEMBLVEP_DOWNLOAD.out.cache.map{ meta, cache -> [cache] } + input[5] = Channel.value([ + [id:"fasta"], + file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/genome.fasta', checkIfExists: true) + ]) + input[6] = [] + """ + } + } + } + + when { + process { + """ + input[0] = ENSEMBLVEP_VEP.out.tab + input[1] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match() }, + { assert path(process.out.output.get(0).get(1)).readLines().first().contains("## ENSEMBL VARIANT EFFECT PREDICTOR v113.0") } + ) + } + } +} diff --git a/modules/nf-core/ensemblvep/filtervep/tests/main.nf.test.snap b/modules/nf-core/ensemblvep/filtervep/tests/main.nf.test.snap new file mode 100644 index 00000000..065d747b --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/tests/main.nf.test.snap @@ -0,0 +1,26 @@ +{ + "test_ensemblvep_filtervep_vcf": { + "content": [ + [ + "versions.yml:md5,1e8906572b04dd21d8c6973efac773c6" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-21T09:10:47.874831491" + }, + "test_ensemblvep_filtervep_tab_gz": { + "content": [ + [ + "versions.yml:md5,1e8906572b04dd21d8c6973efac773c6" + ] + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-10-21T09:11:18.765461163" + } +} \ No newline at end of file diff --git a/modules/nf-core/ensemblvep/filtervep/tests/nextflow.config b/modules/nf-core/ensemblvep/filtervep/tests/nextflow.config new file mode 100644 index 00000000..40b3a3bd --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/tests/nextflow.config @@ -0,0 +1,10 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ +params { + vep_cache_version = "113" + vep_genome = "WBcel235" + vep_species = "caenorhabditis_elegans" +} diff --git a/modules/nf-core/ensemblvep/filtervep/tests/tab.gz.config b/modules/nf-core/ensemblvep/filtervep/tests/tab.gz.config new file mode 100644 index 00000000..cdad2d94 --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/tests/tab.gz.config @@ -0,0 +1,24 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: ENSEMBLVEP_DOWNLOAD { + ext.args = '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' + } + + withName: ENSEMBLVEP_VEP { + ext.args = '--tab' + ext.prefix = { "${meta.id}_vep" } + } + + withName: ENSEMBLVEP_FILTERVEP { + ext.args = '--filter "Feature_type is Transcript"' + ext.suffix = "tab" + } +} diff --git a/modules/nf-core/ensemblvep/filtervep/tests/tags.yml b/modules/nf-core/ensemblvep/filtervep/tests/tags.yml new file mode 100644 index 00000000..b43bf40d --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/tests/tags.yml @@ -0,0 +1,2 @@ +ensemblvep/filtervep: + - "modules/nf-core/ensemblvep/filtervep/**" diff --git a/modules/nf-core/ensemblvep/filtervep/tests/vcf.config b/modules/nf-core/ensemblvep/filtervep/tests/vcf.config new file mode 100644 index 00000000..ee2aef57 --- /dev/null +++ b/modules/nf-core/ensemblvep/filtervep/tests/vcf.config @@ -0,0 +1,23 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process { + + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: ENSEMBLVEP_DOWNLOAD { + ext.args = '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' + } + + withName: ENSEMBLVEP_VEP { + ext.args = '--vcf' + ext.prefix = { "${meta.id}_vep" } + } + + withName: ENSEMBLVEP_FILTERVEP { + ext.args = '--filter "Feature_type is Transcript"' + } +} diff --git a/nextflow.config b/nextflow.config index cfcf60e4..f35e534b 100644 --- a/nextflow.config +++ b/nextflow.config @@ -14,6 +14,7 @@ params { target_regions = null cadd_resources = null cadd_prescored_indels = null + filter_variants_hgnc_ids = null par_regions = null tandem_repeats = null trgt_repeats = null @@ -49,6 +50,8 @@ params { deepvariant_model_type = params.preset == 'ONT_R10' ? 'ONT_R104' : 'PACBIO' minimap2_read_mapping_preset = params.preset == 'ONT_R10' ? 'lr:hq' : 'map-hifi' + filter_snvs_expression = '' + filter_svs_expression = '' phaser = 'longphase' sv_caller = 'severus' preset = 'revio' @@ -88,7 +91,7 @@ params { help_full = false show_hidden = false version = false - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/' // Config options config_profile_name = null @@ -325,6 +328,7 @@ includeConfig 'conf/modules/call_paralogs.config' includeConfig 'conf/modules/call_repeat_expansions.config' includeConfig 'conf/modules/call_svs.config' includeConfig 'conf/modules/convert_input_files.config' +includeConfig 'conf/modules/filter_variants.config' includeConfig 'conf/modules/assembly_variant_calling.config' includeConfig 'conf/modules/genome_assembly.config' includeConfig 'conf/modules/methylation.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index 5a039737..f283d38f 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -257,7 +257,7 @@ "type": "string", "fa_icon": "far fa-check-circle", "description": "Base URL or local path to location of pipeline test dataset files", - "default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/", + "default": "https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/", "hidden": true } } @@ -468,6 +468,24 @@ "description": "A csv file with vep_files as header, and then paths to vep plugin files. Paths to pLI_values.txt and LoFtool_scores.txt are required.", "schema": "assets/vep_plugin_files_schema.json" }, + "filter_variants_hgnc_ids": { + "type": "string", + "pattern": "^\\S+\\.(csv|tsv)$", + "description": "A tsv/csv file with a `hgnc_ids` column header, and then one numerical HGNC ID per row. E.g. `4281` or `HGNC:4281`.", + "format": "file-path", + "exits": true, + "schema": "assets/schema_hgnc_ids.json" + }, + "filter_snvs_expression": { + "type": "string", + "default": "", + "description": "An expression that is passed to bcftools view to filter SNVs, e.g. --filter_snvs_expression \"-e 'INFO/AQ>60'\"" + }, + "filter_svs_expression": { + "type": "string", + "default": "", + "description": "An expression that is passed to bcftools view to filter SVs, e.g. --filter_svs_expression \"-e 'INFO/AQ>60'\"" + }, "deepvariant_model_type": { "type": "string", "default": "PACBIO", diff --git a/subworkflows/local/filter_variants/main.nf b/subworkflows/local/filter_variants/main.nf new file mode 100644 index 00000000..dd87ddf9 --- /dev/null +++ b/subworkflows/local/filter_variants/main.nf @@ -0,0 +1,39 @@ +include { ENSEMBLVEP_FILTERVEP } from '../../../modules/nf-core/ensemblvep/filtervep/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' +include { ENSEMBLVEP_VEP } from '../../../modules/nf-core/ensemblvep/vep/main.nf' + +workflow FILTER_VARIANTS { + + take: + ch_vcf // channel [optional] [ val(meta), path(vcf) ] + ch_hgnc_ids // channel: [optional] [ val(meta), path(txt) ] + filter_hgnc // bool: should filter_vep be run to filter on hgnc ids + + main: + ch_versions = Channel.empty() + + if ( filter_hgnc ) { + + ENSEMBLVEP_FILTERVEP ( + ch_vcf, + ch_hgnc_ids.map { meta, file -> file } + ) + ch_versions = ch_versions.mix(ENSEMBLVEP_FILTERVEP.out.versions) + + ch_vcf = ENSEMBLVEP_FILTERVEP.out.output + } + + BCFTOOLS_VIEW ( + ch_vcf.map { meta, vcf -> [ meta, vcf, [] ] }, + [], + [], + [] + ) + ch_versions = ch_versions.mix(BCFTOOLS_VIEW.out.versions) + + emit: + vcf = BCFTOOLS_VIEW.out.vcf // channel: [ val(meta), path(vcf) ] + tbi = BCFTOOLS_VIEW.out.tbi // channel: [ val(meta), path(tbi) ] + versions = ch_versions // channel: [ path(versions.yml) ] +} + diff --git a/subworkflows/local/filter_variants/tests/main.nf.test b/subworkflows/local/filter_variants/tests/main.nf.test new file mode 100644 index 00000000..692cf9ec --- /dev/null +++ b/subworkflows/local/filter_variants/tests/main.nf.test @@ -0,0 +1,150 @@ +nextflow_workflow { + + name "Test Workflow FILTER_VARIANTS" + script "../main.nf" + config "./nextflow.config" + workflow "FILTER_VARIANTS" + + setup { + run("GUNZIP") { + script "../../../../modules/nf-core/gunzip/main.nf" + process { + """ + input[0] = [ + [ id:'hg38' ], + file(params.pipelines_testdata_base_path + 'reference/hg38.test.fa.gz', checkIfExists: true) + ] + """ + } + } + run("SAMTOOLS_FAIDX") { + script "../../../../modules/nf-core/samtools/faidx/main.nf" + process { + """ + input[0] = GUNZIP.out.gunzip + input[1] = [[],[]] + """ + } + } + + run("MINIMAP2_ALIGN") { + script "../../../../modules/nf-core/minimap2/align/main.nf" + process { + """ + input[0] = [ + [ id: 'test', num_intervals:1 ], + file(params.pipelines_testdata_base_path + 'testdata/HG002_PacBio_Revio.fastq.gz', checkIfExists: true) + ] + input[1] = GUNZIP.out.gunzip + input[2] = true + input[3] = 'csi' + input[4] = false + input[5] = false + """ + } + } + + run("SHORT_VARIANT_CALLING") { + script "../../short_variant_calling/main.nf" + process { + """ + input[0] = MINIMAP2_ALIGN.out.bam + .join(MINIMAP2_ALIGN.out.index) + .join(Channel.of([ + [ id: 'test', num_intervals:1 ], + file(params.pipelines_testdata_base_path + 'reference/test_data.bed', checkifexists: true) + ])) + input[1] = GUNZIP.out.gunzip + input[2] = SAMTOOLS_FAIDX.out.fai + input[3] = [[],[]] + input[4] = [[],[]] + """ + } + } + run("UNTAR") { + script "../../../../modules/nf-core/untar/main.nf" + process { + """ + input[0] = [ + [ id: 'vep_cache' ], + file(params.pipelines_testdata_base_path + 'reference/vep_cache_test_data.tar.gz', checkIfExists:true) + ] + """ + } + } + run("SNV_ANNOTATION") { + script "../../snv_annotation/main.nf" + process { + """ + input[0] = SHORT_VARIANT_CALLING.out.combined_bcf + input[1] = [ + file(params.pipelines_testdata_base_path + 'reference/cadd.v1.6.hg38.test_data.zip', checkIfExists: true) + ] + input[2] = GUNZIP.out.gunzip + input[3] = SAMTOOLS_FAIDX.out.fai + input[4] = UNTAR.out.untar.map { meta, cache -> cache} + input[5] = Channel.value('110') + input[6] = Channel.of([ + file(params.pipelines_testdata_base_path + 'reference/vep_plugin_files.csv', checkIfExists: true) + ]).splitCsv(header:true).map { row -> row.vep_files }.collect() + input[7] = false + input[8] = Channel.value([]) + input[9] = null + input[10] = null + """ + } + } + } + + test("vcf, [[],[]], false") { + + when { + workflow { + """ + input[0] = SNV_ANNOTATION.out.vcf + input[1] = Channel.of([[],[]]) + input[2] = false + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + file(workflow.out.tbi.get(0).get(1)).name, + path(workflow.out.vcf.get(0).get(1)).vcf.variantsMD5, + path(workflow.out.vcf.get(0).get(1)).vcf.summary, + ).match() } + ) + } + } + + test("vcf, hgnc_ids, true") { + tag "hgnc" + when { + workflow { + """ + input[0] = SNV_ANNOTATION.out.vcf + input[1] = Channel.of('HGNC:4826') + .collectFile(name: 'hgnc_ids.txt') + .map { file -> [ [ id: 'hgnc_ids' ], file ] } + input[2] = true + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + file(workflow.out.tbi.get(0).get(1)).name, + path(workflow.out.vcf.get(0).get(1)).vcf.variantsMD5, + path(workflow.out.vcf.get(0).get(1)).vcf.summary, + ).match() } + ) + } + } +} diff --git a/subworkflows/local/filter_variants/tests/main.nf.test.snap b/subworkflows/local/filter_variants/tests/main.nf.test.snap new file mode 100644 index 00000000..75153877 --- /dev/null +++ b/subworkflows/local/filter_variants/tests/main.nf.test.snap @@ -0,0 +1,33 @@ +{ + "vcf, [[],[]], false": { + "content": [ + [ + "versions.yml:md5,c6563ccec6867a29e00bdbb19abee900" + ], + "test_data.bed_filtered.vcf.gz.tbi", + "28024aed73c5c0fd09c14cf32008a335", + "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=80, phased=false, phasedAutodetect=false]" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-11-06T15:09:53.145300431" + }, + "vcf, hgnc_ids, true": { + "content": [ + [ + "versions.yml:md5,24b641f905e26c567657e04d22f2e337", + "versions.yml:md5,c6563ccec6867a29e00bdbb19abee900" + ], + "test_data.bed_filtered.vcf.gz.tbi", + "77c14f11bd3af0802b1736cd87b949eb", + "VcfFile [chromosomes=[chr16], sampleCount=1, variantCount=49, phased=false, phasedAutodetect=false]" + ], + "meta": { + "nf-test": "0.9.0", + "nextflow": "24.04.4" + }, + "timestamp": "2024-11-06T16:37:39.828165612" + } +} \ No newline at end of file diff --git a/subworkflows/local/filter_variants/tests/nextflow.config b/subworkflows/local/filter_variants/tests/nextflow.config new file mode 100644 index 00000000..b9fd0da1 --- /dev/null +++ b/subworkflows/local/filter_variants/tests/nextflow.config @@ -0,0 +1,80 @@ +process { + withName: 'SHORT_VARIANT_CALLING:DEEPVARIANT' { + ext.prefix = { intervals ? "${meta.id}_${intervals}_deepvariant" : "${meta.id}_deepvariant" } + ext.args = { [ + '--model_type PACBIO', + "--sample_name=${meta.id}", + '-vcf_stats_report=False' + ].join(' ') } + } + + withName: 'SHORT_VARIANT_CALLING:GLNEXUS' { + ext.args = '--config DeepVariant_unfiltered' + } + + withName: 'SHORT_VARIANT_CALLING:BCFTOOLS_CONCAT' { + ext.prefix = { "${meta.id}_concat" } + ext.args = [ + '--no-version', + '--allow-overlaps' + ].join(' ') + } + + withName: 'SHORT_VARIANT_CALLING:BCFTOOLS_NORM_MULTISAMPLE' { + ext.prefix = { "${meta.id}_norm_multisample" } + ext.args = [ + '--no-version', + '-m -', + '--output-type u', + '--write-index=csi', + '-w 10000' + ].join(' ') + } + + withName: 'SHORT_VARIANT_CALLING:BCFTOOLS_NORM_SINGLESAMPLE' { + ext.prefix = { "${meta.id}_norm_singlesample" } + ext.args = [ + '--no-version', + '-m -', + '-w 10000', + '--output-type u', + ].join(' ') + } + + withName: 'SHORT_VARIANT_CALLING:BCFTOOLS_FILLTAGS' { + ext.prefix = { "${meta.id}_ac" } + ext.args = [ + '--no-version', + '--output-type u' + ].join(' ') + } + + withName: 'SNV_ANNOTATION:BCFTOOLS_FILLTAGS_ANNO' { + ext.prefix = { "${meta.id}_filltags_anno" } + ext.args = [ + '--no-version', + '--output-type z' + ].join(' ') + } + + withName: 'UNTAR' { + ext.prefix = { "${name}".contains('merged') ? './vep_cache/homo_sapiens_merged/' : './vep_cache/homo_sapiens/' } + } + + withName: 'SNV_ANNOTATION:ENSEMBLVEP_SNV' { + ext.args = { [ + '--offline', + '--vcf', + '--compress_output bgzip' + ].join(' ') } + } + + withName: 'FILTER_VARIANTS:ENSEMBLVEP_FILTERVEP' { + ext.args = { "--filter \"HGNC_ID in ${feature_file}\"" } + } + + withName: 'FILTER_VARIANTS:BCFTOOLS_VIEW' { + ext.prefix = { "${meta.id}_filtered" } + ext.args = '-e "INFO/AQ>60" --no-version --output-type z --write-index=tbi' + } +} diff --git a/tests/samplesheet.nf.test b/tests/samplesheet.nf.test index 3d4c9648..ea02a959 100644 --- a/tests/samplesheet.nf.test +++ b/tests/samplesheet.nf.test @@ -9,7 +9,7 @@ nextflow_pipeline { when { params { - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/' input = params.pipelines_testdata_base_path + 'testdata/samplesheet.csv' outdir = "$outputDir" } diff --git a/tests/samplesheet.nf.test.snap b/tests/samplesheet.nf.test.snap index 0060fe6c..dfe4afe1 100644 --- a/tests/samplesheet.nf.test.snap +++ b/tests/samplesheet.nf.test.snap @@ -1,7 +1,7 @@ { "test profile": { "content": [ - 111, + 115, { "ADD_FOUND_IN_TAG": { "bcftools": 1.2, @@ -51,6 +51,9 @@ "BCFTOOLS_STATS": { "bcftools": 1.2 }, + "BCFTOOLS_VIEW": { + "bcftools": 1.2 + }, "BEDTOOLS_MERGE": { "bedtools": "2.31.1" }, @@ -81,6 +84,9 @@ "ECHTVAR_ANNO": { "echtvar": "0.2.0" }, + "ENSEMBLVEP_FILTERVEP": { + "ensemblvep": 113.0 + }, "ENSEMBLVEP_SNV": { "ensemblvep": 110.0 }, @@ -471,6 +477,10 @@ "snvs/family/FAM", "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz", "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz.tbi", + "snvs/multi_sample", + "snvs/multi_sample/FAM", + "snvs/multi_sample/FAM/FAM_snvs_annotated_ranked_filtered.vcf.gz", + "snvs/multi_sample/FAM/FAM_snvs_annotated_ranked_filtered.vcf.gz.tbi", "snvs/sample", "snvs/sample/HG002_Revio", "snvs/sample/HG002_Revio/HG002_Revio_snv_annotated_ranked.vcf.gz", @@ -483,6 +493,8 @@ "svs/family/FAM", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi", + "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz", + "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz.tbi", "svs/sample", "svs/sample/HG002_Revio", "svs/sample/HG002_Revio/HG002_Revio_svs.vcf.gz", @@ -614,6 +626,10 @@ "FAM_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=103, phased=false, phasedAutodetect=false]" ], + [ + "FAM_snvs_annotated_ranked_filtered.vcf.gz", + "VcfFile [chromosomes=[chr16], sampleCount=1, variantCount=54, phased=false, phasedAutodetect=false]" + ], [ "HG002_Revio_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=100, phased=false, phasedAutodetect=false]" @@ -622,6 +638,10 @@ "FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=87, phased=false, phasedAutodetect=false]" ], + [ + "FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz", + "VcfFile [chromosomes=[chr16], sampleCount=1, variantCount=5, phased=false, phasedAutodetect=false]" + ], [ "HG002_Revio_svs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=55, phased=false, phasedAutodetect=false]" @@ -654,6 +674,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-12T16:46:53.528609548" + "timestamp": "2024-11-18T08:53:15.024737825" } } \ No newline at end of file diff --git a/tests/samplesheet_multisample_bam.nf.test b/tests/samplesheet_multisample_bam.nf.test index 4fa19978..3099f149 100644 --- a/tests/samplesheet_multisample_bam.nf.test +++ b/tests/samplesheet_multisample_bam.nf.test @@ -9,7 +9,7 @@ nextflow_pipeline { when { params { - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/' input = params.pipelines_testdata_base_path + 'testdata/samplesheet_multisample_bam.csv' outdir = "$outputDir" phaser = "hiphase" diff --git a/tests/samplesheet_multisample_bam.nf.test.snap b/tests/samplesheet_multisample_bam.nf.test.snap index ebbae97f..e2eb81ef 100644 --- a/tests/samplesheet_multisample_bam.nf.test.snap +++ b/tests/samplesheet_multisample_bam.nf.test.snap @@ -1,7 +1,7 @@ { "samplesheet_multisample_bam | --phaser hiphase": { "content": [ - 154, + 158, { "ADD_FOUND_IN_TAG": { "bcftools": 1.2, @@ -51,6 +51,9 @@ "BCFTOOLS_STATS": { "bcftools": 1.2 }, + "BCFTOOLS_VIEW": { + "bcftools": 1.2 + }, "BEDTOOLS_MERGE": { "bedtools": "2.31.1" }, @@ -81,6 +84,9 @@ "ECHTVAR_ANNO": { "echtvar": "0.2.0" }, + "ENSEMBLVEP_FILTERVEP": { + "ensemblvep": 113.0 + }, "ENSEMBLVEP_SNV": { "ensemblvep": 110.0 }, @@ -549,6 +555,10 @@ "snvs/family/FAM", "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz", "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz.tbi", + "snvs/multi_sample", + "snvs/multi_sample/FAM", + "snvs/multi_sample/FAM/FAM_snvs_annotated_ranked_filtered.vcf.gz", + "snvs/multi_sample/FAM/FAM_snvs_annotated_ranked_filtered.vcf.gz.tbi", "snvs/sample", "snvs/sample/HG002_Revio_A", "snvs/sample/HG002_Revio_A/HG002_Revio_A_snv_annotated_ranked.vcf.gz", @@ -565,6 +575,8 @@ "svs/family/FAM", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi", + "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz", + "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz.tbi", "svs/sample", "svs/sample/HG002_Revio_A", "svs/sample/HG002_Revio_A/HG002_Revio_A_svs.vcf.gz", @@ -790,6 +802,10 @@ "FAM_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=104, phased=false, phasedAutodetect=false]" ], + [ + "FAM_snvs_annotated_ranked_filtered.vcf.gz", + "VcfFile [chromosomes=[chr16], sampleCount=2, variantCount=55, phased=false, phasedAutodetect=false]" + ], [ "HG002_Revio_A_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=100, phased=false, phasedAutodetect=false]" @@ -802,6 +818,10 @@ "FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=87, phased=false, phasedAutodetect=false]" ], + [ + "FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz", + "VcfFile [chromosomes=[chr16], sampleCount=2, variantCount=5, phased=false, phasedAutodetect=false]" + ], [ "HG002_Revio_A_svs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=55, phased=false, phasedAutodetect=false]" @@ -854,6 +874,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-12T16:48:50.801407496" + "timestamp": "2024-11-18T08:55:12.484845801" } } \ No newline at end of file diff --git a/tests/samplesheet_multisample_ont_bam.nf.test b/tests/samplesheet_multisample_ont_bam.nf.test index 635a31e3..826bcd65 100644 --- a/tests/samplesheet_multisample_ont_bam.nf.test +++ b/tests/samplesheet_multisample_ont_bam.nf.test @@ -9,7 +9,7 @@ nextflow_pipeline { when { params { - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/22fb5b8a1a358df96e49f8d01a9c6e18770fbd6d/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/genomic-medicine-sweden/test-datasets/ba720cd29322036d966ab3e4bc4c3d03e1731af5/' input = params.pipelines_testdata_base_path + 'testdata/samplesheet_multisample_bam_ont.csv' outdir = "$outputDir" preset = 'ONT_R10' diff --git a/tests/samplesheet_multisample_ont_bam.nf.test.snap b/tests/samplesheet_multisample_ont_bam.nf.test.snap index f92f434f..85446506 100644 --- a/tests/samplesheet_multisample_ont_bam.nf.test.snap +++ b/tests/samplesheet_multisample_ont_bam.nf.test.snap @@ -1,7 +1,7 @@ { "samplesheet_multisample_ont_bam | --preset ONT_R10 --phaser whatshap --alignment_processes 1 --snv_calling_processes 1": { "content": [ - 103, + 107, { "ADD_FOUND_IN_TAG": { "bcftools": 1.2, @@ -45,6 +45,9 @@ "BCFTOOLS_STATS": { "bcftools": 1.2 }, + "BCFTOOLS_VIEW": { + "bcftools": 1.2 + }, "BEDTOOLS_MERGE": { "bedtools": "2.31.1" }, @@ -69,6 +72,9 @@ "ECHTVAR_ANNO": { "echtvar": "0.2.0" }, + "ENSEMBLVEP_FILTERVEP": { + "ensemblvep": 113.0 + }, "ENSEMBLVEP_SNV": { "ensemblvep": 110.0 }, @@ -441,6 +447,10 @@ "snvs/family/FAM", "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz", "snvs/family/FAM/FAM_snv_annotated_ranked.vcf.gz.tbi", + "snvs/multi_sample", + "snvs/multi_sample/FAM", + "snvs/multi_sample/FAM/FAM_snvs_annotated_ranked_filtered.vcf.gz", + "snvs/multi_sample/FAM/FAM_snvs_annotated_ranked_filtered.vcf.gz.tbi", "snvs/sample", "snvs/sample/HG002_ONT_A", "snvs/sample/HG002_ONT_A/HG002_ONT_A_snv_annotated_ranked.vcf.gz", @@ -457,6 +467,8 @@ "svs/family/FAM", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked.vcf.gz.tbi", + "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz", + "svs/family/FAM/FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz.tbi", "svs/sample", "svs/sample/HG002_ONT_A", "svs/sample/HG002_ONT_A/HG002_ONT_A_svs.vcf.gz", @@ -586,6 +598,10 @@ "FAM_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=105, phased=false, phasedAutodetect=false]" ], + [ + "FAM_snvs_annotated_ranked_filtered.vcf.gz", + "VcfFile [chromosomes=[chr16], sampleCount=2, variantCount=56, phased=false, phasedAutodetect=false]" + ], [ "HG002_ONT_A_snv_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=99, phased=false, phasedAutodetect=false]" @@ -598,6 +614,10 @@ "FAM_svs_cnvs_merged_annotated_ranked.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=2, variantCount=98, phased=false, phasedAutodetect=false]" ], + [ + "FAM_svs_cnvs_merged_annotated_ranked_filtered.vcf.gz", + "VcfFile [chromosomes=[chr16], sampleCount=2, variantCount=6, phased=false, phasedAutodetect=false]" + ], [ "HG002_ONT_A_svs.vcf.gz", "VcfFile [chromosomes=[chrX, chr16], sampleCount=1, variantCount=68, phased=false, phasedAutodetect=false]" @@ -623,6 +643,6 @@ "nf-test": "0.9.0", "nextflow": "24.04.4" }, - "timestamp": "2024-11-12T16:50:40.322312965" + "timestamp": "2024-11-18T08:57:05.873919937" } } \ No newline at end of file diff --git a/workflows/nallo.nf b/workflows/nallo.nf index 2d675915..cc355876 100644 --- a/workflows/nallo.nf +++ b/workflows/nallo.nf @@ -21,6 +21,8 @@ include { CALL_CNVS } from '../subworkflows/local/ include { CALL_PARALOGS } from '../subworkflows/local/call_paralogs' include { CALL_REPEAT_EXPANSIONS } from '../subworkflows/local/call_repeat_expansions' include { CALL_SVS } from '../subworkflows/local/call_svs' +include { FILTER_VARIANTS as FILTER_VARIANTS_SNVS } from '../subworkflows/local/filter_variants' +include { FILTER_VARIANTS as FILTER_VARIANTS_SVS } from '../subworkflows/local/filter_variants' include { METHYLATION } from '../subworkflows/local/methylation' include { PHASING } from '../subworkflows/local/phasing' include { PREPARE_GENOME } from '../subworkflows/local/prepare_genome' @@ -99,9 +101,10 @@ workflow NALLO { // Channels from (optional) input samplesheets validated by schema ch_databases = createReferenceChannelFromSamplesheet(params.echtvar_snv_databases, 'assets/schema_snp_db.json') ch_vep_plugin_files = createReferenceChannelFromSamplesheet(params.vep_plugin_files, 'assets/schema_vep_plugin_files.json', Channel.value([])) - - // Check parameter that doesn't conform to schema validation here - if (params.phaser.matches('hiphase') && params.preset == 'ONT_R10') { error "The HiPhase license only permits analysis of data from PacBio. For details see: https://github.com/PacificBiosciences/HiPhase/blob/main/LICENSE.md" } + ch_hgnc_ids = createReferenceChannelFromSamplesheet(params.filter_variants_hgnc_ids, 'assets/schema_hgnc_ids.json') + .map { it[0].toString() } // only one element per row + .collectFile(name: 'hgnc_ids.txt', newLine: true, sort: true) + .map { file -> [ [ id: 'hgnc_ids' ], file ] } // // Convert FASTQ to BAM (and vice versa if assembly workflow is active) @@ -395,7 +398,9 @@ workflow NALLO { .set { ch_bcftools_concat_in } // Concat into family VCFs per family with all regions - BCFTOOLS_CONCAT ( ch_bcftools_concat_in ) + BCFTOOLS_CONCAT ( + ch_bcftools_concat_in + ) ch_versions = ch_versions.mix(BCFTOOLS_CONCAT.out.versions) // Sort and publish @@ -414,6 +419,21 @@ workflow NALLO { BCFTOOLS_STATS ( ch_bcftools_stats_snv_in, [[],[]], [[],[]], [[],[]], [[],[]], [[],[]] ) ch_versions = ch_versions.mix(BCFTOOLS_STATS.out.versions) ch_multiqc_files = ch_multiqc_files.mix(BCFTOOLS_STATS.out.stats.collect{it[1]}.ifEmpty([])) + + } + // + // Filter SNVs + // + if(params.filter_variants_hgnc_ids || params.filter_snvs_expression != '') { + + // Publish filtered `project` SNVs from here + FILTER_VARIANTS_SNVS ( + BCFTOOLS_SORT.out.vcf, + ch_hgnc_ids, + params.filter_variants_hgnc_ids + ) + ch_versions = ch_versions.mix(FILTER_VARIANTS_SNVS.out.versions) + } // @@ -520,6 +540,24 @@ workflow NALLO { ch_versions = ch_versions.mix(RANK_VARIANTS_SVS.out.versions) } + // + // Filter SVs + // + if(params.filter_variants_hgnc_ids || params.filter_svs_expression != '') { + + if(params.skip_cnv_calling) { + ch_filter_svs_in = params.skip_sv_annotation ? CALL_SVS.out.family_vcf : params.skip_rank_variants ? ANN_CSQ_PLI_SVS.out.vcf : RANK_VARIANTS_SVS.out.vcf + } else { + ch_filter_svs_in = params.skip_sv_annotation ? annotate_svs_in : params.skip_rank_variants ? ANN_CSQ_PLI_SVS.out.vcf : RANK_VARIANTS_SVS.out.vcf + } + + FILTER_VARIANTS_SVS ( + ch_filter_svs_in, + ch_hgnc_ids, + params.filter_variants_hgnc_ids + ) + } + // // Phase SNVs and INDELs //