From 67c0c5b865e61d052c4cad603065c40f9ee77ef1 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Thu, 15 Aug 2024 15:50:50 +0200 Subject: [PATCH 01/11] refactor: add decompose and normalizing for t vcfs --- config/config.yaml | 1 + config/output_files.json | 16 +++--- workflow/Snakefile | 67 +++++++++++++++++++++----- workflow/rules/common.smk | 2 +- workflow/rules/export.smk | 6 +-- workflow/rules/fix_af.smk | 8 +-- workflow/rules/html_output.smk | 2 +- workflow/rules/mutectcaller_to_tsv.smk | 2 +- 8 files changed, 73 insertions(+), 31 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 2dde122..92b5f6a 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -16,6 +16,7 @@ modules: prealignment: "v1.1.0" qc: "v0.3.0" reports: "7c8b8c5" + snv_indels: "v0.6.0" misc: "v0.1.0" sentieon: "b002d39" diff --git a/config/output_files.json b/config/output_files.json index e5d9ab9..e26deec 100644 --- a/config/output_files.json +++ b/config/output_files.json @@ -16,14 +16,14 @@ "Results/{sample}/Cram/{sample}_{type}.crumble.cram": {"name": "_results_cram", "file": "compression/crumble/{sample}_{type}.crumble.cram", "types": ["T", "N"]}, "Results/{sample}/Cram/{sample}_{type}.crumble.cram.crai": {"name": "_results_crai", "file": "compression/crumble/{sample}_{type}.crumble.cram.crai", "types": ["T", "N"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz.tbi", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz.tbi", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz.tbi", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz.tbi", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz", "types": ["TN"]}, diff --git a/workflow/Snakefile b/workflow/Snakefile index cc56c70..f3e2a21 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -115,21 +115,21 @@ use rule vep from annotation as annotation_vep with: use rule vep from annotation as annotation_vep_t with: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz.tbi", cache=config["vep"]["vep_cache"], fasta=config["reference"]["fasta"], output: - vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf"), log: - "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.log", + "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.benchmark.tsv", config.get("vep", {}).get("benchmark_repeats", 1), ) message: - "{rule}: Annotate {wildcards.sample}_T.vcf" + "{rule}: Annotate {input.vcf} with vep." use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation_tn with: @@ -324,18 +324,18 @@ use rule filter_vcf from filtering as filtering_filter_vcf use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_t with: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi", output: - vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.{bed}.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf"), params: filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed], extra=config.get("bcftools_SNV", {}).get("extra", ""), log: - "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.{bed}.vcf.log", + "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.{bed}.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.benchmark.tsv", config.get("bcftools_SNV", {}).get("benchmark_repeats", 1), ) message: @@ -556,9 +556,9 @@ if aligner == "bwa_sentieon": use rule peddy from qc as qc_peddy with: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz", + vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz", ped="qc/peddy/{sample}.peddy.fam", - tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi", + tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz.tbi", output: ped=temp("qc/peddy/{sample}/peddy.peddy.ped"), ped_check=temp("qc/peddy/{sample}/peddy.ped_check.csv"), @@ -712,6 +712,47 @@ use rule cnv_html_report from reports as reports_cnv_html_report with: tc_method="pathology", +module snv_indels: + snakefile: + github( + "hydra-genetics/snv_indels", + path="workflow/Snakefile", + tag=config["modules"]["snv_indels"], + ) + config: + config + + +use rule vt_decompose from snv_indels as snv_indels_vt_decompose with: + input: + vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz", + tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi", + output: + vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz"), + log: + "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.log", + benchmark: + repeat( + "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.benchmark.tsv", + config.get("vt_decompose", {}).get("benchmark_repeats", 1), + ) + + +use rule vt_normalize from snv_indels as snv_indels_vt_normalize with: + input: + vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.decomposed.vcf.gz", + ref=config["reference"]["fasta"], + output: + vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz", + log: + "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.log", + benchmark: + repeat( + "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.benchmark.tsv", + config.get("vt_normalize", {}).get("benchmark_repeats", 1), + ) + + module misc: snakefile: github("hydra-genetics/misc", path="workflow/Snakefile", tag=config["modules"]["misc"]) diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index e006eaf..3fc5f76 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -110,7 +110,7 @@ def get_vcf_input(wildcards): if aligner is None: sys.exit("aligner missing from config, valid options: bwa_gpu or bwa_sentieon") elif aligner == "bwa_gpu": - vcf_input = "parabricks/pbrun_mutectcaller_t/{}_{}.vep.filter.germline.vcf".format(wildcards.sample, wildcards.type) + vcf_input = "parabricks/pbrun_mutectcaller_t/{}_{}.normalized.vep.filter.germline.vcf".format(wildcards.sample, wildcards.type) elif aligner == "bwa_sentieon": vcf_input = "sentieon/tnscope/{}_TNscope_tn_ML.vcf".format(wildcards.sample) else: diff --git a/workflow/rules/export.smk b/workflow/rules/export.smk index 7ee9256..6581282 100644 --- a/workflow/rules/export.smk +++ b/workflow/rules/export.smk @@ -6,11 +6,11 @@ __license__ = "GPL-3" rule export_to_xlsx: input: - all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vep.include.all.vcf.gz", + all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.all.vcf.gz", all_bed=config["bcftools_SNV"]["all"], - aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vep.include.aml.vcf.gz", + aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.aml.vcf.gz", aml_bed=config["bcftools_SNV"]["aml"], - tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vep.include.tm.vcf.gz", + tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.tm.vcf.gz", tm_bed=config["bcftools_SNV"]["tm"], output: xlsx=temp("export_to_xlsx/{analysis}/{sample_type}.snvs.xlsx"), diff --git a/workflow/rules/fix_af.smk b/workflow/rules/fix_af.smk index 5d84ca1..e31cca5 100644 --- a/workflow/rules/fix_af.smk +++ b/workflow/rules/fix_af.smk @@ -1,13 +1,13 @@ rule fix_af: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.vcf", + vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.vcf", output: - vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.vcf"), log: - "parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.log", + "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.benchmark.tsv", + "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.benchmark.tsv", config.get("fix_af", {}).get("benchmark_repeats", 1), ) threads: config.get("fix_af", {}).get("threads", config["default_resources"]["threads"]) diff --git a/workflow/rules/html_output.smk b/workflow/rules/html_output.smk index 9e5162c..1fda261 100644 --- a/workflow/rules/html_output.smk +++ b/workflow/rules/html_output.smk @@ -9,7 +9,7 @@ rule merge_cnv_json_chr: json=get_json_for_merge_cnv_json_chr, fai=config.get("reference", {}).get("fai", ""), annotation_bed=list(config.get("annotate_cnv", {}).values()), - germline_vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.vcf", + germline_vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.vcf", cnv_vcfs=get_unfiltered_cnv_vcfs_for_merge_json, filtered_cnv_vcfs=get_filtered_cnv_vcfs_for_merge_json, filtered_cnv_vcfs_tbi=get_filtered_cnv_vcfs_tbi_for_merge_json, diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk index 547d3de..04b4bc0 100644 --- a/workflow/rules/mutectcaller_to_tsv.smk +++ b/workflow/rules/mutectcaller_to_tsv.smk @@ -36,7 +36,7 @@ rule mutectcaller_tn_to_tsv: rule mutectcaller_t_to_tsv: input: - vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}_T.vep.include.{bed}.vcf", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}_T.normalized.vep.include.{bed}.vcf", output: tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"), params: From 03159b772cf9c0f48d86b60a40fda3303f15fc2f Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Mon, 19 Aug 2024 11:33:09 +0200 Subject: [PATCH 02/11] refactor: add decompose and normalizing to tn-vcfs --- config/output_files.json | 16 +++++------ workflow/Snakefile | 40 +++++++++++++------------- workflow/rules/mutectcaller_to_tsv.smk | 2 +- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/config/output_files.json b/config/output_files.json index e26deec..3122e7c 100644 --- a/config/output_files.json +++ b/config/output_files.json @@ -26,14 +26,14 @@ "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz.tbi", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz.tbi", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_mutectcaller_t.all.tsv", "types": ["T"]}, diff --git a/workflow/Snakefile b/workflow/Snakefile index f3e2a21..80df038 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -96,21 +96,21 @@ use rule * from annotation as annotation_* use rule vep from annotation as annotation_vep with: input: - vcf="parabricks/pbrun_mutectcaller_tn/{sample}.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_tn/{sample}.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz.tbi", cache=config["vep"]["vep_cache"], fasta=config["reference"]["fasta"], output: - vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf"), log: - "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.log", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.benchmark.tsv", config.get("vep", {}).get("benchmark_repeats", 1), ) message: - "{rule}: Annotate {wildcards.sample}.vcf" + "{rule}: Annotate {input.vcf} with vep" use rule vep from annotation as annotation_vep_t with: @@ -344,18 +344,18 @@ use rule bcftools_filter_include_region from filtering as filtering_bcftools_fil use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_tn with: input: - vcf="parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi", output: - vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.{bed}.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf"), params: filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed], extra=config.get("bcftools_SNV", {}).get("extra", ""), log: - "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.{bed}.vcf.log", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.{bed}.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.benchmark.tsv", config.get("bcftools_SNV", {}).get("benchmark_repeats", 1), ) message: @@ -725,30 +725,30 @@ module snv_indels: use rule vt_decompose from snv_indels as snv_indels_vt_decompose with: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz", - tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vcf.gz", + tbi="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vcf.gz.tbi", output: - vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz"), + vcf=temp("parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz"), log: - "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.log", + "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.benchmark.tsv", + "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz.benchmark.tsv", config.get("vt_decompose", {}).get("benchmark_repeats", 1), ) use rule vt_normalize from snv_indels as snv_indels_vt_normalize with: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.decomposed.vcf.gz", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz", ref=config["reference"]["fasta"], output: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz", log: - "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.log", + "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.benchmark.tsv", + "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz.benchmark.tsv", config.get("vt_normalize", {}).get("benchmark_repeats", 1), ) diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk index 04b4bc0..d866c9a 100644 --- a/workflow/rules/mutectcaller_to_tsv.smk +++ b/workflow/rules/mutectcaller_to_tsv.smk @@ -6,7 +6,7 @@ __license__ = "GPL-3" rule mutectcaller_tn_to_tsv: input: - vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}.vep.include.{bed}.vcf", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}.normalized.vep.include.{bed}.vcf", output: tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"), params: From 5918aae31358bc997d38904ee014a09ba1849ff2 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Tue, 20 Aug 2024 10:54:01 +0200 Subject: [PATCH 03/11] refactor: update mutect to tsv script to include normal af and dp for tn samples --- config/config.yaml | 7 ++ config/output_files.json | 4 +- workflow/rules/mutectcaller_to_tsv.smk | 41 +------ workflow/scripts/mutectcaller_to_tsv.py | 146 ++++++++++++++++++++---- 4 files changed, 139 insertions(+), 59 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 92b5f6a..7048b56 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -338,3 +338,10 @@ vep: container: "docker://hydragenetics/vep:105" vep_cache: "" extra: "--assembly GRCh38 --check_existing --pick --sift b --polyphen b --ccds --uniprot --hgvs --symbol --numbers --domains --regulatory --canonical --protein --biotype --uniprot --tsl --appris --gene_phenotype --af --af_1kg --af_gnomad --max_af --pubmed --variant_class" + +vt_decompose: + container: "docker://hydragenetics/vt:2015.11.10" + +vt_normalize: + container: "docker://hydragenetics/vt:2015.11.10" + \ No newline at end of file diff --git a/config/output_files.json b/config/output_files.json index 3122e7c..851a054 100644 --- a/config/output_files.json +++ b/config/output_files.json @@ -36,8 +36,8 @@ "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz.tbi", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_mutectcaller_t.all.tsv", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.aml.tsv": {"name": "_results_tsv_aml_t", "file": "tsv_files/{sample}_mutectcaller_t.aml.tsv", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_T_mutectcaller_t.all.tsv", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.aml.tsv": {"name": "_results_tsv_aml_t", "file": "tsv_files/{sample}_T_mutectcaller_t.aml.tsv", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_mutectcaller_TN.all.tsv": {"name": "_results_tsv_all", "file": "tsv_files/{sample}_mutectcaller_tn.all.tsv", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_mutectcaller_TN.aml.tsv": {"name": "_results_tsv_aml", "file": "tsv_files/{sample}_mutectcaller_tn.aml.tsv", "types": ["TN"]}, diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk index d866c9a..61fcaca 100644 --- a/workflow/rules/mutectcaller_to_tsv.smk +++ b/workflow/rules/mutectcaller_to_tsv.smk @@ -4,49 +4,18 @@ __email__ = "martin.rippin@scilifelab.uu.se" __license__ = "GPL-3" -rule mutectcaller_tn_to_tsv: +rule mutectcaller_to_tsv: input: - vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}.normalized.vep.include.{bed}.vcf", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.{bed}.vcf", output: - tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"), + tsv=temp("tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv"), params: analysis="{analysis}", - sample="{sample}", log: - "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.log", + "tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv.log", benchmark: repeat( - "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.benchmark.tsv", - config.get("mutectcaller_to_tsv", {}).get("benchmark_repeats", 1), - ) - threads: config.get("mutectcaller_to_tsv", {}).get("threads", config["default_resources"]["threads"]) - resources: - mem_mb=config.get("mutectcaller_to_tsv", {}).get("mem_mb", config["default_resources"]["mem_mb"]), - mem_per_cpu=config.get("mutectcaller_to_tsv", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), - partition=config.get("mutectcaller_to_tsv", {}).get("partition", config["default_resources"]["partition"]), - threads=config.get("mutectcaller_to_tsv", {}).get("threads", config["default_resources"]["threads"]), - time=config.get("mutectcaller_to_tsv", {}).get("time", config["default_resources"]["time"]), - container: - config.get("mutectcaller_to_tsv", {}).get("container", config["default_container"]) - message: - "{rule}: select relevant info from {input.vcf} and compile into tsv" - script: - "../scripts/mutectcaller_to_tsv.py" - - -rule mutectcaller_t_to_tsv: - input: - vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}_T.normalized.vep.include.{bed}.vcf", - output: - tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"), - params: - analysis="{analysis}", - sample="{sample}", - log: - "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.log", - benchmark: - repeat( - "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.benchmark.tsv", + "tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv.benchmark.tsv", config.get("mutectcaller_to_tsv", {}).get("benchmark_repeats", 1), ) threads: config.get("mutectcaller_to_tsv", {}).get("threads", config["default_resources"]["threads"]) diff --git a/workflow/scripts/mutectcaller_to_tsv.py b/workflow/scripts/mutectcaller_to_tsv.py index a48538c..d7129c8 100755 --- a/workflow/scripts/mutectcaller_to_tsv.py +++ b/workflow/scripts/mutectcaller_to_tsv.py @@ -3,30 +3,134 @@ import csv import sys -import vcf +from pysam import VariantFile +import logging -input_file = snakemake.input.vcf +logging.basicConfig( + format="{asctime} - {levelname} - {message}", + style="{", + datefmt="%Y-%m-%d %H:%M", + level=logging.INFO, +) + +input_file = VariantFile(snakemake.input.vcf) analysis = snakemake.params.analysis -sample = snakemake.params.sample output_file = snakemake.output.tsv -vcf = vcf.Reader(open(input_file, "r")) +csq_index = [] +for x in input_file.header.records: + if "CSQ" in str(x): + csq_index = str(x).split("Format: ")[1].strip().strip('">').split("|") + +outlines = [] +if len(list(input_file.header.samples)) == 1: + logging.debug("One sample identified. " + f"{list(input_file.header.samples)=}") + outlines.append( + [ + "#SAMPLE", + "CHROMOSOME", + "POSITION", + "REFERENCE", + "ALTERNATIVE", + "ALLELEFREQUENCY", + "DEPTH", + "GENE", + "TRANSCRIPT", + "NUCLEOTIDESEQ", + "PROTEIN", + "PROTEINSEQ", + "CONSEQUENCE", + ] + ) + +elif len(list(input_file.header.samples)) == 2: + logging.debug("Two sample identified. " + f"{list(input_file.header.samples)=}") + outlines.append( + [ + "#SAMPLE", + "CHROMOSOME", + "POSITION", + "REFERENCE", + "ALTERNATIVE", + "ALLELEFREQUENCY", + "NORMAL_AF", + "DEPTH", + "NORMAL_DP", + "GENE", + "TRANSCRIPT", + "NUCLEOTIDESEQ", + "PROTEIN", + "PROTEINSEQ", + "CONSEQUENCE", + ] + ) +else: + logging.error(snakemake.input.vcf + " contains more than two samples in header.") + sys.exit() + + +logging.info("Starting to iterate through calls in " + snakemake.input.vcf) +for row in input_file.fetch(): + csq = row.info["CSQ"][0].split("|") + gene = csq[csq_index.index("SYMBOL")] + + transcript_name = csq[csq_index.index("HGVSc")].split(":")[0] + transcript_change = "" + if len(csq[csq_index.index("HGVSc")].split(":")) > 1: + transcript_change = csq[csq_index.index("HGVSc")].split(":")[1] + + protein_name = csq[csq_index.index("HGVSp")].split(":")[0] + protein_change = "" + if len(csq[csq_index.index("HGVSp")].split(":")) > 1: + protein_change = csq[csq_index.index("HGVSp")].split(":")[1] + + consequence = csq[csq_index.index("Consequence")] + + tumor_sample = [x for x in row.samples if x.endswith("_T")][0] + af = row.samples[tumor_sample]["AF"][0] + dp = row.samples[tumor_sample]["DP"][0] + outline = [ + tumor_sample.split("_")[0], + row.chrom, + row.pos, + row.ref, + row.alts[0], + af, + dp, + gene, + transcript_name, + transcript_change, + protein_name, + protein_change, + consequence, + ] + + if len(list(input_file.header.samples)) == 2: + normal_sample = [x for x in row.samples if x.endswith("_N")][0] + af_normal = row.samples[normal_sample]["AF"][0] + dp_normal = row.samples[normal_sample]["DP"][0] + outline = [ + tumor_sample.split("_")[0], + row.chrom, + row.pos, + row.ref, + row.alts[0], + af, + af_normal, + dp, + dp_normal, + gene, + transcript_name, + transcript_change, + protein_name, + protein_change, + consequence, + ] + + outlines.append(outline) + +logging.info("Writing results to " + snakmake.output.tsv) with open(output_file, "wt") as tsv: - tsv_writer = csv.writer(tsv, delimiter='\t') - tsv_writer.writerow(["#SAMPLE", "CHROMOSOME", "POSITION", "REFERENCE", "ALTERNATIVE", "ALLELEFREQUENCY", "DEPTH", "GENE", - "TRANSCRIPT", "NUCLEOTIDESEQ", "PROTEIN", "PROTEINSEQ", "CONSEQUENCE"]) - for row in vcf: - genes = row.INFO["CSQ"][0].split("|") - transcript_field = genes[10].split(":") - if len(transcript_field) == 1: - transcript_field = ["", ""] - protein_field = genes[11].split(":") - if len(protein_field) == 1: - protein_field = [genes[24], ""] - i = 0 - if analysis == "tn": - i = 1 - af = row.samples[i]["AF"] - tsv_writer.writerow([sample, row.CHROM, row.POS, row.REF, row.ALT, af, row.INFO["DP"], genes[3], transcript_field[0], - transcript_field[1], protein_field[0], protein_field[1], genes[1]]) + tsv_writer = csv.writer(tsv, delimiter="\t") + tsv_writer.writerows(outlines) From cdf9c63ac5d4e6fe0f52d49eac4c09db72f54a41 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Tue, 20 Aug 2024 14:11:41 +0200 Subject: [PATCH 04/11] feat: add preliminary soft filters --- config/config.yaml | 2 +- config/config_snv_softfilter.yaml | 31 ++++++++++++++++++++++++ config/output_files.json | 32 ++++++++++++------------- workflow/Snakefile | 20 ++++++++-------- workflow/rules/export.smk | 6 ++--- workflow/rules/mutectcaller_to_tsv.smk | 2 +- workflow/scripts/export_to_xlsx.py | 22 ++++++++++------- workflow/scripts/mutectcaller_to_tsv.py | 5 ++++ 8 files changed, 80 insertions(+), 40 deletions(-) create mode 100644 config/config_snv_softfilter.yaml diff --git a/config/config.yaml b/config/config.yaml index 7048b56..b0ec8fe 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -120,6 +120,7 @@ fastqc: filter_vcf: germline: "config/config_germline_filter.yaml" cnv_hardfilter: "config/config_cnv_hardfilter.yaml" + somatic: "config/config_snv_softfilter.yaml" fusioncatcher: container: "docker://hydragenetics/fusioncatcher:1.33" @@ -344,4 +345,3 @@ vt_decompose: vt_normalize: container: "docker://hydragenetics/vt:2015.11.10" - \ No newline at end of file diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml new file mode 100644 index 0000000..23747a2 --- /dev/null +++ b/config/config_snv_softfilter.yaml @@ -0,0 +1,31 @@ +filters: + af: + description: "Softfilter variants where T have AF lower than 5 percent" + expression: "INFO:AF < 0.05" + soft_filter: "True" + soft_filter_flag: "AF_lt_0.05" + ad: + description: "Softfilter variants where T have AD lower than 3?" + expression: "INFO:AD:1 < 3" + soft_filter: "True" + soft_filter_flag: "AD_lt_3" + dp: + description: "Softfilter variants where dp is lower than 20x?" + expression: "INFO:DP < 20" + soft_filter: "True" + soft_filter_flag: "DP_lt_20" + germline: + description: "Soft filter germline if >2% in any population from 1000 genomes, ESP or gnomADe" + expression: "(VEP:MAX_AF > 0.02)" + soft_filter_flag: "PopAF_0.02" + soft_filter: "True" + intron: + description: "Soft filter intronic variants except if also splice" + expression: "(exist[intron_variant, VEP:Consequence] and !exist[splice, VEP:Consequence])" + soft_filter_flag: "Intron" + soft_filter: "True" + protein_coding: + description: "Soft filter variants not deemed protein_coding" + expression: (VEP:BIOTYPE != protein_coding) + soft_filter_flag: "Biotype" + soft_filter: "True" \ No newline at end of file diff --git a/config/output_files.json b/config/output_files.json index 851a054..3796349 100644 --- a/config/output_files.json +++ b/config/output_files.json @@ -16,24 +16,24 @@ "Results/{sample}/Cram/{sample}_{type}.crumble.cram": {"name": "_results_cram", "file": "compression/crumble/{sample}_{type}.crumble.cram", "types": ["T", "N"]}, "Results/{sample}/Cram/{sample}_{type}.crumble.cram.crai": {"name": "_results_crai", "file": "compression/crumble/{sample}_{type}.crumble.cram.crai", "types": ["T", "N"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz.tbi", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz.tbi", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.filter.somatic.vep.include.aml.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_T_mutectcaller_t.all.tsv", "types": ["T"]}, diff --git a/workflow/Snakefile b/workflow/Snakefile index 80df038..673e759 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -324,18 +324,18 @@ use rule filter_vcf from filtering as filtering_filter_vcf use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_t with: input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz.tbi", output: - vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.{bed}.vcf"), params: filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed], extra=config.get("bcftools_SNV", {}).get("extra", ""), log: - "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.log", + "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.{bed}.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.{bed}.vcf.benchmark.tsv", config.get("bcftools_SNV", {}).get("benchmark_repeats", 1), ) message: @@ -344,18 +344,18 @@ use rule bcftools_filter_include_region from filtering as filtering_bcftools_fil use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_tn with: input: - vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi", output: - vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf"), params: filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed], extra=config.get("bcftools_SNV", {}).get("extra", ""), log: - "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.log", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.benchmark.tsv", config.get("bcftools_SNV", {}).get("benchmark_repeats", 1), ) message: diff --git a/workflow/rules/export.smk b/workflow/rules/export.smk index 6581282..48dcc7f 100644 --- a/workflow/rules/export.smk +++ b/workflow/rules/export.smk @@ -6,11 +6,11 @@ __license__ = "GPL-3" rule export_to_xlsx: input: - all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.all.vcf.gz", + all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.all.vcf.gz", all_bed=config["bcftools_SNV"]["all"], - aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.aml.vcf.gz", + aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.aml.vcf.gz", aml_bed=config["bcftools_SNV"]["aml"], - tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.tm.vcf.gz", + tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.tm.vcf.gz", tm_bed=config["bcftools_SNV"]["tm"], output: xlsx=temp("export_to_xlsx/{analysis}/{sample_type}.snvs.xlsx"), diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk index 61fcaca..cc7ba82 100644 --- a/workflow/rules/mutectcaller_to_tsv.smk +++ b/workflow/rules/mutectcaller_to_tsv.smk @@ -6,7 +6,7 @@ __license__ = "GPL-3" rule mutectcaller_to_tsv: input: - vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.{bed}.vcf", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.{bed}.vcf", output: tsv=temp("tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv"), params: diff --git a/workflow/scripts/export_to_xlsx.py b/workflow/scripts/export_to_xlsx.py index b501184..a985129 100644 --- a/workflow/scripts/export_to_xlsx.py +++ b/workflow/scripts/export_to_xlsx.py @@ -13,8 +13,6 @@ def index_vep(variantfile): return csq_index -# No filterings as of now 231027 on any vcf. Should we add? -# are the files decomposed? """ Prepping input data """ bedfiles = {} bedfiles["all"] = snakemake.input.all_bed @@ -31,14 +29,15 @@ def index_vep(variantfile): vcf_tables[subsection] = [] vcf = VariantFile(vcfs[subsection]) csq_index = index_vep(vcf) + sample_tumor = [x for x in list(vcf.header.samples) if x.endswith("_T")][0] if len(list(vcf.header.samples)) > 1: - sample_normal = list(vcf.header.samples)[0] - sample_tumor = list(vcf.header.samples)[1] + sample_normal = [x for x in list(vcf.header.samples) if x.endswith("_N")][0] else: - sample_tumor = list(vcf.header.samples)[0] af_normal = "" + dp_normal = "" for record in vcf.fetch(): + filterflag = ",".join(record.filter.keys()) af = float(record.samples[sample_tumor].get("AF")[0]) if af > 0.01: csq = record.info["CSQ"][0].split("|") @@ -72,8 +71,10 @@ def index_vep(variantfile): if len(list(vcf.header.samples)) > 1: af_normal = float(record.samples[sample_normal].get("AF")[0]) + dp_normal = int(record.samples[sample_normal].get("DP")) outline = [ + filterflag, sample_tumor, gene, record.contig, @@ -83,6 +84,7 @@ def index_vep(variantfile): af, af_normal, int(record.info["DP"]), + int(dp_normal), transcript, exon, coding_name, @@ -138,13 +140,14 @@ def index_vep(variantfile): for sheet in subsections: vcf_data = vcf_tables[sheet] worksheet = workbook.add_worksheet(sheet.upper()) - worksheet.set_column("A:A", 12) - worksheet.set_column("D:D", 10) - worksheet.set_column("J:J", 15) + worksheet.set_column("B:B", 12) + worksheet.set_column("E:E", 10) + worksheet.set_column("L:L", 15) worksheet.write("A1", "Variants in " + sheet.upper() + " regions", heading_format) worksheet.write("A3", "Sample: " + str(sample_tumor)) tableheading = [ + "FilterFlag", "DNAnr", "Gene", "Chr", @@ -154,6 +157,7 @@ def index_vep(variantfile): "AF", "Normal AF", "DP", + "Normal DP", "Transcript", "Exon", "Mutation cds", @@ -168,7 +172,7 @@ def index_vep(variantfile): heading_list = [{"header": a} for a in tableheading] worksheet.add_table( - "A5:S" + str(len(vcf_data) + 6), {"data": vcf_data, "columns": heading_list, "style": "Table Style Light 1"} + "A5:U" + str(len(vcf_data) + 6), {"data": vcf_data, "columns": heading_list, "style": "Table Style Light 1"} ) # Add bedfile sheets for sheet in subsections: diff --git a/workflow/scripts/mutectcaller_to_tsv.py b/workflow/scripts/mutectcaller_to_tsv.py index d7129c8..bd5a3ae 100755 --- a/workflow/scripts/mutectcaller_to_tsv.py +++ b/workflow/scripts/mutectcaller_to_tsv.py @@ -36,6 +36,7 @@ "ALLELEFREQUENCY", "DEPTH", "GENE", + "FILTERFLAG", "TRANSCRIPT", "NUCLEOTIDESEQ", "PROTEIN", @@ -58,6 +59,7 @@ "DEPTH", "NORMAL_DP", "GENE", + "FILTERFLAG", "TRANSCRIPT", "NUCLEOTIDESEQ", "PROTEIN", @@ -90,6 +92,7 @@ tumor_sample = [x for x in row.samples if x.endswith("_T")][0] af = row.samples[tumor_sample]["AF"][0] dp = row.samples[tumor_sample]["DP"][0] + flag = ",".join(row.filter.keys()) outline = [ tumor_sample.split("_")[0], row.chrom, @@ -99,6 +102,7 @@ af, dp, gene, + flag, transcript_name, transcript_change, protein_name, @@ -121,6 +125,7 @@ dp, dp_normal, gene, + flag, transcript_name, transcript_change, protein_name, From 807b1db0fb6062eb3da26fa5413e424b2c413044 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Wed, 21 Aug 2024 13:22:22 +0200 Subject: [PATCH 05/11] chore: small typos and fixes --- config/config.yaml | 1 + config/config_snv_softfilter.yaml | 10 +++++----- workflow/scripts/export_to_xlsx.py | 2 +- workflow/scripts/mutectcaller_to_tsv.py | 6 +++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index b0ec8fe..67fe9dc 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -118,6 +118,7 @@ fastqc: container: "docker://hydragenetics/fastqc:0.11.9" filter_vcf: + sample_name_regex: "^([A-Za-z0-9-]+_[RT])$" germline: "config/config_germline_filter.yaml" cnv_hardfilter: "config/config_cnv_hardfilter.yaml" somatic: "config/config_snv_softfilter.yaml" diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml index 23747a2..a9d6a6c 100644 --- a/config/config_snv_softfilter.yaml +++ b/config/config_snv_softfilter.yaml @@ -1,17 +1,17 @@ filters: af: description: "Softfilter variants where T have AF lower than 5 percent" - expression: "INFO:AF < 0.05" + expression: "FORMAT:AF:0 < 0.05" soft_filter: "True" soft_filter_flag: "AF_lt_0.05" ad: description: "Softfilter variants where T have AD lower than 3?" - expression: "INFO:AD:1 < 3" + expression: "FORMAT:AD:1 < 3" soft_filter: "True" soft_filter_flag: "AD_lt_3" dp: description: "Softfilter variants where dp is lower than 20x?" - expression: "INFO:DP < 20" + expression: "FORMAT:DP < 20" soft_filter: "True" soft_filter_flag: "DP_lt_20" germline: @@ -26,6 +26,6 @@ filters: soft_filter: "True" protein_coding: description: "Soft filter variants not deemed protein_coding" - expression: (VEP:BIOTYPE != protein_coding) + expression: "(VEP:BIOTYPE != protein_coding)" soft_filter_flag: "Biotype" - soft_filter: "True" \ No newline at end of file + soft_filter: "True" diff --git a/workflow/scripts/export_to_xlsx.py b/workflow/scripts/export_to_xlsx.py index a985129..5eaba76 100644 --- a/workflow/scripts/export_to_xlsx.py +++ b/workflow/scripts/export_to_xlsx.py @@ -84,7 +84,7 @@ def index_vep(variantfile): af, af_normal, int(record.info["DP"]), - int(dp_normal), + dp_normal, transcript, exon, coding_name, diff --git a/workflow/scripts/mutectcaller_to_tsv.py b/workflow/scripts/mutectcaller_to_tsv.py index bd5a3ae..7d51a24 100755 --- a/workflow/scripts/mutectcaller_to_tsv.py +++ b/workflow/scripts/mutectcaller_to_tsv.py @@ -91,7 +91,7 @@ tumor_sample = [x for x in row.samples if x.endswith("_T")][0] af = row.samples[tumor_sample]["AF"][0] - dp = row.samples[tumor_sample]["DP"][0] + dp = row.samples[tumor_sample]["DP"] flag = ",".join(row.filter.keys()) outline = [ tumor_sample.split("_")[0], @@ -113,7 +113,7 @@ if len(list(input_file.header.samples)) == 2: normal_sample = [x for x in row.samples if x.endswith("_N")][0] af_normal = row.samples[normal_sample]["AF"][0] - dp_normal = row.samples[normal_sample]["DP"][0] + dp_normal = row.samples[normal_sample]["DP"] outline = [ tumor_sample.split("_")[0], row.chrom, @@ -135,7 +135,7 @@ outlines.append(outline) -logging.info("Writing results to " + snakmake.output.tsv) +logging.info("Writing results to " + output_file) with open(output_file, "wt") as tsv: tsv_writer = csv.writer(tsv, delimiter="\t") tsv_writer.writerows(outlines) From e10ec276ab3dd9b7c0c34d1e1205344d3eb95b7d Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Thu, 5 Sep 2024 15:03:33 +0200 Subject: [PATCH 06/11] refactor: change snv dp filter to 10x --- config/config_snv_softfilter.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml index a9d6a6c..a138877 100644 --- a/config/config_snv_softfilter.yaml +++ b/config/config_snv_softfilter.yaml @@ -10,10 +10,10 @@ filters: soft_filter: "True" soft_filter_flag: "AD_lt_3" dp: - description: "Softfilter variants where dp is lower than 20x?" - expression: "FORMAT:DP < 20" + description: "Softfilter variants where dp is lower than 10x" + expression: "FORMAT:DP < 10" soft_filter: "True" - soft_filter_flag: "DP_lt_20" + soft_filter_flag: "DP_lt_10" germline: description: "Soft filter germline if >2% in any population from 1000 genomes, ESP or gnomADe" expression: "(VEP:MAX_AF > 0.02)" From 6ad931291c449465b5100e456a3541582ca66339 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Fri, 6 Sep 2024 11:53:10 +0200 Subject: [PATCH 07/11] refactor: add consequnce filter to snvs --- config/config_snv_softfilter.yaml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml index a138877..1949147 100644 --- a/config/config_snv_softfilter.yaml +++ b/config/config_snv_softfilter.yaml @@ -29,3 +29,18 @@ filters: expression: "(VEP:BIOTYPE != protein_coding)" soft_filter_flag: "Biotype" soft_filter: "True" + consequences: + description: "Soft filter based on consequences" + expression: + "(exist[intergenic_variant, VEP:Consequence] or exist[NMD_transcript_variant, VEP:Consequence] or + exist[non_coding_transcript_variant, VEP:Consequence] or + exist[upstream_gene_variant, VEP:Consequence] or + exist[downstream_gene_variant, VEP:Consequence] or + exist[TFBS_ablation, VEP:Consequence] or + exist[TFBS_amplification, VEP:Consequence] or + exist[TF_binding_site_variant, VEP:Consequence] or + exist[regulatory_region_ablation, VEP:Consequence] or + exist[regulatory_region_amplification, VEP:Consequence] or + exist[regulatory_region_variant, VEP:Consequence])" + soft_filter_flag: "Consequence" + soft_filter: "True" From 82f50c5be13bd8a437435b2a685df4fac817b6b5 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Wed, 11 Sep 2024 11:50:42 +0200 Subject: [PATCH 08/11] refactor: merge vep rules for t and tn into one rule --- workflow/Snakefile | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/workflow/Snakefile b/workflow/Snakefile index 673e759..d664179 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -96,42 +96,23 @@ use rule * from annotation as annotation_* use rule vep from annotation as annotation_vep with: input: - vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz.tbi", cache=config["vep"]["vep_cache"], fasta=config["reference"]["fasta"], output: - vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.vcf"), log: - "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.log", + "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.vcf.benchmark.tsv", config.get("vep", {}).get("benchmark_repeats", 1), ) message: "{rule}: Annotate {input.vcf} with vep" -use rule vep from annotation as annotation_vep_t with: - input: - vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz.tbi", - cache=config["vep"]["vep_cache"], - fasta=config["reference"]["fasta"], - output: - vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf"), - log: - "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.log", - benchmark: - repeat( - "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.benchmark.tsv", - config.get("vep", {}).get("benchmark_repeats", 1), - ) - message: - "{rule}: Annotate {input.vcf} with vep." - - use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation_tn with: input: vcf="cnv_sv/manta_run_workflow_tn/{sample}/results/variants/somaticSV.snpeff.vcf.gz", @@ -147,7 +128,7 @@ use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation config.get("snpeff", {}).get("benchmark_repeats", 1), ) message: - "{rule}: Annotate cnv_sv/manta_run_workflow_tn/{wildcards.sample}/results/variants/somaticSV.snpeff.vcf.gz" + "{rule}: Annotate {input.vcf}" use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation_t with: From a92c8fb0abaf0ef0039f02605ee5604e506a2027 Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Wed, 11 Sep 2024 11:54:24 +0200 Subject: [PATCH 09/11] feat: add n_ratio annotation to tn vcfs --- config/output_files.json | 18 ++++---- docs/softwares.md | 21 ++++++++++ workflow/Snakefile | 11 ++--- workflow/rules/annotate.smk | 31 ++++++++++++++ workflow/rules/export.smk | 20 +++++++-- workflow/rules/mutectcaller_to_tsv.smk | 9 +++- workflow/schemas/config.schema.yaml | 14 +++++++ workflow/schemas/resources.schema.yaml | 20 +++++++++ workflow/schemas/rules.schema.yaml | 22 +++++++++- workflow/scripts/annotate_normal_ratio.py | 51 +++++++++++++++++++++++ workflow/scripts/export_to_xlsx.py | 6 +-- 11 files changed, 200 insertions(+), 23 deletions(-) create mode 100644 workflow/rules/annotate.smk create mode 100644 workflow/scripts/annotate_normal_ratio.py diff --git a/config/output_files.json b/config/output_files.json index 3796349..27228a6 100644 --- a/config/output_files.json +++ b/config/output_files.json @@ -21,19 +21,19 @@ "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.filter.somatic.vep.include.aml.vcf.gz.tbi", "types": ["T"]}, + "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.aml.vcf.gz.tbi", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["T"]}, "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz.tbi", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["TN"]}, - "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.all.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.all.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.aml.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.aml.vcf.gz.tbi", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.tm.vcf.gz", "types": ["TN"]}, + "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.tm.vcf.gz.tbi", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]}, "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_T_mutectcaller_t.all.tsv", "types": ["T"]}, diff --git a/docs/softwares.md b/docs/softwares.md index 9dc8a30..371ed95 100644 --- a/docs/softwares.md +++ b/docs/softwares.md @@ -19,3 +19,24 @@ Introduction to export_to_xlsx #### Resources settings (`resources.yaml`) #RESOURCESSCHEMA__export_to_xlsx# + +## [annotate_normal_ratio](url_to_tool) +Introduction to annotate_normal_ratio + +### :snake: Rule + +#SNAKEMAKE_RULE_SOURCE__annotate__annotate_normal_ratio# + +#### :left_right_arrow: input / output files + +#SNAKEMAKE_RULE_TABLE__annotate__annotate_normal_ratio# + +### :wrench: Configuration + +#### Software settings (`config.yaml`) + +#CONFIGSCHEMA__annotate_normal_ratio# + +#### Resources settings (`resources.yaml`) + +#RESOURCESSCHEMA__annotate_normal_ratio# diff --git a/workflow/Snakefile b/workflow/Snakefile index d664179..9dbf58e 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -5,6 +5,7 @@ __license__ = "GPL-3" include: "rules/common.smk" +include: "rules/annotate.smk" include: "rules/cnvkit.smk" include: "rules/cnvkit_table.smk" include: "rules/dux4_rearrangements.smk" @@ -325,18 +326,18 @@ use rule bcftools_filter_include_region from filtering as filtering_bcftools_fil use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_tn with: input: - vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz", - tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi", + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz", + tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz.tbi", output: - vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf"), + vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf"), params: filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed], extra=config.get("bcftools_SNV", {}).get("extra", ""), log: - "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.log", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf.log", benchmark: repeat( - "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.benchmark.tsv", + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf.benchmark.tsv", config.get("bcftools_SNV", {}).get("benchmark_repeats", 1), ) message: diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk new file mode 100644 index 0000000..f3110c0 --- /dev/null +++ b/workflow/rules/annotate.smk @@ -0,0 +1,31 @@ +__author__ = "Arielle R. Munters" +__copyright__ = "Copyright 2024, Arielle R. Munters" +__email__ = "arielle.munters@scilifelab.uu.se" +__license__ = "GPL-3" + + +rule annotate_normal_ratio: + input: + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf", + output: + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz", + log: + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz.log", + benchmark: + repeat( + "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz.benchmark.tsv", + config.get("annotate_normal_ratio", {}).get("benchmark_repeats", 1) + ) + threads: config.get("annotate_normal_ratio", {}).get("threads", config["default_resources"]["threads"]) + resources: + mem_mb=config.get("annotate_normal_ratio", {}).get("mem_mb", config["default_resources"]["mem_mb"]), + mem_per_cpu=config.get("annotate_normal_ratio", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]), + partition=config.get("annotate_normal_ratio", {}).get("partition", config["default_resources"]["partition"]), + threads=config.get("annotate_normal_ratio", {}).get("threads", config["default_resources"]["threads"]), + time=config.get("annotate_normal_ratio", {}).get("time", config["default_resources"]["time"]), + container: + config.get("annotate_normal_ratio", {}).get("container", config["default_container"]) + message: + "{rule}: Add normal ratio annotation to {input.vcf}" + script: + "../scripts/annotate_normal_ratio.py" diff --git a/workflow/rules/export.smk b/workflow/rules/export.smk index 48dcc7f..a69e40d 100644 --- a/workflow/rules/export.smk +++ b/workflow/rules/export.smk @@ -4,13 +4,25 @@ __email__ = "arielle.munters@scilifelab.uu.se" __license__ = "GPL-3" +def check_if_tn(wildcards): + if wildcards.analysis == "tn": + vcfs = expand( + "parabricks/pbrun_mutectcaller_{{analysis}}/{{sample_type}}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf.gz", + bed=["all", "aml", "tm"], + ) + else: + vcfs = expand( + "parabricks/pbrun_mutectcaller_{{analysis}}/{{sample_type}}.normalized.vep.filter.somatic.include.{bed}.vcf.gz", + bed=["all", "aml", "tm"], + ) + return vcfs + + rule export_to_xlsx: input: - all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.all.vcf.gz", + vcfs=check_if_tn, all_bed=config["bcftools_SNV"]["all"], - aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.aml.vcf.gz", aml_bed=config["bcftools_SNV"]["aml"], - tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.tm.vcf.gz", tm_bed=config["bcftools_SNV"]["tm"], output: xlsx=temp("export_to_xlsx/{analysis}/{sample_type}.snvs.xlsx"), @@ -33,6 +45,6 @@ rule export_to_xlsx: container: config.get("export_to_xlsx", {}).get("container", config["default_container"]) message: - "{rule}: merge {input.all}, {input.aml}, {input.tm} into {output.xlsx}" + "{rule}: merge {input.vcfs} into {output.xlsx}" script: "../scripts/export_to_xlsx.py" diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk index cc7ba82..6a31e1d 100644 --- a/workflow/rules/mutectcaller_to_tsv.smk +++ b/workflow/rules/mutectcaller_to_tsv.smk @@ -3,10 +3,17 @@ __copyright__ = "Copyright 2021, Martin Rippin" __email__ = "martin.rippin@scilifelab.uu.se" __license__ = "GPL-3" +def check_if_tn(wildcards): + if wildcards.analysis == "tn": + vcf = "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf" + else: + vcf = "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.{bed}.vcf" + return vcf + rule mutectcaller_to_tsv: input: - vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.{bed}.vcf", + vcf=check_if_tn, output: tsv=temp("tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv"), params: diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index d827352..a86cb14 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -133,6 +133,20 @@ properties: type: string description: path to CNA gene bed file + annotate_normal_ratio: + type: object + description: parameters for annotate_normal_ratio + properties: + benchmark_repeats: + type: integer + description: set number of times benchmark should be repeated + container: + type: string + description: name or path to docker/singularity container + extra: + type: string + description: parameters that should be forwarded + arriba: type: object properties: diff --git a/workflow/schemas/resources.schema.yaml b/workflow/schemas/resources.schema.yaml index d162ea6..6f67bb1 100644 --- a/workflow/schemas/resources.schema.yaml +++ b/workflow/schemas/resources.schema.yaml @@ -48,6 +48,26 @@ properties: type: string description: max execution time + annotate_normal_ratio: + type: object + description: resource definitions for annotate_normal_ratio + properties: + mem_mb: + type: integer + description: max memory in MB to be available + mem_per_cpu: + type: integer + description: memory in MB used per cpu + partition: + type: string + description: partition to use on cluster + threads: + type: integer + description: number of threads to be available + time: + type: string + description: max execution time + arriba: type: object description: resource definitions for arriba diff --git a/workflow/schemas/rules.schema.yaml b/workflow/schemas/rules.schema.yaml index 4a93b8b..8e70e7f 100644 --- a/workflow/schemas/rules.schema.yaml +++ b/workflow/schemas/rules.schema.yaml @@ -3,6 +3,26 @@ description: snakemake rule input and output files description file type: object properties: + + annotate_normal_ratio: + type: object + description: input and output parameters for tumour normal vcf with annotate_normal_ratio + properties: + input: + type: object + description: list of inputs + properties: + vcf: + type: string + description: vcf file with both normal and tumor sample af used to annotate INFO column with n_frq/t_frq + output: + type: object + description: list of outputs + properties: + vcf: + type: string + description: vcf file with N_ratio annotation in INFO field + export_to_xlsx: type: object description: input and output parameters for export_to_xlsx @@ -35,4 +55,4 @@ properties: properties: xlsx: type: string - description: name of xlsx file for summerizing snv results based on different genelists \ No newline at end of file + description: name of xlsx file for summarizing snv results based on different genelists diff --git a/workflow/scripts/annotate_normal_ratio.py b/workflow/scripts/annotate_normal_ratio.py new file mode 100644 index 0000000..e669325 --- /dev/null +++ b/workflow/scripts/annotate_normal_ratio.py @@ -0,0 +1,51 @@ +#!/bin/python3 + +from pysam import VariantFile +import logging + +logging.basicConfig( + format="{asctime} - {levelname} - {message}", + style="{", + datefmt="%Y-%m-%d %H:%M", + level=logging.INFO, +) + +logging.info("Starting to process " + snakemake.input.vcf) + +vcf_in = VariantFile(snakemake.input.vcf) + +new_header = vcf_in.header +new_header.info.add("N_ratio", "A", "Float", "Number of times larger normal AF is than tumour sample.") +logging.debug("Creating header for " + snakemake.output.vcf) +vcf_out = VariantFile(snakemake.output.vcf, "wb", header=new_header) + +sample_tumor = [x for x in list(vcf_in.header.samples) if x.endswith("_T")][0] +sample_normal = [x for x in list(vcf_in.header.samples) if x.endswith("_N")][0] +logging.debug(f"{sample_tumor=}, {sample_normal=}") + +logging.info("Starting to iterate through input.") +for record in vcf_in.fetch(): + try: + n_freq = float(record.samples[sample_normal].get("AF")[0]) + except TypeError: + n_freq = 0 + + try: + t_freq = float(record.samples[sample_tumor].get("AF")[0]) + except TypeError: + t_freq = 0 + + if n_freq == 0: + logging.warning("Normal freq 0 or missing, setting ratio to 0") + ratio = 0 + elif t_freq == 0: + logging.warning("Tumor freq 0 or missing, setting ratio to 100") + ratio = 100 + else: + ratio = float(record.samples[sample_normal].get("AF")[0]) / float(record.samples[sample_tumor].get("AF")[0]) + + logging.debug(f"{record.samples[sample_normal].get('AF')[0]=}, {record.samples[sample_tumor].get('AF')[0]=}") + logging.debug(f"{ratio=}") + record.info["N_ratio"] = ratio + + vcf_out.write(record) diff --git a/workflow/scripts/export_to_xlsx.py b/workflow/scripts/export_to_xlsx.py index 5eaba76..2fc5d7c 100644 --- a/workflow/scripts/export_to_xlsx.py +++ b/workflow/scripts/export_to_xlsx.py @@ -20,9 +20,9 @@ def index_vep(variantfile): bedfiles["tm"] = snakemake.input.tm_bed vcfs = {} -vcfs["all"] = snakemake.input.all -vcfs["aml"] = snakemake.input.aml -vcfs["tm"] = snakemake.input.tm +vcfs["all"] = [x for x in snakemake.input.vcfs if "include.all" in x][0] +vcfs["aml"] = [x for x in snakemake.input.vcfs if "include.aml" in x][0] +vcfs["tm"] = [x for x in snakemake.input.vcfs if "include.tm" in x][0] subsections = ["all", "aml", "tm"] vcf_tables = {} for subsection in subsections: From ada2826d6aa45790e82f6c975f23ad33cc1e3f3e Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Wed, 11 Sep 2024 11:55:06 +0200 Subject: [PATCH 10/11] feat: filter out variants with n_ratio lgt 5 --- config/config_snv_softfilter.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml index 1949147..7b4f942 100644 --- a/config/config_snv_softfilter.yaml +++ b/config/config_snv_softfilter.yaml @@ -44,3 +44,8 @@ filters: exist[regulatory_region_variant, VEP:Consequence])" soft_filter_flag: "Consequence" soft_filter: "True" + n_ratio: + description: "Soft filter variants where normal freq is atleast 5 times bigger than tumour freq" + expression: "INFO:NA_FALSE:N_ratio >= 5" + soft_filter_flag: "N_ratio_lgt_5" + soft_filter: "True" From 71c8871d2d11f388f13f9381e2fce6aea76c3c9c Mon Sep 17 00:00:00 2001 From: Arielle R Munters Date: Wed, 11 Sep 2024 15:32:32 +0200 Subject: [PATCH 11/11] fix: update annotate_ratio output format, filter ratio to >5, set N_ratio to one value in annotation --- config/config_snv_softfilter.yaml | 2 +- workflow/rules/annotate.smk | 2 +- workflow/scripts/annotate_normal_ratio.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml index 7b4f942..2655f53 100644 --- a/config/config_snv_softfilter.yaml +++ b/config/config_snv_softfilter.yaml @@ -46,6 +46,6 @@ filters: soft_filter: "True" n_ratio: description: "Soft filter variants where normal freq is atleast 5 times bigger than tumour freq" - expression: "INFO:NA_FALSE:N_ratio >= 5" + expression: "INFO:NA_FALSE:N_ratio > 5" soft_filter_flag: "N_ratio_lgt_5" soft_filter: "True" diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk index f3110c0..79f9751 100644 --- a/workflow/rules/annotate.smk +++ b/workflow/rules/annotate.smk @@ -8,7 +8,7 @@ rule annotate_normal_ratio: input: vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf", output: - vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz", + vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf", log: "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz.log", benchmark: diff --git a/workflow/scripts/annotate_normal_ratio.py b/workflow/scripts/annotate_normal_ratio.py index e669325..594b48c 100644 --- a/workflow/scripts/annotate_normal_ratio.py +++ b/workflow/scripts/annotate_normal_ratio.py @@ -15,9 +15,9 @@ vcf_in = VariantFile(snakemake.input.vcf) new_header = vcf_in.header -new_header.info.add("N_ratio", "A", "Float", "Number of times larger normal AF is than tumour sample.") +new_header.info.add("N_ratio", "1", "Float", "Number of times larger normal AF is than tumour sample.") logging.debug("Creating header for " + snakemake.output.vcf) -vcf_out = VariantFile(snakemake.output.vcf, "wb", header=new_header) +vcf_out = VariantFile(snakemake.output.vcf, "w", header=new_header) sample_tumor = [x for x in list(vcf_in.header.samples) if x.endswith("_T")][0] sample_normal = [x for x in list(vcf_in.header.samples) if x.endswith("_N")][0]