From 67c0c5b865e61d052c4cad603065c40f9ee77ef1 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Thu, 15 Aug 2024 15:50:50 +0200
Subject: [PATCH 01/11] refactor: add decompose and normalizing for t vcfs

---
 config/config.yaml                     |  1 +
 config/output_files.json               | 16 +++---
 workflow/Snakefile                     | 67 +++++++++++++++++++++-----
 workflow/rules/common.smk              |  2 +-
 workflow/rules/export.smk              |  6 +--
 workflow/rules/fix_af.smk              |  8 +--
 workflow/rules/html_output.smk         |  2 +-
 workflow/rules/mutectcaller_to_tsv.smk |  2 +-
 8 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 2dde122..92b5f6a 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -16,6 +16,7 @@ modules:
   prealignment: "v1.1.0"
   qc: "v0.3.0"
   reports: "7c8b8c5"
+  snv_indels: "v0.6.0"
   misc: "v0.1.0"
   sentieon: "b002d39"
 
diff --git a/config/output_files.json b/config/output_files.json
index e5d9ab9..e26deec 100644
--- a/config/output_files.json
+++ b/config/output_files.json
@@ -16,14 +16,14 @@
   
   "Results/{sample}/Cram/{sample}_{type}.crumble.cram": {"name": "_results_cram", "file": "compression/crumble/{sample}_{type}.crumble.cram", "types": ["T", "N"]},
   "Results/{sample}/Cram/{sample}_{type}.crumble.cram.crai": {"name": "_results_crai", "file": "compression/crumble/{sample}_{type}.crumble.cram.crai", "types": ["T", "N"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.all.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.aml.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.tm.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz.tbi", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]},
 
   "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz", "types": ["TN"]},
diff --git a/workflow/Snakefile b/workflow/Snakefile
index cc56c70..f3e2a21 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -115,21 +115,21 @@ use rule vep from annotation as annotation_vep with:
 
 use rule vep from annotation as annotation_vep_t with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz.tbi",
         cache=config["vep"]["vep_cache"],
         fasta=config["reference"]["fasta"],
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf"),
     log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.log",
+        "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.benchmark.tsv",
             config.get("vep", {}).get("benchmark_repeats", 1),
         )
     message:
-        "{rule}: Annotate {wildcards.sample}_T.vcf"
+        "{rule}: Annotate {input.vcf} with vep."
 
 
 use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation_tn with:
@@ -324,18 +324,18 @@ use rule filter_vcf from filtering as filtering_filter_vcf
 
 use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_t with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.vep.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.{bed}.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf"),
     params:
         filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed],
         extra=config.get("bcftools_SNV", {}).get("extra", ""),
     log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.{bed}.vcf.log",
+        "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_T.vep.include.{bed}.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.benchmark.tsv",
             config.get("bcftools_SNV", {}).get("benchmark_repeats", 1),
         )
     message:
@@ -556,9 +556,9 @@ if aligner == "bwa_sentieon":
 
 use rule peddy from qc as qc_peddy with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz",
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz",
         ped="qc/peddy/{sample}.peddy.fam",
-        tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi",
+        tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz.tbi",
     output:
         ped=temp("qc/peddy/{sample}/peddy.peddy.ped"),
         ped_check=temp("qc/peddy/{sample}/peddy.ped_check.csv"),
@@ -712,6 +712,47 @@ use rule cnv_html_report from reports as reports_cnv_html_report with:
         tc_method="pathology",
 
 
+module snv_indels:
+    snakefile:
+        github(
+            "hydra-genetics/snv_indels",
+            path="workflow/Snakefile",
+            tag=config["modules"]["snv_indels"],
+        )
+    config:
+        config
+
+
+use rule vt_decompose from snv_indels as snv_indels_vt_decompose with:
+    input:
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz",
+        tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi",
+    output:
+        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz"),
+    log:
+        "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.log",
+    benchmark:
+        repeat(
+            "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.benchmark.tsv",
+            config.get("vt_decompose", {}).get("benchmark_repeats", 1),
+        )
+
+
+use rule vt_normalize from snv_indels as snv_indels_vt_normalize with:
+    input:
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.decomposed.vcf.gz",
+        ref=config["reference"]["fasta"],
+    output:
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz",
+    log:
+        "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.log",
+    benchmark:
+        repeat(
+            "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.benchmark.tsv",
+            config.get("vt_normalize", {}).get("benchmark_repeats", 1),
+        )
+
+
 module misc:
     snakefile:
         github("hydra-genetics/misc", path="workflow/Snakefile", tag=config["modules"]["misc"])
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
index e006eaf..3fc5f76 100644
--- a/workflow/rules/common.smk
+++ b/workflow/rules/common.smk
@@ -110,7 +110,7 @@ def get_vcf_input(wildcards):
     if aligner is None:
         sys.exit("aligner missing from config, valid options: bwa_gpu or bwa_sentieon")
     elif aligner == "bwa_gpu":
-        vcf_input = "parabricks/pbrun_mutectcaller_t/{}_{}.vep.filter.germline.vcf".format(wildcards.sample, wildcards.type)
+        vcf_input = "parabricks/pbrun_mutectcaller_t/{}_{}.normalized.vep.filter.germline.vcf".format(wildcards.sample, wildcards.type)
     elif aligner == "bwa_sentieon":
         vcf_input = "sentieon/tnscope/{}_TNscope_tn_ML.vcf".format(wildcards.sample)
     else:
diff --git a/workflow/rules/export.smk b/workflow/rules/export.smk
index 7ee9256..6581282 100644
--- a/workflow/rules/export.smk
+++ b/workflow/rules/export.smk
@@ -6,11 +6,11 @@ __license__ = "GPL-3"
 
 rule export_to_xlsx:
     input:
-        all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vep.include.all.vcf.gz",
+        all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.all.vcf.gz",
         all_bed=config["bcftools_SNV"]["all"],
-        aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vep.include.aml.vcf.gz",
+        aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.aml.vcf.gz",
         aml_bed=config["bcftools_SNV"]["aml"],
-        tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vep.include.tm.vcf.gz",
+        tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.tm.vcf.gz",
         tm_bed=config["bcftools_SNV"]["tm"],
     output:
         xlsx=temp("export_to_xlsx/{analysis}/{sample_type}.snvs.xlsx"),
diff --git a/workflow/rules/fix_af.smk b/workflow/rules/fix_af.smk
index 5d84ca1..e31cca5 100644
--- a/workflow/rules/fix_af.smk
+++ b/workflow/rules/fix_af.smk
@@ -1,13 +1,13 @@
 rule fix_af:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.vcf",
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.vcf",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.vcf"),
     log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.log",
+        "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.benchmark.tsv",
             config.get("fix_af", {}).get("benchmark_repeats", 1),
         )
     threads: config.get("fix_af", {}).get("threads", config["default_resources"]["threads"])
diff --git a/workflow/rules/html_output.smk b/workflow/rules/html_output.smk
index 9e5162c..1fda261 100644
--- a/workflow/rules/html_output.smk
+++ b/workflow/rules/html_output.smk
@@ -9,7 +9,7 @@ rule merge_cnv_json_chr:
         json=get_json_for_merge_cnv_json_chr,
         fai=config.get("reference", {}).get("fai", ""),
         annotation_bed=list(config.get("annotate_cnv", {}).values()),
-        germline_vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.vep.filter.germline.fix_af.vcf",
+        germline_vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vep.filter.germline.fix_af.vcf",
         cnv_vcfs=get_unfiltered_cnv_vcfs_for_merge_json,
         filtered_cnv_vcfs=get_filtered_cnv_vcfs_for_merge_json,
         filtered_cnv_vcfs_tbi=get_filtered_cnv_vcfs_tbi_for_merge_json,
diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk
index 547d3de..04b4bc0 100644
--- a/workflow/rules/mutectcaller_to_tsv.smk
+++ b/workflow/rules/mutectcaller_to_tsv.smk
@@ -36,7 +36,7 @@ rule mutectcaller_tn_to_tsv:
 
 rule mutectcaller_t_to_tsv:
     input:
-        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}_T.vep.include.{bed}.vcf",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}_T.normalized.vep.include.{bed}.vcf",
     output:
         tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"),
     params:

From 03159b772cf9c0f48d86b60a40fda3303f15fc2f Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Mon, 19 Aug 2024 11:33:09 +0200
Subject: [PATCH 02/11] refactor: add decompose and normalizing to tn-vcfs

---
 config/output_files.json               | 16 +++++------
 workflow/Snakefile                     | 40 +++++++++++++-------------
 workflow/rules/mutectcaller_to_tsv.smk |  2 +-
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/config/output_files.json b/config/output_files.json
index e26deec..3122e7c 100644
--- a/config/output_files.json
+++ b/config/output_files.json
@@ -26,14 +26,14 @@
   "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz.tbi", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]},
 
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.all.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.aml.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.tm.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz.tbi", "types": ["TN"]},
   "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]},
   
   "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_mutectcaller_t.all.tsv", "types": ["T"]},
diff --git a/workflow/Snakefile b/workflow/Snakefile
index f3e2a21..80df038 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -96,21 +96,21 @@ use rule * from annotation as annotation_*
 
 use rule vep from annotation as annotation_vep with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz.tbi",
         cache=config["vep"]["vep_cache"],
         fasta=config["reference"]["fasta"],
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf"),
     log:
-        "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.log",
+        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.benchmark.tsv",
             config.get("vep", {}).get("benchmark_repeats", 1),
         )
     message:
-        "{rule}: Annotate {wildcards.sample}.vcf"
+        "{rule}: Annotate {input.vcf} with vep"
 
 
 use rule vep from annotation as annotation_vep_t with:
@@ -344,18 +344,18 @@ use rule bcftools_filter_include_region from filtering as filtering_bcftools_fil
 
 use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_tn with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.vep.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.{bed}.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf"),
     params:
         filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed],
         extra=config.get("bcftools_SNV", {}).get("extra", ""),
     log:
-        "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.{bed}.vcf.log",
+        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_tn/{sample}.vep.include.{bed}.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.benchmark.tsv",
             config.get("bcftools_SNV", {}).get("benchmark_repeats", 1),
         )
     message:
@@ -725,30 +725,30 @@ module snv_indels:
 
 use rule vt_decompose from snv_indels as snv_indels_vt_decompose with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz",
-        tbi="parabricks/pbrun_mutectcaller_t/{sample}_T.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vcf.gz",
+        tbi="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.vcf.gz.tbi",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz"),
+        vcf=temp("parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz"),
     log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.log",
+        "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_T.decomposed.vcf.gz.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz.benchmark.tsv",
             config.get("vt_decompose", {}).get("benchmark_repeats", 1),
         )
 
 
 use rule vt_normalize from snv_indels as snv_indels_vt_normalize with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.decomposed.vcf.gz",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.decomposed.vcf.gz",
         ref=config["reference"]["fasta"],
     output:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz",
     log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.log",
+        "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_{type}.normalized.vcf.gz.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz.benchmark.tsv",
             config.get("vt_normalize", {}).get("benchmark_repeats", 1),
         )
 
diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk
index 04b4bc0..d866c9a 100644
--- a/workflow/rules/mutectcaller_to_tsv.smk
+++ b/workflow/rules/mutectcaller_to_tsv.smk
@@ -6,7 +6,7 @@ __license__ = "GPL-3"
 
 rule mutectcaller_tn_to_tsv:
     input:
-        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}.vep.include.{bed}.vcf",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}.normalized.vep.include.{bed}.vcf",
     output:
         tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"),
     params:

From 5918aae31358bc997d38904ee014a09ba1849ff2 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Tue, 20 Aug 2024 10:54:01 +0200
Subject: [PATCH 03/11] refactor: update mutect to tsv script to include normal
 af and dp for tn samples

---
 config/config.yaml                      |   7 ++
 config/output_files.json                |   4 +-
 workflow/rules/mutectcaller_to_tsv.smk  |  41 +------
 workflow/scripts/mutectcaller_to_tsv.py | 146 ++++++++++++++++++++----
 4 files changed, 139 insertions(+), 59 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index 92b5f6a..7048b56 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -338,3 +338,10 @@ vep:
   container: "docker://hydragenetics/vep:105"
   vep_cache: ""
   extra: "--assembly GRCh38 --check_existing --pick --sift b --polyphen b --ccds --uniprot --hgvs --symbol --numbers --domains --regulatory --canonical --protein --biotype --uniprot --tsl --appris --gene_phenotype --af --af_1kg --af_gnomad --max_af --pubmed --variant_class"
+
+vt_decompose:
+  container: "docker://hydragenetics/vt:2015.11.10"
+
+vt_normalize:
+  container: "docker://hydragenetics/vt:2015.11.10"
+  
\ No newline at end of file
diff --git a/config/output_files.json b/config/output_files.json
index 3122e7c..851a054 100644
--- a/config/output_files.json
+++ b/config/output_files.json
@@ -36,8 +36,8 @@
   "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz.tbi", "types": ["TN"]},
   "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]},
   
-  "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_mutectcaller_t.all.tsv", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.aml.tsv": {"name": "_results_tsv_aml_t", "file": "tsv_files/{sample}_mutectcaller_t.aml.tsv", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_T_mutectcaller_t.all.tsv", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.aml.tsv": {"name": "_results_tsv_aml_t", "file": "tsv_files/{sample}_T_mutectcaller_t.aml.tsv", "types": ["T"]},
 
   "Results/{sample}/SNV_indels/{sample}_mutectcaller_TN.all.tsv": {"name": "_results_tsv_all", "file": "tsv_files/{sample}_mutectcaller_tn.all.tsv", "types": ["TN"]},
   "Results/{sample}/SNV_indels/{sample}_mutectcaller_TN.aml.tsv": {"name": "_results_tsv_aml", "file": "tsv_files/{sample}_mutectcaller_tn.aml.tsv", "types": ["TN"]},
diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk
index d866c9a..61fcaca 100644
--- a/workflow/rules/mutectcaller_to_tsv.smk
+++ b/workflow/rules/mutectcaller_to_tsv.smk
@@ -4,49 +4,18 @@ __email__ = "martin.rippin@scilifelab.uu.se"
 __license__ = "GPL-3"
 
 
-rule mutectcaller_tn_to_tsv:
+rule mutectcaller_to_tsv:
     input:
-        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}.normalized.vep.include.{bed}.vcf",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.{bed}.vcf",
     output:
-        tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"),
+        tsv=temp("tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv"),
     params:
         analysis="{analysis}",
-        sample="{sample}",
     log:
-        "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.log",
+        "tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv.log",
     benchmark:
         repeat(
-            "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.benchmark.tsv",
-            config.get("mutectcaller_to_tsv", {}).get("benchmark_repeats", 1),
-        )
-    threads: config.get("mutectcaller_to_tsv", {}).get("threads", config["default_resources"]["threads"])
-    resources:
-        mem_mb=config.get("mutectcaller_to_tsv", {}).get("mem_mb", config["default_resources"]["mem_mb"]),
-        mem_per_cpu=config.get("mutectcaller_to_tsv", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]),
-        partition=config.get("mutectcaller_to_tsv", {}).get("partition", config["default_resources"]["partition"]),
-        threads=config.get("mutectcaller_to_tsv", {}).get("threads", config["default_resources"]["threads"]),
-        time=config.get("mutectcaller_to_tsv", {}).get("time", config["default_resources"]["time"]),
-    container:
-        config.get("mutectcaller_to_tsv", {}).get("container", config["default_container"])
-    message:
-        "{rule}: select relevant info from {input.vcf} and compile into tsv"
-    script:
-        "../scripts/mutectcaller_to_tsv.py"
-
-
-rule mutectcaller_t_to_tsv:
-    input:
-        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample}_T.normalized.vep.include.{bed}.vcf",
-    output:
-        tsv=temp("tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv"),
-    params:
-        analysis="{analysis}",
-        sample="{sample}",
-    log:
-        "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.log",
-    benchmark:
-        repeat(
-            "tsv_files/{sample}_mutectcaller_{analysis}.{bed}.tsv.benchmark.tsv",
+            "tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv.benchmark.tsv",
             config.get("mutectcaller_to_tsv", {}).get("benchmark_repeats", 1),
         )
     threads: config.get("mutectcaller_to_tsv", {}).get("threads", config["default_resources"]["threads"])
diff --git a/workflow/scripts/mutectcaller_to_tsv.py b/workflow/scripts/mutectcaller_to_tsv.py
index a48538c..d7129c8 100755
--- a/workflow/scripts/mutectcaller_to_tsv.py
+++ b/workflow/scripts/mutectcaller_to_tsv.py
@@ -3,30 +3,134 @@
 
 import csv
 import sys
-import vcf
+from pysam import VariantFile
+import logging
 
-input_file = snakemake.input.vcf
+logging.basicConfig(
+    format="{asctime} - {levelname} - {message}",
+    style="{",
+    datefmt="%Y-%m-%d %H:%M",
+    level=logging.INFO,
+)
+
+input_file = VariantFile(snakemake.input.vcf)
 analysis = snakemake.params.analysis
-sample = snakemake.params.sample
 output_file = snakemake.output.tsv
 
-vcf = vcf.Reader(open(input_file, "r"))
 
+csq_index = []
+for x in input_file.header.records:
+    if "CSQ" in str(x):
+        csq_index = str(x).split("Format: ")[1].strip().strip('">').split("|")
+
+outlines = []
+if len(list(input_file.header.samples)) == 1:
+    logging.debug("One sample identified. " + f"{list(input_file.header.samples)=}")
+    outlines.append(
+        [
+            "#SAMPLE",
+            "CHROMOSOME",
+            "POSITION",
+            "REFERENCE",
+            "ALTERNATIVE",
+            "ALLELEFREQUENCY",
+            "DEPTH",
+            "GENE",
+            "TRANSCRIPT",
+            "NUCLEOTIDESEQ",
+            "PROTEIN",
+            "PROTEINSEQ",
+            "CONSEQUENCE",
+        ]
+    )
+
+elif len(list(input_file.header.samples)) == 2:
+    logging.debug("Two sample identified. " + f"{list(input_file.header.samples)=}")
+    outlines.append(
+        [
+            "#SAMPLE",
+            "CHROMOSOME",
+            "POSITION",
+            "REFERENCE",
+            "ALTERNATIVE",
+            "ALLELEFREQUENCY",
+            "NORMAL_AF",
+            "DEPTH",
+            "NORMAL_DP",
+            "GENE",
+            "TRANSCRIPT",
+            "NUCLEOTIDESEQ",
+            "PROTEIN",
+            "PROTEINSEQ",
+            "CONSEQUENCE",
+        ]
+    )
+else:
+    logging.error(snakemake.input.vcf + " contains more than two samples in header.")
+    sys.exit()
+
+
+logging.info("Starting to iterate through calls in " + snakemake.input.vcf)
+for row in input_file.fetch():
+    csq = row.info["CSQ"][0].split("|")
+    gene = csq[csq_index.index("SYMBOL")]
+
+    transcript_name = csq[csq_index.index("HGVSc")].split(":")[0]
+    transcript_change = ""
+    if len(csq[csq_index.index("HGVSc")].split(":")) > 1:
+        transcript_change = csq[csq_index.index("HGVSc")].split(":")[1]
+
+    protein_name = csq[csq_index.index("HGVSp")].split(":")[0]
+    protein_change = ""
+    if len(csq[csq_index.index("HGVSp")].split(":")) > 1:
+        protein_change = csq[csq_index.index("HGVSp")].split(":")[1]
+
+    consequence = csq[csq_index.index("Consequence")]
+
+    tumor_sample = [x for x in row.samples if x.endswith("_T")][0]
+    af = row.samples[tumor_sample]["AF"][0]
+    dp = row.samples[tumor_sample]["DP"][0]
+    outline = [
+        tumor_sample.split("_")[0],
+        row.chrom,
+        row.pos,
+        row.ref,
+        row.alts[0],
+        af,
+        dp,
+        gene,
+        transcript_name,
+        transcript_change,
+        protein_name,
+        protein_change,
+        consequence,
+    ]
+
+    if len(list(input_file.header.samples)) == 2:
+        normal_sample = [x for x in row.samples if x.endswith("_N")][0]
+        af_normal = row.samples[normal_sample]["AF"][0]
+        dp_normal = row.samples[normal_sample]["DP"][0]
+        outline = [
+            tumor_sample.split("_")[0],
+            row.chrom,
+            row.pos,
+            row.ref,
+            row.alts[0],
+            af,
+            af_normal,
+            dp,
+            dp_normal,
+            gene,
+            transcript_name,
+            transcript_change,
+            protein_name,
+            protein_change,
+            consequence,
+        ]
+
+    outlines.append(outline)
+
+logging.info("Writing results to " + snakmake.output.tsv)
 with open(output_file, "wt") as tsv:
-    tsv_writer = csv.writer(tsv, delimiter='\t')
-    tsv_writer.writerow(["#SAMPLE", "CHROMOSOME", "POSITION", "REFERENCE", "ALTERNATIVE", "ALLELEFREQUENCY", "DEPTH", "GENE",
-                         "TRANSCRIPT", "NUCLEOTIDESEQ", "PROTEIN", "PROTEINSEQ", "CONSEQUENCE"])
-    for row in vcf:
-        genes = row.INFO["CSQ"][0].split("|")
-        transcript_field = genes[10].split(":")
-        if len(transcript_field) == 1:
-            transcript_field = ["", ""]
-        protein_field = genes[11].split(":")
-        if len(protein_field) == 1:
-            protein_field = [genes[24], ""]
-        i = 0
-        if analysis == "tn":
-            i = 1
-        af = row.samples[i]["AF"]
-        tsv_writer.writerow([sample, row.CHROM, row.POS, row.REF, row.ALT, af, row.INFO["DP"], genes[3], transcript_field[0],
-                             transcript_field[1], protein_field[0], protein_field[1], genes[1]])
+    tsv_writer = csv.writer(tsv, delimiter="\t")
+    tsv_writer.writerows(outlines)

From cdf9c63ac5d4e6fe0f52d49eac4c09db72f54a41 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Tue, 20 Aug 2024 14:11:41 +0200
Subject: [PATCH 04/11] feat: add preliminary soft filters

---
 config/config.yaml                      |  2 +-
 config/config_snv_softfilter.yaml       | 31 ++++++++++++++++++++++++
 config/output_files.json                | 32 ++++++++++++-------------
 workflow/Snakefile                      | 20 ++++++++--------
 workflow/rules/export.smk               |  6 ++---
 workflow/rules/mutectcaller_to_tsv.smk  |  2 +-
 workflow/scripts/export_to_xlsx.py      | 22 ++++++++++-------
 workflow/scripts/mutectcaller_to_tsv.py |  5 ++++
 8 files changed, 80 insertions(+), 40 deletions(-)
 create mode 100644 config/config_snv_softfilter.yaml

diff --git a/config/config.yaml b/config/config.yaml
index 7048b56..b0ec8fe 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -120,6 +120,7 @@ fastqc:
 filter_vcf:
   germline: "config/config_germline_filter.yaml"
   cnv_hardfilter: "config/config_cnv_hardfilter.yaml"
+  somatic: "config/config_snv_softfilter.yaml"
 
 fusioncatcher:
   container: "docker://hydragenetics/fusioncatcher:1.33"
@@ -344,4 +345,3 @@ vt_decompose:
 
 vt_normalize:
   container: "docker://hydragenetics/vt:2015.11.10"
-  
\ No newline at end of file
diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml
new file mode 100644
index 0000000..23747a2
--- /dev/null
+++ b/config/config_snv_softfilter.yaml
@@ -0,0 +1,31 @@
+filters:
+  af:
+    description: "Softfilter variants where T have AF lower than 5 percent"
+    expression: "INFO:AF < 0.05"
+    soft_filter: "True"
+    soft_filter_flag: "AF_lt_0.05"
+  ad:
+    description: "Softfilter variants where T have AD lower than 3?"
+    expression: "INFO:AD:1 < 3"
+    soft_filter: "True"
+    soft_filter_flag: "AD_lt_3"
+  dp:
+    description: "Softfilter variants where dp is lower than 20x?"
+    expression: "INFO:DP < 20"
+    soft_filter: "True"
+    soft_filter_flag: "DP_lt_20"
+  germline:
+    description: "Soft filter germline if >2% in any population from 1000 genomes, ESP or gnomADe"
+    expression: "(VEP:MAX_AF > 0.02)"
+    soft_filter_flag: "PopAF_0.02"
+    soft_filter: "True"
+  intron:
+    description: "Soft filter intronic variants except if also splice"
+    expression: "(exist[intron_variant, VEP:Consequence] and !exist[splice, VEP:Consequence])"
+    soft_filter_flag: "Intron"
+    soft_filter: "True"
+  protein_coding:
+    description: "Soft filter variants not deemed protein_coding"
+    expression: (VEP:BIOTYPE != protein_coding)
+    soft_filter_flag: "Biotype"
+    soft_filter: "True"
\ No newline at end of file
diff --git a/config/output_files.json b/config/output_files.json
index 851a054..3796349 100644
--- a/config/output_files.json
+++ b/config/output_files.json
@@ -16,24 +16,24 @@
   
   "Results/{sample}/Cram/{sample}_{type}.crumble.cram": {"name": "_results_cram", "file": "compression/crumble/{sample}_{type}.crumble.cram", "types": ["T", "N"]},
   "Results/{sample}/Cram/{sample}_{type}.crumble.cram.crai": {"name": "_results_crai", "file": "compression/crumble/{sample}_{type}.crumble.cram.crai", "types": ["T", "N"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.all.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.aml.vcf.gz.tbi", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.tm.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz": {"name": "_results_vcf_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.vcf.gz.tbi": {"name": "_results_tbi_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.filter.somatic.vep.include.aml.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]},
 
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.all.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.aml.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.tm.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["TN"]},
   "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]},
   
   "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_T_mutectcaller_t.all.tsv", "types": ["T"]},
diff --git a/workflow/Snakefile b/workflow/Snakefile
index 80df038..673e759 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -324,18 +324,18 @@ use rule filter_vcf from filtering as filtering_filter_vcf
 
 use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_t with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.vcf.gz.tbi",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.{bed}.vcf"),
     params:
         filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed],
         extra=config.get("bcftools_SNV", {}).get("extra", ""),
     log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.log",
+        "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.{bed}.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.include.{bed}.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.{bed}.vcf.benchmark.tsv",
             config.get("bcftools_SNV", {}).get("benchmark_repeats", 1),
         )
     message:
@@ -344,18 +344,18 @@ use rule bcftools_filter_include_region from filtering as filtering_bcftools_fil
 
 use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_tn with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf"),
     params:
         filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed],
         extra=config.get("bcftools_SNV", {}).get("extra", ""),
     log:
-        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.log",
+        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.include.{bed}.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.benchmark.tsv",
             config.get("bcftools_SNV", {}).get("benchmark_repeats", 1),
         )
     message:
diff --git a/workflow/rules/export.smk b/workflow/rules/export.smk
index 6581282..48dcc7f 100644
--- a/workflow/rules/export.smk
+++ b/workflow/rules/export.smk
@@ -6,11 +6,11 @@ __license__ = "GPL-3"
 
 rule export_to_xlsx:
     input:
-        all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.all.vcf.gz",
+        all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.all.vcf.gz",
         all_bed=config["bcftools_SNV"]["all"],
-        aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.aml.vcf.gz",
+        aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.aml.vcf.gz",
         aml_bed=config["bcftools_SNV"]["aml"],
-        tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.tm.vcf.gz",
+        tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.tm.vcf.gz",
         tm_bed=config["bcftools_SNV"]["tm"],
     output:
         xlsx=temp("export_to_xlsx/{analysis}/{sample_type}.snvs.xlsx"),
diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk
index 61fcaca..cc7ba82 100644
--- a/workflow/rules/mutectcaller_to_tsv.smk
+++ b/workflow/rules/mutectcaller_to_tsv.smk
@@ -6,7 +6,7 @@ __license__ = "GPL-3"
 
 rule mutectcaller_to_tsv:
     input:
-        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.include.{bed}.vcf",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.{bed}.vcf",
     output:
         tsv=temp("tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv"),
     params:
diff --git a/workflow/scripts/export_to_xlsx.py b/workflow/scripts/export_to_xlsx.py
index b501184..a985129 100644
--- a/workflow/scripts/export_to_xlsx.py
+++ b/workflow/scripts/export_to_xlsx.py
@@ -13,8 +13,6 @@ def index_vep(variantfile):
     return csq_index
 
 
-# No filterings as of now 231027 on any vcf. Should we add?
-# are the files decomposed?
 """ Prepping input data """
 bedfiles = {}
 bedfiles["all"] = snakemake.input.all_bed
@@ -31,14 +29,15 @@ def index_vep(variantfile):
     vcf_tables[subsection] = []
     vcf = VariantFile(vcfs[subsection])
     csq_index = index_vep(vcf)
+    sample_tumor = [x for x in list(vcf.header.samples) if x.endswith("_T")][0]
     if len(list(vcf.header.samples)) > 1:
-        sample_normal = list(vcf.header.samples)[0]
-        sample_tumor = list(vcf.header.samples)[1]
+        sample_normal = [x for x in list(vcf.header.samples) if x.endswith("_N")][0]
     else:
-        sample_tumor = list(vcf.header.samples)[0]
         af_normal = ""
+        dp_normal = ""
 
     for record in vcf.fetch():
+        filterflag = ",".join(record.filter.keys())
         af = float(record.samples[sample_tumor].get("AF")[0])
         if af > 0.01:
             csq = record.info["CSQ"][0].split("|")
@@ -72,8 +71,10 @@ def index_vep(variantfile):
 
             if len(list(vcf.header.samples)) > 1:
                 af_normal = float(record.samples[sample_normal].get("AF")[0])
+                dp_normal = int(record.samples[sample_normal].get("DP"))
 
             outline = [
+                filterflag,
                 sample_tumor,
                 gene,
                 record.contig,
@@ -83,6 +84,7 @@ def index_vep(variantfile):
                 af,
                 af_normal,
                 int(record.info["DP"]),
+                int(dp_normal),
                 transcript,
                 exon,
                 coding_name,
@@ -138,13 +140,14 @@ def index_vep(variantfile):
 for sheet in subsections:
     vcf_data = vcf_tables[sheet]
     worksheet = workbook.add_worksheet(sheet.upper())
-    worksheet.set_column("A:A", 12)
-    worksheet.set_column("D:D", 10)
-    worksheet.set_column("J:J", 15)
+    worksheet.set_column("B:B", 12)
+    worksheet.set_column("E:E", 10)
+    worksheet.set_column("L:L", 15)
     worksheet.write("A1", "Variants in " + sheet.upper() + " regions", heading_format)
     worksheet.write("A3", "Sample: " + str(sample_tumor))
 
     tableheading = [
+        "FilterFlag",
         "DNAnr",
         "Gene",
         "Chr",
@@ -154,6 +157,7 @@ def index_vep(variantfile):
         "AF",
         "Normal AF",
         "DP",
+        "Normal DP",
         "Transcript",
         "Exon",
         "Mutation cds",
@@ -168,7 +172,7 @@ def index_vep(variantfile):
     heading_list = [{"header": a} for a in tableheading]
 
     worksheet.add_table(
-        "A5:S" + str(len(vcf_data) + 6), {"data": vcf_data, "columns": heading_list, "style": "Table Style Light 1"}
+        "A5:U" + str(len(vcf_data) + 6), {"data": vcf_data, "columns": heading_list, "style": "Table Style Light 1"}
     )
 # Add bedfile sheets
 for sheet in subsections:
diff --git a/workflow/scripts/mutectcaller_to_tsv.py b/workflow/scripts/mutectcaller_to_tsv.py
index d7129c8..bd5a3ae 100755
--- a/workflow/scripts/mutectcaller_to_tsv.py
+++ b/workflow/scripts/mutectcaller_to_tsv.py
@@ -36,6 +36,7 @@
             "ALLELEFREQUENCY",
             "DEPTH",
             "GENE",
+            "FILTERFLAG",
             "TRANSCRIPT",
             "NUCLEOTIDESEQ",
             "PROTEIN",
@@ -58,6 +59,7 @@
             "DEPTH",
             "NORMAL_DP",
             "GENE",
+            "FILTERFLAG",
             "TRANSCRIPT",
             "NUCLEOTIDESEQ",
             "PROTEIN",
@@ -90,6 +92,7 @@
     tumor_sample = [x for x in row.samples if x.endswith("_T")][0]
     af = row.samples[tumor_sample]["AF"][0]
     dp = row.samples[tumor_sample]["DP"][0]
+    flag = ",".join(row.filter.keys())
     outline = [
         tumor_sample.split("_")[0],
         row.chrom,
@@ -99,6 +102,7 @@
         af,
         dp,
         gene,
+        flag,
         transcript_name,
         transcript_change,
         protein_name,
@@ -121,6 +125,7 @@
             dp,
             dp_normal,
             gene,
+            flag,
             transcript_name,
             transcript_change,
             protein_name,

From 807b1db0fb6062eb3da26fa5413e424b2c413044 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Wed, 21 Aug 2024 13:22:22 +0200
Subject: [PATCH 05/11] chore: small typos and fixes

---
 config/config.yaml                      |  1 +
 config/config_snv_softfilter.yaml       | 10 +++++-----
 workflow/scripts/export_to_xlsx.py      |  2 +-
 workflow/scripts/mutectcaller_to_tsv.py |  6 +++---
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/config/config.yaml b/config/config.yaml
index b0ec8fe..67fe9dc 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -118,6 +118,7 @@ fastqc:
   container: "docker://hydragenetics/fastqc:0.11.9"
 
 filter_vcf:
+  sample_name_regex: "^([A-Za-z0-9-]+_[RT])$"
   germline: "config/config_germline_filter.yaml"
   cnv_hardfilter: "config/config_cnv_hardfilter.yaml"
   somatic: "config/config_snv_softfilter.yaml"
diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml
index 23747a2..a9d6a6c 100644
--- a/config/config_snv_softfilter.yaml
+++ b/config/config_snv_softfilter.yaml
@@ -1,17 +1,17 @@
 filters:
   af:
     description: "Softfilter variants where T have AF lower than 5 percent"
-    expression: "INFO:AF < 0.05"
+    expression: "FORMAT:AF:0 < 0.05"
     soft_filter: "True"
     soft_filter_flag: "AF_lt_0.05"
   ad:
     description: "Softfilter variants where T have AD lower than 3?"
-    expression: "INFO:AD:1 < 3"
+    expression: "FORMAT:AD:1 < 3"
     soft_filter: "True"
     soft_filter_flag: "AD_lt_3"
   dp:
     description: "Softfilter variants where dp is lower than 20x?"
-    expression: "INFO:DP < 20"
+    expression: "FORMAT:DP < 20"
     soft_filter: "True"
     soft_filter_flag: "DP_lt_20"
   germline:
@@ -26,6 +26,6 @@ filters:
     soft_filter: "True"
   protein_coding:
     description: "Soft filter variants not deemed protein_coding"
-    expression: (VEP:BIOTYPE != protein_coding)
+    expression: "(VEP:BIOTYPE != protein_coding)"
     soft_filter_flag: "Biotype"
-    soft_filter: "True"
\ No newline at end of file
+    soft_filter: "True"
diff --git a/workflow/scripts/export_to_xlsx.py b/workflow/scripts/export_to_xlsx.py
index a985129..5eaba76 100644
--- a/workflow/scripts/export_to_xlsx.py
+++ b/workflow/scripts/export_to_xlsx.py
@@ -84,7 +84,7 @@ def index_vep(variantfile):
                 af,
                 af_normal,
                 int(record.info["DP"]),
-                int(dp_normal),
+                dp_normal,
                 transcript,
                 exon,
                 coding_name,
diff --git a/workflow/scripts/mutectcaller_to_tsv.py b/workflow/scripts/mutectcaller_to_tsv.py
index bd5a3ae..7d51a24 100755
--- a/workflow/scripts/mutectcaller_to_tsv.py
+++ b/workflow/scripts/mutectcaller_to_tsv.py
@@ -91,7 +91,7 @@
 
     tumor_sample = [x for x in row.samples if x.endswith("_T")][0]
     af = row.samples[tumor_sample]["AF"][0]
-    dp = row.samples[tumor_sample]["DP"][0]
+    dp = row.samples[tumor_sample]["DP"]
     flag = ",".join(row.filter.keys())
     outline = [
         tumor_sample.split("_")[0],
@@ -113,7 +113,7 @@
     if len(list(input_file.header.samples)) == 2:
         normal_sample = [x for x in row.samples if x.endswith("_N")][0]
         af_normal = row.samples[normal_sample]["AF"][0]
-        dp_normal = row.samples[normal_sample]["DP"][0]
+        dp_normal = row.samples[normal_sample]["DP"]
         outline = [
             tumor_sample.split("_")[0],
             row.chrom,
@@ -135,7 +135,7 @@
 
     outlines.append(outline)
 
-logging.info("Writing results to " + snakmake.output.tsv)
+logging.info("Writing results to " + output_file)
 with open(output_file, "wt") as tsv:
     tsv_writer = csv.writer(tsv, delimiter="\t")
     tsv_writer.writerows(outlines)

From e10ec276ab3dd9b7c0c34d1e1205344d3eb95b7d Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Thu, 5 Sep 2024 15:03:33 +0200
Subject: [PATCH 06/11] refactor: change snv dp filter to 10x

---
 config/config_snv_softfilter.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml
index a9d6a6c..a138877 100644
--- a/config/config_snv_softfilter.yaml
+++ b/config/config_snv_softfilter.yaml
@@ -10,10 +10,10 @@ filters:
     soft_filter: "True"
     soft_filter_flag: "AD_lt_3"
   dp:
-    description: "Softfilter variants where dp is lower than 20x?"
-    expression: "FORMAT:DP < 20"
+    description: "Softfilter variants where dp is lower than 10x"
+    expression: "FORMAT:DP < 10"
     soft_filter: "True"
-    soft_filter_flag: "DP_lt_20"
+    soft_filter_flag: "DP_lt_10"
   germline:
     description: "Soft filter germline if >2% in any population from 1000 genomes, ESP or gnomADe"
     expression: "(VEP:MAX_AF > 0.02)"

From 6ad931291c449465b5100e456a3541582ca66339 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Fri, 6 Sep 2024 11:53:10 +0200
Subject: [PATCH 07/11] refactor: add consequnce filter to snvs

---
 config/config_snv_softfilter.yaml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml
index a138877..1949147 100644
--- a/config/config_snv_softfilter.yaml
+++ b/config/config_snv_softfilter.yaml
@@ -29,3 +29,18 @@ filters:
     expression: "(VEP:BIOTYPE != protein_coding)"
     soft_filter_flag: "Biotype"
     soft_filter: "True"
+  consequences:
+    description: "Soft filter based on consequences"
+    expression:
+      "(exist[intergenic_variant, VEP:Consequence] or exist[NMD_transcript_variant, VEP:Consequence] or
+      exist[non_coding_transcript_variant, VEP:Consequence] or
+      exist[upstream_gene_variant, VEP:Consequence] or
+      exist[downstream_gene_variant, VEP:Consequence] or
+      exist[TFBS_ablation, VEP:Consequence] or
+      exist[TFBS_amplification, VEP:Consequence] or
+      exist[TF_binding_site_variant, VEP:Consequence] or
+      exist[regulatory_region_ablation, VEP:Consequence] or
+      exist[regulatory_region_amplification, VEP:Consequence] or
+      exist[regulatory_region_variant, VEP:Consequence])"
+    soft_filter_flag: "Consequence"
+    soft_filter: "True"

From 82f50c5be13bd8a437435b2a685df4fac817b6b5 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Wed, 11 Sep 2024 11:50:42 +0200
Subject: [PATCH 08/11] refactor: merge vep rules for t and tn into one rule

---
 workflow/Snakefile | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 673e759..d664179 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -96,42 +96,23 @@ use rule * from annotation as annotation_*
 
 use rule vep from annotation as annotation_vep with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vcf.gz.tbi",
         cache=config["vep"]["vep_cache"],
         fasta=config["reference"]["fasta"],
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.vcf"),
     log:
-        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.log",
+        "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.vcf.benchmark.tsv",
             config.get("vep", {}).get("benchmark_repeats", 1),
         )
     message:
         "{rule}: Annotate {input.vcf} with vep"
 
 
-use rule vep from annotation as annotation_vep_t with:
-    input:
-        vcf="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vcf.gz.tbi",
-        cache=config["vep"]["vep_cache"],
-        fasta=config["reference"]["fasta"],
-    output:
-        vcf=temp("parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf"),
-    log:
-        "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.log",
-    benchmark:
-        repeat(
-            "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.vcf.benchmark.tsv",
-            config.get("vep", {}).get("benchmark_repeats", 1),
-        )
-    message:
-        "{rule}: Annotate {input.vcf} with vep."
-
-
 use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation_tn with:
     input:
         vcf="cnv_sv/manta_run_workflow_tn/{sample}/results/variants/somaticSV.snpeff.vcf.gz",
@@ -147,7 +128,7 @@ use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation
             config.get("snpeff", {}).get("benchmark_repeats", 1),
         )
     message:
-        "{rule}: Annotate cnv_sv/manta_run_workflow_tn/{wildcards.sample}/results/variants/somaticSV.snpeff.vcf.gz"
+        "{rule}: Annotate {input.vcf}"
 
 
 use rule simple_sv_annotation from annotation as annotation_simple_sv_annotation_t with:

From a92c8fb0abaf0ef0039f02605ee5604e506a2027 Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Wed, 11 Sep 2024 11:54:24 +0200
Subject: [PATCH 09/11] feat: add n_ratio annotation to tn vcfs

---
 config/output_files.json                  | 18 ++++----
 docs/softwares.md                         | 21 ++++++++++
 workflow/Snakefile                        | 11 ++---
 workflow/rules/annotate.smk               | 31 ++++++++++++++
 workflow/rules/export.smk                 | 20 +++++++--
 workflow/rules/mutectcaller_to_tsv.smk    |  9 +++-
 workflow/schemas/config.schema.yaml       | 14 +++++++
 workflow/schemas/resources.schema.yaml    | 20 +++++++++
 workflow/schemas/rules.schema.yaml        | 22 +++++++++-
 workflow/scripts/annotate_normal_ratio.py | 51 +++++++++++++++++++++++
 workflow/scripts/export_to_xlsx.py        |  6 +--
 11 files changed, 200 insertions(+), 23 deletions(-)
 create mode 100644 workflow/rules/annotate.smk
 create mode 100644 workflow/scripts/annotate_normal_ratio.py

diff --git a/config/output_files.json b/config/output_files.json
index 3796349..27228a6 100644
--- a/config/output_files.json
+++ b/config/output_files.json
@@ -21,19 +21,19 @@
   "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz": {"name": "_results_vcf_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz": {"name": "_results_vcf_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["T"]},
-  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.filter.somatic.vep.include.aml.vcf.gz.tbi", "types": ["T"]},
+  "Results/{sample}/SNV_indels/{sample}_T.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.aml.vcf.gz.tbi", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz": {"name": "_results_vcf_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm_t", "file": "parabricks/pbrun_mutectcaller_t/{sample}_T.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["T"]},
   "Results/{sample}/SNV_indels/{sample}_T.xlsx": {"name": "_results_xlsx_t", "file": "export_to_xlsx/t/{sample}_T.snvs.xlsx", "types": ["T"]},
 
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.all.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.aml.vcf.gz.tbi", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz", "types": ["TN"]},
-  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.tm.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz": {"name": "_results_vcf_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.vcf.gz.tbi": {"name": "_results_tbi_tn", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz": {"name": "_results_vcf_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.all.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.all.vcf.gz.tbi": {"name": "_results_tbi_all", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.all.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz": {"name": "_results_vcf_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.aml.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.aml.vcf.gz.tbi": {"name": "_results_tbi_aml", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.aml.vcf.gz.tbi", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz": {"name": "_results_vcf_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.tm.vcf.gz", "types": ["TN"]},
+  "Results/{sample}/SNV_indels/{sample}_TN.vep.tm.vcf.gz.tbi": {"name": "_results_tbi_tm", "file": "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.tm.vcf.gz.tbi", "types": ["TN"]},
   "Results/{sample}/SNV_indels/{sample}_TN.xlsx": {"name": "_results_xlsx_tn", "file": "export_to_xlsx/tn/{sample}.snvs.xlsx", "types": ["TN"]},
   
   "Results/{sample}/SNV_indels/{sample}_mutectcaller_T.all.tsv": {"name": "_results_tsv_all_t", "file": "tsv_files/{sample}_T_mutectcaller_t.all.tsv", "types": ["T"]},
diff --git a/docs/softwares.md b/docs/softwares.md
index 9dc8a30..371ed95 100644
--- a/docs/softwares.md
+++ b/docs/softwares.md
@@ -19,3 +19,24 @@ Introduction to export_to_xlsx
 #### Resources settings (`resources.yaml`)
 
 #RESOURCESSCHEMA__export_to_xlsx#
+
+## [annotate_normal_ratio](url_to_tool)
+Introduction to annotate_normal_ratio
+
+### :snake: Rule
+
+#SNAKEMAKE_RULE_SOURCE__annotate__annotate_normal_ratio#
+
+#### :left_right_arrow: input / output files
+
+#SNAKEMAKE_RULE_TABLE__annotate__annotate_normal_ratio#
+
+### :wrench: Configuration
+
+#### Software settings (`config.yaml`)
+
+#CONFIGSCHEMA__annotate_normal_ratio#
+
+#### Resources settings (`resources.yaml`)
+
+#RESOURCESSCHEMA__annotate_normal_ratio#
diff --git a/workflow/Snakefile b/workflow/Snakefile
index d664179..9dbf58e 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -5,6 +5,7 @@ __license__ = "GPL-3"
 
 
 include: "rules/common.smk"
+include: "rules/annotate.smk"
 include: "rules/cnvkit.smk"
 include: "rules/cnvkit_table.smk"
 include: "rules/dux4_rearrangements.smk"
@@ -325,18 +326,18 @@ use rule bcftools_filter_include_region from filtering as filtering_bcftools_fil
 
 use rule bcftools_filter_include_region from filtering as filtering_bcftools_filter_include_region_snv_tn with:
     input:
-        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz",
-        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.vcf.gz.tbi",
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz",
+        tabix="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.vcf.gz.tbi",
     output:
-        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf"),
+        vcf=temp("parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf"),
     params:
         filter=lambda wildcards: "-R %s" % config["bcftools_SNV"][wildcards.bed],
         extra=config.get("bcftools_SNV", {}).get("extra", ""),
     log:
-        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.log",
+        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf.log",
     benchmark:
         repeat(
-            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.filter.somatic.include.{bed}.vcf.benchmark.tsv",
+            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf.benchmark.tsv",
             config.get("bcftools_SNV", {}).get("benchmark_repeats", 1),
         )
     message:
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
new file mode 100644
index 0000000..f3110c0
--- /dev/null
+++ b/workflow/rules/annotate.smk
@@ -0,0 +1,31 @@
+__author__ = "Arielle R. Munters"
+__copyright__ = "Copyright 2024, Arielle R. Munters"
+__email__ = "arielle.munters@scilifelab.uu.se"
+__license__ = "GPL-3"
+
+
+rule annotate_normal_ratio:
+    input:
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf",
+    output:
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz",
+    log:
+        "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz.log",
+    benchmark:
+        repeat(
+            "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz.benchmark.tsv",
+            config.get("annotate_normal_ratio", {}).get("benchmark_repeats", 1)
+        )
+    threads: config.get("annotate_normal_ratio", {}).get("threads", config["default_resources"]["threads"])
+    resources:
+        mem_mb=config.get("annotate_normal_ratio", {}).get("mem_mb", config["default_resources"]["mem_mb"]),
+        mem_per_cpu=config.get("annotate_normal_ratio", {}).get("mem_per_cpu", config["default_resources"]["mem_per_cpu"]),
+        partition=config.get("annotate_normal_ratio", {}).get("partition", config["default_resources"]["partition"]),
+        threads=config.get("annotate_normal_ratio", {}).get("threads", config["default_resources"]["threads"]),
+        time=config.get("annotate_normal_ratio", {}).get("time", config["default_resources"]["time"]),
+    container:
+        config.get("annotate_normal_ratio", {}).get("container", config["default_container"])
+    message:
+        "{rule}: Add normal ratio annotation to {input.vcf}"
+    script:
+        "../scripts/annotate_normal_ratio.py"
diff --git a/workflow/rules/export.smk b/workflow/rules/export.smk
index 48dcc7f..a69e40d 100644
--- a/workflow/rules/export.smk
+++ b/workflow/rules/export.smk
@@ -4,13 +4,25 @@ __email__ = "arielle.munters@scilifelab.uu.se"
 __license__ = "GPL-3"
 
 
+def check_if_tn(wildcards):
+    if wildcards.analysis == "tn":
+        vcfs = expand(
+            "parabricks/pbrun_mutectcaller_{{analysis}}/{{sample_type}}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf.gz",
+            bed=["all", "aml", "tm"],
+        )
+    else:
+        vcfs = expand(
+            "parabricks/pbrun_mutectcaller_{{analysis}}/{{sample_type}}.normalized.vep.filter.somatic.include.{bed}.vcf.gz",
+            bed=["all", "aml", "tm"],
+        )
+    return vcfs
+
+
 rule export_to_xlsx:
     input:
-        all="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.all.vcf.gz",
+        vcfs=check_if_tn,
         all_bed=config["bcftools_SNV"]["all"],
-        aml="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.aml.vcf.gz",
         aml_bed=config["bcftools_SNV"]["aml"],
-        tm="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.tm.vcf.gz",
         tm_bed=config["bcftools_SNV"]["tm"],
     output:
         xlsx=temp("export_to_xlsx/{analysis}/{sample_type}.snvs.xlsx"),
@@ -33,6 +45,6 @@ rule export_to_xlsx:
     container:
         config.get("export_to_xlsx", {}).get("container", config["default_container"])
     message:
-        "{rule}: merge {input.all}, {input.aml}, {input.tm} into {output.xlsx}"
+        "{rule}: merge {input.vcfs} into {output.xlsx}"
     script:
         "../scripts/export_to_xlsx.py"
diff --git a/workflow/rules/mutectcaller_to_tsv.smk b/workflow/rules/mutectcaller_to_tsv.smk
index cc7ba82..6a31e1d 100644
--- a/workflow/rules/mutectcaller_to_tsv.smk
+++ b/workflow/rules/mutectcaller_to_tsv.smk
@@ -3,10 +3,17 @@ __copyright__ = "Copyright 2021, Martin Rippin"
 __email__ = "martin.rippin@scilifelab.uu.se"
 __license__ = "GPL-3"
 
+def check_if_tn(wildcards):
+    if wildcards.analysis == "tn":
+        vcf = "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.ratio.filter.somatic.include.{bed}.vcf"
+    else:
+        vcf = "parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.{bed}.vcf"
+    return vcf
+
 
 rule mutectcaller_to_tsv:
     input:
-        vcf="parabricks/pbrun_mutectcaller_{analysis}/{sample_type}.normalized.vep.filter.somatic.include.{bed}.vcf",
+        vcf=check_if_tn,
     output:
         tsv=temp("tsv_files/{sample_type}_mutectcaller_{analysis}.{bed}.tsv"),
     params:
diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml
index d827352..a86cb14 100644
--- a/workflow/schemas/config.schema.yaml
+++ b/workflow/schemas/config.schema.yaml
@@ -133,6 +133,20 @@ properties:
         type: string
         description: path to CNA gene bed file
 
+  annotate_normal_ratio:
+    type: object
+    description: parameters for annotate_normal_ratio
+    properties:
+      benchmark_repeats:
+        type: integer
+        description: set number of times benchmark should be repeated
+      container:
+        type: string
+        description: name or path to docker/singularity container
+      extra:
+        type: string
+        description: parameters that should be forwarded
+
   arriba:
     type: object
     properties:
diff --git a/workflow/schemas/resources.schema.yaml b/workflow/schemas/resources.schema.yaml
index d162ea6..6f67bb1 100644
--- a/workflow/schemas/resources.schema.yaml
+++ b/workflow/schemas/resources.schema.yaml
@@ -48,6 +48,26 @@ properties:
         type: string
         description: max execution time
 
+  annotate_normal_ratio:
+    type: object
+    description: resource definitions for annotate_normal_ratio
+    properties:
+      mem_mb:
+        type: integer
+        description: max memory in MB to be available
+      mem_per_cpu:
+        type: integer
+        description: memory in MB used per cpu
+      partition:
+        type: string
+        description: partition to use on cluster
+      threads:
+        type: integer
+        description: number of threads to be available
+      time:
+        type: string
+        description: max execution time
+
   arriba:
     type: object
     description: resource definitions for arriba
diff --git a/workflow/schemas/rules.schema.yaml b/workflow/schemas/rules.schema.yaml
index 4a93b8b..8e70e7f 100644
--- a/workflow/schemas/rules.schema.yaml
+++ b/workflow/schemas/rules.schema.yaml
@@ -3,6 +3,26 @@ description: snakemake rule input and output files description file
 type: object
 properties:
 
+
+  annotate_normal_ratio:
+    type: object
+    description: input and output parameters for tumour normal vcf with annotate_normal_ratio
+    properties:
+      input:
+        type: object
+        description: list of inputs
+        properties:
+          vcf:
+            type: string
+            description: vcf file with both normal and tumor sample af used to annotate INFO column with n_frq/t_frq
+      output:
+        type: object
+        description: list of outputs
+        properties:
+          vcf:
+            type: string
+            description: vcf file with N_ratio annotation in INFO field
+
   export_to_xlsx:
     type: object
     description: input and output parameters for export_to_xlsx
@@ -35,4 +55,4 @@ properties:
         properties:
           xlsx:
             type: string
-            description: name of xlsx file for summerizing snv results based on different genelists
\ No newline at end of file
+            description: name of xlsx file for summarizing snv results based on different genelists
diff --git a/workflow/scripts/annotate_normal_ratio.py b/workflow/scripts/annotate_normal_ratio.py
new file mode 100644
index 0000000..e669325
--- /dev/null
+++ b/workflow/scripts/annotate_normal_ratio.py
@@ -0,0 +1,51 @@
+#!/bin/python3
+
+from pysam import VariantFile
+import logging
+
+logging.basicConfig(
+    format="{asctime} - {levelname} - {message}",
+    style="{",
+    datefmt="%Y-%m-%d %H:%M",
+    level=logging.INFO,
+)
+
+logging.info("Starting to process " + snakemake.input.vcf)
+
+vcf_in = VariantFile(snakemake.input.vcf)
+
+new_header = vcf_in.header
+new_header.info.add("N_ratio", "A", "Float", "Number of times larger normal AF is than tumour sample.")
+logging.debug("Creating header for " + snakemake.output.vcf)
+vcf_out = VariantFile(snakemake.output.vcf, "wb", header=new_header)
+
+sample_tumor = [x for x in list(vcf_in.header.samples) if x.endswith("_T")][0]
+sample_normal = [x for x in list(vcf_in.header.samples) if x.endswith("_N")][0]
+logging.debug(f"{sample_tumor=}, {sample_normal=}")
+
+logging.info("Starting to iterate through input.")
+for record in vcf_in.fetch():
+    try:
+        n_freq = float(record.samples[sample_normal].get("AF")[0])
+    except TypeError:
+        n_freq = 0
+
+    try:
+        t_freq = float(record.samples[sample_tumor].get("AF")[0])
+    except TypeError:
+        t_freq = 0
+
+    if n_freq == 0:
+        logging.warning("Normal freq 0 or missing, setting ratio to 0")
+        ratio = 0
+    elif t_freq == 0:
+        logging.warning("Tumor freq 0 or missing, setting ratio to 100")
+        ratio = 100
+    else:
+        ratio = float(record.samples[sample_normal].get("AF")[0]) / float(record.samples[sample_tumor].get("AF")[0])
+
+    logging.debug(f"{record.samples[sample_normal].get('AF')[0]=}, {record.samples[sample_tumor].get('AF')[0]=}")
+    logging.debug(f"{ratio=}")
+    record.info["N_ratio"] = ratio
+
+    vcf_out.write(record)
diff --git a/workflow/scripts/export_to_xlsx.py b/workflow/scripts/export_to_xlsx.py
index 5eaba76..2fc5d7c 100644
--- a/workflow/scripts/export_to_xlsx.py
+++ b/workflow/scripts/export_to_xlsx.py
@@ -20,9 +20,9 @@ def index_vep(variantfile):
 bedfiles["tm"] = snakemake.input.tm_bed
 
 vcfs = {}
-vcfs["all"] = snakemake.input.all
-vcfs["aml"] = snakemake.input.aml
-vcfs["tm"] = snakemake.input.tm
+vcfs["all"] = [x for x in snakemake.input.vcfs if "include.all" in x][0]
+vcfs["aml"] = [x for x in snakemake.input.vcfs if "include.aml" in x][0]
+vcfs["tm"] = [x for x in snakemake.input.vcfs if "include.tm" in x][0]
 subsections = ["all", "aml", "tm"]
 vcf_tables = {}
 for subsection in subsections:

From ada2826d6aa45790e82f6c975f23ad33cc1e3f3e Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Wed, 11 Sep 2024 11:55:06 +0200
Subject: [PATCH 10/11] feat: filter out variants with n_ratio lgt 5

---
 config/config_snv_softfilter.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml
index 1949147..7b4f942 100644
--- a/config/config_snv_softfilter.yaml
+++ b/config/config_snv_softfilter.yaml
@@ -44,3 +44,8 @@ filters:
       exist[regulatory_region_variant, VEP:Consequence])"
     soft_filter_flag: "Consequence"
     soft_filter: "True"
+  n_ratio:
+    description: "Soft filter variants where normal freq is atleast 5 times bigger than tumour freq"
+    expression: "INFO:NA_FALSE:N_ratio >= 5"
+    soft_filter_flag: "N_ratio_lgt_5"
+    soft_filter: "True"

From 71c8871d2d11f388f13f9381e2fce6aea76c3c9c Mon Sep 17 00:00:00 2001
From: Arielle R Munters <arielle@reivant.se>
Date: Wed, 11 Sep 2024 15:32:32 +0200
Subject: [PATCH 11/11] fix: update annotate_ratio output format, filter ratio
 to >5, set N_ratio to one value in annotation

---
 config/config_snv_softfilter.yaml         | 2 +-
 workflow/rules/annotate.smk               | 2 +-
 workflow/scripts/annotate_normal_ratio.py | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/config/config_snv_softfilter.yaml b/config/config_snv_softfilter.yaml
index 7b4f942..2655f53 100644
--- a/config/config_snv_softfilter.yaml
+++ b/config/config_snv_softfilter.yaml
@@ -46,6 +46,6 @@ filters:
     soft_filter: "True"
   n_ratio:
     description: "Soft filter variants where normal freq is atleast 5 times bigger than tumour freq"
-    expression: "INFO:NA_FALSE:N_ratio >= 5"
+    expression: "INFO:NA_FALSE:N_ratio > 5"
     soft_filter_flag: "N_ratio_lgt_5"
     soft_filter: "True"
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
index f3110c0..79f9751 100644
--- a/workflow/rules/annotate.smk
+++ b/workflow/rules/annotate.smk
@@ -8,7 +8,7 @@ rule annotate_normal_ratio:
     input:
         vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.vcf",
     output:
-        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz",
+        vcf="parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf",
     log:
         "parabricks/pbrun_mutectcaller_tn/{sample}.normalized.vep.ratio.vcf.gz.log",
     benchmark:
diff --git a/workflow/scripts/annotate_normal_ratio.py b/workflow/scripts/annotate_normal_ratio.py
index e669325..594b48c 100644
--- a/workflow/scripts/annotate_normal_ratio.py
+++ b/workflow/scripts/annotate_normal_ratio.py
@@ -15,9 +15,9 @@
 vcf_in = VariantFile(snakemake.input.vcf)
 
 new_header = vcf_in.header
-new_header.info.add("N_ratio", "A", "Float", "Number of times larger normal AF is than tumour sample.")
+new_header.info.add("N_ratio", "1", "Float", "Number of times larger normal AF is than tumour sample.")
 logging.debug("Creating header for " + snakemake.output.vcf)
-vcf_out = VariantFile(snakemake.output.vcf, "wb", header=new_header)
+vcf_out = VariantFile(snakemake.output.vcf, "w", header=new_header)
 
 sample_tumor = [x for x in list(vcf_in.header.samples) if x.endswith("_T")][0]
 sample_normal = [x for x in list(vcf_in.header.samples) if x.endswith("_N")][0]