snakemake-workflows · johanneskoester · May 17, 2024 · Mar 26, 2024 · Mar 28, 2024 · Apr 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,6 @@ resources/**
 .snakemake/**
 .test/**
 logs
-logs/**
+logs/**
+data/**
+report/**
diff --git a/.test/config/config.yaml b/.test/config/config.yaml
@@ -13,6 +13,9 @@ variant-calls:
     path: "resources/variants/seqc2-somatic/all.truth.format-added.vcf.gz"
     benchmark: "seqc2-somatic-ea"
     tumor_sample_name: "truth"
+    vaf-field: 
+      - INFO
+      - TVAF
 
 custom-benchmarks:
   my-custom-benchmark: # custom benchmark name, choose freely (no whitespace allowed)

diff --git a/config/config.yaml b/config/config.yaml
@@ -17,6 +17,9 @@ variant-calls:
     # Uncomment and point to file containing contig name replacements
     # as needed for 'bcftools annotate --rename-chrs'
     # rename-contigs: path/to/rename-contigs.txt
+    vaf-field: # needs to be checked with bcftools view -h
+      - FORMAT
+      - AF
     # Uncomment if callset was produced using grch37 as reference, then a liftover to grch38 will be performed
     # grch37: true
 

diff --git a/workflow/resources/datavzrd/precision-recall-config.yte.yaml b/workflow/resources/datavzrd/precision-recall-config.yte.yaml
@@ -22,9 +22,9 @@ views:
   results-plot:
     dataset: results
     desc: |
-      Precision and recall are calculated by matching variants between each callset 
+      Precision and recall are calculated by matching variants between each callset
       and truth, stratified by coverage categories. The matching ignores genotype
-      differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate" 
+      differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
       column of the tabular result representation or in the tooltips shown when hovering
       a point.
 
@@ -69,47 +69,113 @@ views:
 
   results-table:
     dataset: results
-    desc: |
-      Precision and recall are calculated by matching variants between each callset 
-      and truth, stratified by coverage categories. The matching ignores genotype
-      differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate" 
-      column.
-    page-size: 12
-    render-table:
-      columns:
-        callset:
-          plot:
-            heatmap:
-              scale: ordinal
-              color-scheme: category20
-        precision:
-          precision: 3
-          plot:
-            ticks:
-              scale: linear
-        recall:
-          precision: 3
-          plot:
-            ticks:
-              scale: linear
-        genotype_mismatch_rate:
-          ?if params.somatic:
-            display-mode: hidden
-          ?else:
-            display-mode: normal
-          precision: 3
-          plot:
-            ticks:
-              scale: linear
-        coverage:
-          plot:
-            heatmap:
-              scale: ordinal
-              domain:
-                - low
-                - medium
-                - high
-              range:
-                - "#c6dbef"
-                - "#9ecae1"
-                - "#6baed6"
+    ?if params.vaf:
+      desc: |
+        Precision and recall are calculated by matching variants between each callset
+        and truth, stratified by coverage categories. Stratified by VAF.
+      page-size: 12
+      render-table:
+        columns:
+          callset:
+            plot:
+              heatmap:
+                scale: ordinal
+                color-scheme: category20
+          precision:
+            precision: 3
+            plot:
+              ticks:
+                scale: linear
+          recall:
+            precision: 3
+            plot:
+              ticks:
+                scale: linear
+          coverage:
+            plot:
+              heatmap:
+                scale: ordinal
+                domain:
+                  - low
+                  - medium
+                  - high
+                range:
+                  - "#c6dbef"
+                  - "#9ecae1"
+                  - "#6baed6"
+          vaf:
+            plot:
+             heatmap:
+                scale: linear
+                custom-content:
+                  function(value, row) {
+                    let lower = value - 0.1;
+                    return `${lower.toFixed(1)}..${parseFloat(value).toFixed(1)}`
+                  }
+                domain:
+                  - 0.1
+                  - 0.2
+                  - 0.3
+                  - 0.4
+                  - 0.5
+                  - 0.6
+                  - 0.7
+                  - 0.8
+                  - 0.9
+                  - 1.0
+                range:
+                  - "#c0e6baff"
+                  - "#abdda5ff"
+                  - "#94d391ff"
+                  - "#7bc77dff"
+                  - "#60ba6cff"
+                  - "#46ab5eff"
+                  - "#329a51ff"
+                  - "#208943ff"
+                  - "#0e7735ff"
+                  - "#1a833fff"
+    ?else:
+      desc: |
+        Precision and recall are calculated by matching variants between each callset
+        and truth, stratified by coverage categories. The matching ignores genotype
+        differences. Instead, genotype mismatches are displayed in the "genotype mismatch rate"
+        column.
+      page-size: 12
+      render-table:
+        columns:
+          callset:
+            plot:
+              heatmap:
+                scale: ordinal
+                color-scheme: category20
+          precision:
+            precision: 3
+            plot:
+              ticks:
+                scale: linear
+          recall:
+            precision: 3
+            plot:
+              ticks:
+                scale: linear
+          genotype_mismatch_rate:
+            ?if params.somatic:
+              display-mode: hidden
+            ?else:
+              display-mode: normal
+            precision: 3
+            plot:
+              ticks:
+                scale: linear
+          coverage:
+            plot:
+              heatmap:
+                scale: ordinal
+                domain:
+                  - low
+                  - medium
+                  - high
+                range:
+                  - "#c6dbef"
+                  - "#9ecae1"
+                  - "#6baed6"
diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml
@@ -15,6 +15,9 @@ benchmarks:
     bam-url: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bam
     target-regions: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/technical/reference_genome/Exome_Target_bed/S07604624_Covered_human_all_v6_plus_UTR.liftover.to.hg38.bed6.gz
     grch37: false
+    vaf-field: 
+      - INFO # either FORMAT or INFO
+      - TVAF # name of tumor variant allele frequency
 
   imgag-somatic-5perc:
     genome: na12878-somatic

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -83,7 +83,7 @@ def get_plot_cov_labels():
     def label(name):
         lower, upper = get_cov_interval(name)
         if upper:
-            return f"{lower}-{upper - 1}"
+            return f"{lower}-{upper-1}"
         return f"≥{lower}"
 
     return {name: label(name) for name in coverages}
@@ -178,7 +178,7 @@ def get_callset(wildcards):
     if get_somatic_status(wildcards):
         return "results/normalized-variants/{callset}.gt-added.vcf.gz"
     elif "rename-contigs" in callset:
-        return "results/normalized-variants/{callset}.replaced-contigs.bcf"
+        return "results/normalized-variants/{callset}.replaced-contigs.vcf.gz"
     elif "grch37" in callset:
         return "results/normalized-variants/{callset}.lifted.vcf.gz"
     else:
@@ -188,7 +188,7 @@ def get_callset(wildcards):
 def get_callset_correct_contigs(wildcards):
     callset = config["variant-calls"][wildcards.callset]
     if "rename-contigs" in callset:
-        return "results/normalized-variants/{callset}.replaced-contigs.bcf"
+        return "results/normalized-variants/{callset}.replaced-contigs.vcf.gz"
     elif "grch37" in callset:
         return "results/normalized-variants/{callset}.lifted.vcf.gz"
     else:
@@ -199,6 +199,8 @@ def get_callset_correct_contigs_liftover(wildcards):
     callset = config["variant-calls"][wildcards.callset]
     if "grch37" in callset:
         return "results/normalized-variants/{callset}.lifted.vcf.gz"
+    elif "rename-contigs" in callset:
+        return "results/normalized-variants/{callset}.replaced-contigs.vcf.gz"
     else:
         return get_raw_callset(wildcards)
 
@@ -390,6 +392,31 @@ def get_somatic_flag(wildcards):
     return somatic_flag
 
 
+def get_vaf_fields(wildcards):
+    vaf_callset = config["variant-calls"][wildcards.callset].get("vaf-field")
+
+    benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
+    vaf_benchmark = benchmarks[benchmark].get("vaf-field")
+
+    # can return (None, None) if param not set
+    return (vaf_callset, vaf_benchmark)
+
+
+def get_vaf_status(wildcards):
+    vaf_benchmark = benchmarks[wildcards.benchmark].get("vaf-field")
+    if vaf_benchmark is None:
+        return False
+    else:
+        callsets = get_benchmark_callsets(wildcards.benchmark)
+        vaf_callsets = [
+            config["variant-calls"][callset].get("vaf-field") for callset in callsets
+        ]
+        if any(vaf_callset is not None for vaf_callset in vaf_callsets):
+            return True
+        else:
+            return False
+
+
 def get_collect_stratifications_input(wildcards):
     import json
 

diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
@@ -1,25 +1,40 @@
+rule get_reference_dict:
+    input:
+        reference="resources/reference/genome.fasta",
+    output:
+        "resources/reference/genome.fasta.dict",
+    log:
+        "logs/get-reference-dict.log",
+    conda:
+        "../envs/picard.yaml"
+    shell:
+        "picard CreateSequenceDictionary  -R {input.reference} -O {input.reference}.dict &> {log}"
+
+
 rule liftover_callset:
     input:
         callset=get_callset_correct_contigs,
         liftover_chain="resources/liftover/GRCh37_to_GRCh38.chain.gz",
         reference="resources/reference/genome.fasta",
+        reference_dict="resources/reference/genome.fasta.dict",
     output:
         "results/normalized-variants/{callset}.lifted.vcf.gz",
     log:
         "logs/liftover_callset/{callset}.log",
     conda:
         "../envs/picard.yaml"
+    resources:
+        mem_mb=64000,
     shell:
-        "picard CreateSequenceDictionary  -R {input.reference} -O {input.reference}.dict"
-        "picard LiftoverVcf  -I {input.callset}  -O {output} --CHAIN {input.liftover_chain} --REJECT {output}_rejected_variants.vcf -R {input.reference} &> {log}"
+        "picard LiftoverVcf -Xmx64g --MAX_RECORDS_IN_RAM 100000 -I {input.callset} -O {output} --CHAIN {input.liftover_chain} --REJECT {output}_rejected_variants.vcf -R {input.reference} &> {log}"
 
 
 rule rename_contigs:
     input:
         calls=get_raw_callset,
         repl_file=get_rename_contig_file,
     output:
-        "results/normalized-variants/{callset}.replaced-contigs.bcf",
+        "results/normalized-variants/{callset}.replaced-contigs.vcf.gz",
     log:
         "logs/rename-contigs/{callset}.log",
     conda:
@@ -195,12 +210,16 @@ rule calc_precision_recall:
         calls="results/vcfeval/{callset}/{cov}/output.vcf.gz",
         idx="results/vcfeval/{callset}/{cov}/output.vcf.gz.tbi",
         common_src=common_src,
+        truth=get_stratified_truth(),
+        truth_idx=get_stratified_truth(".tbi"),
+        query="results/stratified-variants/{callset}/{cov}.vcf.gz",
+        query_index="results/stratified-variants/{callset}/{cov}.vcf.gz.tbi",
     output:
         snvs="results/precision-recall/callsets/{callset}/{cov}.{vartype}.tsv",
     log:
         "logs/calc-precision-recall/{callset}/{cov}/{vartype}.log",
-    # params:
-    #     vaf_fields=get_vaf_fields,
+    params:
+        vaf_fields=get_vaf_fields,
     conda:
         "../envs/pysam.yaml"
     script:
@@ -234,6 +253,7 @@ rule collect_precision_recall:
     params:
         callsets=lambda w: get_benchmark_callsets(w.benchmark),
         labels=get_collect_precision_recall_labels,
+        vaf=get_vaf_status,
     log:
         "logs/collect-precision-recall/{benchmark}/{vartype}.log",
     conda:
@@ -259,6 +279,7 @@ rule report_precision_recall:
         "logs/datavzrd/precision-recall/{benchmark}/{vartype}.log",
     params:
         somatic=get_somatic_status,
+        vaf=get_vaf_status,
     wrapper:
         "v3.10.1/utils/datavzrd"