snakemake-workflows · famosab · Mar 26, 2024 · Mar 28, 2024 · Apr 12, 2024 · Apr 12, 2024
diff --git a/config/config.yaml b/config/config.yaml
@@ -17,6 +17,7 @@ variant-calls:
     # Uncomment and point to file containing contig name replacements
     # as needed for 'bcftools annotate --rename-chrs'
     # rename-contigs: path/to/rename-contigs.txt
+    vaf-field: FORMAT/AF # needs to be checked with bcftools view -h
 
 # Optional custom benchmarks
 custom-benchmarks:

diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml
@@ -15,6 +15,7 @@ benchmarks:
     bam-url: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bam
     target-regions: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/technical/reference_genome/Exome_Target_bed/S07604624_Covered_human_all_v6_plus_UTR.liftover.to.hg38.bed6.gz
     grch37: false
+    vaf-field: FORMAT/AF # needs to be checked
 
   imgag-somatic-5perc:
     genome: na12878-somatic

diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -387,6 +387,16 @@ def get_somatic_flag(wildcards):
     return somatic_flag
 
 
+def get_vaf_fields(wildcards):
+    vaf_callset = config["variant-calls"][wildcards.callset].get("vaf_field")
+
+    benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
+    vaf_benchmark = benchmarks[benchmark].get("vaf_field")
+
+    # can return (None, None) if param not set
+    return (vaf_callset, vaf_benchmark)
+
+
 def get_collect_stratifications_input(wildcards):
     import json
 

diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk
@@ -158,6 +158,33 @@ rule get_reference:
         "v1.7.2/bio/reference/ensembl-sequence"
 
 
+rule get_liftover_references:
+    output:
+        "resources/liftover/"
+    log:
+        "logs/get_liftover_references.log",
+    conda:
+        "../envs/tools.yaml"
+    shell:
+        # GRCh37
+        "wget -O- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz | "
+        "gzip -d > $HOME/GRCh37/human_g1k_v37.fasta"
+        "samtools faidx $HOME/GRCh37/human_g1k_v37.fasta"
+        "bwa index $HOME/GRCh37/human_g1k_v37.fasta"
+        "wget -P $HOME/GRCh37 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz"
+        "wget -P $HOME/GRCh37 http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg19.over.chain.gz"
+        "ref='$HOME/GRCh37/human_g1k_v37.fasta'"
+        # GRCh38
+        "wget -O- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | "
+        "gzip -d > $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
+        "samtools faidx $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
+        "bwa index $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
+        "wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz"
+        "wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg38.over.chain.gz"
+        "wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz"
+        "ref='$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'"
+
+
 rule samtools_faidx:
     input:
         "resources/reference/genome.fasta",

diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk
@@ -1,3 +1,25 @@
+rule liftover_callset:
+    input:
+        get_callset,
+    output:
+        "results/normalized-variants/{callset}.lifted.vcf.gz",
+    log:
+        "logs/liftover-callset/{callset}.log",
+    container:
+        "docker:us.gcr.io/mccarroll-mocha/bcftools@sha256:b636b46e4db7122bc546041ad8dc9328d8b20a999fc8bdea82108eff07f6271a"
+    shell:
+        "wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz{,.tbi}
+        "bcftools +liftover --no-version -Ou ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz -- "
+        "-s $HOME/GRCh37/human_g1k_v37.fasta"
+        "-f $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" "
+        "-c $HOME/GRCh38/hg19ToHg38.over.chain.gz" "
+        "--reject ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.reject.bcf "
+        "--reject-type b "
+        "--write-src | "
+        "bcftools sort -Ob | tee ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.hg38.bcf | "
+        "bcftools index --force --output ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.hg38.bcf.csi" "
+
+
 rule rename_contigs:
     input:
         calls=get_raw_callset,
@@ -173,6 +195,8 @@ rule calc_precision_recall:
         snvs="results/precision-recall/callsets/{callset}/{cov}.{vartype}.tsv",
     log:
         "logs/calc-precision-recall/{callset}/{cov}/{vartype}.log",
+    params:
+        vaf_fields=get_vaf_fields,
     conda:
         "../envs/pysam.yaml"
     script:

diff --git a/workflow/scripts/calc-precision-recall.py b/workflow/scripts/calc-precision-recall.py
@@ -7,29 +7,56 @@
 from abc import ABC, abstractmethod
 from enum import Enum
 import pandas as pd
+import numpy as np
 import pysam
 
 from common.classification import CompareExactGenotype, CompareExistence, Class
 
 
 class Classifications:
-    def __init__(self, comparator):
-        self.tp_query = 0
-        self.tp_truth = 0
-        self.fn = 0
-        self.fp = 0
-        self.comparator = comparator
+    def __init__(self, comparator, vaf_fields):
+        if vaf_fields[0] is None or vaf_fields[1] is None:  
+            self.tp_query = 0
+            self.tp_truth = 0
+            self.fn = 0
+            self.fp = 0
+            self.comparator = comparator
+        else:
+            self.tp_query = np.array([]) # 0-100%
+            self.tp_truth = 0
+            self.fn = 0
+            self.fp = 0
+
+    def increment_counter(self, counter, vaf):
+        if vaf is None:
+            counter += 1
+        else:
+            # 10 equally sized bins
+            bin = int(vaf*10)
+            counter[bin] += 1
 
     def register(self, record):
         for c in self.comparator.classify(record):
+            # TODO: depending on case, fetch VAF from truth or query record (FP: from query record, field configurable by callset (e.g. FORMAT/AF, INFO/AF, ...)
+            # for truth record, field configurable by benchmark preset (same syntax as above)
+            # increment counters for bins, bins given to constructor as list of tuples or some numpy equivalent.
+            # Default: None. If no VAF field given for either truth or callset, don't bin at all. 
             if c.cls is Class.TP_truth:
-                self.tp_truth += 1
+                vaf = truth.samples[record.name]["AF"] # still needs to be implemented
+                self.increment_counter(self.tp_truth, vaf)
+                # self.tp_truth += 1
             elif c.cls is Class.TP_query:
-                self.tp_query += 1
+                vaf = truth.samples[record.name]["AF"] # still needs to be implemented
+                self.increment_counter(self.tp_query, vaf)
+                # self.tp_query += 1
             elif c.cls is Class.FN:
-                self.fn += 1
+                vaf = truth.samples[record.name]["AF"] # still needs to be implemented
+                self.increment_counter(self.fn, vaf)
+                # self.fn += 1
             elif c.cls is Class.FP:
-                self.fp += 1
+                vaf = record.samples[0]["AF"] # only works for FORMAT field
+                self.increment_counter(self.fp, vaf)
+                # self.fp += 1
             else:
                 assert False, "unexpected case"
 
@@ -89,6 +116,7 @@ def collect_results(vartype):
         ]
     ]
 
+snakemake.params.vaf_fields
 
 assert snakemake.wildcards.vartype in ["snvs", "indels"]
 vartype = "SNV" if snakemake.wildcards.vartype == "snvs" else "INDEL"