diff --git a/config/config.yaml b/config/config.yaml index 1131dfc..7ad42f0 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -17,6 +17,7 @@ variant-calls: # Uncomment and point to file containing contig name replacements # as needed for 'bcftools annotate --rename-chrs' # rename-contigs: path/to/rename-contigs.txt + vaf-field: FORMAT/AF # needs to be checked with bcftools view -h # Optional custom benchmarks custom-benchmarks: diff --git a/workflow/resources/presets.yaml b/workflow/resources/presets.yaml index 7109c05..4e07a9f 100644 --- a/workflow/resources/presets.yaml +++ b/workflow/resources/presets.yaml @@ -15,6 +15,7 @@ benchmarks: bam-url: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bam target-regions: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/technical/reference_genome/Exome_Target_bed/S07604624_Covered_human_all_v6_plus_UTR.liftover.to.hg38.bed6.gz grch37: false + vaf-field: FORMAT/AF # needs to be checked imgag-somatic-5perc: genome: na12878-somatic diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 008d1b5..1d06e06 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -387,6 +387,16 @@ def get_somatic_flag(wildcards): return somatic_flag +def get_vaf_fields(wildcards): + vaf_callset = config["variant-calls"][wildcards.callset].get("vaf_field") + + benchmark = config["variant-calls"][wildcards.callset]["benchmark"] + vaf_benchmark = benchmarks[benchmark].get("vaf_field") + + # can return (None, None) if param not set + return (vaf_callset, vaf_benchmark) + + def get_collect_stratifications_input(wildcards): import json diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index e5c19f8..4ce3b01 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -158,6 +158,33 @@ rule get_reference: "v1.7.2/bio/reference/ensembl-sequence" +rule get_liftover_references: + output: + "resources/liftover/" + log: + "logs/get_liftover_references.log", + conda: + "../envs/tools.yaml" + shell: + # GRCh37 + "wget -O- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz | " + "gzip -d > $HOME/GRCh37/human_g1k_v37.fasta" + "samtools faidx $HOME/GRCh37/human_g1k_v37.fasta" + "bwa index $HOME/GRCh37/human_g1k_v37.fasta" + "wget -P $HOME/GRCh37 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz" + "wget -P $HOME/GRCh37 http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg19.over.chain.gz" + "ref='$HOME/GRCh37/human_g1k_v37.fasta'" + # GRCh38 + "wget -O- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | " + "gzip -d > $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" + "samtools faidx $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" + "bwa index $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" + "wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz" + "wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg38.over.chain.gz" + "wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz" + "ref='$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'" + + rule samtools_faidx: input: "resources/reference/genome.fasta", diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index b7eabb7..e138ccd 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -1,3 +1,25 @@ +rule liftover_callset: + input: + get_callset, + output: + "results/normalized-variants/{callset}.lifted.vcf.gz", + log: + "logs/liftover-callset/{callset}.log", + container: + "docker:us.gcr.io/mccarroll-mocha/bcftools@sha256:b636b46e4db7122bc546041ad8dc9328d8b20a999fc8bdea82108eff07f6271a" + shell: + "wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz{,.tbi} + "bcftools +liftover --no-version -Ou ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz -- " + "-s $HOME/GRCh37/human_g1k_v37.fasta" + "-f $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" " + "-c $HOME/GRCh38/hg19ToHg38.over.chain.gz" " + "--reject ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.reject.bcf " + "--reject-type b " + "--write-src | " + "bcftools sort -Ob | tee ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.hg38.bcf | " + "bcftools index --force --output ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.hg38.bcf.csi" " + + rule rename_contigs: input: calls=get_raw_callset, @@ -173,6 +195,8 @@ rule calc_precision_recall: snvs="results/precision-recall/callsets/{callset}/{cov}.{vartype}.tsv", log: "logs/calc-precision-recall/{callset}/{cov}/{vartype}.log", + params: + vaf_fields=get_vaf_fields, conda: "../envs/pysam.yaml" script: diff --git a/workflow/scripts/calc-precision-recall.py b/workflow/scripts/calc-precision-recall.py index 3db4071..797c319 100644 --- a/workflow/scripts/calc-precision-recall.py +++ b/workflow/scripts/calc-precision-recall.py @@ -7,29 +7,56 @@ from abc import ABC, abstractmethod from enum import Enum import pandas as pd +import numpy as np import pysam from common.classification import CompareExactGenotype, CompareExistence, Class class Classifications: - def __init__(self, comparator): - self.tp_query = 0 - self.tp_truth = 0 - self.fn = 0 - self.fp = 0 - self.comparator = comparator + def __init__(self, comparator, vaf_fields): + if vaf_fields[0] is None or vaf_fields[1] is None: + self.tp_query = 0 + self.tp_truth = 0 + self.fn = 0 + self.fp = 0 + self.comparator = comparator + else: + self.tp_query = np.array([]) # 0-100% + self.tp_truth = 0 + self.fn = 0 + self.fp = 0 + + def increment_counter(self, counter, vaf): + if vaf is None: + counter += 1 + else: + # 10 equally sized bins + bin = int(vaf*10) + counter[bin] += 1 def register(self, record): for c in self.comparator.classify(record): + # TODO: depending on case, fetch VAF from truth or query record (FP: from query record, field configurable by callset (e.g. FORMAT/AF, INFO/AF, ...) + # for truth record, field configurable by benchmark preset (same syntax as above) + # increment counters for bins, bins given to constructor as list of tuples or some numpy equivalent. + # Default: None. If no VAF field given for either truth or callset, don't bin at all. if c.cls is Class.TP_truth: - self.tp_truth += 1 + vaf = truth.samples[record.name]["AF"] # still needs to be implemented + self.increment_counter(self.tp_truth, vaf) + # self.tp_truth += 1 elif c.cls is Class.TP_query: - self.tp_query += 1 + vaf = truth.samples[record.name]["AF"] # still needs to be implemented + self.increment_counter(self.tp_query, vaf) + # self.tp_query += 1 elif c.cls is Class.FN: - self.fn += 1 + vaf = truth.samples[record.name]["AF"] # still needs to be implemented + self.increment_counter(self.fn, vaf) + # self.fn += 1 elif c.cls is Class.FP: - self.fp += 1 + vaf = record.samples[0]["AF"] # only works for FORMAT field + self.increment_counter(self.fp, vaf) + # self.fp += 1 else: assert False, "unexpected case" @@ -89,6 +116,7 @@ def collect_results(vartype): ] ] +snakemake.params.vaf_fields assert snakemake.wildcards.vartype in ["snvs", "indels"] vartype = "SNV" if snakemake.wildcards.vartype == "snvs" else "INDEL"