Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add rules to liftover callsets from hg19/GRCh37 to hg38/GRCh38 #69

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ variant-calls:
# Uncomment and point to file containing contig name replacements
# as needed for 'bcftools annotate --rename-chrs'
# rename-contigs: path/to/rename-contigs.txt
vaf-field: FORMAT/AF # needs to be checked with bcftools view -h

# Optional custom benchmarks
custom-benchmarks:
Expand Down
1 change: 1 addition & 0 deletions workflow/resources/presets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ benchmarks:
bam-url: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/data/WES/WES_EA_T_1.bwa.dedup.bam
target-regions: https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/technical/reference_genome/Exome_Target_bed/S07604624_Covered_human_all_v6_plus_UTR.liftover.to.hg38.bed6.gz
grch37: false
vaf-field: FORMAT/AF # needs to be checked

imgag-somatic-5perc:
genome: na12878-somatic
Expand Down
10 changes: 10 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,16 @@ def get_somatic_flag(wildcards):
return somatic_flag


def get_vaf_fields(wildcards):
vaf_callset = config["variant-calls"][wildcards.callset].get("vaf_field")

benchmark = config["variant-calls"][wildcards.callset]["benchmark"]
vaf_benchmark = benchmarks[benchmark].get("vaf_field")

# can return (None, None) if param not set
return (vaf_callset, vaf_benchmark)


def get_collect_stratifications_input(wildcards):
import json

Expand Down
27 changes: 27 additions & 0 deletions workflow/rules/download.smk
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,33 @@ rule get_reference:
"v1.7.2/bio/reference/ensembl-sequence"


rule get_liftover_references:
output:
"resources/liftover/"
log:
"logs/get_liftover_references.log",
conda:
"../envs/tools.yaml"
shell:
# GRCh37
"wget -O- ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/human_g1k_v37.fasta.gz | "
"gzip -d > $HOME/GRCh37/human_g1k_v37.fasta"
"samtools faidx $HOME/GRCh37/human_g1k_v37.fasta"
"bwa index $HOME/GRCh37/human_g1k_v37.fasta"
"wget -P $HOME/GRCh37 http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/cytoBand.txt.gz"
"wget -P $HOME/GRCh37 http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg19.over.chain.gz"
"ref='$HOME/GRCh37/human_g1k_v37.fasta'"
# GRCh38
"wget -O- ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/405/GCA_000001405.15_GRCh38/seqs_for_alignment_pipelines.ucsc_ids/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz | "
"gzip -d > $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
"samtools faidx $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
"bwa index $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna"
"wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenPath/hg38/database/cytoBand.txt.gz"
"wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenpath/hg18/liftOver/hg18ToHg38.over.chain.gz"
"wget -P $HOME/GRCh38 http://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz"
"ref='$HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna'"


rule samtools_faidx:
input:
"resources/reference/genome.fasta",
Expand Down
24 changes: 24 additions & 0 deletions workflow/rules/eval.smk
Original file line number Diff line number Diff line change
@@ -1,3 +1,25 @@
rule liftover_callset:
input:
get_callset,
output:
"results/normalized-variants/{callset}.lifted.vcf.gz",
log:
"logs/liftover-callset/{callset}.log",
container:
"docker:us.gcr.io/mccarroll-mocha/bcftools@sha256:b636b46e4db7122bc546041ad8dc9328d8b20a999fc8bdea82108eff07f6271a"
shell:
"wget ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz{,.tbi}
"bcftools +liftover --no-version -Ou ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.vcf.gz -- "
"-s $HOME/GRCh37/human_g1k_v37.fasta"
"-f $HOME/GRCh38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna" "
"-c $HOME/GRCh38/hg19ToHg38.over.chain.gz" "
"--reject ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.reject.bcf "
"--reject-type b "
"--write-src | "
"bcftools sort -Ob | tee ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.hg38.bcf | "
"bcftools index --force --output ALL.wgs.phase3_shapeit2_mvncall_integrated_v5c.20130502.sites.hg38.bcf.csi" "


rule rename_contigs:
input:
calls=get_raw_callset,
Expand Down Expand Up @@ -173,6 +195,8 @@ rule calc_precision_recall:
snvs="results/precision-recall/callsets/{callset}/{cov}.{vartype}.tsv",
log:
"logs/calc-precision-recall/{callset}/{cov}/{vartype}.log",
params:
vaf_fields=get_vaf_fields,
conda:
"../envs/pysam.yaml"
script:
Expand Down
48 changes: 38 additions & 10 deletions workflow/scripts/calc-precision-recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,56 @@
from abc import ABC, abstractmethod
from enum import Enum
import pandas as pd
import numpy as np
import pysam

from common.classification import CompareExactGenotype, CompareExistence, Class


class Classifications:
def __init__(self, comparator):
self.tp_query = 0
self.tp_truth = 0
self.fn = 0
self.fp = 0
self.comparator = comparator
def __init__(self, comparator, vaf_fields):
if vaf_fields[0] is None or vaf_fields[1] is None:
self.tp_query = 0
self.tp_truth = 0
self.fn = 0
self.fp = 0
self.comparator = comparator
else:
self.tp_query = np.array([]) # 0-100%
self.tp_truth = 0
self.fn = 0
self.fp = 0

def increment_counter(self, counter, vaf):
if vaf is None:
counter += 1
else:
# 10 equally sized bins
bin = int(vaf*10)
counter[bin] += 1

def register(self, record):
for c in self.comparator.classify(record):
# TODO: depending on case, fetch VAF from truth or query record (FP: from query record, field configurable by callset (e.g. FORMAT/AF, INFO/AF, ...)
# for truth record, field configurable by benchmark preset (same syntax as above)
# increment counters for bins, bins given to constructor as list of tuples or some numpy equivalent.
# Default: None. If no VAF field given for either truth or callset, don't bin at all.
if c.cls is Class.TP_truth:
self.tp_truth += 1
vaf = truth.samples[record.name]["AF"] # still needs to be implemented
self.increment_counter(self.tp_truth, vaf)
# self.tp_truth += 1
elif c.cls is Class.TP_query:
self.tp_query += 1
vaf = truth.samples[record.name]["AF"] # still needs to be implemented
self.increment_counter(self.tp_query, vaf)
# self.tp_query += 1
elif c.cls is Class.FN:
self.fn += 1
vaf = truth.samples[record.name]["AF"] # still needs to be implemented
self.increment_counter(self.fn, vaf)
# self.fn += 1
elif c.cls is Class.FP:
self.fp += 1
vaf = record.samples[0]["AF"] # only works for FORMAT field
self.increment_counter(self.fp, vaf)
# self.fp += 1
else:
assert False, "unexpected case"

Expand Down Expand Up @@ -89,6 +116,7 @@ def collect_results(vartype):
]
]

snakemake.params.vaf_fields

assert snakemake.wildcards.vartype in ["snvs", "indels"]
vartype = "SNV" if snakemake.wildcards.vartype == "snvs" else "INDEL"
Expand Down
Loading