Skip to content

Commit

Permalink
[MRG] add VCF-calculating code from #124 (#165)
Browse files Browse the repository at this point in the history
* steal VCF code from #124

* fix n_snps calculation
  • Loading branch information
ctb authored Feb 16, 2022
1 parent cfdf4db commit 03b0d60
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 8 deletions.
4 changes: 4 additions & 0 deletions genome_grist/combine_csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@ def main():
rows.extend(list(r))
fieldnames = r.fieldnames

if not rows:
print(f"error: file {first_csv} is empty!?", file=sys.stderr)
sys.exit(-1)

if args.fields:
fieldnames = args.fields.split(',')

Expand Down
5 changes: 3 additions & 2 deletions genome_grist/conf/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -715,12 +715,13 @@ rule build_new_consensus_wc:
# summarize depth into a CSV
rule summarize_samtools_depth_wc:
input:
Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt")
depth = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt"),
vcf = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.vcf.gz")
output:
csv = f"{outdir}/{{dir}}/{{sample}}.summary.csv"
shell: """
python -m genome_grist.summarize_mapping {wildcards.sample} \
{input} -o {output.csv}
{input.depth} -o {output.csv}
"""

# compute sourmash signature from abundtrim reads
Expand Down
50 changes: 44 additions & 6 deletions genome_grist/summarize_mapping.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,43 @@
#! /usr/bin/env python
"""
Summarize mapping depth information (produced by samtools depth -aa {bamfile}).
Also summarize SNP counts from VCF file.
"""
import argparse
import sys
import pandas as pd
import os
import gzip
from collections import defaultdict


def summarize_vcf(vcf_gz):
"Count number of distinct SNP locations"
by_pos = defaultdict(dict)
n_lines = 0
with gzip.open(vcf_gz, 'rt') as fp:
for line in fp:
# skip comments
if line.startswith('#'):
continue

n_lines += 1
chrom, pos, ident, ref, alt, qual, *rest = line.split('\t')

# skip indels for now
if len(ref) > 1 or len(alt) > 1:
continue

pos = int(pos)
by_pos[chrom][pos] = 1

n_chrom = len(by_pos)

n_snps = 0
for chrom in by_pos:
n_snps += len(by_pos[chrom])

return n_lines, n_chrom, n_snps


def main():
Expand All @@ -19,20 +51,25 @@ def main():
sample = args.sample_name
runs = {}
for n, depth_txt in enumerate(args.depth_txts):
print(f'reading from {depth_txt}', file=sys.stderr)
assert depth_txt.endswith('.depth.txt')
vcf_gz = depth_txt[:-len('.depth.txt')] + '.vcf.gz'
assert os.path.exists(vcf_gz)
print(f"reading from '{vcf_gz}'", file=sys.stderr)
_, n_chrom, n_snps = summarize_vcf(vcf_gz)

data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"])
print(f"reading from '{depth_txt}", file=sys.stderr)

if not len(data):
print("empty?")
continue
data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"])

filename = os.path.basename(depth_txt)
sample_check, _, genome_id, *rest = filename.split(".")

assert sample_check == sample, (sample_check, sample)

d = {}
d['n_chrom'] = n_chrom
d['n_snps'] = n_snps

value_counts = data['coverage'].value_counts()
d['genome bp'] = int(len(data))
d['missed'] = int(value_counts.get(0, 0))
Expand All @@ -43,7 +80,8 @@ def main():
d['unique_mapped_coverage'] = uniq_cov
else:
d['unique_mapped_coverage'] = d['coverage']
d['covered_bp'] = (1 - d['percent missed']/100.0) * d['genome bp']
covered_bp = (1 - d['percent missed']/100.0) * d['genome bp']
d['covered_bp'] = round(covered_bp + 0.5)
d['genome_id'] = genome_id
d['sample_id'] = sample

Expand Down

0 comments on commit 03b0d60

Please sign in to comment.