diff --git a/genome_grist/combine_csvs.py b/genome_grist/combine_csvs.py index c685b71..388aa8f 100644 --- a/genome_grist/combine_csvs.py +++ b/genome_grist/combine_csvs.py @@ -24,6 +24,10 @@ def main(): rows.extend(list(r)) fieldnames = r.fieldnames + if not rows: + print(f"error: file {first_csv} is empty!?", file=sys.stderr) + sys.exit(-1) + if args.fields: fieldnames = args.fields.split(',') diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile index 99c9f92..b564073 100755 --- a/genome_grist/conf/Snakefile +++ b/genome_grist/conf/Snakefile @@ -715,12 +715,13 @@ rule build_new_consensus_wc: # summarize depth into a CSV rule summarize_samtools_depth_wc: input: - Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt") + depth = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt"), + vcf = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.vcf.gz") output: csv = f"{outdir}/{{dir}}/{{sample}}.summary.csv" shell: """ python -m genome_grist.summarize_mapping {wildcards.sample} \ - {input} -o {output.csv} + {input.depth} -o {output.csv} """ # compute sourmash signature from abundtrim reads diff --git a/genome_grist/summarize_mapping.py b/genome_grist/summarize_mapping.py index 0f3c2c4..70b1903 100644 --- a/genome_grist/summarize_mapping.py +++ b/genome_grist/summarize_mapping.py @@ -1,11 +1,43 @@ #! /usr/bin/env python """ Summarize mapping depth information (produced by samtools depth -aa {bamfile}). +Also summarize SNP counts from VCF file. """ import argparse import sys import pandas as pd import os +import gzip +from collections import defaultdict + + +def summarize_vcf(vcf_gz): + "Count number of distinct SNP locations" + by_pos = defaultdict(dict) + n_lines = 0 + with gzip.open(vcf_gz, 'rt') as fp: + for line in fp: + # skip comments + if line.startswith('#'): + continue + + n_lines += 1 + chrom, pos, ident, ref, alt, qual, *rest = line.split('\t') + + # skip indels for now + if len(ref) > 1 or len(alt) > 1: + continue + + pos = int(pos) + by_pos[chrom][pos] = 1 + + n_chrom = len(by_pos) + + n_snps = 0 + for chrom in by_pos: + n_snps += len(by_pos[chrom]) + + return n_lines, n_chrom, n_snps def main(): @@ -19,13 +51,15 @@ def main(): sample = args.sample_name runs = {} for n, depth_txt in enumerate(args.depth_txts): - print(f'reading from {depth_txt}', file=sys.stderr) + assert depth_txt.endswith('.depth.txt') + vcf_gz = depth_txt[:-len('.depth.txt')] + '.vcf.gz' + assert os.path.exists(vcf_gz) + print(f"reading from '{vcf_gz}'", file=sys.stderr) + _, n_chrom, n_snps = summarize_vcf(vcf_gz) - data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"]) + print(f"reading from '{depth_txt}", file=sys.stderr) - if not len(data): - print("empty?") - continue + data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"]) filename = os.path.basename(depth_txt) sample_check, _, genome_id, *rest = filename.split(".") @@ -33,6 +67,9 @@ def main(): assert sample_check == sample, (sample_check, sample) d = {} + d['n_chrom'] = n_chrom + d['n_snps'] = n_snps + value_counts = data['coverage'].value_counts() d['genome bp'] = int(len(data)) d['missed'] = int(value_counts.get(0, 0)) @@ -43,7 +80,8 @@ def main(): d['unique_mapped_coverage'] = uniq_cov else: d['unique_mapped_coverage'] = d['coverage'] - d['covered_bp'] = (1 - d['percent missed']/100.0) * d['genome bp'] + covered_bp = (1 - d['percent missed']/100.0) * d['genome bp'] + d['covered_bp'] = round(covered_bp + 0.5) d['genome_id'] = genome_id d['sample_id'] = sample