From ec1eff93632eb16118d5719d541204ce8685edfa Mon Sep 17 00:00:00 2001 From: jonca79 Date: Thu, 4 Jul 2024 10:46:01 +0200 Subject: [PATCH] tests: corrected tests for tsv_report --- .tests/units/test_cnv_report.py | 13 +++++++------ .tests/units/vcf/test.cnv1.vcf | 2 +- .tests/units/vcf/test.cnv2.vcf | 2 +- workflow/scripts/cnv_report.py | 30 +++++++++--------------------- 4 files changed, 18 insertions(+), 29 deletions(-) diff --git a/.tests/units/test_cnv_report.py b/.tests/units/test_cnv_report.py index 1b10d17c..73ea9fee 100644 --- a/.tests/units/test_cnv_report.py +++ b/.tests/units/test_cnv_report.py @@ -22,6 +22,7 @@ def test_create_tsv_report(self): ".tests/units/vcf/test.deletions.tsv", ".tests/units/vcf/test.amplifications.tsv", ".tests/integration/reference/chromosome_arm_size.tsv", + ".tests/units/gatk_cnv/LI-VAL-42_T.clean.denoisedCR.tsv", 2.0, cnv, out_additional_only, @@ -48,27 +49,27 @@ class TestCase: testcases = [ TestCase( name="header row", - expected=("sample", "gene(s)", "chrom", "region", "caller", "freq_in_db", "copy_number") + expected=("gene(s)", "chrom", "region", "caller", "freq_in_db", "copy_number", "FP_flag") ), TestCase( name="variant 1", - expected=("testSample_T", "FGFR1", "chr8", "34370199-43930232", "cnvkit", "0.01", "8.59") + expected=("FGFR1", "chr8", "34370199-43930232", "cnvkit", "0.01", "8.59") ), TestCase( name="variant 2", - expected=("testSample_T", "FGFR1,MYC", "chr8", "35008818-146144253", "gatk_cnv", "0.01", "7.01") + expected=("FGFR1,MYC", "chr8", "35008818-146144253", "gatk", "0.01", "7.01") ), TestCase( name="variant 3", - expected=("testSample_T", "MYC", "chr8", "46689525-146144003", "cnvkit", "0.09", "5.06") + expected=("MYC", "chr8", "46689525-146144003", "cnvkit", "0.09", "5.06") ), TestCase( name="small deletion", - expected=("testSample_T", "CDKN2A,CDKN2B", "chr9", "21968207-22008972", "small_deletion", "NA", "-0.28") + expected=("CDKN2A,CDKN2B", "chr9", "21968207-22008972", "small_deletion", "NA", "-0.28") ), TestCase( name="small amplification", - expected=("testSample_T", "MYCN", "chr2", "16968207-17008972", "small_amplification", "NA", "7.29") + expected=("MYCN", "chr2", "16968207-17008972", "small_amplification", "NA", "7.29") ), ] diff --git a/.tests/units/vcf/test.cnv1.vcf b/.tests/units/vcf/test.cnv1.vcf index 371b9bda..a756b4a8 100644 --- a/.tests/units/vcf/test.cnv1.vcf +++ b/.tests/units/vcf/test.cnv1.vcf @@ -62,4 +62,4 @@ ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testSample_T chr8 34370199 . N . . Genes=FGFR1;SVTYPE=DUP;END=43930232;SVLEN=9560034;LOG_ODDS_RATIO=2.10299;CALLER=cnvkit;CN=.;CORR_CN=8.59;PROBES=167;BAF=-1.1761;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|34370199;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:8.59|CNQ:167|DP:857.922|BAF:-1.1761;SVDB_INFO=.|SVTYPE:DUP|END:43930232|SVLEN:9560034|LOG_ODDS_RATIO:2.10299|CALLER:cnvkit|CN:NA|CORR_CN:8.59|PROBES:167|BAF:-1.1761;svdb_origin=testSample_T;Twist_OCC=3;Twist_AF=0.0103093 GT:CN:CNQ:DP:BAF 0/1:8.59:167:857.922:-1.1761 -chr8 35008818 . N . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk_cnv;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk_cnv|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41 +chr8 35008818 . N . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41 diff --git a/.tests/units/vcf/test.cnv2.vcf b/.tests/units/vcf/test.cnv2.vcf index dbdc8074..205a7d10 100644 --- a/.tests/units/vcf/test.cnv2.vcf +++ b/.tests/units/vcf/test.cnv2.vcf @@ -62,6 +62,6 @@ ##contig= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testSample_T chr8 34370199 . N . . Genes=FGFR1;SVTYPE=DUP;END=43930232;SVLEN=9560034;LOG_ODDS_RATIO=2.10299;CALLER=cnvkit;CN=.;CORR_CN=8.59;PROBES=167;BAF=-1.1761;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|34370199;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:8.59|CNQ:167|DP:857.922|BAF:-1.1761;SVDB_INFO=.|SVTYPE:DUP|END:43930232|SVLEN:9560034|LOG_ODDS_RATIO:2.10299|CALLER:cnvkit|CN:NA|CORR_CN:8.59|PROBES:167|BAF:-1.1761;svdb_origin=testSample_T;Twist_OCC=3;Twist_AF=0.0103093 GT:CN:CNQ:DP:BAF 0/1:8.59:167:857.922:-1.1761 -chr8 35008818 . N . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk_cnv;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk_cnv|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41 +chr8 35008818 . N . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41 chr8 46689525 . N . . Genes=MYC;SVTYPE=DUP;END=146144003;SVLEN=99454479;LOG_ODDS_RATIO=1.82064;CALLER=cnvkit;CN=.;CORR_CN=5.06;PROBES=991;BAF=-0.252241;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|46689525;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:7.06|CNQ:991|DP:287.108|BAF:-0.252241;SVDB_INFO=.|SVTYPE:DUP|END:146144003|SVLEN:99454479|LOG_ODDS_RATIO:1.82064|CALLER:cnvkit|CN:NA|CORR_CN:7.06|PROBES:991|BAF:-0.252241;svdb_origin=testSample_T;Twist_OCC=25;Twist_AF=0.0859107 GT:CN:CNQ:DP:BAF 0/1:7.06:991:287.108:-0.252241 chr9 1206037 . N . . Genes=CDKN2A,CDKN2B,TSC1;SVTYPE=COPY_NORMAL;END=140509860;SVLEN=139303824;LOG_ODDS_RATIO=0.015690;CALLER=cnvkit;CN=2.02;CORR_CN=2.09;PROBES=640;BAF=0.386991;BAF_PROBES=79;set=Intersection;SVDB_CHROM=.|chr9;SVDB_POS=.|1206037;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/0|CN:2.02|CCN:2.09|CNQ:640|BAF:0.386991|BAFQ:79;SVDB_INFO=.|SVTYPE:COPY_NORMAL|END:140509860|SVLEN:139303824|LOG_ODDS_RATIO:0.015690|CALLER:gatk|CN:2.02|CORR_CN:2.09|PROBES:640|BAF:0.386991|BAF_PROBES:79;svdb_origin=testSample_T GT:CN:CCN:CNQ:BAF:BAFQ 0/0:2.02:2.09:640:0.386991:79 diff --git a/workflow/scripts/cnv_report.py b/workflow/scripts/cnv_report.py index 22c12997..99cb24f0 100644 --- a/workflow/scripts/cnv_report.py +++ b/workflow/scripts/cnv_report.py @@ -34,7 +34,6 @@ def check_fp(chrom, start, end, gatk_cnr_dict, cn): while j1 < j2: cnv_region["surrounding_region"].append(gatk_data[j1][2]) j1 += 1 - median_surrounding_region = statistics.median(cnv_region["surrounding_region"]) stdev_surrounding_region = 0.0 @@ -45,8 +44,6 @@ def check_fp(chrom, start, end, gatk_cnr_dict, cn): if len(cnv_region["region"]) >= 3: stdev_region = statistics.stdev(cnv_region["region"]) - print(chrom, start, end, cn, median_region, stdev_region, median_surrounding_region, stdev_surrounding_region) - if cn < 1.5 and median_region > -0.2 and median_region < 0.2: if median_region + stdev_region > median_surrounding_region - stdev_surrounding_region: FP_flag = "FP?" @@ -103,9 +100,8 @@ def create_tsv_report( gene_all_dict = {} nr_writes = 0 log.info(f"Opening output tsv file: {output_txt}") - sample_name = "" with open(output_txt, "w") as writer: - writer.write("sample\tgene(s)\tchrom\tregion\tcaller\tfreq_in_db\tcopy_number\tFP_flag") + writer.write("gene(s)\tchrom\tregion\tcaller\tfreq_in_db\tcopy_number\tFP_flag") out_additional_only.write("gene(s)\tchrom\tregion\tcaller\tfreq_in_db\tcopy_number") file1 = True for input_org_vcf in input_org_vcfs: @@ -118,9 +114,6 @@ def create_tsv_report( samples = list(variants.header.samples) if len(samples) > 1: raise Exception(f"Unable to process vcf with more then one sample: {samples}") - else: - samples = samples[0] - sample_name = samples for variant in variants: genes = utils.get_annotation_data_info(variant, "Genes") log.debug(f"Processing variant: {variant}") @@ -226,7 +219,7 @@ def create_tsv_report( if nr_writes < 2: avg_cn = ((del_1p19q["1p_cnvkit"][1] + del_1p19q["19q_cnvkit"][1]) / (del_1p19q["1p_cnvkit"][0] + del_1p19q["19q_cnvkit"][0])) - writer.write(f"\n{samples}\t1p19q\t1p19q\t") + writer.write(f"\n1p19q\t1p19q\t") writer.write(f"{fraction_1p_cnvkit*100:.0f}%,{fraction_19q_cnvkit*100:.0f}%") writer.write(f"\tcnvkit\tNA\t{avg_cn:.2f}\t") out_additional_only.write(f"\n1p19q\t1p19q\tNA\tcnvkit\tNA\tNA") @@ -237,8 +230,7 @@ def create_tsv_report( if nr_writes < 2: avg_cn = ((del_1p19q["1p_gatkcnv"][1] + del_1p19q["19q_gatkcnv"][1]) / (del_1p19q["1p_gatkcnv"][0] + del_1p19q["19q_gatkcnv"][0])) - writer.write(f"\n{samples}\t1p19q\tNA\tNA\tgatk_cnv\tNA\tNA\t") - writer.write(f"\n{samples}\t1p19q\t1p19q\t") + writer.write(f"\n1p19q\t1p19q\t") writer.write(f"{fraction_1p_gatkcnv*100:.0f}%,{fraction_19q_gatkcnv*100:.0f}%") writer.write(f"\tgatk_cnv\tNA\t{avg_cn:.2f}\t") out_additional_only.write(f"\nt1p19q\tNA\tNA\tgatk_cnv\tNA\tNA") @@ -253,8 +245,6 @@ def create_tsv_report( samples = list(variants.header.samples) if len(samples) > 1: raise Exception(f"Unable to process vcf with more then one sample: {samples}") - else: - samples = samples[0] counter = 0 for variant in variants: genes = utils.get_annotation_data_info(variant, "Genes") @@ -275,20 +265,18 @@ def create_tsv_report( gene_variant_dict[gene] = [] gene_variant_dict[gene].append([chr, start, end, caller, cn, AF]) if gene in gene_all_dict: - nr_callers = {"cnvkit": 0, "gatk" : 0} + nr_callers = {"cnvkit": 0, "gatk": 0} for cnv in gene_all_dict[gene]: if cnv[4] > 2.5 or cnv[4] < 1.5: nr_callers[cnv[3]] += 1 if nr_callers["cnvkit"] > 0 and nr_callers["gatk"] > 0: both_callers = True FP_flag = "" - if caller == "cnvkit": - print(both_callers, chr, start, end, cn) if caller == "cnvkit" and not both_callers: FP_flag = check_fp(chr, start, end, gatk_cnr_dict, cn) - writer.write(f"\n{samples}\t{genes}\t{chr}\t{start}-{end}\t{caller}\t{AF:.2f}\t{cn:.2f}\t{FP_flag}") + writer.write(f"\n{genes}\t{chr}\t{start}-{end}\t{caller}\t{AF:.2f}\t{cn:.2f}\t{FP_flag}") counter += 1 - + for gene in gene_variant_dict: if len(gene_variant_dict[gene]) == 1: org_caller = gene_variant_dict[gene][0][3] @@ -306,7 +294,7 @@ def create_tsv_report( (gene_variant_dict[gene][0][1] >= start and gene_variant_dict[gene][0][1] <= end) or (gene_variant_dict[gene][0][2] >= end and gene_variant_dict[gene][0][2] <= start) ): - writer.write(f"\n{samples}\t{gene}\t{chr}\t{start}-{end}\t{new_caller}\t{AF:.2f}\t{cn:.2f}\t") + writer.write(f"\n{gene}\t{chr}\t{start}-{end}\t{new_caller}\t{AF:.2f}\t{cn:.2f}\t") log.info(f"Processed {counter} variants") deletions = open(input_del) @@ -324,7 +312,7 @@ def create_tsv_report( ccn = cn if TC > 0.0: ccn = round(2 + (cn - 2) * (1/float(TC)), 2) - writer.write(f"\n{sample_name}\t{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t") + writer.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t") out_additional_only.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}") amplifications = open(input_amp) header_list = next(amplifications).split("\t") @@ -342,7 +330,7 @@ def create_tsv_report( if TC > 0.0: ccn = round(2 + (cn - 2) * (1/float(TC)), 2) if ccn > amp_cn_limit: - writer.write(f"\n{sample_name}\t{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t") + writer.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t") out_additional_only.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}") with open(out_tsv_chrom_arms, "w") as writer: