Skip to content

Commit

Permalink
tests: corrected tests for tsv_report
Browse files Browse the repository at this point in the history
  • Loading branch information
jonca79 committed Jul 4, 2024
1 parent c223d50 commit ec1eff9
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 29 deletions.
13 changes: 7 additions & 6 deletions .tests/units/test_cnv_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def test_create_tsv_report(self):
".tests/units/vcf/test.deletions.tsv",
".tests/units/vcf/test.amplifications.tsv",
".tests/integration/reference/chromosome_arm_size.tsv",
".tests/units/gatk_cnv/LI-VAL-42_T.clean.denoisedCR.tsv",
2.0,
cnv,
out_additional_only,
Expand All @@ -48,27 +49,27 @@ class TestCase:
testcases = [
TestCase(
name="header row",
expected=("sample", "gene(s)", "chrom", "region", "caller", "freq_in_db", "copy_number")
expected=("gene(s)", "chrom", "region", "caller", "freq_in_db", "copy_number", "FP_flag")
),
TestCase(
name="variant 1",
expected=("testSample_T", "FGFR1", "chr8", "34370199-43930232", "cnvkit", "0.01", "8.59")
expected=("FGFR1", "chr8", "34370199-43930232", "cnvkit", "0.01", "8.59")
),
TestCase(
name="variant 2",
expected=("testSample_T", "FGFR1,MYC", "chr8", "35008818-146144253", "gatk_cnv", "0.01", "7.01")
expected=("FGFR1,MYC", "chr8", "35008818-146144253", "gatk", "0.01", "7.01")
),
TestCase(
name="variant 3",
expected=("testSample_T", "MYC", "chr8", "46689525-146144003", "cnvkit", "0.09", "5.06")
expected=("MYC", "chr8", "46689525-146144003", "cnvkit", "0.09", "5.06")
),
TestCase(
name="small deletion",
expected=("testSample_T", "CDKN2A,CDKN2B", "chr9", "21968207-22008972", "small_deletion", "NA", "-0.28")
expected=("CDKN2A,CDKN2B", "chr9", "21968207-22008972", "small_deletion", "NA", "-0.28")
),
TestCase(
name="small amplification",
expected=("testSample_T", "MYCN", "chr2", "16968207-17008972", "small_amplification", "NA", "7.29")
expected=("MYCN", "chr2", "16968207-17008972", "small_amplification", "NA", "7.29")
),
]

Expand Down
2 changes: 1 addition & 1 deletion .tests/units/vcf/test.cnv1.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -62,4 +62,4 @@
##contig=<ID=chrY>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testSample_T
chr8 34370199 . N <DUP> . . Genes=FGFR1;SVTYPE=DUP;END=43930232;SVLEN=9560034;LOG_ODDS_RATIO=2.10299;CALLER=cnvkit;CN=.;CORR_CN=8.59;PROBES=167;BAF=-1.1761;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|34370199;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:8.59|CNQ:167|DP:857.922|BAF:-1.1761;SVDB_INFO=.|SVTYPE:DUP|END:43930232|SVLEN:9560034|LOG_ODDS_RATIO:2.10299|CALLER:cnvkit|CN:NA|CORR_CN:8.59|PROBES:167|BAF:-1.1761;svdb_origin=testSample_T;Twist_OCC=3;Twist_AF=0.0103093 GT:CN:CNQ:DP:BAF 0/1:8.59:167:857.922:-1.1761
chr8 35008818 . N <DUP> . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk_cnv;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk_cnv|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41
chr8 35008818 . N <DUP> . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41
2 changes: 1 addition & 1 deletion .tests/units/vcf/test.cnv2.vcf
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,6 @@
##contig=<ID=chrY>
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT testSample_T
chr8 34370199 . N <DUP> . . Genes=FGFR1;SVTYPE=DUP;END=43930232;SVLEN=9560034;LOG_ODDS_RATIO=2.10299;CALLER=cnvkit;CN=.;CORR_CN=8.59;PROBES=167;BAF=-1.1761;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|34370199;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:8.59|CNQ:167|DP:857.922|BAF:-1.1761;SVDB_INFO=.|SVTYPE:DUP|END:43930232|SVLEN:9560034|LOG_ODDS_RATIO:2.10299|CALLER:cnvkit|CN:NA|CORR_CN:8.59|PROBES:167|BAF:-1.1761;svdb_origin=testSample_T;Twist_OCC=3;Twist_AF=0.0103093 GT:CN:CNQ:DP:BAF 0/1:8.59:167:857.922:-1.1761
chr8 35008818 . N <DUP> . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk_cnv;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk_cnv|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41
chr8 35008818 . N <DUP> . . Genes=FGFR1,MYC;SVTYPE=DUP;END=146144253;SVLEN=111135436;LOG_ODDS_RATIO=0.58604;CALLER=gatk;CN=3;CORR_CN=7.01;PROBES=233;BAF=0.366555;BAF_PROBES=41;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|35008818;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:3.0|CCN:7.01|CNQ:233|BAF:0.366555|BAFQ:41;SVDB_INFO=.|SVTYPE:DUP|END:146144253|SVLEN:111135436|LOG_ODDS_RATIO:0.586040|CALLER:gatk|CN:3.0|CORR_CN:7.01|PROBES:233|BAF:0.366555|BAF_PROBES:41;svdb_origin=testSample_T;Twist_OCC=4;Twist_AF=0.0137457 GT:CN:CCN:CNQ:BAF:BAFQ 0/1:3:7.01:233:0.366555:41
chr8 46689525 . N <DUP> . . Genes=MYC;SVTYPE=DUP;END=146144003;SVLEN=99454479;LOG_ODDS_RATIO=1.82064;CALLER=cnvkit;CN=.;CORR_CN=5.06;PROBES=991;BAF=-0.252241;set=Intersection;SVDB_CHROM=.|chr8;SVDB_POS=.|46689525;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/1|CN:7.06|CNQ:991|DP:287.108|BAF:-0.252241;SVDB_INFO=.|SVTYPE:DUP|END:146144003|SVLEN:99454479|LOG_ODDS_RATIO:1.82064|CALLER:cnvkit|CN:NA|CORR_CN:7.06|PROBES:991|BAF:-0.252241;svdb_origin=testSample_T;Twist_OCC=25;Twist_AF=0.0859107 GT:CN:CNQ:DP:BAF 0/1:7.06:991:287.108:-0.252241
chr9 1206037 . N <COPY_NORMAL> . . Genes=CDKN2A,CDKN2B,TSC1;SVTYPE=COPY_NORMAL;END=140509860;SVLEN=139303824;LOG_ODDS_RATIO=0.015690;CALLER=cnvkit;CN=2.02;CORR_CN=2.09;PROBES=640;BAF=0.386991;BAF_PROBES=79;set=Intersection;SVDB_CHROM=.|chr9;SVDB_POS=.|1206037;SVDB_QUAL=.|.;SVDB_FILTERS=.|.;SVDB_SAMPLES=.|testSample_T|GT:0/0|CN:2.02|CCN:2.09|CNQ:640|BAF:0.386991|BAFQ:79;SVDB_INFO=.|SVTYPE:COPY_NORMAL|END:140509860|SVLEN:139303824|LOG_ODDS_RATIO:0.015690|CALLER:gatk|CN:2.02|CORR_CN:2.09|PROBES:640|BAF:0.386991|BAF_PROBES:79;svdb_origin=testSample_T GT:CN:CCN:CNQ:BAF:BAFQ 0/0:2.02:2.09:640:0.386991:79
30 changes: 9 additions & 21 deletions workflow/scripts/cnv_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ def check_fp(chrom, start, end, gatk_cnr_dict, cn):
while j1 < j2:
cnv_region["surrounding_region"].append(gatk_data[j1][2])
j1 += 1


median_surrounding_region = statistics.median(cnv_region["surrounding_region"])
stdev_surrounding_region = 0.0
Expand All @@ -45,8 +44,6 @@ def check_fp(chrom, start, end, gatk_cnr_dict, cn):
if len(cnv_region["region"]) >= 3:
stdev_region = statistics.stdev(cnv_region["region"])

print(chrom, start, end, cn, median_region, stdev_region, median_surrounding_region, stdev_surrounding_region)

if cn < 1.5 and median_region > -0.2 and median_region < 0.2:
if median_region + stdev_region > median_surrounding_region - stdev_surrounding_region:
FP_flag = "FP?"
Expand Down Expand Up @@ -103,9 +100,8 @@ def create_tsv_report(
gene_all_dict = {}
nr_writes = 0
log.info(f"Opening output tsv file: {output_txt}")
sample_name = ""
with open(output_txt, "w") as writer:
writer.write("sample\tgene(s)\tchrom\tregion\tcaller\tfreq_in_db\tcopy_number\tFP_flag")
writer.write("gene(s)\tchrom\tregion\tcaller\tfreq_in_db\tcopy_number\tFP_flag")
out_additional_only.write("gene(s)\tchrom\tregion\tcaller\tfreq_in_db\tcopy_number")
file1 = True
for input_org_vcf in input_org_vcfs:
Expand All @@ -118,9 +114,6 @@ def create_tsv_report(
samples = list(variants.header.samples)
if len(samples) > 1:
raise Exception(f"Unable to process vcf with more then one sample: {samples}")
else:
samples = samples[0]
sample_name = samples
for variant in variants:
genes = utils.get_annotation_data_info(variant, "Genes")
log.debug(f"Processing variant: {variant}")
Expand Down Expand Up @@ -226,7 +219,7 @@ def create_tsv_report(
if nr_writes < 2:
avg_cn = ((del_1p19q["1p_cnvkit"][1] + del_1p19q["19q_cnvkit"][1]) /
(del_1p19q["1p_cnvkit"][0] + del_1p19q["19q_cnvkit"][0]))
writer.write(f"\n{samples}\t1p19q\t1p19q\t")
writer.write(f"\n1p19q\t1p19q\t")
writer.write(f"{fraction_1p_cnvkit*100:.0f}%,{fraction_19q_cnvkit*100:.0f}%")
writer.write(f"\tcnvkit\tNA\t{avg_cn:.2f}\t")
out_additional_only.write(f"\n1p19q\t1p19q\tNA\tcnvkit\tNA\tNA")
Expand All @@ -237,8 +230,7 @@ def create_tsv_report(
if nr_writes < 2:
avg_cn = ((del_1p19q["1p_gatkcnv"][1] + del_1p19q["19q_gatkcnv"][1]) /
(del_1p19q["1p_gatkcnv"][0] + del_1p19q["19q_gatkcnv"][0]))
writer.write(f"\n{samples}\t1p19q\tNA\tNA\tgatk_cnv\tNA\tNA\t")
writer.write(f"\n{samples}\t1p19q\t1p19q\t")
writer.write(f"\n1p19q\t1p19q\t")
writer.write(f"{fraction_1p_gatkcnv*100:.0f}%,{fraction_19q_gatkcnv*100:.0f}%")
writer.write(f"\tgatk_cnv\tNA\t{avg_cn:.2f}\t")
out_additional_only.write(f"\nt1p19q\tNA\tNA\tgatk_cnv\tNA\tNA")
Expand All @@ -253,8 +245,6 @@ def create_tsv_report(
samples = list(variants.header.samples)
if len(samples) > 1:
raise Exception(f"Unable to process vcf with more then one sample: {samples}")
else:
samples = samples[0]
counter = 0
for variant in variants:
genes = utils.get_annotation_data_info(variant, "Genes")
Expand All @@ -275,20 +265,18 @@ def create_tsv_report(
gene_variant_dict[gene] = []
gene_variant_dict[gene].append([chr, start, end, caller, cn, AF])
if gene in gene_all_dict:
nr_callers = {"cnvkit": 0, "gatk" : 0}
nr_callers = {"cnvkit": 0, "gatk": 0}
for cnv in gene_all_dict[gene]:
if cnv[4] > 2.5 or cnv[4] < 1.5:
nr_callers[cnv[3]] += 1
if nr_callers["cnvkit"] > 0 and nr_callers["gatk"] > 0:
both_callers = True
FP_flag = ""
if caller == "cnvkit":
print(both_callers, chr, start, end, cn)
if caller == "cnvkit" and not both_callers:
FP_flag = check_fp(chr, start, end, gatk_cnr_dict, cn)
writer.write(f"\n{samples}\t{genes}\t{chr}\t{start}-{end}\t{caller}\t{AF:.2f}\t{cn:.2f}\t{FP_flag}")
writer.write(f"\n{genes}\t{chr}\t{start}-{end}\t{caller}\t{AF:.2f}\t{cn:.2f}\t{FP_flag}")
counter += 1

for gene in gene_variant_dict:
if len(gene_variant_dict[gene]) == 1:
org_caller = gene_variant_dict[gene][0][3]
Expand All @@ -306,7 +294,7 @@ def create_tsv_report(
(gene_variant_dict[gene][0][1] >= start and gene_variant_dict[gene][0][1] <= end) or
(gene_variant_dict[gene][0][2] >= end and gene_variant_dict[gene][0][2] <= start)
):
writer.write(f"\n{samples}\t{gene}\t{chr}\t{start}-{end}\t{new_caller}\t{AF:.2f}\t{cn:.2f}\t")
writer.write(f"\n{gene}\t{chr}\t{start}-{end}\t{new_caller}\t{AF:.2f}\t{cn:.2f}\t")
log.info(f"Processed {counter} variants")

deletions = open(input_del)
Expand All @@ -324,7 +312,7 @@ def create_tsv_report(
ccn = cn
if TC > 0.0:
ccn = round(2 + (cn - 2) * (1/float(TC)), 2)
writer.write(f"\n{sample_name}\t{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t")
writer.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t")
out_additional_only.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}")
amplifications = open(input_amp)
header_list = next(amplifications).split("\t")
Expand All @@ -342,7 +330,7 @@ def create_tsv_report(
if TC > 0.0:
ccn = round(2 + (cn - 2) * (1/float(TC)), 2)
if ccn > amp_cn_limit:
writer.write(f"\n{sample_name}\t{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t")
writer.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}\t")
out_additional_only.write(f"\n{gene}\t{chr}\t{start}-{end}\t{caller}\t{AF}\t{ccn:.2f}")

with open(out_tsv_chrom_arms, "w") as writer:
Expand Down

0 comments on commit ec1eff9

Please sign in to comment.