diff --git a/R/snv_merge_and_vaf_calc.R b/R/snv_merge_and_vaf_calc.R index cf961c8..654da64 100755 --- a/R/snv_merge_and_vaf_calc.R +++ b/R/snv_merge_and_vaf_calc.R @@ -86,6 +86,7 @@ if (length(grep("\\.gz", muts_vcf)) > 0) { } else { num_snvs = system(paste("grep -cv \"^#\" ", muts_vcf, "|| true"), intern = TRUE) } +num_snvs = as.integer(num_snvs) num_indels = 0 if (indel_vcf != '-') { @@ -96,6 +97,7 @@ if (indel_vcf != '-') { num_indels = system(paste("grep -cv \"^#\" ", indel_vcf, "|| true"), intern = TRUE) } } +num_indels = as.integer(num_indels) col_t = c("character","numeric","character","character","character","character","character") if (num_snvs == 0) { @@ -245,8 +247,8 @@ if (num_snvs > 0) { snvs_new2[new_row, "BEND"] = paste(snvs_tmp[, "BEND"], collapse = ",") snvs_new2[new_row, "TRI"] = snvs_tmp[1, "TRI"] snvs_new2[new_row, "QPOS"] = paste(snvs_tmp[, "QPOS"], collapse = ",") - snvs_new2[new_row, "DEPTH_FWD"] = median(snvs_tmp[, "DEPTH_FWD"]) - snvs_new2[new_row, "DEPTH_REV"] = median(snvs_tmp[, "DEPTH_REV"]) + snvs_new2[new_row, "DEPTH_FWD"] = median(as.numeric(snvs_tmp[, "DEPTH_FWD"])) + snvs_new2[new_row, "DEPTH_REV"] = median(as.numeric(snvs_tmp[, "DEPTH_REV"])) snvs_new2[new_row, "DEPTH_NORM_FWD"] = snvs_tmp[1, "DEPTH_NORM_FWD"] snvs_new2[new_row, "DEPTH_NORM_REV"] = snvs_tmp[1, "DEPTH_NORM_REV"] snvs_new2[new_row, "TIMES_CALLED"] = freq @@ -370,13 +372,26 @@ if (num_indels > 0) { indels_new[new_row, "MQ"] = indels_tmp[1, "MQ"] indels_new[new_row, "DP4"] = indels_tmp[1, "DP4"] indels_new[new_row, "mut_id"] = indels_tmp[1, "mut_id"] + + indels_new[new_row, "BBEG"] = paste(indels_tmp[, "BBEG"], collapse = ",") + indels_new[new_row, "BEND"] = paste(indels_tmp[, "BEND"], collapse = ",") + indels_new[new_row, "QPOS"] = paste(indels_tmp[, "QPOS"], collapse = ",") + indels_new[new_row, "DEPTH_FWD"] = median(as.numeric(indels_tmp[, "DEPTH_FWD"])) + indels_new[new_row, "DEPTH_REV"] = median(as.numeric(indels_tmp[, "DEPTH_REV"])) + indels_new[new_row, "DEPTH_NORM_FWD"] = indels_tmp[1, "DEPTH_NORM_FWD"] + indels_new[new_row, "DEPTH_NORM_REV"] = indels_tmp[1, "DEPTH_NORM_REV"] + indels_new[new_row, "DPLX_ASXS"] = paste(indels_tmp[, "DPLX_ASXS"], collapse = ",") + indels_new[new_row, "DPLX_CLIP"] = paste(indels_tmp[, "DPLX_CLIP"], collapse = ",") + indels_new[new_row, "DPLX_NM"] = paste(indels_tmp[, "DPLX_NM"], collapse = ",") + indels_new[new_row, "BULK_ASXS"] = paste(indels_tmp[, "BULK_ASXS"], collapse = ",") + indels_new[new_row, "BULK_NM"] = paste(indels_tmp[, "BULK_NM"], collapse = ",") } } indels_new = indels_new[order(indels_new$chr, indels_new$pos),] # drop some columns: - indels_new = indels_new[, c("chr", "pos", "kk", "ref", "mut", "qual", "filter", "rb_id", "TYPE", "TIMES_CALLED", "SEQ")] + indels_new = indels_new[, c("chr", "pos", "kk", "ref", "mut", "qual", "filter", "rb_id", "TYPE", "TIMES_CALLED", "SEQ","BBEG","BEND","QPOS","DEPTH_FWD","DEPTH_REV","DEPTH_NORM_FWD","DEPTH_NORM_REV","DPLX_ASXS","DPLX_CLIP","DPLX_NM","BULK_ASXS","BULK_NM")] ########################################################################################## # Calculate VAFs for indels @@ -468,6 +483,10 @@ if (num_indels > 0) { indels_final$INFO = paste(indels_final$INFO, rep("DUPLEX_VAF=", nrow(indels_final)), indels_final$DUPLEX_VAF, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("BAM_VAF=", nrow(indels_final)), indels_final$BAM_VAF, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("BAM_VAF_BQ10=", nrow(indels_final)), indels_final$BAM_VAF_BQ10, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_NORM_FWD=", nrow(indels_final)), indels_final$DEPTH_NORM_FWD, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_NORM_REV=", nrow(indels_final)), indels_final$DEPTH_NORM_REV, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_FWD=", nrow(indels_final)), indels_final$DEPTH_FWD, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_REV=", nrow(indels_final)), indels_final$DEPTH_REV, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("SEQ=", nrow(indels_final)), indels_final$SEQ, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("DUPLEX_COV=", nrow(indels_final)), indels_final$DUPLEX_COV, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("BAM_MUT=", nrow(indels_final)), indels_final$BAM_MUT, ";", sep = "") @@ -475,6 +494,12 @@ if (num_indels > 0) { indels_final$INFO = paste(indels_final$INFO, rep("BAM_MUT_BQ10=", nrow(indels_final)), indels_final$BAM_MUT_BQ10, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("BAM_COV_BQ10=", nrow(indels_final)), indels_final$BAM_COV_BQ10, ";", sep = "") indels_final$INFO = paste(indels_final$INFO, rep("RB=", nrow(indels_final)), indels_final$rb_id, "", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("QPOS=", nrow(indels_final)), indels_final$QPOS, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DPLX_ASXS=", nrow(indels_final)), indels_final$DPLX_ASXS, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DPLX_CLIP=", nrow(indels_final)), indels_final$DPLX_CLIP, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("DPLX_NM=", nrow(indels_final)), indels_final$DPLX_NM, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("BULK_ASXS=", nrow(indels_final)), indels_final$BULK_ASXS, ";", sep = "") + indels_final$INFO = paste(indels_final$INFO, rep("BULK_NM=", nrow(indels_final)), indels_final$BULK_NM, "", sep = "") } # Describe in header: diff --git a/perl/indelCaller_step1.pl b/perl/indelCaller_step1.pl index bff32b6..18672d9 100755 --- a/perl/indelCaller_step1.pl +++ b/perl/indelCaller_step1.pl @@ -109,8 +109,9 @@ my $bulk_rev = $bulkReverseTotal; next if($bulk_fwd + $bulk_rev < $bulk_min_cov); # Bulk minimum coverage next if($dplxCLIP > $max_clip); - next if($dplxNM > 20); # made very liberal to allow long indels. Check the impact! + next if($dplxNM > 20); # made very liberal to allow long indels. Check the impact! --> it seems is working, good next if($dplxASXS < $min_asxs || $bulkASXS < $min_asxs ); #fa8: fixed bug, we needed to check AS-XS for the bulk too + # Fixed but: from bitwise OR to logical OR (ainsss) if($r1 >= $min_size_subfam && $r2 >= $min_size_subfam) { my $bulktotal = $bulkForwardTotal+$bulkReverseTotal; @@ -152,7 +153,17 @@ $signature_trinuc = &reverse_signature($signature_trinuc); } $site_tags .= "$signature_trinuc;SW=$shearwater;cSNP=$commonSNP"; - + $site_tags .= ";BBEG=$dplxBreakpointBeg"; + $site_tags .= ";BEND=$dplxBreakpointEnd"; + $site_tags .= ";DEPTH_FWD=$r1"; + $site_tags .= ";DEPTH_REV=$r2"; + $site_tags .= ";DEPTH_NORM_FWD=$bulkForwardTotal"; + $site_tags .= ";DEPTH_NORM_REV=$bulkReverseTotal"; + $site_tags .= ";DPLX_ASXS=$dplxASXS"; + $site_tags .= ";DPLX_CLIP=$dplxCLIP"; + $site_tags .= ";DPLX_NM=$dplxNM"; + $site_tags .= ";BULK_ASXS=$bulkASXS"; + $site_tags .= ";BULK_NM=$bulkNM"; # If seen in the bulk, flag it: if($bulkForwardIndel+$bulkReverseIndel > $max_vaf * $bulktotal) { $site_tags .= ";BULK_SEEN($dplxForwardIndel+$dplxReverseIndel/$bulktotal)"; diff --git a/perl/indelCaller_step2.pl b/perl/indelCaller_step2.pl index 682dbcd..141a9d8 100755 --- a/perl/indelCaller_step2.pl +++ b/perl/indelCaller_step2.pl @@ -114,6 +114,7 @@ sub cmp_chr { sub runCmd { my $cmd = $_[0]; + print STDERR "$cmd\n"; my ($stdout, $stderr, $exit) = capture { system($cmd); }; @@ -126,11 +127,17 @@ sub runCmd { my %ignore_indels_at; while() { # input bed files are generated by indelCaller.pl [step 1] my($chr,$pos0,$pos1,$info) = (split(/\t/,$_ ))[0,1,2,3] ; - my($rb_id,$dp,$qpos,$context,$sw,$snp) = (split(/;/,$info))[0,1,2,3,4,5]; + my($rb_id,$dp,$qpos,$context,$sw,$snp, + $bbeg,$bend,$depth_fwd,$depth_rev, + $depth_norm_fwd,$depth_norm_rev, + $dplxASXS,$dplxCLIP,$dplxNM, + $bulkASXS,$bulkNM) = (split(/;/,$info))[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]; if( /BULK_SEEN/ ) { #filter if site comes from the bulk sample $ignore_indels_at{$chr}->{$pos1} = 1; next; } + + chomp; #my @tmp = split(/[:\-\|]/,$rb_id); my @tmp1 = split(/[:\-]/,$rb_id); @@ -142,11 +149,22 @@ sub runCmd { $rb_id = "@tmp"; $sw =~ s/SW=//; $snp =~ s/cSNP=//; - $indels{$rb_id}->{$pos1}->{"dp" } = $dp ; - $indels{$rb_id}->{$pos1}->{"qpos" } = $qpos ; - $indels{$rb_id}->{$pos1}->{"context"} = $context; - $indels{$rb_id}->{$pos1}->{"sw" } = $sw ; - $indels{$rb_id}->{$pos1}->{"snp" } = $snp ; + $indels{$rb_id}->{$pos1}->{"dp" } = $dp ; + $indels{$rb_id}->{$pos1}->{"qpos" } = $qpos ; + $indels{$rb_id}->{$pos1}->{"context" } = $context ; + $indels{$rb_id}->{$pos1}->{"sw" } = $sw ; + $indels{$rb_id}->{$pos1}->{"snp" } = $snp ; + $indels{$rb_id}->{$pos1}->{"bbeg" } = $bbeg ; + $indels{$rb_id}->{$pos1}->{"bend" } = $bend ; + $indels{$rb_id}->{$pos1}->{"depth_fwd" } = $depth_fwd ; + $indels{$rb_id}->{$pos1}->{"depth_rev" } = $depth_rev ; + $indels{$rb_id}->{$pos1}->{"depth_norm_fwd"} = $depth_norm_fwd ; + $indels{$rb_id}->{$pos1}->{"depth_norm_rev"} = $depth_norm_rev ; + $indels{$rb_id}->{$pos1}->{"dplxASXS" } = $dplxASXS ; + $indels{$rb_id}->{$pos1}->{"dplxCLIP" } = $dplxCLIP ; + $indels{$rb_id}->{$pos1}->{"dplxNM" } = $dplxNM ; + $indels{$rb_id}->{$pos1}->{"bulkASXS" } = $bulkASXS ; + $indels{$rb_id}->{$pos1}->{"bulkNM" } = $bulkNM ; $counts++; } print STDOUT "$counts indel sites seen\n"; @@ -212,6 +230,7 @@ sub runCmd { $get_header = 1; } print STDOUT "\nChecking result\n"; + print STDOUT "Step 7...\n"; open(IN_TMP, "<$tempdir/$out_name.tmp2.vcf") or die "Couldn't open $tempdir/$out_name.tmp2.vcf\n"; my @info = (); my $count = 0; @@ -224,6 +243,19 @@ sub runCmd { if ($once and /^##FORMAT/){ $_ = "##FILTER=\n" . $_; $_ = "##INFO=\n" . $_ ; +##INFO= + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; + $_ = "##INFO=\n" . $_ ; $once = 0; } if ( /^#CHROM/ ){ #use a simple sample name, without path @@ -249,8 +281,23 @@ sub runCmd { next; } print "RESULT: $_\n"; - my @fields = split(/\t/,$_); - $fields[7] = "$fields[7];RB=$rb_id"; + my @fields = split(/\t/,$_); + print STDERR "fields=@fields\n"; + my $anyisok = (keys(%{$indels{$rb_id}}))[0]; + my $qpos = $indels{$rb_id}->{$anyisok}->{"qpos" }; + my $bbeg = $indels{$rb_id}->{$anyisok}->{"bbeg" }; + my $bend = $indels{$rb_id}->{$anyisok}->{"bend" }; + my $depth_fwd = $indels{$rb_id}->{$anyisok}->{"depth_fwd" }; + my $depth_rev = $indels{$rb_id}->{$anyisok}->{"depth_rev" }; + my $depth_norm_fwd = $indels{$rb_id}->{$anyisok}->{"depth_norm_fwd"}; + my $depth_norm_rev = $indels{$rb_id}->{$anyisok}->{"depth_norm_rev"}; + my $dplxASXS = $indels{$rb_id}->{$anyisok}->{"dplxASXS" }; + my $dplxCLIP = $indels{$rb_id}->{$anyisok}->{"dplxCLIP" }; + my $dplxNM = $indels{$rb_id}->{$anyisok}->{"dplxNM" }; + my $bulkASXS = $indels{$rb_id}->{$anyisok}->{"bulkASXS" }; + my $bulkNM = $indels{$rb_id}->{$anyisok}->{"bulkNM" }; + #$fields[7] = "$fields[7];QPOS=$qpos;RB=$rb_id;BBEG=$bbeg;BEND=$bend;DEPTH_FWD=$depth_fwd;DEPTH_REV=$depth_rev;DEPTH_NORM_FWD=$depth_norm_fwd;DEPTH_NORM_REV=$depth_norm_rev;DPLX_ASXS=$dplxASXS;DPLX_CLIP=$dplxCLIP;DPLX_NM=$dplxNM;BULK_ASXS=$bulkASXS;BULK_NM=$bulkNM"; + $fields[7] = "$fields[7];$qpos;RB=$rb_id;$bbeg;$bend;$depth_fwd;$depth_rev;$depth_norm_fwd;$depth_norm_rev;$dplxASXS;$dplxCLIP;$dplxNM;$bulkASXS;$bulkNM"; $fields[6] = "PASS"; # If there is overlap with SW or cSNP, flag it as MASKED # fa8: Do this only when the overlap is significant? If not, very long deletions are likely to overlap @@ -265,6 +312,7 @@ sub runCmd { $fields[6] = "MASKED"; } $info[$count] = join "\t",@fields; + print STDERR "$count: @fields\n"; # fa8: check if it's in the list of ignore_indels_at{chr}->{pos} for(my $i=$fields[1]; $i<= ($fields[1] + length($fields[3])); $i++) { if(exists($ignore_indels_at{$fields[0]}->{$i})) { @@ -287,6 +335,7 @@ sub runCmd { # to: # my $best_count = 0; } +print STDOUT "Step 8...\n"; if ( $header eq "") { #minimal header for an empty vcf print OUT "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$sample_name\n"; OUT->flush();