Skip to content

Commit

Permalink
Merge pull request #66 from cancerit/test_indels
Browse files Browse the repository at this point in the history
Test indels
  • Loading branch information
fa8sanger authored Sep 14, 2023
2 parents 47fc158 + 785d49d commit 8da01e1
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 13 deletions.
31 changes: 28 additions & 3 deletions R/snv_merge_and_vaf_calc.R
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ if (length(grep("\\.gz", muts_vcf)) > 0) {
} else {
num_snvs = system(paste("grep -cv \"^#\" ", muts_vcf, "|| true"), intern = TRUE)
}
num_snvs = as.integer(num_snvs)

num_indels = 0
if (indel_vcf != '-') {
Expand All @@ -96,6 +97,7 @@ if (indel_vcf != '-') {
num_indels = system(paste("grep -cv \"^#\" ", indel_vcf, "|| true"), intern = TRUE)
}
}
num_indels = as.integer(num_indels)

col_t = c("character","numeric","character","character","character","character","character")
if (num_snvs == 0) {
Expand Down Expand Up @@ -245,8 +247,8 @@ if (num_snvs > 0) {
snvs_new2[new_row, "BEND"] = paste(snvs_tmp[, "BEND"], collapse = ",")
snvs_new2[new_row, "TRI"] = snvs_tmp[1, "TRI"]
snvs_new2[new_row, "QPOS"] = paste(snvs_tmp[, "QPOS"], collapse = ",")
snvs_new2[new_row, "DEPTH_FWD"] = median(snvs_tmp[, "DEPTH_FWD"])
snvs_new2[new_row, "DEPTH_REV"] = median(snvs_tmp[, "DEPTH_REV"])
snvs_new2[new_row, "DEPTH_FWD"] = median(as.numeric(snvs_tmp[, "DEPTH_FWD"]))
snvs_new2[new_row, "DEPTH_REV"] = median(as.numeric(snvs_tmp[, "DEPTH_REV"]))
snvs_new2[new_row, "DEPTH_NORM_FWD"] = snvs_tmp[1, "DEPTH_NORM_FWD"]
snvs_new2[new_row, "DEPTH_NORM_REV"] = snvs_tmp[1, "DEPTH_NORM_REV"]
snvs_new2[new_row, "TIMES_CALLED"] = freq
Expand Down Expand Up @@ -370,13 +372,26 @@ if (num_indels > 0) {
indels_new[new_row, "MQ"] = indels_tmp[1, "MQ"]
indels_new[new_row, "DP4"] = indels_tmp[1, "DP4"]
indels_new[new_row, "mut_id"] = indels_tmp[1, "mut_id"]

indels_new[new_row, "BBEG"] = paste(indels_tmp[, "BBEG"], collapse = ",")
indels_new[new_row, "BEND"] = paste(indels_tmp[, "BEND"], collapse = ",")
indels_new[new_row, "QPOS"] = paste(indels_tmp[, "QPOS"], collapse = ",")
indels_new[new_row, "DEPTH_FWD"] = median(as.numeric(indels_tmp[, "DEPTH_FWD"]))
indels_new[new_row, "DEPTH_REV"] = median(as.numeric(indels_tmp[, "DEPTH_REV"]))
indels_new[new_row, "DEPTH_NORM_FWD"] = indels_tmp[1, "DEPTH_NORM_FWD"]
indels_new[new_row, "DEPTH_NORM_REV"] = indels_tmp[1, "DEPTH_NORM_REV"]
indels_new[new_row, "DPLX_ASXS"] = paste(indels_tmp[, "DPLX_ASXS"], collapse = ",")
indels_new[new_row, "DPLX_CLIP"] = paste(indels_tmp[, "DPLX_CLIP"], collapse = ",")
indels_new[new_row, "DPLX_NM"] = paste(indels_tmp[, "DPLX_NM"], collapse = ",")
indels_new[new_row, "BULK_ASXS"] = paste(indels_tmp[, "BULK_ASXS"], collapse = ",")
indels_new[new_row, "BULK_NM"] = paste(indels_tmp[, "BULK_NM"], collapse = ",")
}
}

indels_new = indels_new[order(indels_new$chr, indels_new$pos),]

# drop some columns:
indels_new = indels_new[, c("chr", "pos", "kk", "ref", "mut", "qual", "filter", "rb_id", "TYPE", "TIMES_CALLED", "SEQ")]
indels_new = indels_new[, c("chr", "pos", "kk", "ref", "mut", "qual", "filter", "rb_id", "TYPE", "TIMES_CALLED", "SEQ","BBEG","BEND","QPOS","DEPTH_FWD","DEPTH_REV","DEPTH_NORM_FWD","DEPTH_NORM_REV","DPLX_ASXS","DPLX_CLIP","DPLX_NM","BULK_ASXS","BULK_NM")]

##########################################################################################
# Calculate VAFs for indels
Expand Down Expand Up @@ -468,13 +483,23 @@ if (num_indels > 0) {
indels_final$INFO = paste(indels_final$INFO, rep("DUPLEX_VAF=", nrow(indels_final)), indels_final$DUPLEX_VAF, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BAM_VAF=", nrow(indels_final)), indels_final$BAM_VAF, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BAM_VAF_BQ10=", nrow(indels_final)), indels_final$BAM_VAF_BQ10, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_NORM_FWD=", nrow(indels_final)), indels_final$DEPTH_NORM_FWD, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_NORM_REV=", nrow(indels_final)), indels_final$DEPTH_NORM_REV, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_FWD=", nrow(indels_final)), indels_final$DEPTH_FWD, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DEPTH_REV=", nrow(indels_final)), indels_final$DEPTH_REV, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("SEQ=", nrow(indels_final)), indels_final$SEQ, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DUPLEX_COV=", nrow(indels_final)), indels_final$DUPLEX_COV, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BAM_MUT=", nrow(indels_final)), indels_final$BAM_MUT, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BAM_COV=", nrow(indels_final)), indels_final$BAM_COV, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BAM_MUT_BQ10=", nrow(indels_final)), indels_final$BAM_MUT_BQ10, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BAM_COV_BQ10=", nrow(indels_final)), indels_final$BAM_COV_BQ10, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("RB=", nrow(indels_final)), indels_final$rb_id, "", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("QPOS=", nrow(indels_final)), indels_final$QPOS, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DPLX_ASXS=", nrow(indels_final)), indels_final$DPLX_ASXS, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DPLX_CLIP=", nrow(indels_final)), indels_final$DPLX_CLIP, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("DPLX_NM=", nrow(indels_final)), indels_final$DPLX_NM, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BULK_ASXS=", nrow(indels_final)), indels_final$BULK_ASXS, ";", sep = "")
indels_final$INFO = paste(indels_final$INFO, rep("BULK_NM=", nrow(indels_final)), indels_final$BULK_NM, "", sep = "")
}

# Describe in header:
Expand Down
15 changes: 13 additions & 2 deletions perl/indelCaller_step1.pl
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,9 @@
my $bulk_rev = $bulkReverseTotal;
next if($bulk_fwd + $bulk_rev < $bulk_min_cov); # Bulk minimum coverage
next if($dplxCLIP > $max_clip);
next if($dplxNM > 20); # made very liberal to allow long indels. Check the impact!
next if($dplxNM > 20); # made very liberal to allow long indels. Check the impact! --> it seems is working, good
next if($dplxASXS < $min_asxs || $bulkASXS < $min_asxs ); #fa8: fixed bug, we needed to check AS-XS for the bulk too
# Fixed but: from bitwise OR to logical OR (ainsss)

if($r1 >= $min_size_subfam && $r2 >= $min_size_subfam) {
my $bulktotal = $bulkForwardTotal+$bulkReverseTotal;
Expand Down Expand Up @@ -152,7 +153,17 @@
$signature_trinuc = &reverse_signature($signature_trinuc);
}
$site_tags .= "$signature_trinuc;SW=$shearwater;cSNP=$commonSNP";

$site_tags .= ";BBEG=$dplxBreakpointBeg";
$site_tags .= ";BEND=$dplxBreakpointEnd";
$site_tags .= ";DEPTH_FWD=$r1";
$site_tags .= ";DEPTH_REV=$r2";
$site_tags .= ";DEPTH_NORM_FWD=$bulkForwardTotal";
$site_tags .= ";DEPTH_NORM_REV=$bulkReverseTotal";
$site_tags .= ";DPLX_ASXS=$dplxASXS";
$site_tags .= ";DPLX_CLIP=$dplxCLIP";
$site_tags .= ";DPLX_NM=$dplxNM";
$site_tags .= ";BULK_ASXS=$bulkASXS";
$site_tags .= ";BULK_NM=$bulkNM";
# If seen in the bulk, flag it:
if($bulkForwardIndel+$bulkReverseIndel > $max_vaf * $bulktotal) {
$site_tags .= ";BULK_SEEN($dplxForwardIndel+$dplxReverseIndel/$bulktotal)";
Expand Down
65 changes: 57 additions & 8 deletions perl/indelCaller_step2.pl
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ sub cmp_chr {

sub runCmd {
my $cmd = $_[0];
print STDERR "$cmd\n";
my ($stdout, $stderr, $exit) = capture {
system($cmd);
};
Expand All @@ -126,11 +127,17 @@ sub runCmd {
my %ignore_indels_at;
while(<IN>) { # input bed files are generated by indelCaller.pl [step 1]
my($chr,$pos0,$pos1,$info) = (split(/\t/,$_ ))[0,1,2,3] ;
my($rb_id,$dp,$qpos,$context,$sw,$snp) = (split(/;/,$info))[0,1,2,3,4,5];
my($rb_id,$dp,$qpos,$context,$sw,$snp,
$bbeg,$bend,$depth_fwd,$depth_rev,
$depth_norm_fwd,$depth_norm_rev,
$dplxASXS,$dplxCLIP,$dplxNM,
$bulkASXS,$bulkNM) = (split(/;/,$info))[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16];
if( /BULK_SEEN/ ) { #filter if site comes from the bulk sample
$ignore_indels_at{$chr}->{$pos1} = 1;
next;
}


chomp;
#my @tmp = split(/[:\-\|]/,$rb_id);
my @tmp1 = split(/[:\-]/,$rb_id);
Expand All @@ -142,11 +149,22 @@ sub runCmd {
$rb_id = "@tmp";
$sw =~ s/SW=//;
$snp =~ s/cSNP=//;
$indels{$rb_id}->{$pos1}->{"dp" } = $dp ;
$indels{$rb_id}->{$pos1}->{"qpos" } = $qpos ;
$indels{$rb_id}->{$pos1}->{"context"} = $context;
$indels{$rb_id}->{$pos1}->{"sw" } = $sw ;
$indels{$rb_id}->{$pos1}->{"snp" } = $snp ;
$indels{$rb_id}->{$pos1}->{"dp" } = $dp ;
$indels{$rb_id}->{$pos1}->{"qpos" } = $qpos ;
$indels{$rb_id}->{$pos1}->{"context" } = $context ;
$indels{$rb_id}->{$pos1}->{"sw" } = $sw ;
$indels{$rb_id}->{$pos1}->{"snp" } = $snp ;
$indels{$rb_id}->{$pos1}->{"bbeg" } = $bbeg ;
$indels{$rb_id}->{$pos1}->{"bend" } = $bend ;
$indels{$rb_id}->{$pos1}->{"depth_fwd" } = $depth_fwd ;
$indels{$rb_id}->{$pos1}->{"depth_rev" } = $depth_rev ;
$indels{$rb_id}->{$pos1}->{"depth_norm_fwd"} = $depth_norm_fwd ;
$indels{$rb_id}->{$pos1}->{"depth_norm_rev"} = $depth_norm_rev ;
$indels{$rb_id}->{$pos1}->{"dplxASXS" } = $dplxASXS ;
$indels{$rb_id}->{$pos1}->{"dplxCLIP" } = $dplxCLIP ;
$indels{$rb_id}->{$pos1}->{"dplxNM" } = $dplxNM ;
$indels{$rb_id}->{$pos1}->{"bulkASXS" } = $bulkASXS ;
$indels{$rb_id}->{$pos1}->{"bulkNM" } = $bulkNM ;
$counts++;
}
print STDOUT "$counts indel sites seen\n";
Expand Down Expand Up @@ -212,6 +230,7 @@ sub runCmd {
$get_header = 1;
}
print STDOUT "\nChecking result\n";
print STDOUT "Step 7...\n";
open(IN_TMP, "<$tempdir/$out_name.tmp2.vcf") or die "Couldn't open $tempdir/$out_name.tmp2.vcf\n";
my @info = ();
my $count = 0;
Expand All @@ -224,6 +243,19 @@ sub runCmd {
if ($once and /^##FORMAT/){
$_ = "##FILTER=<ID=MASKED,Description=\"Site overlaps with SW or SNP site\">\n" . $_;
$_ = "##INFO=<ID=RB,Number=1,Type=String,Description=\"Readbundle ID\">\n" . $_ ;
##INFO=<ID=RB,Number=1,Type=String,Description="Readbundle ID">
$_ = "##INFO=<ID=QPOS,Number=1,Type=Integer,Description=\"Read position\">\n" . $_ ;
$_ = "##INFO=<ID=BBEG,Number=1,Type=Integer,Description=\"Start breakpoint\">\n" . $_ ;
$_ = "##INFO=<ID=BEND,Number=1,Type=Integer,Description=\"End breakpoing\">\n" . $_ ;
$_ = "##INFO=<ID=DEPTH_FWD,Number=1,Type=Integer,Description=\"Depth RB forward\">\n" . $_ ;
$_ = "##INFO=<ID=DEPTH_REV,Number=1,Type=String,Description=\"Depth RB reverse\">\n" . $_ ;
$_ = "##INFO=<ID=DEPTH_NORM_FWD,Number=1,Type=Integer,Description=\"Depth normal forward\">\n" . $_ ;
$_ = "##INFO=<ID=DEPTH_NORM_REV,Number=1,Type=Integer,Description=\"Depth normal reverse\">\n" . $_ ;
$_ = "##INFO=<ID=DPLX_ASXS,Number=1,Type=Float,Description=\"RB mean AS-XS\">\n" . $_ ;
$_ = "##INFO=<ID=DPLX_CLIP,Number=1,Type=Float,Description=\"RB mean CLIP\">\n" . $_ ;
$_ = "##INFO=<ID=DPLX_NM,Number=1,Type=Float,Description=\"RB mean NM\">\n" . $_ ;
$_ = "##INFO=<ID=BULK_ASXS,Number=1,Type=Float,Description=\"Normal mean AS-XS\">\n" . $_ ;
$_ = "##INFO=<ID=BULK_NM,Number=1,Type=Float,Description=\"Normal mean NM\">\n" . $_ ;
$once = 0;
}
if ( /^#CHROM/ ){ #use a simple sample name, without path
Expand All @@ -249,8 +281,23 @@ sub runCmd {
next;
}
print "RESULT: $_\n";
my @fields = split(/\t/,$_);
$fields[7] = "$fields[7];RB=$rb_id";
my @fields = split(/\t/,$_);
print STDERR "fields=@fields\n";
my $anyisok = (keys(%{$indels{$rb_id}}))[0];
my $qpos = $indels{$rb_id}->{$anyisok}->{"qpos" };
my $bbeg = $indels{$rb_id}->{$anyisok}->{"bbeg" };
my $bend = $indels{$rb_id}->{$anyisok}->{"bend" };
my $depth_fwd = $indels{$rb_id}->{$anyisok}->{"depth_fwd" };
my $depth_rev = $indels{$rb_id}->{$anyisok}->{"depth_rev" };
my $depth_norm_fwd = $indels{$rb_id}->{$anyisok}->{"depth_norm_fwd"};
my $depth_norm_rev = $indels{$rb_id}->{$anyisok}->{"depth_norm_rev"};
my $dplxASXS = $indels{$rb_id}->{$anyisok}->{"dplxASXS" };
my $dplxCLIP = $indels{$rb_id}->{$anyisok}->{"dplxCLIP" };
my $dplxNM = $indels{$rb_id}->{$anyisok}->{"dplxNM" };
my $bulkASXS = $indels{$rb_id}->{$anyisok}->{"bulkASXS" };
my $bulkNM = $indels{$rb_id}->{$anyisok}->{"bulkNM" };
#$fields[7] = "$fields[7];QPOS=$qpos;RB=$rb_id;BBEG=$bbeg;BEND=$bend;DEPTH_FWD=$depth_fwd;DEPTH_REV=$depth_rev;DEPTH_NORM_FWD=$depth_norm_fwd;DEPTH_NORM_REV=$depth_norm_rev;DPLX_ASXS=$dplxASXS;DPLX_CLIP=$dplxCLIP;DPLX_NM=$dplxNM;BULK_ASXS=$bulkASXS;BULK_NM=$bulkNM";
$fields[7] = "$fields[7];$qpos;RB=$rb_id;$bbeg;$bend;$depth_fwd;$depth_rev;$depth_norm_fwd;$depth_norm_rev;$dplxASXS;$dplxCLIP;$dplxNM;$bulkASXS;$bulkNM";
$fields[6] = "PASS";
# If there is overlap with SW or cSNP, flag it as MASKED
# fa8: Do this only when the overlap is significant? If not, very long deletions are likely to overlap
Expand All @@ -265,6 +312,7 @@ sub runCmd {
$fields[6] = "MASKED";
}
$info[$count] = join "\t",@fields;
print STDERR "$count: @fields\n";
# fa8: check if it's in the list of ignore_indels_at{chr}->{pos}
for(my $i=$fields[1]; $i<= ($fields[1] + length($fields[3])); $i++) {
if(exists($ignore_indels_at{$fields[0]}->{$i})) {
Expand All @@ -287,6 +335,7 @@ sub runCmd {
# to:
# my $best_count = 0;
}
print STDOUT "Step 8...\n";
if ( $header eq "") { #minimal header for an empty vcf
print OUT "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t$sample_name\n";
OUT->flush();
Expand Down

0 comments on commit 8da01e1

Please sign in to comment.