From 5a59b35e346a7b79797943b39147b69d93d97890 Mon Sep 17 00:00:00 2001 From: am26 Date: Tue, 15 Sep 2015 13:44:54 +0100 Subject: [PATCH 01/40] Re-working comparison script just for star and deFuse --- perl/bin/compare_overlapping_fusions.pl | 42 +- .../CGP/CompareFusions/FusionAnnotation.pm | 7 + .../Sanger/CGP/CompareFusions/Implement.pm | 869 ++++++++---------- 3 files changed, 405 insertions(+), 513 deletions(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 2489984..1dac0d2 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -58,33 +58,30 @@ BEGIN use Data::Dumper; const my @REQUIRED_PARAMS => qw(outdir sample gtf); -const my @VALID_PROCESS => qw(createbed annotatebed selectannotation createbedpe runbedpairtopair compareoverlaps); -const my %INDEX_FACTOR => ( 'createbed' => -1, +const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair createbed annotatebed selectannotation output); +const my %INDEX_FACTOR => ( 'createjunctionbed' => -1, + 'runbedpairtopair' => -1, + 'createbed' => -1, 'annotatebed' => -1, - 'selectannotation' => -1, - 'createbedpe' => -1, - 'runbedpairtopair' => 1, - 'compareoverlaps' => 1); + 'selectannotation' => 1, + 'output' => 1); { my $options = setup(); my $threads = PCAP::Threaded->new($options->{'threads'}); &PCAP::Threaded::disable_out_err if(exists $options->{'index'}); - $threads->add_function('createbed', \&Sanger::CGP::CompareFusions::Implement::create_bed); - $threads->add_function('annotatebed', \&Sanger::CGP::CompareFusions::Implement::annotate_bed); - $threads->add_function('selectannotation', \&Sanger::CGP::CompareFusions::Implement::select_annotation); - $threads->add_function('createbedpe', \&Sanger::CGP::CompareFusions::Implement::create_bedpe); + $threads->add_function('createjunctionbed', \&Sanger::CGP::CompareFusions::Implement::create_junction_bedpe); - $threads->run($options->{'num'}, 'createbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbed'); - $threads->run($options->{'num'}, 'annotatebed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); - $threads->run($options->{'num'}, 'selectannotation', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); - $threads->run($options->{'num'}, 'createbedpe', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbedpe'); + $threads->run($options->{'num'}, 'createjunctionbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createjunctionbed'); Sanger::CGP::CompareFusions::Implement::run_bed_pairtopair($options) if(!exists $options->{'process'} || $options->{'process'} eq 'runbedpairtopair'); + Sanger::CGP::CompareFusions::Implement::create_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbed'); + Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); + Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); - if(!exists $options->{'process'} || $options->{'process'} eq 'compareoverlaps') { - Sanger::CGP::CompareFusions::Implement::compare_overlaps($options); + if(!exists $options->{'process'} || $options->{'process'} eq 'output') { + Sanger::CGP::CompareFusions::Implement::generate_output($options); cleanup($options); } } @@ -93,8 +90,7 @@ sub cleanup { my $options = shift; my $tmpdir = $options->{'tmp'}; my $sample = $options->{'sample'}; - move(File::Spec->catfile($tmpdir, "$sample.gene-fusions.txt"), $options->{'outdir'}) || die $!; - move(File::Spec->catfile($tmpdir, "$sample.exon-fusions.txt"), $options->{'outdir'}) || die $!; + move(File::Spec->catfile($tmpdir, "$sample.star-defuse.overlapping.fusions.txt"), $options->{'outdir'}) || die $!; move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; remove_tree $tmpdir if(-e $tmpdir); return 0; @@ -181,7 +177,7 @@ sub setup { =head1 compare_overlapping_fusions.pl -Produces a report of fusions that have been called by two algorithms (tophat, star or deFuse) or overlapping all three. Two output lists will be generated; one at the gene level and the other at the exon level. +Produces a report of overlapping fusions that have been called by star-fusion or deFuse. =head1 SYNOPSIS @@ -199,7 +195,7 @@ =head1 SYNOPSIS -process -p Only process this step then exit -index -i Valid for processes; createbed, annotatebed and createbedpe - 1.. - Input files should be in the format generated by cgpRna pipelines; defuse.pl, tophat_fusion.pl or star_fusion.pl + Input files should be in the format generated by cgpRna pipelines; defuse.pl or star_fusion.pl =head1 OPTIONS @@ -209,11 +205,11 @@ =head1 OPTIONS Available processes for this tool are: + createjunctionbed + runbedpairtopair createbed annotatebed selectannotation - createbedpe - runbedpairtopair - compareoverlaps + output =back diff --git a/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm b/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm index 0360923..64dc939 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm @@ -43,6 +43,7 @@ sub new { bless $self,$class; if ($args{-breakpoint}) { $self->breakpoint($args{-breakpoint}) } + if ($args{-alt_breakpoint}) { $self->alt_breakpoint($args{-alt_breakpoint}) } if ($args{-chr1}) { $self->chr1($args{-chr1}) } if ($args{-strand1}) { $self->strand1($args{-strand1}) } if ($args{-pos1_start}) { $self->pos1_start($args{-pos1_start}) } @@ -83,6 +84,12 @@ sub breakpoint { return($self->{breakpoint}); } +sub alt_breakpoint { + my $self = shift; + $self->{alt_breakpoint} = shift if @_; + return($self->{alt_breakpoint}); +} + sub chr1 { my $self = shift; $self->{chr1} = shift if @_; diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index a3c8d2a..6ceb949 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -47,11 +47,10 @@ use Sanger::CGP::CgpRna; use Data::Dumper; -const my $BEDTOOLS_CLOSEST => q{ closest -d -a %s -b %s | sort -k4,5 > %s}; -const my $BEDTOOLS_PAIRTOPAIR => q{ pairtopair -a %s -b %s -f 1.0 > %s}; +const my $BEDTOOLS_CLOSEST => q{ closest -s -a %s -b %s | sort -k4,4 > %s}; +const my $BEDTOOLS_PAIRTOPAIR => q{ pairtopair -a %s -b %s -slop 5 > %s}; -const my $OUTPUT_GENE_HEADER => "genes\tbreakpoints\tcalled_by\tchr1\tstrand1\tgene1\tgene1_start\tgene1_end\tchr2\tstrand2\tgene2\tgene2_start\tgene2_end\n"; -const my $OUTPUT_EXON_HEADER => "exons\tbreakpoints\tcalled_by\tchr1\tstrand1\tgene1\ttranscript1_id\texon1_num\texon1_start\texon1_end\tchr2\tstrand2\tgene2\ttranscript2_id\texon2_num\texon2_start\texon2_end\n"; +const my $OUTPUT_HEADER => "sample\tstar_breakpoint\tdefuse_breakpoint\tfusion_name\tdefuse_splitr_count\tdefuse_span_count\tstar_JunctionReads\tstar_SpanningFrags\tLeftGene\tLeftGeneId\tLeftChr\tLeftPos\tLeftStrand\tLeftDistFromRefExonSplice\tRightGene\tRightGeneId\tRightChr\tRightPos\tRightStrand\tRightDistFromRefExonSplice\tbreak1_feature\texon1_id\texon1_num\texon1_start\texon1_end\tbreak2_feature\texon2_id\texon2_num\texon2_start\texon2_end\ttranscript1_id\tgene1_biotype\ttranscript2_id\tgene2_biotype\tdefuse_cluster_id\tdefuse_splitr_sequence\n"; my %ALLOWED_BIOTYPES = ( antisense => 1, @@ -95,6 +94,7 @@ const my $DEFUSE_CHR2 => 27; const my $DEFUSE_POS2 => 40; const my $DEFUSE_STRAND2 => 37; const my $DEFUSE_BREAKREF => 1; +const my $DEFUSE_CLUSTER_ID => 2; const my $DEFUSE_HEADER_PATTERN => 'cluster_id'; # Position of the columns in the star-fusion output file used to format fusion breakpoint references. @@ -106,6 +106,10 @@ const my $STAR_CHR2 => 13; const my $STAR_POS2 => 14; const my $STAR_STRAND2 => 15; const my $STAR_BREAKREF => 1; +const my $STAR_GENENAME1 => 5; +const my $STAR_GENEID1 => 6; +const my $STAR_GENENAME2 => 11; +const my $STAR_GENEID2 => 12; const my $STAR_HEADER_PATTERN => 'fusion_name'; # Position of the columns in the SOAPfuse output file used to format fusion breakpoint references. @@ -120,11 +124,10 @@ const my $SOAP_BREAKREF => 1; const my $SOAP_HEADER_PATTERN => 'up_chr'; sub annotate_bed { - my ($index, $options) = @_; - return 1 if(exists $options->{'index'} && $index != $options->{'index'}); + my $options = shift; my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $sample = $options->{'sample'}; my $exon_gtf = filter_gtf($options->{'gtf'}, $tmp, 'exon'); @@ -135,8 +138,8 @@ sub annotate_bed { opendir(my $dh, $tmp); while(my $file = readdir $dh) { - $break1_file = File::Spec->catfile($tmp, $file) if($file =~ m/^$index.*1.bed/); - $break2_file = File::Spec->catfile($tmp, $file) if($file =~ m/^$index.*2.bed/); + $break1_file = File::Spec->catfile($tmp, $file) if($file =~ m/^$sample.1.bed/); + $break2_file = File::Spec->catfile($tmp, $file) if($file =~ m/^$sample.2.bed/); } closedir($dh); @@ -152,8 +155,8 @@ sub annotate_bed { my @commands = ($command1,$command2); - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, $index); - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); + PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return 1; } @@ -212,12 +215,12 @@ sub check_input { return $source; } -sub compare_overlaps { +sub create_bed { my $options = shift; - + my $tmp = $options->{'tmp'}; return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - + my $sample = $options->{'sample'}; # There will always be a 1_2 comparison file so deal with that first and build the fusions object. @@ -226,157 +229,78 @@ sub compare_overlaps { my $source1 = $options->{'fusion_files'}->{'1'}->{'format'}; my $source2 = $options->{'fusion_files'}->{'2'}->{'format'}; - my $gene_overlap_file1_2; - my $exon_overlap_file1_2; + my $col_set = 1; + my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; + if($source2 eq 'star'){ + $col_set = 2; + $star_file = $options->{'fusion_files'}->{'2'}->{'name'}; + } + + my $overlap_file1_2; opendir(my $dh, $tmp); while(my $file = readdir $dh) { - $gene_overlap_file1_2 = File::Spec->catfile($tmp, $file) if($file =~ m/^1_2.$sample.gene.bedpe_overlap/); - $exon_overlap_file1_2 = File::Spec->catfile($tmp, $file) if($file =~ m/^1_2.$sample.exon.bedpe_overlap/); + $overlap_file1_2 = File::Spec->catfile($tmp, $file) if($file =~ m/^1_2.$sample.bedpe_overlap/); } closedir($dh); - - my %gene_list; - my %feature_source; - my %exon_list; - - process_gene_overlaps($gene_overlap_file1_2, \%gene_list, \%feature_source, $source1, $source2); - process_exon_overlaps($exon_overlap_file1_2, \%exon_list, \%feature_source, $source1, $source2); - - if($options->{'num'} == 3){ - my $gene_overlap_file1_3; - my $exon_overlap_file1_3; - my $gene_overlap_file2_3; - my $exon_overlap_file2_3; - - my $source3 = $options->{'fusion_files'}->{'3'}->{'format'}; - - opendir(my $dh2, $tmp); - while(my $file = readdir $dh2) { - $gene_overlap_file1_3 = File::Spec->catfile($tmp, $file) if($file =~ m/^1_3.$sample.gene.bedpe_overlap/); - $exon_overlap_file1_3 = File::Spec->catfile($tmp, $file) if($file =~ m/^1_3.$sample.exon.bedpe_overlap/); - $gene_overlap_file2_3 = File::Spec->catfile($tmp, $file) if($file =~ m/^2_3.$sample.gene.bedpe_overlap/); - $exon_overlap_file2_3 = File::Spec->catfile($tmp, $file) if($file =~ m/^2_3.$sample.exon.bedpe_overlap/); - } - closedir($dh2); - - process_gene_overlaps($gene_overlap_file1_3, \%gene_list, \%feature_source, $source1, $source3); - process_exon_overlaps($exon_overlap_file1_3, \%exon_list, \%feature_source, $source1, $source3); - process_gene_overlaps($gene_overlap_file2_3, \%gene_list, \%feature_source, $source2, $source3); - process_exon_overlaps($exon_overlap_file2_3, \%exon_list, \%feature_source, $source2, $source3); + my $output1 = File::Spec->catfile($tmp, "$sample.1.bed"); + my $output2 = File::Spec->catfile($tmp, "$sample.2.bed"); + + my %star_gene_list; + open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + my @fields = split "\t", $line; + my $breakpoint = $fields[0]; + $star_gene_list{$breakpoint}{'gene1_name'} = $fields[4]; + $star_gene_list{$breakpoint}{'gene1_id'} = $fields[5]; + $star_gene_list{$breakpoint}{'gene2_name'} = $fields[10]; + $star_gene_list{$breakpoint}{'gene2_id'} = $fields[11]; } - - my $gene_output_file = File::Spec->catfile($tmp, "$sample.gene-fusions.txt"); - open(my $ofh1, '>', $gene_output_file) or die "Could not open file $gene_output_file $!"; - print $ofh1 $OUTPUT_GENE_HEADER; - for my $gene (keys %gene_list){ - my @brk_list; - for my $brk (keys %{ $gene_list{$gene}}){ - push @brk_list, $brk; - } - - my @brks; - for my $brkl (keys %{ $feature_source{$gene}{'breakpoints'}}){ - push @brks, $brkl; - } - - my $tophat = ""; - my $defuse = ""; - my $star = ""; - my $soap = ""; - - $tophat = "T" if(exists $feature_source{$gene}->{'tophat'}); - $defuse = "D" if(exists $feature_source{$gene}->{'defuse'}); - $star = "S" if(exists $feature_source{$gene}->{'star'}); - $soap = "O" if(exists $feature_source{$gene}->{'soap'}); - my $gene_source_string = $tophat.$defuse.$star.$soap; - - my $chr1 = $gene_list{$gene}{$brk_list[0]}->{'chr1'}; - my $strand1 = $gene_list{$gene}{$brk_list[0]}->{'strand1'}; - my $gene1 = $gene_list{$gene}{$brk_list[0]}->{'gene1'}; - my $gene1_start = $gene_list{$gene}{$brk_list[0]}->{'gene1_start'}; - my $gene1_end = $gene_list{$gene}{$brk_list[0]}->{'gene1_end'}; - my $chr2 = $gene_list{$gene}{$brk_list[0]}->{'chr2'}; - my $strand2 = $gene_list{$gene}{$brk_list[0]}->{'strand2'}; - my $gene2 = $gene_list{$gene}{$brk_list[0]}->{'gene2'}; - my $gene2_start = $gene_list{$gene}{$brk_list[0]}->{'gene2_start'}; - my $gene2_end = $gene_list{$gene}{$brk_list[0]}->{'gene2_end'}; - - my $breaks = join(",",@brks); - print $ofh1 "$gene\t$breaks\t$gene_source_string\t$chr1\t$strand1\t$gene1\t$gene1_start\t$gene1_end\t$chr2\t$strand2\t$gene2\t$gene2_start\t$gene2_end\n"; - } - close($ofh1); - - my $exon_output_file = File::Spec->catfile($tmp, "$sample.exon-fusions.txt"); - open(my $ofh2, '>', $exon_output_file) or die "Could not open file $exon_output_file $!"; - print $ofh2 $OUTPUT_EXON_HEADER; - - for my $exon (keys %exon_list){ - my @brk_list; - for my $brk (keys %{ $exon_list{$exon}}){ - push @brk_list, $brk; - } - - my @brks; - for my $brkl (keys %{ $feature_source{$exon}{'breakpoints'}}){ - push @brks, $brkl; - } - - my $tophat = ""; - my $defuse = ""; - my $star = ""; - my $soap = ""; - $tophat = "T" if(exists $feature_source{$exon}->{'tophat'}); - $defuse = "D" if(exists $feature_source{$exon}->{'defuse'}); - $star = "S" if(exists $feature_source{$exon}->{'star'}); - $soap = "O" if(exists $feature_source{$exon}->{'soap'}); - my $exon_source_string = $tophat.$defuse.$star.$soap; - - my $chr1 = $exon_list{$exon}{$brk_list[0]}->{'chr1'}; - my $strand1 = $exon_list{$exon}{$brk_list[0]}->{'strand1'}; - my $gene1 = $exon_list{$exon}{$brk_list[0]}->{'gene1'}; - my $transcript1_id = $exon_list{$exon}{$brk_list[0]}->{'transcript1_id'}; - my $exon1_num = $exon_list{$exon}{$brk_list[0]}->{'exon1_num'}; - my $exon1_start = $exon_list{$exon}{$brk_list[0]}->{'feature1_start'}; - my $exon1_end = $exon_list{$exon}{$brk_list[0]}->{'feature1_end'}; - my $chr2 = $exon_list{$exon}{$brk_list[0]}->{'chr2'}; - my $strand2 = $exon_list{$exon}{$brk_list[0]}->{'strand2'}; - my $gene2 = $exon_list{$exon}{$brk_list[0]}->{'gene2'}; - my $transcript2_id = $exon_list{$exon}{$brk_list[0]}->{'transcript2_id'}; - my $exon2_num = $exon_list{$exon}{$brk_list[0]}->{'exon2_num'}; - my $exon2_start = $exon_list{$exon}{$brk_list[0]}->{'feature2_start'}; - my $exon2_end = $exon_list{$exon}{$brk_list[0]}->{'feature2_end'}; - - my $breaks = join(",",@brks); - print $ofh2 "$exon\t$breaks\t$exon_source_string\t$chr1\t$strand1\t$gene1\t$transcript1_id\t$exon1_num\t$exon1_start\t$exon1_end\t$chr2\t$strand2\t$gene2\t$transcript2_id\t$exon2_num\t$exon2_start\t$exon2_end\n"; + close ($ifh1); + + open (my $ifh2, $overlap_file1_2) or die "Could not open file '$overlap_file1_2' $!"; + open(my $ofh1, '>', $output1) or die "Could not open file '$output1' $!"; + open(my $ofh2, '>', $output2) or die "Could not open file '$output2' $!"; + + while (<$ifh2>) { + chomp; + my $line = $_; + my $fusion = parse_overlap($line, $col_set); + my $gene1_name = $star_gene_list{$fusion->{'breakpoint'}}{'gene1_name'}; + my $gene1_id = $star_gene_list{$fusion->{'breakpoint'}}{'gene1_id'}; + my $gene2_name = $star_gene_list{$fusion->{'breakpoint'}}{'gene2_name'}; + my $gene2_id = $star_gene_list{$fusion->{'breakpoint'}}{'gene2_id'}; + print $ofh1 $fusion->{'chr1'}."\t".$fusion->{'pos1_start'}."\t".$fusion->{'pos1_end'}."\t".$fusion->{'breakpoint'}."_".$fusion->{'strand1'}.$fusion->{'strand2'}."\t".$fusion->{'alt_breakpoint'}."\t".$fusion->{'strand1'}."\t".$gene1_name."\t".$gene1_id."\n"; + print $ofh2 $fusion->{'chr2'}."\t".$fusion->{'pos2_start'}."\t".$fusion->{'pos2_end'}."\t".$fusion->{'breakpoint'}."_".$fusion->{'strand1'}.$fusion->{'strand2'}."\t".$fusion->{'alt_breakpoint'}."\t".$fusion->{'strand2'}."\t".$gene2_name."\t".$gene2_id."\n"; } - close($ofh2); - + + close ($ifh2); + close ($ofh1); + close ($ofh2); + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); - + return 1; } -sub create_bed { +sub create_junction_bedpe { my ($index, $options) = @_; - + my $tmp = $options->{'tmp'}; return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); my $sample = $options->{'sample'}; - + my $file = $options->{'fusion_files'}->{$index}->{'name'}; my $filetype = $options->{'fusion_files'}->{$index}->{'format'}; - my $output1 = File::Spec->catfile($tmp, "$index.$sample.$filetype.1.bed"); - my $output2 = File::Spec->catfile($tmp, "$index.$sample.$filetype.2.bed"); - my $output3 = File::Spec->catfile($tmp, "$index.$sample.$filetype.list"); + my $output1 = File::Spec->catfile($tmp, "$index.$sample.bedpe"); open (my $ifh, $file) or die "Could not open file '$file' $!"; open(my $ofh1, '>', $output1) or die "Could not open file '$output1' $!"; - open(my $ofh2, '>', $output2) or die "Could not open file '$output2' $!"; - open(my $ofh3, '>', $output3) or die "Could not open file '$output3' $!"; - + while (<$ifh>) { chomp; my $line = $_; @@ -390,27 +314,9 @@ sub create_bed { my $chr2; my $pos2_start; my $pos2_end; - my $strand2; + my $strand2; - if($filetype eq 'tophat'){ - next if($line =~ m/$TOPHAT_HEADER_PATTERN/); - - @fields = split $TOPHAT_SPLIT_CHAR, $line; - $name = $fields[$TOPHAT_BREAKREF - 1]; - $chr1 = $fields[$TOPHAT_CHR1 - 1]; - $pos1_start = $fields[$TOPHAT_POS1 - 1]-1; - $pos1_end = $fields[$TOPHAT_POS1 - 1]; - $strand1 = $fields[$TOPHAT_STRAND1 - 1]; - $chr2 = $fields[$TOPHAT_CHR2 - 1]; - $pos2_start = $fields[$TOPHAT_POS2 - 1]-1; - $pos2_end = $fields[$TOPHAT_POS2- 1]; - $strand2 = $fields[$TOPHAT_STRAND2 - 1]; - - print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$name."\t".$strand1."\n"; - print $ofh2 $chr2."\t".$pos2_start."\t".$pos2_end."\t".$name."\t".$strand2."\n"; - print $ofh3 $name."\n"; - } - elsif($filetype eq 'star'){ + if($filetype eq 'star'){ next if($line =~ m/$STAR_HEADER_PATTERN/); @fields = split $STAR_SPLIT_CHAR, $line; @@ -424,34 +330,14 @@ sub create_bed { $pos2_end = $fields[$STAR_POS2- 1]; $strand2 = $fields[$STAR_STRAND2 - 1]; - print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$name."\t".$strand1."\n"; - print $ofh2 $chr2."\t".$pos2_start."\t".$pos2_end."\t".$name."\t".$strand2."\n"; - print $ofh3 $name."\n"; - } - elsif($filetype eq 'soap'){ - next if($line =~ m/$SOAP_HEADER_PATTERN/); - - @fields = split $SOAP_SPLIT_CHAR, $line; - $name = $fields[$SOAP_BREAKREF - 1]; - $chr1 = $fields[$SOAP_CHR1 - 1]; - $pos1_start = $fields[$SOAP_POS1 - 1]-1; - $pos1_end = $fields[$SOAP_POS1 - 1]; - $strand1 = $fields[$SOAP_STRAND1 - 1]; - $chr2 = $fields[$SOAP_CHR2 - 1]; - $pos2_start = $fields[$SOAP_POS2 - 1]-1; - $pos2_end = $fields[$SOAP_POS2- 1]; - $strand2 = $fields[$SOAP_STRAND2 - 1]; - - print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$name."\t".$strand1."\n"; - print $ofh2 $chr2."\t".$pos2_start."\t".$pos2_end."\t".$name."\t".$strand2."\n"; - print $ofh3 $name."\n"; + print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$chr2."\t".$pos2_start."\t".$pos2_end."\t".$filetype."\t".$name."\t".$strand1."\t".$strand2."\n"; } # It must be defuse format else{ next if($line =~ m/$DEFUSE_HEADER_PATTERN/); @fields = split $DEFUSE_SPLIT_CHAR, $line; - $name = $fields[$DEFUSE_BREAKREF - 1]; + $name = $fields[$DEFUSE_BREAKREF - 1]."_".$fields[$DEFUSE_CLUSTER_ID - 1]; $chr1 = $fields[$DEFUSE_CHR1 - 1]; $pos1_start = $fields[$DEFUSE_POS1 - 1]-1; $pos1_end = $fields[$DEFUSE_POS1 - 1]; @@ -461,154 +347,16 @@ sub create_bed { $pos2_end = $fields[$DEFUSE_POS2- 1]; $strand2 = $fields[$DEFUSE_STRAND2 - 1]; - print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$name."\t".$strand1."\n"; - print $ofh2 $chr2."\t".$pos2_start."\t".$pos2_end."\t".$name."\t".$strand2."\n"; - print $ofh3 $name."\n"; + print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$chr2."\t".$pos2_start."\t".$pos2_end."\t".$filetype."\t".$name."\t".$strand1."\t".$strand2."\n"; } - } close ($ifh); close ($ofh1); - close ($ofh2); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); - return 1; -} - -sub create_bedpe { - my ($index, $options) = @_; - return 1 if(exists $options->{'index'} && $index != $options->{'index'}); + return 1; - my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); - - my $sample = $options->{'sample'}; - - my $annot_file1; - my $annot_file2; - my $gene_bedpe_file = File::Spec->catfile($tmp, "$index.$sample.gene.bedpe"); - my $exon_bedpe_file = File::Spec->catfile($tmp, "$index.$sample.exon.bedpe"); - my $gene_gtf = File::Spec->catfile($tmp, "filtered_gene.gtf"); - - opendir(my $dh, $tmp); - while(my $file = readdir $dh) { - $annot_file1 = File::Spec->catfile($tmp, $file) if($file =~ m/^$index.$sample.*.1.ann_final$/); - $annot_file2 = File::Spec->catfile($tmp, $file) if($file =~ m/^$index.$sample.*.2.ann_final$/); - } - closedir($dh); - - my %gene_info; - - open (my $ifh1, $gene_gtf) or die "Could not open file '$gene_gtf' $!"; - while (<$ifh1>) { - chomp; - my $line = $_; - my $gene_annot = parse_gene_info($line); - if(!exists $gene_info{$gene_annot->{'gene_name'}}){ - $gene_info{$gene_annot->{'gene_name'}}{'feature_start'} = $gene_annot->{'start'}; - $gene_info{$gene_annot->{'gene_name'}}{'feature_end'} = $gene_annot->{'end'}; - } - } - close ($ifh1); - - my %break1; - open (my $ifh2, $annot_file1) or die "Could not open file '$annot_file1' $!"; - while (<$ifh2>) { - chomp; - my $line1 = $_; - my $break_annotation1 = parse_annotation($line1); - my $gene1 = $break_annotation1->{'gene_name'}; - my $gene1_start = $gene_info{$gene1}{'feature_start'}; - my $gene1_end = $gene_info{$gene1}{'feature_end'}; - - $break_annotation1->{'gene_start'} = $gene1_start; - $break_annotation1->{'gene_end'} = $gene1_end; - $break1{$break_annotation1->{'breakpoint'}} = $break_annotation1; - } - close ($ifh2); - - my %break2; - open (my $ifh3, $annot_file2) or die "Could not open file '$annot_file2' $!"; - while (<$ifh3>) { - chomp; - my $line2 = $_; - - my $break_annotation2 = parse_annotation($line2); - my $gene2 = $break_annotation2->{'gene_name'}; - my $gene2_start = $gene_info{$gene2}{'feature_start'}; - my $gene2_end = $gene_info{$gene2}{'feature_end'}; - - $break_annotation2->{'gene_start'} = $gene2_start; - $break_annotation2->{'gene_end'} = $gene2_end; - $break2{$break_annotation2->{'breakpoint'}} = $break_annotation2; - } - close ($ifh3); - - my $fusion; - my $formatted_gene_line; - my $formatted_exon_line; - - open(my $ofh1, '>', $gene_bedpe_file) or die "Could not open file $gene_bedpe_file $!"; - open(my $ofh2, '>', $exon_bedpe_file) or die "Could not open file $exon_bedpe_file $!"; - - for my $brk (keys %break1){ - - $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( - -breakpoint => $brk, - -chr1 => $break1{$brk}->{'chr'}, - -pos1_start => $break1{$brk}->{'pos_start'}, - -pos1_end => $break1{$brk}->{'pos_end'}, - -strand1 => $break1{$brk}->{'strand'}, - -feature1 => $break1{$brk}->{'feature'}, - -feature1_start => $break1{$brk}->{'feature_start'}, - -feature1_end => $break1{$brk}->{'feature_end'}, - -gene1 => $break1{$brk}->{'gene_name'}, - -gene1_id => $break1{$brk}->{'gene_id'}, - -gene1_start => $break1{$brk}->{'gene_start'}, - -gene1_end => $break1{$brk}->{'gene_end'}, - -chr2 => $break2{$brk}->{'chr'}, - -pos2_start => $break2{$brk}->{'pos_start'}, - -pos2_end => $break2{$brk}->{'pos_end'}, - -strand2 => $break2{$brk}->{'strand'}, - -feature2 => $break2{$brk}->{'feature'}, - -feature2_start => $break2{$brk}->{'feature_start'}, - -feature2_end => $break2{$brk}->{'feature_end'}, - -gene2 => $break2{$brk}->{'gene_name'}, - -gene2_id => $break2{$brk}->{'gene_id'}, - -gene2_start => $break2{$brk}->{'gene_start'}, - -gene2_end => $break2{$brk}->{'gene_end'}); - - if($break1{$brk}->{'feature'} eq "exon"){ - $fusion->exon1_num($break1{$brk}->{'exon_number'}); - $fusion->exon1_id($break1{$brk}->{'exon_id'}); - $fusion->transcript1_id($break1{$brk}->{'transcript_id'}); - $fusion->distance1($break1{$brk}->{'distance'}); - } - - if($break2{$brk}->{'feature'} eq "exon"){ - $fusion->exon2_num($break2{$brk}->{'exon_number'}); - $fusion->exon2_id($break2{$brk}->{'exon_id'}); - $fusion->transcript2_id($break2{$brk}->{'transcript_id'}); - $fusion->distance2($break2{$brk}->{'distance'}); - } - - $formatted_gene_line = $fusion->format_bedpe_line('gene'); - my $within_gene = check_gene_boundaries($formatted_gene_line); - print $ofh1 $formatted_gene_line."\n" if($within_gene); - if(defined $fusion->exon1_num && $fusion->exon2_num){ - if($fusion->distance1 <= 10 && $fusion->distance2 <= 10){ - $formatted_exon_line = $fusion->format_bedpe_line('exon'); - print $ofh2 $formatted_exon_line."\n"; - } - } - } - close ($ofh1); - close ($ofh2); - - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), $index); - - return 1; } sub filter_gtf { @@ -616,6 +364,7 @@ sub filter_gtf { my $filtered_gtf = File::Spec->catfile($tmp, "filtered_$feature.gtf"); + unless (-e $filtered_gtf){ open (my $ifh, $gtf) or die "Could not open file '$gtf' $!"; open(my $ofh, '>', $filtered_gtf) or die "Could not open file '$filtered_gtf' $!"; @@ -639,6 +388,7 @@ sub filter_gtf { } close($ifh); close($ofh); + } return $filtered_gtf; } @@ -654,6 +404,125 @@ sub find_closest_boundary { return $distance; } +sub generate_output { + my $options = shift; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + + my $sample = $options->{'sample'}; + + my $star_file; + my $defuse_file; + + if($options->{'fusion_files'}->{'1'}->{'format'} eq 'star'){ + $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; + $defuse_file = $options->{'fusion_files'}->{'2'}->{'name'}; + } + else{ + $star_file = $options->{'fusion_files'}->{'2'}->{'name'}; + $defuse_file = $options->{'fusion_files'}->{'1'}->{'name'}; + } + + my %star_data; + open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + next if($line =~ m/^breakpoint/); + $line =~ m/^(.*:[0-9]+-.*:[0-9]+)\t([A-Za-z0-9-_:\.]+--[A-Za-z0-9-_:\.]+)\t/; + my $break_ref = $1; + my $fusion_name = $2; + $line =~ s/^.*:[0-9]+-.*:[0-9]+\t[A-Za-z0-9-_:\.]+--[A-Za-z0-9-_:\.]+\t//; + $star_data{$break_ref}{'fusion_name'} = $fusion_name; + $star_data{$break_ref}{'data'} = $line; + } + close ($ifh1); + + my %defuse_data; + open (my $ifh2, $defuse_file) or die "Could not open file '$defuse_file' $!"; + while (<$ifh2>) { + chomp; + my $line = $_; + next if($line =~ m/^breakpoint/); + my @fields = split "\t", $line; + $line =~ m/^(.*:[0-9]+-.*:[0-9]+)\t([0-9]+)\t([ACGT|]+)\t/; + my $break_ref = $fields[0]; + my $cluster_id = $fields[1]; + my $sequence = $fields[2]; + my $split_reads = $fields[3]; + my $span_reads = $fields[61]; + $defuse_data{$break_ref."_".$cluster_id}{'breakpoint'} = $break_ref; + $defuse_data{$break_ref."_".$cluster_id}{'cluster_id'} = $cluster_id; + $defuse_data{$break_ref."_".$cluster_id}{'sequence'} = $sequence; + $defuse_data{$break_ref."_".$cluster_id}{'split_reads'} = $split_reads; + $defuse_data{$break_ref."_".$cluster_id}{'span_reads'} = $span_reads; + } + close ($ifh2); + + my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); + my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); + my $output_file = File::Spec->catfile($tmp, "$sample.star-defuse.overlapping.fusions.txt"); + + my %break1; + open (my $ifh3, $annot_file1) or die "Could not open file '$annot_file1' $!"; + while (<$ifh3>) { + chomp; + my $line1 = $_; + my $break_annotation1 = parse_break_data($line1); + $line1 =~ m/^(.*:[0-9]+-.*:[0-9]+_[+-]+)/; + my $break_ref = $1; + $break1{$break_ref} = $break_annotation1; + } + close ($ifh3); + + my %break2; + open (my $ifh4, $annot_file2) or die "Could not open file '$annot_file2' $!"; + while (<$ifh4>) { + chomp; + my $line2 = $_; + my $break_annotation2 = parse_break_data($line2); + $line2 =~ m/^(.*:[0-9]+-.*:[0-9]+_[+-]+)/; + my $break_ref = $1; + $break2{$break_ref} = $break_annotation2; + } + close ($ifh4); + + open(my $ofh1, '>', $output_file) or die "Could not open file $output_file $!"; + print $ofh1 $OUTPUT_HEADER; + for my $brk (keys %break1){ + if(exists $break2{$brk}){ + my $breakpoint = $break1{$brk}->{'breakpoint'}; + my $alt_breakpoint = $break1{$brk}->{'alt_breakpoint'}; + my $star_fusion_name = $star_data{$breakpoint}{'fusion_name'}; + my $star_data = $star_data{$breakpoint}{'data'}; + my $feature1 = $break1{$brk}->{'feature'}; + my $feature2 = $break2{$brk}->{'feature'}; + my $exon1_id = $break1{$brk}->{'exon_id'}; + my $exon2_id = $break2{$brk}->{'exon_id'}; + my $exon1_number = $break1{$brk}->{'exon_number'}; + my $exon2_number = $break2{$brk}->{'exon_number'}; + my $exon1_start = $break1{$brk}->{'exon_start'}; + my $exon2_start = $break2{$brk}->{'exon_start'}; + my $exon1_end = $break1{$brk}->{'exon_end'}; + my $exon2_end = $break2{$brk}->{'exon_end'}; + my $transcript1_id = $break1{$brk}->{'transcript_id'}; + my $transcript2_id = $break2{$brk}->{'transcript_id'}; + my $biotype1 = $break1{$brk}->{'gene_biotype'}; + my $biotype2 = $break2{$brk}->{'gene_biotype'}; + my $defuse_breakpoint = $defuse_data{$alt_breakpoint}{'breakpoint'}; + my $defuse_cluster_id = $defuse_data{$alt_breakpoint}{'cluster_id'}; + my $defuse_split_reads = $defuse_data{$alt_breakpoint}{'split_reads'}; + my $defuse_span_reads = $defuse_data{$alt_breakpoint}{'span_reads'}; + my $defuse_sequence = $defuse_data{$alt_breakpoint}{'sequence'}; + print $ofh1 "$sample\t$breakpoint\t$defuse_breakpoint\t$star_fusion_name\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$feature1\t$exon1_id\t$exon1_number\t$exon1_start\t$exon1_end\t$feature2\t$exon2_id\t$exon2_number\t$exon2_start\t$exon2_end\t$transcript1_id\t$biotype1\t$transcript2_id\t$biotype2\t$defuse_cluster_id\t$defuse_sequence\n"; + } + } + close($ofh1); + + return 1; +} + sub parse_annotation { my $line = shift; $line =~ s/"//g; @@ -662,18 +531,19 @@ sub parse_annotation { my @fields = split "\t", $line; $annotation{'breakpoint'} = $fields[3]; + $annotation{'alt_breakpoint'} = $fields[4]; $annotation{'chr'} = $fields[0]; $annotation{'pos_start'} = $fields[1]; $annotation{'pos_end'} = $fields[2]; - $annotation{'strand'} = $fields[4]; - $annotation{'feature'} = $fields[7]; - $annotation{'feature_start'} = $fields[8]; - $annotation{'feature_end'} = $fields[9]; - - my $annot_column = scalar @fields; - $annotation{'distance'} = $fields[$annot_column-1]; - - my @annot_fields = split /; /, $fields[$annot_column-2]; + $annotation{'strand'} = $fields[5]; + $annotation{'feature'} = $fields[10]; + $annotation{'feature_start'} = $fields[11]; + $annotation{'feature_end'} = $fields[12]; + $annotation{'star_genename'} = $fields[6]; + $annotation{'star_geneid'} = $fields[7]; + + my $annot_column = scalar @fields; + my @annot_fields = split /; /, $fields[$annot_column-1]; foreach my $item(@annot_fields) { my ($type,$value)= split / /, $item; $annotation{$type} = $value; @@ -682,6 +552,29 @@ sub parse_annotation { return \%annotation; } +sub parse_break_data { + my $line = shift; + + my %break; + my @fields = split "\t", $line; + + my @breakpoint = split "_", $fields[0]; + $break{'breakpoint'} = $breakpoint[0]; + $break{'alt_breakpoint'} = $fields[1]; + $break{'gene_name'} = $fields[2]; + $break{'gene_id'} = $fields[3]; + $break{'strand'} = $fields[4]; + $break{'feature'} = $fields[5]; + $break{'exon_id'} = $fields[6]; + $break{'exon_number'} = $fields[7]; + $break{'exon_start'} = $fields[8]; + $break{'exon_end'} = $fields[9]; + $break{'transcript_id'} = $fields[10]; + $break{'gene_biotype'} = $fields[11]; + + return \%break; +} + sub parse_gene_info { my $line = shift; $line =~ s/"//g; @@ -703,7 +596,7 @@ sub parse_gene_info { } sub parse_overlap { - my ($line, $cols, $type) = @_; + my ($line, $cols) = @_; my @fields = split "\t", $line; @@ -714,101 +607,24 @@ sub parse_overlap { my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( -breakpoint => $fields[$start + 7], -chr1 => $fields[$start], + -pos1_start => $fields[$start + 1], + -pos1_end => $fields[$start + 2], -strand1 => $fields[$start + 8], - -gene1 => $fields[$start + 10], - -gene1_id => $fields[$start + 11], -chr2 => $fields[$start + 3], - -strand2 => $fields[$start + 9], - -gene2 => $fields[$start + 12], - -gene2_id => $fields[$start + 13], - -feature1 => $fields[$start + 6]); - - if($type eq 'gene'){ - $fusion->gene1_start($fields[$start + 1]); - $fusion->gene1_end($fields[$start + 2]); - $fusion->gene2_start($fields[$start + 4]); - $fusion->gene2_end($fields[$start + 5]); + -pos2_start => $fields[$start + 4], + -pos2_end => $fields[$start + 5], + -strand2 => $fields[$start + 9]); + + if($cols == 1){ + $fusion->alt_breakpoint($fields[17]); } - - if($type eq 'exon'){ - $fusion->feature1_start($fields[$start + 1]); - $fusion->feature1_end($fields[$start + 2]); - $fusion->exon1_num($fields[$start + 14]); - $fusion->exon1_id($fields[$start + 15]); - $fusion->transcript1_id($fields[$start + 16]); - $fusion->feature2_start($fields[$start + 4]); - $fusion->feature2_end($fields[$start + 5]); - $fusion->exon2_num($fields[$start + 17]); - $fusion->exon2_id($fields[$start + 18]); - $fusion->transcript2_id($fields[$start + 19]); + else{ + $fusion->alt_breakpoint($fields[7]); } return $fusion; } -sub process_exon_overlaps { - my ($exon_overlap_file, $exon_list, $feature_source, $source1, $source2) = @_; - - # The fusion will be represented twice in the overlapping file, use the first set of columns by default - my $col_set = 1; - - # If the first set of columns come from a defuse bedpe, use the second set of columns. - # This is because star and tophat are consistent in terms of strand orientation. If we use - # defuse data we would need to flip some of the gene and exon data - $col_set = 2 if($source1 eq 'defuse'); - - open (my $ifh, $exon_overlap_file) or die "Could not open file '$exon_overlap_file' $!"; - while (<$ifh>) { - chomp; - my $line = $_; - my @fields = split "\t", $line; - next if(($fields[15] ne $fields[37] && $fields[15] ne $fields[40]) || ($fields[18] ne $fields[37] && $fields[18] ne $fields[40])); - my $exon_fusion = parse_overlap($line, $col_set, 'exon'); - $exon_list->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{$exon_fusion->{'breakpoint'}} = $exon_fusion if(!exists $exon_list->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{$exon_fusion->{'breakpoint'}}); - my $brk1 = $fields[7]; - my $brk2 = $fields[29]; - $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{'breakpoints'}{$brk1} = 1 if(!exists $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{'breakpoints'}{$brk1}); - $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{'breakpoints'}{$brk2} = 1 if(!exists $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{'breakpoints'}{$brk2}); - $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{$source1} = 1 if(!exists $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{$source1}); - $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{$source2} = 1 if(!exists $feature_source->{$exon_fusion->{'exon1_id'}.':'.$exon_fusion->{'exon2_id'}}{$source2}); - - } - close ($ifh); - - return 1; -} - -sub process_gene_overlaps { - my ($gene_overlap_file, $gene_list, $feature_source, $source1, $source2) = @_; - - # The fusion will be represented twice in the overlapping file, use the first set of columns by default - my $col_set = 1; - - # If the first set of columns come from a defuse bedpe, use the second set of columns. - # This is because star and tophat are consistent in terms of strand orientation. If we use - # defuse data we would need to flip some of the gene and exon data - $col_set = 2 if($source1 eq 'defuse'); - - open (my $ifh, $gene_overlap_file) or die "Could not open file '$gene_overlap_file' $!"; - while (<$ifh>) { - chomp; - my $line = $_; - my @fields = split "\t", $line; - next if(($fields[10] ne $fields[24] && $fields[10] ne $fields[26]) || ($fields[12] ne $fields[24] && $fields[12] ne $fields[26])); - my $gene_fusion = parse_overlap($line, $col_set, 'gene'); - $gene_list->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{$gene_fusion->{'breakpoint'}} = $gene_fusion if(!exists $gene_list->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{$gene_fusion->{'breakpoint'}}); - my $brk1 = $fields[7]; - my $brk2 = $fields[21]; - $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{'breakpoints'}{$brk1} = 1 if(!exists $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{'breakpoints'}{$brk1}); - $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{'breakpoints'}{$brk2} = 1 if(!exists $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{'breakpoints'}{$brk2}); - $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{$source1} = 1 if(!exists $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{$source1}); - $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{$source2} = 1 if(!exists $feature_source->{$gene_fusion->{'gene1_id'}.':'.$gene_fusion->{'gene2_id'}}{$source2}); - } - close ($ifh); - - return 1; -} - sub run_bed_pairtopair { my $options = shift; @@ -820,35 +636,11 @@ sub run_bed_pairtopair { my $prog = _which('bedtools'); # There will always be at least two input files so build the command for the first comparison - my $command1 = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.gene.bedpe"), - File::Spec->catfile($tmp, "2.$sample.gene.bedpe"), File::Spec->catfile($tmp, "1_2.$sample.gene.bedpe_overlap"); - - my $command2 = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.exon.bedpe"), - File::Spec->catfile($tmp, "2.$sample.exon.bedpe"), - File::Spec->catfile($tmp, "1_2.$sample.exon.bedpe_overlap"); - my @commands = ($command1, $command2); - - if($options->{'num'} == 3){ - my $command3 = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.gene.bedpe"), - File::Spec->catfile($tmp, "3.$sample.gene.bedpe"), - File::Spec->catfile($tmp, "1_3.$sample.gene.bedpe_overlap"); - - my $command4 = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.exon.bedpe"), - File::Spec->catfile($tmp, "3.$sample.exon.bedpe"), - File::Spec->catfile($tmp, "1_3.$sample.exon.bedpe_overlap"); - - my $command5 = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "2.$sample.gene.bedpe"), - File::Spec->catfile($tmp, "3.$sample.gene.bedpe"), - File::Spec->catfile($tmp, "2_3.$sample.gene.bedpe_overlap"); - - my $command6 = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "2.$sample.exon.bedpe"), - File::Spec->catfile($tmp, "3.$sample.exon.bedpe"), - File::Spec->catfile($tmp, "2_3.$sample.exon.bedpe_overlap"); - - push @commands, ($command3, $command4, $command5, $command6); - } + my $command = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.bedpe"), + File::Spec->catfile($tmp, "2.$sample.bedpe"), + File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); + PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return 1; @@ -857,22 +649,36 @@ sub run_bed_pairtopair { sub select_annotation { # All possible exon annotations have been retrieved for each breakpoint, we need to select annotation for the nearest. - my ($index, $options) = @_; - return 1 if(exists $options->{'index'} && $index != $options->{'index'}); + my $options = shift; my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), $index); + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $sample = $options->{'sample'}; my $annot_file1; my $annot_file2; my $final_annot_file1; my $final_annot_file2; + + my $gene_gtf = File::Spec->catfile($tmp, "filtered_gene.gtf"); + my %gene_info; + + open (my $ifh1, $gene_gtf) or die "Could not open file '$gene_gtf' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + my $gene_annot = parse_gene_info($line); + if(!exists $gene_info{$gene_annot->{'gene_name'}}){ + $gene_info{$gene_annot->{'gene_name'}}{'feature_start'} = $gene_annot->{'start'}; + $gene_info{$gene_annot->{'gene_name'}}{'feature_end'} = $gene_annot->{'end'}; + } + } + close ($ifh1); opendir(my $dh, $tmp); - while(my $file = readdir $dh) { - $annot_file1 = File::Spec->catfile($tmp, $file) if($file =~ m/^$index.$sample.*.1.ann$/); - $annot_file2 = File::Spec->catfile($tmp, $file) if($file =~ m/^$index.$sample.*.2.ann$/); + while(my $file = readdir $dh){ + $annot_file1 = File::Spec->catfile($tmp, $file) if($file eq "$sample.1.ann"); + $annot_file2 = File::Spec->catfile($tmp, $file) if($file eq "$sample.2.ann"); } closedir($dh); @@ -880,70 +686,153 @@ sub select_annotation { $final_annot_file2 = $annot_file2."_final"; my $curr_distance = 10000000; - my $curr_line = ""; my $curr_break = ""; + my $curr_annotation; + my $curr_pos; + my $curr_exon_start; + my $curr_exon_end; # Process the first annotation file open(my $ofh1, '>', $final_annot_file1) or die "Could not open file $final_annot_file1 $!"; - open (my $ifh1, $annot_file1) or die "Could not open file '$annot_file1' $!"; - while (<$ifh1>) { + open (my $ifh2, $annot_file1) or die "Could not open file '$annot_file1' $!"; + while (<$ifh2>){ chomp; my $line = $_; - my @fields = split "\t", $line; - my $break = $fields[3].$fields[4]; + my $annotation = parse_annotation($line); + next if($annotation->{'gene_name'} ne $annotation->{'star_genename'}); + my $break = $annotation->{'breakpoint'}; - if($break ne $curr_break){ - print $ofh1 $curr_line."\n" unless($curr_break eq ""); - $curr_break = $break; - $curr_distance = 10000000; - } - my $pos = $fields[2]; - my $exon_start = $fields[8]; - my $exon_end = $fields[9]; - + if($break ne $curr_break){ + unless($curr_break eq ""){ + $curr_pos = $curr_annotation->{'pos_end'}; + $curr_exon_start = $curr_annotation->{'feature_start'}; + $curr_exon_end = $curr_annotation->{'feature_end'}; + + if($curr_distance <= 10){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + else{ + # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic + my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $break_pos = $curr_annotation->{'pos_end'}; + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + } + $curr_distance = 10000000; + $curr_break = $break; + } + } + my $pos = $annotation->{'pos_end'}; + my $exon_start = $annotation->{'feature_start'}; + my $exon_end = $annotation->{'feature_end'}; my $distance = find_closest_boundary($pos, $exon_start, $exon_end); if($distance < $curr_distance){ $curr_distance = $distance; - $curr_line = $line; + $curr_annotation = $annotation; + $curr_break = $break; } } - print $ofh1 $curr_line."\n"; - close ($ifh1); + $curr_pos = $curr_annotation->{'pos_end'}; + $curr_exon_start = $curr_annotation->{'feature_start'}; + $curr_exon_end = $curr_annotation->{'feature_end'}; + + if($curr_distance <= 10){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + else{ + # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic + my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $break_pos = $curr_annotation->{'pos_end'}; + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + } + close ($ifh2); close ($ofh1); $curr_distance = 10000000; - $curr_line = ""; $curr_break = ""; # Process the second annotation file open(my $ofh2, '>', $final_annot_file2) or die "Could not open file $final_annot_file2 $!"; - open (my $ifh2, $annot_file2) or die "Could not open file '$annot_file2' $!"; - while (<$ifh2>) { + open (my $ifh3, $annot_file2) or die "Could not open file '$annot_file2' $!"; + while (<$ifh3>) { chomp; my $line = $_; - my @fields = split "\t", $line; - my $break = $fields[3].$fields[4]; + my $annotation = parse_annotation($line); + next if($annotation->{'gene_name'} ne $annotation->{'star_genename'}); + my $break = $annotation->{'breakpoint'}; - if($break ne $curr_break){ - print $ofh2 $curr_line."\n" unless($curr_break eq ""); - $curr_break = $break; - $curr_distance = 10000000; - } - my $pos = $fields[2]; - my $exon_start = $fields[8]; - my $exon_end = $fields[9]; + if($break ne $curr_break){ + unless($curr_break eq ""){ + my $curr_pos = $curr_annotation->{'pos_end'}; + my $curr_exon_start = $curr_annotation->{'feature_start'}; + my $curr_exon_end = $curr_annotation->{'feature_end'}; + + if($curr_distance <= 10){ + print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ + print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + else{ + # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic + my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $break_pos = $curr_annotation->{'pos_end'}; + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + } + $curr_distance = 10000000; + $curr_break = $break; + } + } + + my $pos = $annotation->{'pos_end'}; + my $exon_start = $annotation->{'feature_start'}; + my $exon_end = $annotation->{'feature_end'}; my $distance = find_closest_boundary($pos, $exon_start, $exon_end); if($distance < $curr_distance){ $curr_distance = $distance; - $curr_line = $line; + $curr_annotation = $annotation; + $curr_break = $break; } - + } + $curr_pos = $curr_annotation->{'pos_end'}; + $curr_exon_start = $curr_annotation->{'feature_start'}; + $curr_exon_end = $curr_annotation->{'feature_end'}; + + if($curr_distance <= 10){ + print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ + print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } + else{ + # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic + my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $break_pos = $curr_annotation->{'pos_end'}; + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + } } - print $ofh2 $curr_line."\n"; - close ($ifh2); + close ($ifh3); close ($ofh2); + + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return 1; } From ca55655f40039ff3542d6c8ab6ba3c4fd0d404cf Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 12 Oct 2015 11:37:16 +0100 Subject: [PATCH 02/40] first draft of adding longest transcript to annotation --- perl/bin/compare_overlapping_fusions.pl | 3 +- .../Sanger/CGP/CompareFusions/Implement.pm | 140 ++++++++++++++++++ 2 files changed, 142 insertions(+), 1 deletion(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 1dac0d2..46f2257 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -79,10 +79,11 @@ BEGIN Sanger::CGP::CompareFusions::Implement::create_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbed'); Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); + Sanger::CGP::CompareFusions::Implement::find_longest_transcript($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selecttranscript'); if(!exists $options->{'process'} || $options->{'process'} eq 'output') { Sanger::CGP::CompareFusions::Implement::generate_output($options); - cleanup($options); + #cleanup($options); } } diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index 6ceb949..6fec4bb 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -132,6 +132,7 @@ sub annotate_bed { my $sample = $options->{'sample'}; my $exon_gtf = filter_gtf($options->{'gtf'}, $tmp, 'exon'); my $gene_gtf = filter_gtf($options->{'gtf'}, $tmp, 'gene'); + my $transcript_gtf = filter_gtf($options->{'gtf'}, $tmp, 'transcript'); my $break1_file; my $break2_file; @@ -404,6 +405,122 @@ sub find_closest_boundary { return $distance; } +sub find_longest_transcript { + my $options = shift; + + my $tmp = $options->{'tmp'}; + + my $sample = $options->{'sample'}; + my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann"); + my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann"); + my $final_annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); + my $final_annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); + my $transcript_list1 = File::Spec->catfile($tmp, "$sample.1.transcript"); + my $transcript_list2 = File::Spec->catfile($tmp, "$sample.2.transcript"); + my $transcript_gtf = File::Spec->catfile($tmp, "filtered_transcript.gtf"); + + open (my $ifh1, $final_annot_file1) or die "Could not open file '$final_annot_file1' $!"; + open (my $ofh1, '>', $transcript_list1) or die "Could not open file $transcript_list1 $!"; + while (<$ifh1>){ + chomp; + my $line = $_; + my @fields = split "\t"; + my $breakpoint_id = $fields[0]; + my $exon_id = $fields[6]; + open (my $ifh2, $annot_file1) or die "Could not open file '$annot_file1' $!"; + while (<$ifh2>){ + chomp; + my $line2 = $_; + my $annotation = parse_annotation($line2); + if($exon_id eq $annotation->{'exon_id'}){ + print $ofh1 "$breakpoint_id $exon_id $annotation->{'transcript_id'}\n"; + } + } + close($ifh2); + } + close($ofh1); + close($ifh1); + + open (my $ifh3, $final_annot_file2) or die "Could not open file '$final_annot_file2' $!"; + open (my $ofh2, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; + while (<$ifh3>){ + chomp; + my $line = $_; + my @fields = split "\t"; + my $breakpoint_id = $fields[0]; + my $exon_id = $fields[6]; + open (my $ifh4, $annot_file2) or die "Could not open file '$annot_file2' $!"; + while (<$ifh4>){ + chomp; + my $line2 = $_; + my $annotation = parse_annotation($line2); + if($exon_id eq $annotation->{'exon_id'}){ + print $ofh2 "$breakpoint_id $exon_id $annotation->{'transcript_id'}\n"; + } + } + close($ifh4); + } + close($ofh2); + close($ifh3); + + my %transcript_info; + + open (my $ifh5, $transcript_gtf) or die "Could not open file '$transcript_gtf' $!"; + while (<$ifh5>) { + chomp; + my $line = $_; + my $transcript_annot = parse_transcript($line); + if(!exists $transcript_info{$transcript_annot->{'transcript_id'}}){ + $transcript_info{$transcript_annot->{'transcript_id'}}{'feature_start'} = $transcript_annot->{'feature_start'}; + $transcript_info{$transcript_annot->{'transcript_id'}}{'feature_end'} = $transcript_annot->{'feature_end'}; + } + } + close ($ifh5); + + + open (my $ifh6, $transcript_list1) or die "Could not open file '$transcript_list1' $!"; + open (my $ofh3, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; + while (<$ifh6>){ + chomp; + my $line = $_; + my @fields = split " "; + my $breakpoint_id = $fields[0]; + my $exon_id = $fields[1]; + my $transcript_id = $fields[2]; + if(exists $transcript_info{$transcript_id}){ + my $start = $transcript_info{$transcript_id}{'feature_start'}; + my $end = $transcript_info{$transcript_id}{'feature_end'}; + my $transcript_length = $end - $start; + #print "$breakpoint_id $exon_id $transcript_id $transcript_info{$transcript_id}{'feature_start'} $transcript_info{$transcript_id}{'feature_end'} $transcript_length\n"; + } + } + close($ofh3); + close($ifh6); + + open (my $ifh7, $transcript_list2) or die "Could not open file '$transcript_list2' $!"; + open (my $ofh4, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; + while (<$ifh7>){ +print "In file 7\n"; + chomp; + my $line = $_; + my @fields = split " "; + my $breakpoint_id = $fields[0]; + my $exon_id = $fields[1]; + my $transcript_id = $fields[2]; + if(exists $transcript_info{$transcript_id}){ + + my $start = $transcript_info{$transcript_id}{'feature_start'}; + my $end = $transcript_info{$transcript_id}{'feature_end'}; + my $transcript_length = $end - $start; + print "$breakpoint_id $exon_id $transcript_id $transcript_info{$transcript_id}{'feature_start'} $transcript_info{$transcript_id}{'feature_end'} $transcript_length\n"; + } + } + close($ofh4); + close($ifh7); + + return 1; +} + sub generate_output { my $options = shift; @@ -625,6 +742,29 @@ sub parse_overlap { return $fusion; } +sub parse_transcript { + my $line = shift; + $line =~ s/"//g; + + my %transcript_annotation; + my @fields = split "\t", $line; + + $transcript_annotation{'chr'} = $fields[0]; + $transcript_annotation{'strand'} = $fields[6]; + $transcript_annotation{'feature'} = $fields[2]; + $transcript_annotation{'feature_start'} = $fields[3]; + $transcript_annotation{'feature_end'} = $fields[4]; + + my $annot_column = scalar @fields; + my @annot_fields = split /; /, $fields[$annot_column-1]; + foreach my $item(@annot_fields) { + my ($type,$value)= split / /, $item; + $transcript_annotation{$type} = $value; + } + + return \%transcript_annotation; +} + sub run_bed_pairtopair { my $options = shift; From f8163b73ad965407e334e213fd135cbf550877c1 Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 19 Oct 2015 11:16:30 +0100 Subject: [PATCH 03/40] Committing latest changes for selecting the largest transcript based on a selected exon --- perl/bin/compare_overlapping_fusions.pl | 3 +- .../Sanger/CGP/CompareFusions/Implement.pm | 240 ++++++++++-------- 2 files changed, 140 insertions(+), 103 deletions(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 46f2257..06ef72d 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -93,7 +93,7 @@ sub cleanup { my $sample = $options->{'sample'}; move(File::Spec->catfile($tmpdir, "$sample.star-defuse.overlapping.fusions.txt"), $options->{'outdir'}) || die $!; move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; - remove_tree $tmpdir if(-e $tmpdir); + #remove_tree $tmpdir if(-e $tmpdir); return 0; } @@ -110,6 +110,7 @@ sub setup { 't|threads=i' => \$opts{'threads'}, 'p|process=s' => \$opts{'process'}, 'i|index=i' => \$opts{'index'}, + 'c|cache=s' => \$opts{'cache'}, ) or pod2usage(2); pod2usage(-verbose => 1) if(defined $opts{'h'}); diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index 6fec4bb..d1efedd 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -44,13 +44,16 @@ use PCAP::Cli; use PCAP::Threaded; use Sanger::CGP::CompareFusions::FusionAnnotation; use Sanger::CGP::CgpRna; +use Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource; +use Sanger::CGP::Vagrent::Data::Exon; +use Sanger::CGP::Vagrent::Data::Transcript; use Data::Dumper; const my $BEDTOOLS_CLOSEST => q{ closest -s -a %s -b %s | sort -k4,4 > %s}; const my $BEDTOOLS_PAIRTOPAIR => q{ pairtopair -a %s -b %s -slop 5 > %s}; -const my $OUTPUT_HEADER => "sample\tstar_breakpoint\tdefuse_breakpoint\tfusion_name\tdefuse_splitr_count\tdefuse_span_count\tstar_JunctionReads\tstar_SpanningFrags\tLeftGene\tLeftGeneId\tLeftChr\tLeftPos\tLeftStrand\tLeftDistFromRefExonSplice\tRightGene\tRightGeneId\tRightChr\tRightPos\tRightStrand\tRightDistFromRefExonSplice\tbreak1_feature\texon1_id\texon1_num\texon1_start\texon1_end\tbreak2_feature\texon2_id\texon2_num\texon2_start\texon2_end\ttranscript1_id\tgene1_biotype\ttranscript2_id\tgene2_biotype\tdefuse_cluster_id\tdefuse_splitr_sequence\n"; +const my $OUTPUT_HEADER => "sample\tstar_breakpoint\tdefuse_breakpoint\tfusion_name\tdefuse_splitr_count\tdefuse_span_count\tstar_JunctionReads\tstar_SpanningFrags\tLeftGene\tLeftGeneId\tLeftChr\tLeftPos\tLeftStrand\tLeftDistFromRefExonSplice\tRightGene\tRightGeneId\tRightChr\tRightPos\tRightStrand\tRightDistFromRefExonSplice\tbreak1_feature\texon1_id\texon1_num\texon1_start\texon1_end\tbreak2_feature\texon2_id\texon2_num\texon2_start\texon2_end\ttranscript1_id\ttranscript1_src\tgene1_biotype\ttranscript2_id\ttranscript2_src\tgene2_biotype\tdefuse_cluster_id\tdefuse_splitr_sequence\n"; my %ALLOWED_BIOTYPES = ( antisense => 1, @@ -162,6 +165,30 @@ sub annotate_bed { return 1; } +sub annotationSort { + my $a_ccds = 0; + my $b_ccds = 0; + if(defined($a->getCCDS) && $a->getCCDS ne ''){ + $a_ccds = 1; + } + if(defined($b->getCCDS) && $b->getCCDS ne ''){ + $b_ccds = 1; + } + my $ccds_cmp = $b_ccds <=> $a_ccds; + if($ccds_cmp == 0){ + my $a_cds_len = $a->getCdsLength; + my $b_cds_len = $a->getCdsLength;; + my $cds_len_cmp = $b_cds_len <=> $a_cds_len; + if($cds_len_cmp == 0){ + return $b->getmRNALength <=> $a->getmRNALength; + } else { + return $cds_len_cmp; + } + } else { + return $ccds_cmp; + } +} + sub check_gene_boundaries { my $gene_line = shift; @@ -411,6 +438,7 @@ sub find_longest_transcript { my $tmp = $options->{'tmp'}; my $sample = $options->{'sample'}; + my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann"); my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann"); my $final_annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); @@ -419,105 +447,112 @@ sub find_longest_transcript { my $transcript_list2 = File::Spec->catfile($tmp, "$sample.2.transcript"); my $transcript_gtf = File::Spec->catfile($tmp, "filtered_transcript.gtf"); + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}); + + # Parse the annotation files so that we have all of the transcripts listed for an exon + open(my $ofh1, '>', $transcript_list1) or die "Could not open file $transcript_list1 $!"; open (my $ifh1, $final_annot_file1) or die "Could not open file '$final_annot_file1' $!"; - open (my $ofh1, '>', $transcript_list1) or die "Could not open file $transcript_list1 $!"; while (<$ifh1>){ chomp; my $line = $_; - my @fields = split "\t"; - my $breakpoint_id = $fields[0]; - my $exon_id = $fields[6]; - open (my $ifh2, $annot_file1) or die "Could not open file '$annot_file1' $!"; - while (<$ifh2>){ - chomp; - my $line2 = $_; - my $annotation = parse_annotation($line2); - if($exon_id eq $annotation->{'exon_id'}){ - print $ofh1 "$breakpoint_id $exon_id $annotation->{'transcript_id'}\n"; - } - } - close($ifh2); + my @fields = split "\t", $line; + my $break_id = $fields[0]; + $break_id =~ m/^([0-9|X|Y|MT]+):[0-9]+-[0-9|X|Y|MT]+:[0-9]+/; + my $chr = $1; + my $exon_id = $fields[6]; + my $exon_start = $fields[8]; + my $exon_end = $fields[9]; + my $exon = Sanger::CGP::Vagrent::Data::Exon->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $chr, 'minpos' => $exon_start, 'maxpos' => $exon_end, 'id' => $exon_id, 'rnaminpos' => $exon_start, 'rnamaxpos' => $exon_end); + my @trans = $ts->getTranscripts($exon); + + my @filteredTrans; + + foreach my $t(@trans){ + push(@filteredTrans, $t) if ($exon_start >= $t->getGenomicMinPos && $exon_end <= $t->getGenomicMaxPos); + } + my @sortedTrans = sort{&annotationSort} @filteredTrans; + + + if(defined $sortedTrans[0]){ + my $exon_number; + my $transcript_id; + my $num_transcripts = scalar @sortedTrans; + for (my $x=0;$x<$num_transcripts; $x++){ + my @exons = $sortedTrans[$x]->getExons; + my $num_exons = scalar @exons; + for (my $y=0;$y<$num_exons; $y++){ + my $e = $exons[$y]; + if($exon_start == $e->getMinPos && $exon_end == $e->getMaxPos){ + $transcript_id = $sortedTrans[$x]->getAccession; + $exon_number = $y+1; + last; + } + } + last if(defined $transcript_id); + } + $fields[7] = $exon_number; + $fields[10] = $transcript_id."\tVAGrENT"; + } + else{ + $fields[10] = $fields[10]."\tGTF"; + } + print $ofh1 join("\t",@fields)."\n"; } - close($ofh1); close($ifh1); - - open (my $ifh3, $final_annot_file2) or die "Could not open file '$final_annot_file2' $!"; - open (my $ofh2, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; - while (<$ifh3>){ + close($ofh1); + + open(my $ofh2, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; + open (my $ifh2, $final_annot_file2) or die "Could not open file '$final_annot_file2' $!"; + while (<$ifh2>){ chomp; my $line = $_; - my @fields = split "\t"; - my $breakpoint_id = $fields[0]; - my $exon_id = $fields[6]; - open (my $ifh4, $annot_file2) or die "Could not open file '$annot_file2' $!"; - while (<$ifh4>){ - chomp; - my $line2 = $_; - my $annotation = parse_annotation($line2); - if($exon_id eq $annotation->{'exon_id'}){ - print $ofh2 "$breakpoint_id $exon_id $annotation->{'transcript_id'}\n"; - } - } - close($ifh4); - } - close($ofh2); - close($ifh3); - - my %transcript_info; + my @fields = split "\t", $line; + my $break_id = $fields[0]; + $break_id =~ m/^[0-9|X|Y|MT]+:[0-9]+-([0-9|X|Y|MT]+):[0-9]+/; + my $chr = $1; + my $exon_id = $fields[6]; + my $exon_start = $fields[8]; + my $exon_end = $fields[9]; + my $exon = Sanger::CGP::Vagrent::Data::Exon->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $chr, 'minpos' => $exon_start, 'maxpos' => $exon_end, 'id' => $exon_id, 'rnaminpos' => $exon_start, 'rnamaxpos' => $exon_end); + my @trans = $ts->getTranscripts($exon); + + my @filteredTrans; - open (my $ifh5, $transcript_gtf) or die "Could not open file '$transcript_gtf' $!"; - while (<$ifh5>) { - chomp; - my $line = $_; - my $transcript_annot = parse_transcript($line); - if(!exists $transcript_info{$transcript_annot->{'transcript_id'}}){ - $transcript_info{$transcript_annot->{'transcript_id'}}{'feature_start'} = $transcript_annot->{'feature_start'}; - $transcript_info{$transcript_annot->{'transcript_id'}}{'feature_end'} = $transcript_annot->{'feature_end'}; - } - } - close ($ifh5); - - - open (my $ifh6, $transcript_list1) or die "Could not open file '$transcript_list1' $!"; - open (my $ofh3, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; - while (<$ifh6>){ - chomp; - my $line = $_; - my @fields = split " "; - my $breakpoint_id = $fields[0]; - my $exon_id = $fields[1]; - my $transcript_id = $fields[2]; - if(exists $transcript_info{$transcript_id}){ - my $start = $transcript_info{$transcript_id}{'feature_start'}; - my $end = $transcript_info{$transcript_id}{'feature_end'}; - my $transcript_length = $end - $start; - #print "$breakpoint_id $exon_id $transcript_id $transcript_info{$transcript_id}{'feature_start'} $transcript_info{$transcript_id}{'feature_end'} $transcript_length\n"; + foreach my $t(@trans){ + push(@filteredTrans, $t) if ($exon_start >= $t->getGenomicMinPos && $exon_end <= $t->getGenomicMaxPos); } - } - close($ofh3); - close($ifh6); - - open (my $ifh7, $transcript_list2) or die "Could not open file '$transcript_list2' $!"; - open (my $ofh4, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; - while (<$ifh7>){ -print "In file 7\n"; - chomp; - my $line = $_; - my @fields = split " "; - my $breakpoint_id = $fields[0]; - my $exon_id = $fields[1]; - my $transcript_id = $fields[2]; - if(exists $transcript_info{$transcript_id}){ - - my $start = $transcript_info{$transcript_id}{'feature_start'}; - my $end = $transcript_info{$transcript_id}{'feature_end'}; - my $transcript_length = $end - $start; - print "$breakpoint_id $exon_id $transcript_id $transcript_info{$transcript_id}{'feature_start'} $transcript_info{$transcript_id}{'feature_end'} $transcript_length\n"; + my @sortedTrans = sort{&annotationSort} @filteredTrans; + + + if(defined $sortedTrans[0]){ + my $exon_number; + my $transcript_id; + my $num_transcripts = scalar @sortedTrans; + for (my $x=0;$x<$num_transcripts; $x++){ + my @exons = $sortedTrans[$x]->getExons; + my $num_exons = scalar @exons; + for (my $y=0;$y<$num_exons; $y++){ + my $e = $exons[$y]; + if($exon_start == $e->getMinPos && $exon_end == $e->getMaxPos){ + $transcript_id = $sortedTrans[$x]->getAccession; + $exon_number = $y+1; + last; + } + } + last if(defined $transcript_id); + } + $fields[7] = $exon_number; + $fields[10] = $transcript_id."\tVAGrENT"; } + else{ + $fields[10] = $fields[10]."\tGTF"; + } + print $ofh2 join("\t",@fields)."\n"; } - close($ofh4); - close($ifh7); - + close($ifh2); + close($ofh2); + + return 1; } @@ -577,8 +612,8 @@ sub generate_output { } close ($ifh2); - my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); - my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); + my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.transcript"); + my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.transcript"); my $output_file = File::Spec->catfile($tmp, "$sample.star-defuse.overlapping.fusions.txt"); my %break1; @@ -625,6 +660,8 @@ sub generate_output { my $exon2_end = $break2{$brk}->{'exon_end'}; my $transcript1_id = $break1{$brk}->{'transcript_id'}; my $transcript2_id = $break2{$brk}->{'transcript_id'}; + my $transcript1_src = $break1{$brk}->{'transcript_src'}; + my $transcript2_src = $break2{$brk}->{'transcript_src'}; my $biotype1 = $break1{$brk}->{'gene_biotype'}; my $biotype2 = $break2{$brk}->{'gene_biotype'}; my $defuse_breakpoint = $defuse_data{$alt_breakpoint}{'breakpoint'}; @@ -632,7 +669,7 @@ sub generate_output { my $defuse_split_reads = $defuse_data{$alt_breakpoint}{'split_reads'}; my $defuse_span_reads = $defuse_data{$alt_breakpoint}{'span_reads'}; my $defuse_sequence = $defuse_data{$alt_breakpoint}{'sequence'}; - print $ofh1 "$sample\t$breakpoint\t$defuse_breakpoint\t$star_fusion_name\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$feature1\t$exon1_id\t$exon1_number\t$exon1_start\t$exon1_end\t$feature2\t$exon2_id\t$exon2_number\t$exon2_start\t$exon2_end\t$transcript1_id\t$biotype1\t$transcript2_id\t$biotype2\t$defuse_cluster_id\t$defuse_sequence\n"; + print $ofh1 "$sample\t$breakpoint\t$defuse_breakpoint\t$star_fusion_name\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$feature1\t$exon1_id\t$exon1_number\t$exon1_start\t$exon1_end\t$feature2\t$exon2_id\t$exon2_number\t$exon2_start\t$exon2_end\t$transcript1_id\t$transcript1_src\t$biotype1\t$transcript2_id\t$transcript2_src\t$biotype2\t$defuse_cluster_id\t$defuse_sequence\n"; } } close($ofh1); @@ -687,7 +724,8 @@ sub parse_break_data { $break{'exon_start'} = $fields[8]; $break{'exon_end'} = $fields[9]; $break{'transcript_id'} = $fields[10]; - $break{'gene_biotype'} = $fields[11]; + $break{'transcript_src'} = $fields[11]; + $break{'gene_biotype'} = $fields[12]; return \%break; } @@ -744,25 +782,23 @@ sub parse_overlap { sub parse_transcript { my $line = shift; - $line =~ s/"//g; - my %transcript_annotation; + my %transcript; my @fields = split "\t", $line; - $transcript_annotation{'chr'} = $fields[0]; - $transcript_annotation{'strand'} = $fields[6]; - $transcript_annotation{'feature'} = $fields[2]; - $transcript_annotation{'feature_start'} = $fields[3]; - $transcript_annotation{'feature_end'} = $fields[4]; + $transcript{'chr'} = $fields[0]; + $transcript{'start'} = $fields[3]; + $transcript{'end'} = $fields[4]; + $transcript{'strand'} = $fields[6]; my $annot_column = scalar @fields; my @annot_fields = split /; /, $fields[$annot_column-1]; + foreach my $item(@annot_fields) { my ($type,$value)= split / /, $item; - $transcript_annotation{$type} = $value; + $transcript{$type} = $value; } - - return \%transcript_annotation; + return \%transcript; } sub run_bed_pairtopair { From 0ca86f7edebea4b0d743181d00c910ad62dd69a2 Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 23 Oct 2015 10:50:22 +0100 Subject: [PATCH 04/40] Latest version of the comparison code to annotate by longest transcript first then exon number --- perl/bin/compare_overlapping_fusions.pl | 20 +- .../CGP/CompareFusions/FusionAnnotation.pm | 43 +- .../Sanger/CGP/CompareFusions/Implement.pm | 735 ++++++++---------- 3 files changed, 359 insertions(+), 439 deletions(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 06ef72d..0ebdb44 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -60,10 +60,11 @@ BEGIN const my @REQUIRED_PARAMS => qw(outdir sample gtf); const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair createbed annotatebed selectannotation output); const my %INDEX_FACTOR => ( 'createjunctionbed' => -1, - 'runbedpairtopair' => -1, - 'createbed' => -1, + 'runbedpairtopair' => 1, + 'queryvagrent' => -1, 'annotatebed' => -1, 'selectannotation' => 1, + 'collateannotation' => 1, 'output' => 1); { my $options = setup(); @@ -76,14 +77,15 @@ BEGIN $threads->run($options->{'num'}, 'createjunctionbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createjunctionbed'); Sanger::CGP::CompareFusions::Implement::run_bed_pairtopair($options) if(!exists $options->{'process'} || $options->{'process'} eq 'runbedpairtopair'); - Sanger::CGP::CompareFusions::Implement::create_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbed'); - Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); - Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); - Sanger::CGP::CompareFusions::Implement::find_longest_transcript($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selecttranscript'); - + Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent'); + if(!-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}."1.bed") || !-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}."2.bed")){ + Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); + Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); + Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation'); + } if(!exists $options->{'process'} || $options->{'process'} eq 'output') { Sanger::CGP::CompareFusions::Implement::generate_output($options); - #cleanup($options); + cleanup($options); } } @@ -93,7 +95,7 @@ sub cleanup { my $sample = $options->{'sample'}; move(File::Spec->catfile($tmpdir, "$sample.star-defuse.overlapping.fusions.txt"), $options->{'outdir'}) || die $!; move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; - #remove_tree $tmpdir if(-e $tmpdir); + remove_tree $tmpdir if(-e $tmpdir); return 0; } diff --git a/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm b/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm index 64dc939..f311a2a 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm @@ -57,6 +57,7 @@ sub new { if ($args{-gene1_id}) { $self->gene1_id($args{-gene1_id}) } if ($args{-gene1_start}) { $self->gene1_start($args{-gene1_start}) } if ($args{-gene1_end}) { $self->gene1_end($args{-gene1_end}) } + if ($args{-gene1_biotype}) { $self->gene1_biotype($args{-gene1_biotype}) } if ($args{-transcript1_id}) { $self->transcript1_id($args{-transcript1_id}) } if ($args{-distance1}) { $self->distance1($args{-distance1}) } if ($args{-chr2}) { $self->chr2($args{-chr2}) } @@ -72,6 +73,7 @@ sub new { if ($args{-gene2_id}) { $self->gene2_id($args{-gene2_id}) } if ($args{-gene2_start}) { $self->gene2_start($args{-gene2_start}) } if ($args{-gene2_end}) { $self->gene2_end($args{-gene2_end}) } + if ($args{-gene2_biotype}) { $self->gene2_biotype($args{-gene2_biotype}) } if ($args{-transcript2_id}) { $self->transcript2_id($args{-transcript2_id}) } if ($args{-distance2}) { $self->distance2($args{-distance2}) } @@ -222,6 +224,18 @@ sub gene2_end { return($self->{gene2_end}); } +sub gene1_biotype { + my $self = shift; + $self->{gene1_biotype} = shift if @_; + return($self->{gene1_biotype}); +} + +sub gene2_biotype { + my $self = shift; + $self->{gene2_biotype} = shift if @_; + return($self->{gene2_biotype}); +} + sub exon1_num { my $self = shift; $self->{exon1_num} = shift if @_; @@ -270,6 +284,15 @@ sub distance2 { return($self->{distance2}); } +sub format_bed_line { + my ($self, $breaknum) = @_; + + my @fields = ($self->{'chr'.$breaknum},$self->{'pos'.$breaknum.'_start'},$self->{'pos'.$breaknum.'_end'},$self->breakpoint,$self->alt_breakpoint,$self->{'strand'.$breaknum},$self->{'gene'.$breaknum},$self->{'gene'.$breaknum.'_id'}); + + my $formatted_line = join("\t", @fields); + return $formatted_line; +} + sub format_bedpe_line { my ($self, $type) = @_; @@ -289,4 +312,22 @@ sub format_bedpe_line { my $bedpe_line = join("\t", @pe_fields); return $bedpe_line; -} \ No newline at end of file +} + +sub format_annotation_line { + my ($self,$annot_source) = @_; + my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'gene1'},$self->{'gene1_id'},$self->{'transcript1_id'},$annot_source,$self->{'exon1_num'},$self->{'feature1_start'},$self->{'feature1_end'},$self->{'gene2'},$self->{'gene2_id'},$self->{'transcript2_id'},$annot_source,$self->{'exon2_num'},$self->{'feature2_start'},$self->{'feature2_end'}); + + my $formatted_line = join("\t", @fields); + return $formatted_line; +} + +sub format_break_line { + my ($self, $breaknum, $annot_source) = @_; + + my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'gene'.$breaknum},$self->{'gene'.$breaknum.'_id'},$self->{'transcript'.$breaknum.'_id'},$annot_source,$self->{'exon'.$breaknum.'_num'},$self->{'feature'.$breaknum.'_start'},$self->{'feature'.$breaknum.'_end'}); + + my $formatted_line = join("\t", @fields); + return $formatted_line; +} + diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index d1efedd..e57491f 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -45,7 +45,7 @@ use PCAP::Threaded; use Sanger::CGP::CompareFusions::FusionAnnotation; use Sanger::CGP::CgpRna; use Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource; -use Sanger::CGP::Vagrent::Data::Exon; +use Sanger::CGP::Vagrent::Data::GenomicRegion; use Sanger::CGP::Vagrent::Data::Transcript; use Data::Dumper; @@ -53,7 +53,7 @@ use Data::Dumper; const my $BEDTOOLS_CLOSEST => q{ closest -s -a %s -b %s | sort -k4,4 > %s}; const my $BEDTOOLS_PAIRTOPAIR => q{ pairtopair -a %s -b %s -slop 5 > %s}; -const my $OUTPUT_HEADER => "sample\tstar_breakpoint\tdefuse_breakpoint\tfusion_name\tdefuse_splitr_count\tdefuse_span_count\tstar_JunctionReads\tstar_SpanningFrags\tLeftGene\tLeftGeneId\tLeftChr\tLeftPos\tLeftStrand\tLeftDistFromRefExonSplice\tRightGene\tRightGeneId\tRightChr\tRightPos\tRightStrand\tRightDistFromRefExonSplice\tbreak1_feature\texon1_id\texon1_num\texon1_start\texon1_end\tbreak2_feature\texon2_id\texon2_num\texon2_start\texon2_end\ttranscript1_id\ttranscript1_src\tgene1_biotype\ttranscript2_id\ttranscript2_src\tgene2_biotype\tdefuse_cluster_id\tdefuse_splitr_sequence\n"; +const my $OUTPUT_HEADER => "sample\tfusion_name\tstar_junction\tdefuse_junction\tdefuse_splitr_count\tdefuse_span_count\tstar_JunctionReads\tstar_SpanningFrags\tLeftGene\tLeftGeneId\tLeftChr\tLeftPos\tLeftStrand\tLeftDistFromRefExonSplice\tRightGene\tRightGeneId\tRightChr\tRightPos\tRightStrand\tRightDistFromRefExonSplice\ttranscript1_id\ttranscript1_src\texon1_num\texon1_start\texon1_end\ttranscript2_id\ttranscript2_src\texon2_num\texon2_start\texon2_end\tdefuse_cluster_id\tdefuse_splitr_sequence\n"; my %ALLOWED_BIOTYPES = ( antisense => 1, @@ -115,17 +115,6 @@ const my $STAR_GENENAME2 => 11; const my $STAR_GENEID2 => 12; const my $STAR_HEADER_PATTERN => 'fusion_name'; -# Position of the columns in the SOAPfuse output file used to format fusion breakpoint references. -const my $SOAP_SPLIT_CHAR => '\t'; -const my $SOAP_CHR1 => 3; -const my $SOAP_POS1 => 5; -const my $SOAP_STRAND1 => 4; -const my $SOAP_CHR2 => 8; -const my $SOAP_POS2 => 10; -const my $SOAP_STRAND2 => 9; -const my $SOAP_BREAKREF => 1; -const my $SOAP_HEADER_PATTERN => 'up_chr'; - sub annotate_bed { my $options = shift; @@ -133,9 +122,8 @@ sub annotate_bed { return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $sample = $options->{'sample'}; - my $exon_gtf = filter_gtf($options->{'gtf'}, $tmp, 'exon'); - my $gene_gtf = filter_gtf($options->{'gtf'}, $tmp, 'gene'); - my $transcript_gtf = filter_gtf($options->{'gtf'}, $tmp, 'transcript'); + my $exon_gtf = filter_gtf($options, 'exon'); + my $gene_gtf = filter_gtf($options, 'gene'); my $break1_file; my $break2_file; @@ -165,7 +153,7 @@ sub annotate_bed { return 1; } -sub annotationSort { +sub annotation_sort { my $a_ccds = 0; my $b_ccds = 0; if(defined($a->getCCDS) && $a->getCCDS ne ''){ @@ -177,7 +165,7 @@ sub annotationSort { my $ccds_cmp = $b_ccds <=> $a_ccds; if($ccds_cmp == 0){ my $a_cds_len = $a->getCdsLength; - my $b_cds_len = $a->getCdsLength;; + my $b_cds_len = $b->getCdsLength; my $cds_len_cmp = $b_cds_len <=> $a_cds_len; if($cds_len_cmp == 0){ return $b->getmRNALength <=> $a->getmRNALength; @@ -189,25 +177,6 @@ sub annotationSort { } } -sub check_gene_boundaries { - my $gene_line = shift; - - my @gene_fields = split "\t", $gene_line; - my $return_value = 1; - - $gene_fields[7] =~ m/^.*:([0-9]+)-.*:([0-9]+)/; - my $pos1 = $1; - my $pos2 = $2; - my $gene1_start = $gene_fields[1]; - my $gene1_end = $gene_fields[2]; - my $gene2_start = $gene_fields[4]; - my $gene2_end = $gene_fields[5]; - - $return_value = 0 if(($pos1 < $gene1_start || $pos1 > $gene1_end) || ($pos2 < $gene2_start || $pos2 > $gene2_end)); - - return $return_value; -} - sub check_input { my $fusion_file = shift; @@ -232,10 +201,6 @@ sub check_input { elsif($firstLine =~ m/$STAR_HEADER_PATTERN/){ $source = "star"; } - # SOAPfuse check - elsif($firstLine =~ m/$SOAP_HEADER_PATTERN/){ - $source = "soap"; - } else{ die "Unrecognised file type or the file is missing the header record\n"; } @@ -243,74 +208,44 @@ sub check_input { return $source; } -sub create_bed { +sub collate_annotation { my $options = shift; - + my $tmp = $options->{'tmp'}; return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - - my $sample = $options->{'sample'}; - - # There will always be a 1_2 comparison file so deal with that first and build the fusions object. - - # Establish the source of 1 and 2 respectively - my $source1 = $options->{'fusion_files'}->{'1'}->{'format'}; - my $source2 = $options->{'fusion_files'}->{'2'}->{'format'}; - - my $col_set = 1; - my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; - if($source2 eq 'star'){ - $col_set = 2; - $star_file = $options->{'fusion_files'}->{'2'}->{'name'}; - } - - my $overlap_file1_2; - - opendir(my $dh, $tmp); - while(my $file = readdir $dh) { - $overlap_file1_2 = File::Spec->catfile($tmp, $file) if($file =~ m/^1_2.$sample.bedpe_overlap/); - } - closedir($dh); - my $output1 = File::Spec->catfile($tmp, "$sample.1.bed"); - my $output2 = File::Spec->catfile($tmp, "$sample.2.bed"); + my $sample = $options->{'sample'}; + my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); + my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); + my $annot_file_full = File::Spec->catfile($tmp, "$sample.final"); - my %star_gene_list; - open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; + open (my $ifh1, $annot_file1) or die "Could not open file '$annot_file1' $!"; + open(my $ofh1, '>>', $annot_file_full) or die "Could not open file $annot_file_full $!"; while (<$ifh1>) { chomp; my $line = $_; my @fields = split "\t", $line; - my $breakpoint = $fields[0]; - $star_gene_list{$breakpoint}{'gene1_name'} = $fields[4]; - $star_gene_list{$breakpoint}{'gene1_id'} = $fields[5]; - $star_gene_list{$breakpoint}{'gene2_name'} = $fields[10]; - $star_gene_list{$breakpoint}{'gene2_id'} = $fields[11]; + my $breakpoint_id = $fields[0]; + open (my $ifh2, $annot_file2) or die "Could not open file '$annot_file2' $!"; + while (<$ifh2>) { + chomp; + my $line2 = $_; + if($line2 =~ m/^$breakpoint_id/){ + my @fields2 = split "\t", $line2; + shift(@fields2); + shift(@fields2); + push(@fields, @fields2); + last; + } + } + close($ifh2); + print $ofh1 join("\t", @fields)."\n"; } - close ($ifh1); - - open (my $ifh2, $overlap_file1_2) or die "Could not open file '$overlap_file1_2' $!"; - open(my $ofh1, '>', $output1) or die "Could not open file '$output1' $!"; - open(my $ofh2, '>', $output2) or die "Could not open file '$output2' $!"; + close($ofh1); + close($ifh1); - while (<$ifh2>) { - chomp; - my $line = $_; - my $fusion = parse_overlap($line, $col_set); - my $gene1_name = $star_gene_list{$fusion->{'breakpoint'}}{'gene1_name'}; - my $gene1_id = $star_gene_list{$fusion->{'breakpoint'}}{'gene1_id'}; - my $gene2_name = $star_gene_list{$fusion->{'breakpoint'}}{'gene2_name'}; - my $gene2_id = $star_gene_list{$fusion->{'breakpoint'}}{'gene2_id'}; - print $ofh1 $fusion->{'chr1'}."\t".$fusion->{'pos1_start'}."\t".$fusion->{'pos1_end'}."\t".$fusion->{'breakpoint'}."_".$fusion->{'strand1'}.$fusion->{'strand2'}."\t".$fusion->{'alt_breakpoint'}."\t".$fusion->{'strand1'}."\t".$gene1_name."\t".$gene1_id."\n"; - print $ofh2 $fusion->{'chr2'}."\t".$fusion->{'pos2_start'}."\t".$fusion->{'pos2_end'}."\t".$fusion->{'breakpoint'}."_".$fusion->{'strand1'}.$fusion->{'strand2'}."\t".$fusion->{'alt_breakpoint'}."\t".$fusion->{'strand2'}."\t".$gene2_name."\t".$gene2_id."\n"; - } - - close ($ifh2); - close ($ofh1); - close ($ofh2); - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); - + return 1; } @@ -388,8 +323,12 @@ sub create_junction_bedpe { } sub filter_gtf { - my ($gtf, $tmp, $feature) = @_; - + my ($options, $feature) = @_; + + my $tmp = $options->{'tmp'}; + + my $gtf = $options->{'gtf'}; + my $filtered_gtf = File::Spec->catfile($tmp, "filtered_$feature.gtf"); unless (-e $filtered_gtf){ @@ -417,7 +356,6 @@ sub filter_gtf { close($ifh); close($ofh); } - return $filtered_gtf; } @@ -432,130 +370,6 @@ sub find_closest_boundary { return $distance; } -sub find_longest_transcript { - my $options = shift; - - my $tmp = $options->{'tmp'}; - - my $sample = $options->{'sample'}; - - my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann"); - my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann"); - my $final_annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); - my $final_annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); - my $transcript_list1 = File::Spec->catfile($tmp, "$sample.1.transcript"); - my $transcript_list2 = File::Spec->catfile($tmp, "$sample.2.transcript"); - my $transcript_gtf = File::Spec->catfile($tmp, "filtered_transcript.gtf"); - - my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}); - - # Parse the annotation files so that we have all of the transcripts listed for an exon - open(my $ofh1, '>', $transcript_list1) or die "Could not open file $transcript_list1 $!"; - open (my $ifh1, $final_annot_file1) or die "Could not open file '$final_annot_file1' $!"; - while (<$ifh1>){ - chomp; - my $line = $_; - my @fields = split "\t", $line; - my $break_id = $fields[0]; - $break_id =~ m/^([0-9|X|Y|MT]+):[0-9]+-[0-9|X|Y|MT]+:[0-9]+/; - my $chr = $1; - my $exon_id = $fields[6]; - my $exon_start = $fields[8]; - my $exon_end = $fields[9]; - my $exon = Sanger::CGP::Vagrent::Data::Exon->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $chr, 'minpos' => $exon_start, 'maxpos' => $exon_end, 'id' => $exon_id, 'rnaminpos' => $exon_start, 'rnamaxpos' => $exon_end); - my @trans = $ts->getTranscripts($exon); - - my @filteredTrans; - - foreach my $t(@trans){ - push(@filteredTrans, $t) if ($exon_start >= $t->getGenomicMinPos && $exon_end <= $t->getGenomicMaxPos); - } - my @sortedTrans = sort{&annotationSort} @filteredTrans; - - - if(defined $sortedTrans[0]){ - my $exon_number; - my $transcript_id; - my $num_transcripts = scalar @sortedTrans; - for (my $x=0;$x<$num_transcripts; $x++){ - my @exons = $sortedTrans[$x]->getExons; - my $num_exons = scalar @exons; - for (my $y=0;$y<$num_exons; $y++){ - my $e = $exons[$y]; - if($exon_start == $e->getMinPos && $exon_end == $e->getMaxPos){ - $transcript_id = $sortedTrans[$x]->getAccession; - $exon_number = $y+1; - last; - } - } - last if(defined $transcript_id); - } - $fields[7] = $exon_number; - $fields[10] = $transcript_id."\tVAGrENT"; - } - else{ - $fields[10] = $fields[10]."\tGTF"; - } - print $ofh1 join("\t",@fields)."\n"; - } - close($ifh1); - close($ofh1); - - open(my $ofh2, '>', $transcript_list2) or die "Could not open file $transcript_list2 $!"; - open (my $ifh2, $final_annot_file2) or die "Could not open file '$final_annot_file2' $!"; - while (<$ifh2>){ - chomp; - my $line = $_; - my @fields = split "\t", $line; - my $break_id = $fields[0]; - $break_id =~ m/^[0-9|X|Y|MT]+:[0-9]+-([0-9|X|Y|MT]+):[0-9]+/; - my $chr = $1; - my $exon_id = $fields[6]; - my $exon_start = $fields[8]; - my $exon_end = $fields[9]; - my $exon = Sanger::CGP::Vagrent::Data::Exon->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $chr, 'minpos' => $exon_start, 'maxpos' => $exon_end, 'id' => $exon_id, 'rnaminpos' => $exon_start, 'rnamaxpos' => $exon_end); - my @trans = $ts->getTranscripts($exon); - - my @filteredTrans; - - foreach my $t(@trans){ - push(@filteredTrans, $t) if ($exon_start >= $t->getGenomicMinPos && $exon_end <= $t->getGenomicMaxPos); - } - my @sortedTrans = sort{&annotationSort} @filteredTrans; - - - if(defined $sortedTrans[0]){ - my $exon_number; - my $transcript_id; - my $num_transcripts = scalar @sortedTrans; - for (my $x=0;$x<$num_transcripts; $x++){ - my @exons = $sortedTrans[$x]->getExons; - my $num_exons = scalar @exons; - for (my $y=0;$y<$num_exons; $y++){ - my $e = $exons[$y]; - if($exon_start == $e->getMinPos && $exon_end == $e->getMaxPos){ - $transcript_id = $sortedTrans[$x]->getAccession; - $exon_number = $y+1; - last; - } - } - last if(defined $transcript_id); - } - $fields[7] = $exon_number; - $fields[10] = $transcript_id."\tVAGrENT"; - } - else{ - $fields[10] = $fields[10]."\tGTF"; - } - print $ofh2 join("\t",@fields)."\n"; - } - close($ifh2); - close($ofh2); - - - return 1; -} - sub generate_output { my $options = shift; @@ -612,66 +426,50 @@ sub generate_output { } close ($ifh2); - my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.transcript"); - my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.transcript"); + my $annot_file = File::Spec->catfile($tmp, "$sample.final"); my $output_file = File::Spec->catfile($tmp, "$sample.star-defuse.overlapping.fusions.txt"); - - my %break1; - open (my $ifh3, $annot_file1) or die "Could not open file '$annot_file1' $!"; - while (<$ifh3>) { - chomp; - my $line1 = $_; - my $break_annotation1 = parse_break_data($line1); - $line1 =~ m/^(.*:[0-9]+-.*:[0-9]+_[+-]+)/; - my $break_ref = $1; - $break1{$break_ref} = $break_annotation1; - } - close ($ifh3); - - my %break2; - open (my $ifh4, $annot_file2) or die "Could not open file '$annot_file2' $!"; - while (<$ifh4>) { - chomp; - my $line2 = $_; - my $break_annotation2 = parse_break_data($line2); - $line2 =~ m/^(.*:[0-9]+-.*:[0-9]+_[+-]+)/; - my $break_ref = $1; - $break2{$break_ref} = $break_annotation2; - } - close ($ifh4); - open(my $ofh1, '>', $output_file) or die "Could not open file $output_file $!"; + + if(-s $annot_file){ + open(my $ifh3, $annot_file) or die "Could not open file $annot_file $!"; print $ofh1 $OUTPUT_HEADER; - for my $brk (keys %break1){ - if(exists $break2{$brk}){ - my $breakpoint = $break1{$brk}->{'breakpoint'}; - my $alt_breakpoint = $break1{$brk}->{'alt_breakpoint'}; + while(<$ifh3>){ + chomp; + my $line = $_; + my @fields = split "\t", $line; + my $breakpoint = $fields[0]; + my $alt_breakpoint = $fields[1]; + my $defuse_breakpoint = $defuse_data{$alt_breakpoint}{'breakpoint'}; + my $length = scalar @fields; + if($length > 10){ my $star_fusion_name = $star_data{$breakpoint}{'fusion_name'}; my $star_data = $star_data{$breakpoint}{'data'}; - my $feature1 = $break1{$brk}->{'feature'}; - my $feature2 = $break2{$brk}->{'feature'}; - my $exon1_id = $break1{$brk}->{'exon_id'}; - my $exon2_id = $break2{$brk}->{'exon_id'}; - my $exon1_number = $break1{$brk}->{'exon_number'}; - my $exon2_number = $break2{$brk}->{'exon_number'}; - my $exon1_start = $break1{$brk}->{'exon_start'}; - my $exon2_start = $break2{$brk}->{'exon_start'}; - my $exon1_end = $break1{$brk}->{'exon_end'}; - my $exon2_end = $break2{$brk}->{'exon_end'}; - my $transcript1_id = $break1{$brk}->{'transcript_id'}; - my $transcript2_id = $break2{$brk}->{'transcript_id'}; - my $transcript1_src = $break1{$brk}->{'transcript_src'}; - my $transcript2_src = $break2{$brk}->{'transcript_src'}; - my $biotype1 = $break1{$brk}->{'gene_biotype'}; - my $biotype2 = $break2{$brk}->{'gene_biotype'}; - my $defuse_breakpoint = $defuse_data{$alt_breakpoint}{'breakpoint'}; + my $exon1_number = $fields[6]; + my $exon2_number = $fields[13]; + my $exon1_start = $fields[7]; + my $exon2_start = $fields[14]; + my $exon1_end = $fields[8]; + my $exon2_end = $fields[15]; + my $transcript1_id = $fields[4]; + my $transcript2_id = $fields[11]; + my $transcript1_src = $fields[5]; + my $transcript2_src = $fields[12]; my $defuse_cluster_id = $defuse_data{$alt_breakpoint}{'cluster_id'}; my $defuse_split_reads = $defuse_data{$alt_breakpoint}{'split_reads'}; my $defuse_span_reads = $defuse_data{$alt_breakpoint}{'span_reads'}; my $defuse_sequence = $defuse_data{$alt_breakpoint}{'sequence'}; - print $ofh1 "$sample\t$breakpoint\t$defuse_breakpoint\t$star_fusion_name\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$feature1\t$exon1_id\t$exon1_number\t$exon1_start\t$exon1_end\t$feature2\t$exon2_id\t$exon2_number\t$exon2_start\t$exon2_end\t$transcript1_id\t$transcript1_src\t$biotype1\t$transcript2_id\t$transcript2_src\t$biotype2\t$defuse_cluster_id\t$defuse_sequence\n"; + print $ofh1 "$sample\t$star_fusion_name\t$breakpoint\t$defuse_breakpoint\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$transcript1_id\t$transcript1_src\t$exon1_number\t$exon1_start\t$exon1_end\t$transcript2_id\t$transcript2_src\t$exon2_number\t$exon2_start\t$exon2_end\t$defuse_cluster_id\t$defuse_sequence\n"; } + else{ + print $ofh1 "$sample\t-\t$breakpoint\t$defuse_breakpoint\tFUSION COULD NOT BE ANNOTATED\n"; + } + } + close($ifh3); } + else{ + print $ofh1 "$sample\tNO OVERLAPPING FUSIONS FOUND\n"; + } + close($ofh1); return 1; @@ -780,104 +578,75 @@ sub parse_overlap { return $fusion; } -sub parse_transcript { - my $line = shift; - - my %transcript; - my @fields = split "\t", $line; - - $transcript{'chr'} = $fields[0]; - $transcript{'start'} = $fields[3]; - $transcript{'end'} = $fields[4]; - $transcript{'strand'} = $fields[6]; - - my $annot_column = scalar @fields; - my @annot_fields = split /; /, $fields[$annot_column-1]; - - foreach my $item(@annot_fields) { - my ($type,$value)= split / /, $item; - $transcript{$type} = $value; - } - return \%transcript; -} - -sub run_bed_pairtopair { - my $options = shift; - - my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - - my $sample = $options->{'sample'}; +sub parse_transcript_data { + my ($fusion, $breaknum, $transcripts) = @_; - my $prog = _which('bedtools'); - - # There will always be at least two input files so build the command for the first comparison - my $command = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.bedpe"), - File::Spec->catfile($tmp, "2.$sample.bedpe"), - File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); - - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); - - return 1; + my @filteredTrans; + foreach my $t(@{$transcripts}){ + push(@filteredTrans, $t) if ($fusion->{'pos'.$breaknum.'_end'} >= $t->getGenomicMinPos && $fusion->{'pos'.$breaknum.'_end'} <= $t->getGenomicMaxPos); + } + my @sortedTrans = sort{&annotation_sort} @filteredTrans; + + if(defined $sortedTrans[0]){ + + my $exon_number; + my $exon_start; + my $exon_end; + my $transcript_id; + my $gene_biotype; + my $num_transcripts = scalar @sortedTrans; + for (my $x=0;$x<$num_transcripts; $x++){ + my @exons = $sortedTrans[$x]->getExons; + my $num_exons = scalar @exons; + for (my $y=0;$y<$num_exons; $y++){ + my $e = $exons[$y]; + if($fusion->{'pos'.$breaknum.'_end'} == $e->getMinPos || $fusion->{'pos'.$breaknum.'_end'} == $e->getMaxPos){ + $transcript_id = $sortedTrans[$x]->getAccession; + $gene_biotype = $sortedTrans[$x]->{'_genetype'}; + $exon_start = $e->getMinPos; + $exon_end = $e->getMaxPos; + $exon_number = $y+1; + last; + } + } + last if(defined $transcript_id); + } + if($breaknum == 1){ + $fusion->transcript1_id($transcript_id); + $fusion->gene1_biotype($gene_biotype); + $fusion->exon1_num($exon_number); + $fusion->feature1_start($exon_start); + $fusion->feature1_end($exon_end); + } + else{ + $fusion->transcript2_id($transcript_id); + $fusion->gene2_biotype($gene_biotype); + $fusion->exon2_num($exon_number); + $fusion->feature2_start($exon_start); + $fusion->feature2_end($exon_end); + } + } + return $fusion; } -sub select_annotation { - - # All possible exon annotations have been retrieved for each breakpoint, we need to select annotation for the nearest. - my $options = shift; - - my $tmp = $options->{'tmp'}; - return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - - my $sample = $options->{'sample'}; - my $annot_file1; - my $annot_file2; - my $final_annot_file1; - my $final_annot_file2; - - my $gene_gtf = File::Spec->catfile($tmp, "filtered_gene.gtf"); - my %gene_info; +sub process_annotation_file { + my ($options, $input, $output, $gene_info) = @_; - open (my $ifh1, $gene_gtf) or die "Could not open file '$gene_gtf' $!"; - while (<$ifh1>) { - chomp; - my $line = $_; - my $gene_annot = parse_gene_info($line); - if(!exists $gene_info{$gene_annot->{'gene_name'}}){ - $gene_info{$gene_annot->{'gene_name'}}{'feature_start'} = $gene_annot->{'start'}; - $gene_info{$gene_annot->{'gene_name'}}{'feature_end'} = $gene_annot->{'end'}; - } - } - close ($ifh1); - - opendir(my $dh, $tmp); - while(my $file = readdir $dh){ - $annot_file1 = File::Spec->catfile($tmp, $file) if($file eq "$sample.1.ann"); - $annot_file2 = File::Spec->catfile($tmp, $file) if($file eq "$sample.2.ann"); - } - closedir($dh); - - $final_annot_file1 = $annot_file1."_final"; - $final_annot_file2 = $annot_file2."_final"; - my $curr_distance = 10000000; my $curr_break = ""; my $curr_annotation; my $curr_pos; my $curr_exon_start; my $curr_exon_end; - - # Process the first annotation file - open(my $ofh1, '>', $final_annot_file1) or die "Could not open file $final_annot_file1 $!"; - open (my $ifh2, $annot_file1) or die "Could not open file '$annot_file1' $!"; - while (<$ifh2>){ + + open(my $ofh1, '>>', $output) or die "Could not open file $output $!"; + open (my $ifh1, $input) or die "Could not open file '$input' $!"; + while (<$ifh1>){ chomp; my $line = $_; my $annotation = parse_annotation($line); next if($annotation->{'gene_name'} ne $annotation->{'star_genename'}); my $break = $annotation->{'breakpoint'}; - if($break ne $curr_break){ unless($curr_break eq ""){ $curr_pos = $curr_annotation->{'pos_end'}; @@ -885,18 +654,18 @@ sub select_annotation { $curr_exon_end = $curr_annotation->{'feature_end'}; if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; } elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; } else{ # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $gene_start = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$options->{'gtf'}."\t-\t-\t-\n"; } } $curr_distance = 10000000; @@ -919,95 +688,203 @@ sub select_annotation { $curr_exon_end = $curr_annotation->{'feature_end'}; if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; } elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; } else{ # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $gene_start = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$options->{'gtf'}."\t-\t-\t-\n"; } } - close ($ifh2); + close ($ifh1); close ($ofh1); + + return 1; +} + +sub query_vagrent { + my $options = shift; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + + my $sample = $options->{'sample'}; + + # There will always be a 1_2 comparison file so deal with that first and build the fusions object. + + # Establish the source of 1 and 2 respectively + my $source1 = $options->{'fusion_files'}->{'1'}->{'format'}; + my $source2 = $options->{'fusion_files'}->{'2'}->{'format'}; + + my $col_set = 1; + my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; + if($source2 eq 'star'){ + $col_set = 2; + $star_file = $options->{'fusion_files'}->{'2'}->{'name'}; + } + + my $overlap_file1_2 = File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); + + my %star_gene_list; + open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + my @fields = split "\t", $line; + my $breakpoint = $fields[0]; + $star_gene_list{$breakpoint}{'gene1_name'} = $fields[4]; + $star_gene_list{$breakpoint}{'gene1_id'} = $fields[5]; + $star_gene_list{$breakpoint}{'gene2_name'} = $fields[10]; + $star_gene_list{$breakpoint}{'gene2_id'} = $fields[11]; + } + close ($ifh1); + + my $vagrent_version = "VAGrENT_".Sanger::CGP::Vagrent->VERSION; - $curr_distance = 10000000; - $curr_break = ""; + my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}); + my %breaklist; - # Process the second annotation file - open(my $ofh2, '>', $final_annot_file2) or die "Could not open file $final_annot_file2 $!"; - open (my $ifh3, $annot_file2) or die "Could not open file '$annot_file2' $!"; + open (my $ifh3, $overlap_file1_2) or die "Could not open file '$overlap_file1_2' $!"; while (<$ifh3>) { chomp; my $line = $_; - my $annotation = parse_annotation($line); - next if($annotation->{'gene_name'} ne $annotation->{'star_genename'}); - my $break = $annotation->{'breakpoint'}; + my $fusion = parse_overlap($line, $col_set); + $fusion->gene1($star_gene_list{$fusion->{'breakpoint'}}{'gene1_name'}); + $fusion->gene2($star_gene_list{$fusion->{'breakpoint'}}{'gene2_name'}); + $fusion->gene1_id($star_gene_list{$fusion->{'breakpoint'}}{'gene1_id'}); + $fusion->gene2_id($star_gene_list{$fusion->{'breakpoint'}}{'gene2_id'}); + + my $genomic_pos1 = Sanger::CGP::Vagrent::Data::GenomicRegion->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $fusion->{'chr1'}, 'minpos' => $fusion->{'pos1_start'}, 'maxpos' => $fusion->{'pos1_end'}, 'id' => $fusion->{'breakpoint'}); + my $genomic_pos2 = Sanger::CGP::Vagrent::Data::GenomicRegion->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $fusion->{'chr2'}, 'minpos' => $fusion->{'pos2_start'}, 'maxpos' => $fusion->{'pos2_end'}, 'id' => $fusion->{'breakpoint'}); - if($break ne $curr_break){ - unless($curr_break eq ""){ - my $curr_pos = $curr_annotation->{'pos_end'}; - my $curr_exon_start = $curr_annotation->{'feature_start'}; - my $curr_exon_end = $curr_annotation->{'feature_end'}; - - if($curr_distance <= 10){ - print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; - } - elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; - } - else{ - # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; - my $break_pos = $curr_annotation->{'pos_end'}; - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; - } - } - $curr_distance = 10000000; - $curr_break = $break; + my @trans1 = $ts->getTranscripts($genomic_pos1); + my @trans2 = $ts->getTranscripts($genomic_pos2); + + parse_transcript_data($fusion, 1, \@trans1); + parse_transcript_data($fusion, 2, \@trans2); + + $breaklist{$fusion->{'breakpoint'}} = $fusion if(!exists $breaklist{$fusion->{'breakpoint'}}); + } + close ($ifh3); + + my $final_annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); + my $final_annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); + my $bed_file1 = File::Spec->catfile($tmp, "$sample.1.bed"); + my $bed_file2 = File::Spec->catfile($tmp, "$sample.2.bed"); + my $final_annotation_file = File::Spec->catfile($tmp, "$sample.final"); + open(my $ofh1, '>', $final_annotation_file) or die "Could not open file '$final_annotation_file' $!"; + open(my $ofh2, '>', $final_annot_file1) or die "Could not open file '$final_annot_file1' $!"; + open(my $ofh3, '>', $final_annot_file2) or die "Could not open file '$final_annot_file2' $!"; + open(my $ofh4, '>', $bed_file1) or die "Could not open file '$bed_file1' $!"; + open(my $ofh5, '>', $bed_file2) or die "Could not open file '$bed_file2' $!"; + for my $brk (keys %breaklist){ + if(defined $breaklist{$brk}->{'transcript1_id'} && defined $breaklist{$brk}->{'transcript2_id'}){ + my $output_line = $breaklist{$brk}->format_annotation_line($vagrent_version); + print $ofh1 $output_line."\n"; + } + else{ + if(!defined $breaklist{$brk}->{'transcript1_id'}){ + my $output_line = $breaklist{$brk}->format_bed_line(1); + print $ofh4 $output_line."\n"; + } + else{ + my $output_line = $breaklist{$brk}->format_break_line(1, $vagrent_version); + print $ofh2 $output_line."\n"; + } + if(!defined $breaklist{$brk}->{'transcript2_id'}){ + my $output_line = $breaklist{$brk}->format_bed_line(2); + print $ofh5 $output_line."\n"; + } + else{ + my $output_line = $breaklist{$brk}->format_break_line(2, $vagrent_version); + print $ofh3 $output_line."\n"; } } - - my $pos = $annotation->{'pos_end'}; - my $exon_start = $annotation->{'feature_start'}; - my $exon_end = $annotation->{'feature_end'}; - my $distance = find_closest_boundary($pos, $exon_start, $exon_end); + } + close($ofh5); + close($ofh4); + close($ofh3); + close($ofh2); + close($ofh1); - if($distance < $curr_distance){ - $curr_distance = $distance; - $curr_annotation = $annotation; - $curr_break = $break; - } - } - $curr_pos = $curr_annotation->{'pos_end'}; - $curr_exon_start = $curr_annotation->{'feature_start'}; - $curr_exon_end = $curr_annotation->{'feature_end'}; + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + + return 1; +} + +sub run_bed_pairtopair { + my $options = shift; - if($curr_distance <= 10){ - print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\t".$curr_annotation->{'feature'}."\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; - } - elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tmid-exon\t".$curr_annotation->{'exon_id'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; - } - else{ - # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info{$curr_annotation->{'star_genename'}}{'feature_end'}; - my $break_pos = $curr_annotation->{'pos_end'}; - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh2 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'strand'}."\tintronic\t-\t-\t-\t-\t".$curr_annotation->{'transcript_id'}."\t".$curr_annotation->{'gene_biotype'}."\t".$curr_distance."\n"; - } - } - close ($ifh3); - close ($ofh2); + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + + my $sample = $options->{'sample'}; + my $prog = _which('bedtools'); + + # There will always be at least two input files so build the command for the first comparison + my $command = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.bedpe"), + File::Spec->catfile($tmp, "2.$sample.bedpe"), + File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); + + PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + + return 1; +} + +sub select_annotation { + + # All possible exon annotations have been retrieved for each breakpoint, we need to select annotation for the nearest. + my $options = shift; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + + my $sample = $options->{'sample'}; + my $annot_file1; + my $annot_file2; + my $final_annot_file1; + my $final_annot_file2; + + my $gene_gtf = File::Spec->catfile($tmp, "filtered_gene.gtf"); + my %gene_info; + + open (my $ifh1, $gene_gtf) or die "Could not open file '$gene_gtf' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + my $gene_annot = parse_gene_info($line); + if(!exists $gene_info{$gene_annot->{'gene_name'}}){ + $gene_info{$gene_annot->{'gene_name'}}{'feature_start'} = $gene_annot->{'start'}; + $gene_info{$gene_annot->{'gene_name'}}{'feature_end'} = $gene_annot->{'end'}; + } + } + close ($ifh1); + + opendir(my $dh, $tmp); + while(my $file = readdir $dh){ + $annot_file1 = File::Spec->catfile($tmp, $file) if($file eq "$sample.1.ann"); + $annot_file2 = File::Spec->catfile($tmp, $file) if($file eq "$sample.2.ann"); + } + closedir($dh); + + $final_annot_file1 = $annot_file1."_final"; + $final_annot_file2 = $annot_file2."_final"; + + if(-s $annot_file1){ + process_annotation_file($options, $annot_file1, $final_annot_file1, \%gene_info); + } + if(-s $annot_file2){ + process_annotation_file($options, $annot_file2, $final_annot_file2, \%gene_info); + } + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return 1; From 54a3ef88fd544f19f0effe1cb08aa19863e4806e Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 23 Oct 2015 16:59:49 +0100 Subject: [PATCH 05/40] Tidying up usage information --- perl/bin/compare_overlapping_fusions.pl | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 0ebdb44..837f78f 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -58,11 +58,11 @@ BEGIN use Data::Dumper; const my @REQUIRED_PARAMS => qw(outdir sample gtf); -const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair createbed annotatebed selectannotation output); +const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair queryvagrent annotatebed selectannotation collateannotation output); const my %INDEX_FACTOR => ( 'createjunctionbed' => -1, 'runbedpairtopair' => 1, - 'queryvagrent' => -1, - 'annotatebed' => -1, + 'queryvagrent' => 1, + 'annotatebed' => 1, 'selectannotation' => 1, 'collateannotation' => 1, 'output' => 1); @@ -77,11 +77,13 @@ BEGIN $threads->run($options->{'num'}, 'createjunctionbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createjunctionbed'); Sanger::CGP::CompareFusions::Implement::run_bed_pairtopair($options) if(!exists $options->{'process'} || $options->{'process'} eq 'runbedpairtopair'); - Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent'); - if(!-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}."1.bed") || !-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}."2.bed")){ - Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); - Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); - Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation'); + if(-s File::Spec->catfile($options->{'tmp'}, "1_2.".$options->{'sample'}.".bedpe_overlap")){ + Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent'); + if(-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".1.bed") || -s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".2.bed")){ + Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); + Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); + Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation'); + } } if(!exists $options->{'process'} || $options->{'process'} eq 'output') { Sanger::CGP::CompareFusions::Implement::generate_output($options); @@ -181,7 +183,7 @@ sub setup { =head1 compare_overlapping_fusions.pl -Produces a report of overlapping fusions that have been called by star-fusion or deFuse. +Produces a report of overlapping fusions that have been called by star-fusion and deFuse. =head1 SYNOPSIS @@ -191,6 +193,7 @@ =head1 SYNOPSIS -outdir -o Folder to output result to. -sample -s Sample name -gtf -g GTF file to use with bedtools to annotate each fusion breakpoint position. + -cache -c VAGrENT cache file that should be the same reference and gene build as the GTF file being used e.g. GRCh38 e77. Optional: -threads -t Number of threads (cpus) to use [1]. @@ -211,9 +214,10 @@ =head1 OPTIONS createjunctionbed runbedpairtopair - createbed + queryvagrent annotatebed selectannotation + collateannotation output =back From 27a44299d0c52d387f0b187900d3873e27ea612f Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 23 Oct 2015 17:00:18 +0100 Subject: [PATCH 06/40] Updating gtf to only show the file name rather than the full path --- .../Sanger/CGP/CompareFusions/Implement.pm | 55 +++++++++++-------- 1 file changed, 33 insertions(+), 22 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index e57491f..b254bca 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -38,6 +38,7 @@ use autodie qw(:all); use English qw( -no_match_vars ); use Const::Fast qw(const); use File::Which qw(which); +use File::Basename; use FindBin qw($Bin); use File::Spec; use PCAP::Cli; @@ -461,7 +462,7 @@ sub generate_output { print $ofh1 "$sample\t$star_fusion_name\t$breakpoint\t$defuse_breakpoint\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$transcript1_id\t$transcript1_src\t$exon1_number\t$exon1_start\t$exon1_end\t$transcript2_id\t$transcript2_src\t$exon2_number\t$exon2_start\t$exon2_end\t$defuse_cluster_id\t$defuse_sequence\n"; } else{ - print $ofh1 "$sample\t-\t$breakpoint\t$defuse_breakpoint\tFUSION COULD NOT BE ANNOTATED\n"; + print $ofh1 "$sample\tFUSION COULD NOT BE ANNOTATED\t$breakpoint\t$defuse_breakpoint\n"; } } close($ifh3); @@ -638,6 +639,10 @@ sub process_annotation_file { my $curr_pos; my $curr_exon_start; my $curr_exon_end; + my $break; + my $alt_break; + + my ($gtf, $path) = fileparse($options->{'gtf'}); open(my $ofh1, '>>', $output) or die "Could not open file $output $!"; open (my $ifh1, $input) or die "Could not open file '$input' $!"; @@ -645,8 +650,9 @@ sub process_annotation_file { chomp; my $line = $_; my $annotation = parse_annotation($line); + $break = $annotation->{'breakpoint'}; + $alt_break = $annotation->{'alt_breakpoint'}; next if($annotation->{'gene_name'} ne $annotation->{'star_genename'}); - my $break = $annotation->{'breakpoint'}; if($break ne $curr_break){ unless($curr_break eq ""){ $curr_pos = $curr_annotation->{'pos_end'}; @@ -654,10 +660,10 @@ sub process_annotation_file { $curr_exon_end = $curr_annotation->{'feature_end'}; if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; } elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; } else{ # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic @@ -665,7 +671,7 @@ sub process_annotation_file { my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$options->{'gtf'}."\t-\t-\t-\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$gtf."\t-\t-\t-\n"; } } $curr_distance = 10000000; @@ -683,24 +689,29 @@ sub process_annotation_file { $curr_break = $break; } } - $curr_pos = $curr_annotation->{'pos_end'}; - $curr_exon_start = $curr_annotation->{'feature_start'}; - $curr_exon_end = $curr_annotation->{'feature_end'}; - - if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; - } - elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$options->{'gtf'}."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + if (defined $curr_annotation){ + $curr_pos = $curr_annotation->{'pos_end'}; + $curr_exon_start = $curr_annotation->{'feature_start'}; + $curr_exon_end = $curr_annotation->{'feature_end'}; + + if($curr_distance <= 10){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + } + elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + } + else{ + # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic + my $gene_start = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_start'}; + my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $break_pos = $curr_annotation->{'pos_end'}; + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$gtf."\t-\t-\t-\n"; + } + } } - else{ - # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; - my $break_pos = $curr_annotation->{'pos_end'}; - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$options->{'gtf'}."\t-\t-\t-\n"; - } + else{ + print $ofh1 $break."\t".$alt_break."\n"; } close ($ifh1); close ($ofh1); From 9db7ea42055f8f8613f6321dd343260f07234329 Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 30 Nov 2015 10:29:31 +0000 Subject: [PATCH 07/40] Latest rewrite of comparison code to include all combinations of overlaps as well as singleton calls with a confidence score for each fusion --- perl/bin/compare_overlapping_fusions.pl | 24 +- .../CGP/CompareFusions/FusionAnnotation.pm | 36 +- .../Sanger/CGP/CompareFusions/Implement.pm | 897 +++++++++++++++--- 3 files changed, 794 insertions(+), 163 deletions(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 837f78f..33ec3a8 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -58,13 +58,16 @@ BEGIN use Data::Dumper; const my @REQUIRED_PARAMS => qw(outdir sample gtf); -const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair queryvagrent annotatebed selectannotation collateannotation output); +const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair processoverlaps singletons queryvagrent annotatebed selectannotation collateannotation deduplicate output); const my %INDEX_FACTOR => ( 'createjunctionbed' => -1, 'runbedpairtopair' => 1, + 'processoverlaps' => 1, + 'singletons' => 1, 'queryvagrent' => 1, 'annotatebed' => 1, 'selectannotation' => 1, 'collateannotation' => 1, + 'deduplicate' => 1, 'output' => 1); { my $options = setup(); @@ -78,16 +81,19 @@ BEGIN Sanger::CGP::CompareFusions::Implement::run_bed_pairtopair($options) if(!exists $options->{'process'} || $options->{'process'} eq 'runbedpairtopair'); if(-s File::Spec->catfile($options->{'tmp'}, "1_2.".$options->{'sample'}.".bedpe_overlap")){ + Sanger::CGP::CompareFusions::Implement::process_overlap_files($options) if(!exists $options->{'process'} || $options->{'process'} eq 'processoverlaps'); + Sanger::CGP::CompareFusions::Implement::process_singletons($options) if(!exists $options->{'process'} || $options->{'process'} eq 'singletons'); Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent'); if(-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".1.bed") || -s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".2.bed")){ Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation'); + Sanger::CGP::CompareFusions::Implement::deduplicate_fusions($options) if(!exists $options->{'process'} || $options->{'process'} eq 'deduplicate'); } } if(!exists $options->{'process'} || $options->{'process'} eq 'output') { Sanger::CGP::CompareFusions::Implement::generate_output($options); - cleanup($options); + #cleanup($options); } } @@ -145,13 +151,23 @@ sub setup { $opts{'input_files'} = \@ARGV; my $format; + my $format_num; my %fusion_files; my $input; for (my $iter=1; $iter <= $file_count; $iter++) { $input = $ARGV[$iter-1]; $format = Sanger::CGP::CompareFusions::Implement::check_input($input); - $fusion_files{$iter}{'format'} = $format; - $fusion_files{$iter}{'name'} = $input; + if($format eq 'star'){ + $format_num = 1; + } + elsif($format eq 'tophat'){ + $format_num = 2; + } + else{ + $format_num = 3; + } + $fusion_files{$format_num}{'format'} = $format; + $fusion_files{$format_num}{'name'} = $input; } $opts{'fusion_files'} = \%fusion_files; diff --git a/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm b/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm index f311a2a..3dadcc2 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/FusionAnnotation.pm @@ -44,6 +44,7 @@ sub new { if ($args{-breakpoint}) { $self->breakpoint($args{-breakpoint}) } if ($args{-alt_breakpoint}) { $self->alt_breakpoint($args{-alt_breakpoint}) } + if ($args{-alt_breakpoint2}) { $self->alt_breakpoint2($args{-alt_breakpoint2}) } if ($args{-chr1}) { $self->chr1($args{-chr1}) } if ($args{-strand1}) { $self->strand1($args{-strand1}) } if ($args{-pos1_start}) { $self->pos1_start($args{-pos1_start}) } @@ -92,6 +93,12 @@ sub alt_breakpoint { return($self->{alt_breakpoint}); } +sub alt_breakpoint2 { + my $self = shift; + $self->{alt_breakpoint2} = shift if @_; + return($self->{alt_breakpoint2}); +} + sub chr1 { my $self = shift; $self->{chr1} = shift if @_; @@ -284,10 +291,18 @@ sub distance2 { return($self->{distance2}); } +sub format_annotation_line { + my ($self,$annot_source) = @_; + my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'alt_breakpoint2'},$self->{'gene1'},$self->{'gene1_id'},$self->{'transcript1_id'},$annot_source,$self->{'exon1_num'},$self->{'feature1_start'},$self->{'feature1_end'},$self->{'gene2'},$self->{'gene2_id'},$self->{'transcript2_id'},$annot_source,$self->{'exon2_num'},$self->{'feature2_start'},$self->{'feature2_end'},$self->{'feature1'}); + + my $formatted_line = join("\t", @fields); + return $formatted_line; +} + sub format_bed_line { my ($self, $breaknum) = @_; - my @fields = ($self->{'chr'.$breaknum},$self->{'pos'.$breaknum.'_start'},$self->{'pos'.$breaknum.'_end'},$self->breakpoint,$self->alt_breakpoint,$self->{'strand'.$breaknum},$self->{'gene'.$breaknum},$self->{'gene'.$breaknum.'_id'}); + my @fields = ($self->{'chr'.$breaknum},$self->{'pos'.$breaknum.'_start'},$self->{'pos'.$breaknum.'_end'},$self->breakpoint,$self->alt_breakpoint,$self->{'strand'.$breaknum},$self->{'gene'.$breaknum},$self->{'gene'.$breaknum.'_id'},$self->alt_breakpoint2,$self->{'feature1'}); my $formatted_line = join("\t", @fields); return $formatted_line; @@ -314,20 +329,25 @@ sub format_bedpe_line { return $bedpe_line; } -sub format_annotation_line { - my ($self,$annot_source) = @_; - my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'gene1'},$self->{'gene1_id'},$self->{'transcript1_id'},$annot_source,$self->{'exon1_num'},$self->{'feature1_start'},$self->{'feature1_end'},$self->{'gene2'},$self->{'gene2_id'},$self->{'transcript2_id'},$annot_source,$self->{'exon2_num'},$self->{'feature2_start'},$self->{'feature2_end'}); +sub format_break_line { + my ($self, $breaknum, $annot_source) = @_; + + my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'alt_breakpoint2'},$self->{'gene'.$breaknum},$self->{'gene'.$breaknum.'_id'},$self->{'transcript'.$breaknum.'_id'},$annot_source,$self->{'exon'.$breaknum.'_num'},$self->{'feature'.$breaknum.'_start'},$self->{'feature'.$breaknum.'_end'},$self->{'feature1'}); my $formatted_line = join("\t", @fields); return $formatted_line; } -sub format_break_line { - my ($self, $breaknum, $annot_source) = @_; - - my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'gene'.$breaknum},$self->{'gene'.$breaknum.'_id'},$self->{'transcript'.$breaknum.'_id'},$annot_source,$self->{'exon'.$breaknum.'_num'},$self->{'feature'.$breaknum.'_start'},$self->{'feature'.$breaknum.'_end'}); +sub format_fusion_line { + my ($self,$annot_source) = @_; + my @fields = ($self->{'breakpoint'},$self->{'alt_breakpoint'},$self->{'alt_breakpoint2'},$self->{'chr1'},$self->{'pos1_start'},$self->{'pos1_end'},$self->{'strand1'},$self->{'chr2'},$self->{'pos2_start'},$self->{'pos2_end'},$self->{'strand2'},$annot_source); my $formatted_line = join("\t", @fields); return $formatted_line; } + + + + + diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index b254bca..ab186c4 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -53,9 +53,12 @@ use Data::Dumper; const my $BEDTOOLS_CLOSEST => q{ closest -s -a %s -b %s | sort -k4,4 > %s}; const my $BEDTOOLS_PAIRTOPAIR => q{ pairtopair -a %s -b %s -slop 5 > %s}; +const my $SORT => q{ sort -k%d,%d %s > %s}; +const my $TRI_SORT => q{ sort -k%d,%d -k%d,%dr -k%d,%dr %s > %s}; +const my $JOIN => q{ join -1 8 -2 8 %s %s > %s }; +const my $OUTPUT_HEADER => "sample\tfusion_name\talgorithm\tconfidence_score\tstar_junction\ttophat_junction\tdefuse_junction\tdefuse_cluster_id\tstar_junction_reads\tstar_spanning_frags\ttophat_junction_reads\ttophat_spanning_frags\tdefuse_splitr_count\tdefuse_span_count\t5'_gene\t5'_gene_id\t5'_chr\t5'_pos\t5'_strand\t3'_gene\t3'_gene_id\t3'_chr\t3'_pos\t3'_strand\t5'_transcript_id\t5'_transcript_src\t5'_exon_num\t5'_exon_start\t5'_exon_end\t3'_transcript_id\t3'_transcript_src\t3'_exon_num\t3'_exon_start\t3'_exon_end\tdefuse_splitr_sequence\ttophat_splitr_sequence\n"; -const my $OUTPUT_HEADER => "sample\tfusion_name\tstar_junction\tdefuse_junction\tdefuse_splitr_count\tdefuse_span_count\tstar_JunctionReads\tstar_SpanningFrags\tLeftGene\tLeftGeneId\tLeftChr\tLeftPos\tLeftStrand\tLeftDistFromRefExonSplice\tRightGene\tRightGeneId\tRightChr\tRightPos\tRightStrand\tRightDistFromRefExonSplice\ttranscript1_id\ttranscript1_src\texon1_num\texon1_start\texon1_end\ttranscript2_id\ttranscript2_src\texon2_num\texon2_start\texon2_end\tdefuse_cluster_id\tdefuse_splitr_sequence\n"; - +# This filter on biotypes is currently not used in subroutine filter_gtf (uncomment the line to switch on). my %ALLOWED_BIOTYPES = ( antisense => 1, IG_C_gene => 1, @@ -78,14 +81,30 @@ my %ALLOWED_BIOTYPES = ( TR_V_gene => 1, ); +my %CONFIDENCE_SCORES = ( + STD => '76%', + SD => '79%', + ST => '48%', + TD => '25%', + D => '59%', + T => '51%', + S => '24%', +); + # Position of the columns in the tophat-fusion filtered file used to format the bed file. const my $TOPHAT_SPLIT_CHAR => '\t'; +const my $TOPHAT_GENE1 => 3; const my $TOPHAT_CHR1 => 4; const my $TOPHAT_POS1 => 5; const my $TOPHAT_STRAND1 => 13; +const my $TOPHAT_GENE2 => 6; const my $TOPHAT_CHR2 => 7; const my $TOPHAT_POS2 => 8; const my $TOPHAT_STRAND2 => 14; +const my $TOPHAT_SPAN_READS => 9; +const my $TOPHAT_SPAN_MATE_PAIRS => 10; +const my $TOPHAT_SPAN_MATE_PAIRS2 => 11; +const my $TOPHAT_SCORE => 12; const my $TOPHAT_BREAKREF => 1; const my $TOPHAT_HEADER_PATTERN => 'num_spanning_reads'; @@ -94,15 +113,25 @@ const my $DEFUSE_SPLIT_CHAR => '\t'; const my $DEFUSE_CHR1 => 26; const my $DEFUSE_POS1 => 39; const my $DEFUSE_STRAND1 => 36; +const my $DEFUSE_GENENAME1 => 32; +const my $DEFUSE_GENEID1 => 22; const my $DEFUSE_CHR2 => 27; const my $DEFUSE_POS2 => 40; const my $DEFUSE_STRAND2 => 37; +const my $DEFUSE_GENENAME2 => 33; +const my $DEFUSE_GENEID2 => 23; const my $DEFUSE_BREAKREF => 1; const my $DEFUSE_CLUSTER_ID => 2; +const my $DEFUSE_SEQUENCE => 3; +const my $DEFUSE_SPLIT_READS => 4; +const my $DEFUSE_SPAN_READS => 62; const my $DEFUSE_HEADER_PATTERN => 'cluster_id'; # Position of the columns in the star-fusion output file used to format fusion breakpoint references. const my $STAR_SPLIT_CHAR => '\t'; +const my $STAR_FUSION_NAME => 2; +const my $STAR_JUNCTION_READS => 3; +const my $STAR_SPANNING_FRAGS => 4; const my $STAR_CHR1 => 7; const my $STAR_POS1 => 8; const my $STAR_STRAND1 => 9; @@ -114,6 +143,8 @@ const my $STAR_GENENAME1 => 5; const my $STAR_GENEID1 => 6; const my $STAR_GENENAME2 => 11; const my $STAR_GENEID2 => 12; +const my $STAR_DIS_EXON1 => 10; +const my $STAR_DIS_EXON2 => 16; const my $STAR_HEADER_PATTERN => 'fusion_name'; sub annotate_bed { @@ -124,7 +155,6 @@ sub annotate_bed { my $sample = $options->{'sample'}; my $exon_gtf = filter_gtf($options, 'exon'); - my $gene_gtf = filter_gtf($options, 'gene'); my $break1_file; my $break2_file; @@ -226,19 +256,26 @@ sub collate_annotation { chomp; my $line = $_; my @fields = split "\t", $line; + my @fields2; + #pop @fields; my $breakpoint_id = $fields[0]; open (my $ifh2, $annot_file2) or die "Could not open file '$annot_file2' $!"; while (<$ifh2>) { chomp; my $line2 = $_; - if($line2 =~ m/^$breakpoint_id/){ - my @fields2 = split "\t", $line2; + if($line2 =~ m/^$breakpoint_id\s/){ + @fields2 = split "\t", $line2; shift(@fields2); shift(@fields2); - push(@fields, @fields2); + shift(@fields2); + #push(@fields, @fields2); last; } } + if(scalar @fields2 > 0){ + pop @fields; + } + push(@fields, @fields2); close($ifh2); print $ofh1 join("\t", @fields)."\n"; } @@ -296,6 +333,23 @@ sub create_junction_bedpe { print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$chr2."\t".$pos2_start."\t".$pos2_end."\t".$filetype."\t".$name."\t".$strand1."\t".$strand2."\n"; } + elsif($filetype eq 'tophat'){ + next if($line =~ m/$TOPHAT_HEADER_PATTERN/); + + @fields = split $TOPHAT_SPLIT_CHAR, $line; + $name = $fields[$TOPHAT_BREAKREF - 1]; + $chr1 = $fields[$TOPHAT_CHR1 - 1]; + $pos1_start = $fields[$TOPHAT_POS1 - 1]-1; + $pos1_end = $fields[$TOPHAT_POS1 - 1]; + $strand1 = $fields[$TOPHAT_STRAND1 - 1]; + $chr2 = $fields[$TOPHAT_CHR2 - 1]; + $pos2_start = $fields[$TOPHAT_POS2 - 1]-1; + $pos2_end = $fields[$TOPHAT_POS2- 1]; + $strand2 = $fields[$TOPHAT_STRAND2 - 1]; + + print $ofh1 $chr1."\t".$pos1_start."\t".$pos1_end."\t".$chr2."\t".$pos2_start."\t".$pos2_end."\t".$filetype."\t".$name."\t".$strand1."\t".$strand2."\n"; + } + # It must be defuse format else{ next if($line =~ m/$DEFUSE_HEADER_PATTERN/); @@ -323,6 +377,76 @@ sub create_junction_bedpe { } +sub deduplicate_fusions { + my $options = shift; + + my $tmp = $options->{'tmp'}; + my $sample = $options->{'sample'}; + + my $input_file = File::Spec->catfile($tmp, "$sample.final"); + my $temp_file = File::Spec->catfile($tmp, "$sample.final.2"); + + # Open the file, read through and add a sort key based on the source algorithm. STD should be 1 and everything else 2. + open(my $ifh1, $input_file) or die "Could not open file $input_file $!"; + open(my $ofh1, '>', $temp_file) or die "Could not open file $temp_file $!"; + while(<$ifh1>){ + chomp; + my $line = $_; + if($line =~ m/STD$/){ + print $ofh1 "1\t".$line."\n"; + } + else{ + print $ofh1 "2\t".$line."\n"; + } + } + close($ofh1); + close($ifh1); + + my $sorted_file = File::Spec->catfile($tmp, "$sample.final.2.sorted"); + my $sort_command = sprintf $TRI_SORT, 1, 1,8,8,15,15, $temp_file, $sorted_file; + system($sort_command); + + my %seen; + my $output_file = File::Spec->catfile($tmp, "$sample.final.deduped"); + + # Read through the sorted file and check there are no duplicate tophat breakpoints (sometimes star and tophat breakpoints differ). + open(my $ifh2, $sorted_file) or die "Could not open file $sorted_file $!"; + open(my $ofh2, '>', $output_file) or die "Could not open file $output_file $!"; + while(<$ifh2>){ + chomp; + my $line = $_; + my @fields = split "\t", $line; + my $length = scalar @fields; + my $sort_key = $fields[0]; + my $source = $fields[$length-1]; + if(defined $sort_key){ + if($sort_key == 1 ){ + $seen{$fields[2]} = $fields[3]; + shift @fields; + print $ofh2 join("\t", @fields)."\n"; + } + elsif($source eq 'ST'){ + if(!exists $seen{$fields[2]}){ + $seen{$fields[2]} = $fields[3]; + shift @fields; + print $ofh2 join("\t", @fields)."\n"; + } + } + else{ + if(!exists $seen{$fields[1]}){ + $seen{$fields[1]} = $fields[2]; + shift @fields; + print $ofh2 join("\t", @fields)."\n"; + } + } + } + } + close($ofh2); + close($ifh2); + + return 1; +} + sub filter_gtf { my ($options, $feature) = @_; @@ -352,7 +476,8 @@ sub filter_gtf { my ($type,$value)= split / /, $item; $annotation{$type} = $value; } - print $ofh $line."\n" if(exists $ALLOWED_BIOTYPES{$annotation{'gene_biotype'}}); + #print $ofh $line."\n" if(exists $ALLOWED_BIOTYPES{$annotation{'gene_biotype'}}); # UNCOMMENT THIS LINE TO FILTER ON BIOTYPE AND COMMENT OUT LINE BELOE + print $ofh $line."\n" } close($ifh); close($ofh); @@ -378,97 +503,120 @@ sub generate_output { return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $sample = $options->{'sample'}; - - my $star_file; - my $defuse_file; - - if($options->{'fusion_files'}->{'1'}->{'format'} eq 'star'){ - $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; - $defuse_file = $options->{'fusion_files'}->{'2'}->{'name'}; - } - else{ - $star_file = $options->{'fusion_files'}->{'2'}->{'name'}; - $defuse_file = $options->{'fusion_files'}->{'1'}->{'name'}; - } - my %star_data; - open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; - while (<$ifh1>) { - chomp; - my $line = $_; - next if($line =~ m/^breakpoint/); - $line =~ m/^(.*:[0-9]+-.*:[0-9]+)\t([A-Za-z0-9-_:\.]+--[A-Za-z0-9-_:\.]+)\t/; - my $break_ref = $1; - my $fusion_name = $2; - $line =~ s/^.*:[0-9]+-.*:[0-9]+\t[A-Za-z0-9-_:\.]+--[A-Za-z0-9-_:\.]+\t//; - $star_data{$break_ref}{'fusion_name'} = $fusion_name; - $star_data{$break_ref}{'data'} = $line; - } - close ($ifh1); - - my %defuse_data; - open (my $ifh2, $defuse_file) or die "Could not open file '$defuse_file' $!"; - while (<$ifh2>) { - chomp; - my $line = $_; - next if($line =~ m/^breakpoint/); - my @fields = split "\t", $line; - $line =~ m/^(.*:[0-9]+-.*:[0-9]+)\t([0-9]+)\t([ACGT|]+)\t/; - my $break_ref = $fields[0]; - my $cluster_id = $fields[1]; - my $sequence = $fields[2]; - my $split_reads = $fields[3]; - my $span_reads = $fields[61]; - $defuse_data{$break_ref."_".$cluster_id}{'breakpoint'} = $break_ref; - $defuse_data{$break_ref."_".$cluster_id}{'cluster_id'} = $cluster_id; - $defuse_data{$break_ref."_".$cluster_id}{'sequence'} = $sequence; - $defuse_data{$break_ref."_".$cluster_id}{'split_reads'} = $split_reads; - $defuse_data{$break_ref."_".$cluster_id}{'span_reads'} = $span_reads; - } - close ($ifh2); + my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; + my $tophat_file = $options->{'fusion_files'}->{'2'}->{'name'}; + my $defuse_file = $options->{'fusion_files'}->{'3'}->{'name'}; - my $annot_file = File::Spec->catfile($tmp, "$sample.final"); - my $output_file = File::Spec->catfile($tmp, "$sample.star-defuse.overlapping.fusions.txt"); + my $star_data = parse_star_file($star_file); + my $tophat_data = parse_tophat_file($tophat_file); + my $defuse_data = parse_defuse_file($defuse_file); + + my $annot_file = File::Spec->catfile($tmp, "$sample.final.deduped"); + my $output_file = File::Spec->catfile($tmp, "$sample.detected.fusions.txt"); open(my $ofh1, '>', $output_file) or die "Could not open file $output_file $!"; if(-s $annot_file){ - open(my $ifh3, $annot_file) or die "Could not open file $annot_file $!"; + + open(my $ifh1, $annot_file) or die "Could not open file $annot_file $!"; print $ofh1 $OUTPUT_HEADER; - while(<$ifh3>){ + while(<$ifh1>){ chomp; my $line = $_; my @fields = split "\t", $line; - my $breakpoint = $fields[0]; - my $alt_breakpoint = $fields[1]; - my $defuse_breakpoint = $defuse_data{$alt_breakpoint}{'breakpoint'}; + my $length = scalar @fields; - if($length > 10){ - my $star_fusion_name = $star_data{$breakpoint}{'fusion_name'}; - my $star_data = $star_data{$breakpoint}{'data'}; - my $exon1_number = $fields[6]; - my $exon2_number = $fields[13]; - my $exon1_start = $fields[7]; - my $exon2_start = $fields[14]; - my $exon1_end = $fields[8]; - my $exon2_end = $fields[15]; - my $transcript1_id = $fields[4]; - my $transcript2_id = $fields[11]; - my $transcript1_src = $fields[5]; - my $transcript2_src = $fields[12]; - my $defuse_cluster_id = $defuse_data{$alt_breakpoint}{'cluster_id'}; - my $defuse_split_reads = $defuse_data{$alt_breakpoint}{'split_reads'}; - my $defuse_span_reads = $defuse_data{$alt_breakpoint}{'span_reads'}; - my $defuse_sequence = $defuse_data{$alt_breakpoint}{'sequence'}; - print $ofh1 "$sample\t$star_fusion_name\t$breakpoint\t$defuse_breakpoint\t$defuse_split_reads\t$defuse_span_reads\t$star_data\t$transcript1_id\t$transcript1_src\t$exon1_number\t$exon1_start\t$exon1_end\t$transcript2_id\t$transcript2_src\t$exon2_number\t$exon2_start\t$exon2_end\t$defuse_cluster_id\t$defuse_sequence\n"; + my $source = $fields[$length-1]; + my $confidence = $CONFIDENCE_SCORES{$source}; + + my $star_pos = index($source,'S') if($source =~ m/S/); + my $tophat_pos = index($source,'T') if($source =~ m/T/); + my $defuse_pos = index($source,'D') if($source =~ m/D/); + + my $star_breakpoint = 'NA'; + my $tophat_breakpoint = 'NA'; + my $defuse_breakpoint = 'NA'; + my $defuse_junction = 'NA'; + my $defuse_clusterid = 'NA'; + my $chr1 = 'NA'; + my $pos1 = 'NA'; + my $strand1 = 'NA'; + my $chr2 = 'NA'; + my $pos2 = 'NA'; + my $strand2 = 'NA'; + my $star_junction_reads = 'NA'; + my $star_spanning_frags = 'NA'; + my $tophat_junction_reads = 'NA'; + my $tophat_spanning_frags = 'NA'; + my $tophat_splitr_seq = 'NA'; + my $defuse_splitr_count = 'NA'; + my $defuse_span_count = 'NA'; + my $defuse_splitr_seq = 'NA'; + + if(defined $tophat_pos){ + $tophat_breakpoint = $fields[$tophat_pos]; + $tophat_junction_reads = $tophat_data->{$tophat_breakpoint}{'num_spanning_reads'}; + $tophat_spanning_frags = $tophat_data->{$tophat_breakpoint}{'num_spanning_mate_pairs'}; + $chr1 = $tophat_data->{$tophat_breakpoint}{'chr1'}; + $pos1 = $tophat_data->{$tophat_breakpoint}{'pos1'}; + $strand1 = $tophat_data->{$tophat_breakpoint}{'strand1'}; + $chr2 = $tophat_data->{$tophat_breakpoint}{'chr2'}; + $pos2 = $tophat_data->{$tophat_breakpoint}{'pos2'}; + $strand2 = $tophat_data->{$tophat_breakpoint}{'strand2'}; + } + if(defined $star_pos){ + $star_breakpoint = $fields[$star_pos]; + $star_junction_reads = $star_data->{$star_breakpoint}{'junction_reads'}; + $star_spanning_frags = $star_data->{$star_breakpoint}{'spanning_frags'}; + $chr1 = $star_data->{$star_breakpoint}{'chr1'}; + $pos1 = $star_data->{$star_breakpoint}{'pos1'}; + $strand1 = $star_data->{$star_breakpoint}{'strand1'}; + $chr2 = $star_data->{$star_breakpoint}{'chr2'}; + $pos2 = $star_data->{$star_breakpoint}{'pos2'}; + $strand2 = $star_data->{$star_breakpoint}{'strand2'}; + } + if(defined $defuse_pos){ + $defuse_breakpoint = $fields[$defuse_pos]; + $defuse_splitr_count = $defuse_data->{$defuse_breakpoint}{'split_reads'}; + $defuse_span_count = $defuse_data->{$defuse_breakpoint}{'span_reads'}; + $chr1 = $defuse_data->{$defuse_breakpoint}{'chr1'}; + $pos1 = $defuse_data->{$defuse_breakpoint}{'pos1'}; + $strand1 = $defuse_data->{$defuse_breakpoint}{'strand1'}; + $chr2 = $defuse_data->{$defuse_breakpoint}{'chr2'}; + $pos2 = $defuse_data->{$defuse_breakpoint}{'pos2'}; + $strand2 = $defuse_data->{$defuse_breakpoint}{'strand2'}; + $defuse_splitr_seq = $defuse_data->{$defuse_breakpoint}{'sequence'}; + my @defuse_temp = split "_", $defuse_breakpoint; + $defuse_junction = $defuse_temp[0]; + $defuse_clusterid = $defuse_temp[1]; + } + if($length > 11){ + my $gene1_name = $fields[3]; + my $gene1_id = $fields[4]; + my $gene2_name = $fields[10]; + my $gene2_id = $fields[11]; + my $fusion_name = $gene1_name."--".$gene2_name; + my $transcript1_id = $fields[5]; + my $transcript1_src = $fields[6]; + my $exon1_number = $fields[7]; + my $exon1_start = $fields[8]; + my $exon1_end = $fields[9]; + my $transcript2_id = $fields[12]; + my $transcript2_src = $fields[13]; + my $exon2_number = $fields[14]; + my $exon2_start = $fields[15]; + my $exon2_end = $fields[16]; + + $source = reverse $source; + + print $ofh1 "$sample\t$fusion_name\t$source\t$confidence\t$star_breakpoint\t$tophat_breakpoint\t$defuse_junction\t$defuse_clusterid\t$star_junction_reads\t$star_spanning_frags\t$tophat_junction_reads\t$tophat_spanning_frags\t$defuse_splitr_count\t$defuse_span_count\t$gene1_name\t$gene1_id\t$chr1\t$pos1\t$strand1\t$gene2_name\t$gene2_id\t$chr2\t$pos2\t$strand2\t$transcript1_id\t$transcript1_src\t$exon1_number\t$exon1_start\t$exon1_end\t$transcript2_id\t$transcript2_src\t$exon2_number\t$exon2_start\t$exon2_end\t$defuse_splitr_seq\t$tophat_splitr_seq\n"; } else{ - print $ofh1 "$sample\tFUSION COULD NOT BE ANNOTATED\t$breakpoint\t$defuse_breakpoint\n"; + $source = reverse $source; + print $ofh1 "$sample\tFUSION COULD NOT BE ANNOTATED\t$source\t$confidence\t$star_breakpoint\t$tophat_breakpoint\t$defuse_junction\t$defuse_clusterid\t$star_junction_reads\t$star_spanning_frags\t$tophat_junction_reads\t$tophat_spanning_frags\t$defuse_splitr_count\t$defuse_span_count\t\t\t$chr1\t$pos1\t$strand1\t\t\t$chr2\t$pos2\t$strand2\t\t\t\t\t\t\t\t\t\t\t$defuse_splitr_seq\t$tophat_splitr_seq\n"; } } - close($ifh3); - } - else{ - print $ofh1 "$sample\tNO OVERLAPPING FUSIONS FOUND\n"; + close($ifh1); } close($ofh1); @@ -485,15 +633,16 @@ sub parse_annotation { $annotation{'breakpoint'} = $fields[3]; $annotation{'alt_breakpoint'} = $fields[4]; + $annotation{'alt_breakpoint2'} = $fields[8]; $annotation{'chr'} = $fields[0]; $annotation{'pos_start'} = $fields[1]; $annotation{'pos_end'} = $fields[2]; $annotation{'strand'} = $fields[5]; - $annotation{'feature'} = $fields[10]; - $annotation{'feature_start'} = $fields[11]; - $annotation{'feature_end'} = $fields[12]; - $annotation{'star_genename'} = $fields[6]; - $annotation{'star_geneid'} = $fields[7]; + $annotation{'feature'} = $fields[12]; + $annotation{'feature_start'} = $fields[13]; + $annotation{'feature_end'} = $fields[14]; + $annotation{'genename'} = $fields[6]; + $annotation{'source'} = $fields[9]; my $annot_column = scalar @fields; my @annot_fields = split /; /, $fields[$annot_column-1]; @@ -529,6 +678,41 @@ sub parse_break_data { return \%break; } +sub parse_defuse_file { + + my $defuse_file = shift; + + my %defuse_data; + open (my $ifh1, $defuse_file) or die "Could not open file '$defuse_file' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + next if($line =~ m/$DEFUSE_HEADER_PATTERN/); + my @fields = split $DEFUSE_SPLIT_CHAR, $line; + my $break_ref = $fields[$DEFUSE_BREAKREF-1]; + my $cluster_id = $fields[$DEFUSE_CLUSTER_ID-1]; + my $breakpoint = $break_ref."_".$cluster_id; + $defuse_data{$breakpoint}{'breakpoint'} = $break_ref; + $defuse_data{$breakpoint}{'cluster_id'} = $cluster_id; + $defuse_data{$breakpoint}{'chr1'} = $fields[$DEFUSE_CHR1-1]; + $defuse_data{$breakpoint}{'pos1'} = $fields[$DEFUSE_POS1-1]; + $defuse_data{$breakpoint}{'strand1'} = $fields[$DEFUSE_STRAND1-1]; + $defuse_data{$breakpoint}{'gene1_name'} = $fields[$DEFUSE_GENENAME1-1]; + $defuse_data{$breakpoint}{'gene1_id'} = $fields[$DEFUSE_GENEID1-1]; + $defuse_data{$breakpoint}{'chr2'} = $fields[$DEFUSE_CHR2-1]; + $defuse_data{$breakpoint}{'pos2'} = $fields[$DEFUSE_POS2-1]; + $defuse_data{$breakpoint}{'strand2'} = $fields[$DEFUSE_STRAND2-1]; + $defuse_data{$breakpoint}{'gene2_name'} = $fields[$DEFUSE_GENENAME2-1]; + $defuse_data{$breakpoint}{'gene2_id'} = $fields[$DEFUSE_GENEID2-1]; + $defuse_data{$breakpoint}{'sequence'} = $fields[$DEFUSE_SEQUENCE-1]; + $defuse_data{$breakpoint}{'split_reads'} = $fields[$DEFUSE_SPLIT_READS-1]; + $defuse_data{$breakpoint}{'span_reads'} = $fields[$DEFUSE_SPAN_READS-1]; + } + close ($ifh1); + + return \%defuse_data; +} + sub parse_gene_info { my $line = shift; $line =~ s/"//g; @@ -549,36 +733,112 @@ sub parse_gene_info { return \%gene_annotation; } +sub parse_intersection { + my ($line) = @_; + + my @fields = split " ", $line; + + my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation(); + + $fusion->breakpoint($fields[0]); + $fusion->chr1($fields[1]); + $fusion->pos1_start($fields[2]); + $fusion->pos1_end($fields[3]); + $fusion->strand1($fields[8]); + $fusion->chr2($fields[4]); + $fusion->pos2_start($fields[5]); + $fusion->pos2_end($fields[6]); + $fusion->strand2($fields[9]); + $fusion->alt_breakpoint($fields[17]); + $fusion->alt_breakpoint2($fields[36]); + + return $fusion; +} + sub parse_overlap { - my ($line, $cols) = @_; + my ($line) = @_; my @fields = split "\t", $line; - my $row_length = scalar @fields; - my $start = 0; - $start = $row_length / 2 if($cols == 2); - my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( - -breakpoint => $fields[$start + 7], - -chr1 => $fields[$start], - -pos1_start => $fields[$start + 1], - -pos1_end => $fields[$start + 2], - -strand1 => $fields[$start + 8], - -chr2 => $fields[$start + 3], - -pos2_start => $fields[$start + 4], - -pos2_end => $fields[$start + 5], - -strand2 => $fields[$start + 9]); - - if($cols == 1){ - $fusion->alt_breakpoint($fields[17]); - } - else{ - $fusion->alt_breakpoint($fields[7]); - } + -breakpoint => $fields[7], + -chr1 => $fields[0], + -pos1_start => $fields[1], + -pos1_end => $fields[2], + -strand1 => $fields[8], + -chr2 => $fields[3], + -pos2_start => $fields[4], + -pos2_end => $fields[5], + -strand2 => $fields[9], + -alt_breakpoint => $fields[17], + -alt_breakpoint2 => 'NA'); return $fusion; } +sub parse_star_file { + + my $star_file = shift; + + my %star_data; + open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + next if($line =~ m/$STAR_HEADER_PATTERN/); + my @fields = split $STAR_SPLIT_CHAR, $line; + my $breakpoint = $fields[0]; + $star_data{$breakpoint}{'fusion_name'} = $fields[$STAR_FUSION_NAME-1]; + $star_data{$breakpoint}{'junction_reads'} = $fields[$STAR_JUNCTION_READS-1]; + $star_data{$breakpoint}{'spanning_frags'} = $fields[$STAR_SPANNING_FRAGS-1]; + $star_data{$breakpoint}{'gene1_name'} = $fields[$STAR_GENENAME1-1]; + $star_data{$breakpoint}{'gene1_id'} = $fields[$STAR_GENEID1-1]; + $star_data{$breakpoint}{'chr1'} = $fields[$STAR_CHR1-1]; + $star_data{$breakpoint}{'pos1'} = $fields[$STAR_POS1-1]; + $star_data{$breakpoint}{'strand1'} = $fields[$STAR_STRAND1-1]; + $star_data{$breakpoint}{'dis_exon_1'} = $fields[$STAR_DIS_EXON1-1]; + $star_data{$breakpoint}{'gene2_name'} = $fields[$STAR_GENENAME2-1]; + $star_data{$breakpoint}{'gene2_id'} = $fields[$STAR_GENEID2-1]; + $star_data{$breakpoint}{'chr2'} = $fields[$STAR_CHR2-1]; + $star_data{$breakpoint}{'pos2'} = $fields[$STAR_POS2-1]; + $star_data{$breakpoint}{'strand2'} = $fields[$STAR_STRAND2-1]; + $star_data{$breakpoint}{'dis_exon_2'} = $fields[$STAR_DIS_EXON2-1]; + } + close ($ifh1); + + return \%star_data; +} + +sub parse_tophat_file { + + my $tophat_file = shift; + + my %tophat_data; + open (my $ifh1, $tophat_file) or die "Could not open file '$tophat_file' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + next if($line =~ m/$TOPHAT_HEADER_PATTERN/); + my @fields = split $TOPHAT_SPLIT_CHAR, $line; + my $breakpoint = $fields[$TOPHAT_BREAKREF-1]; + $tophat_data{$breakpoint}{'gene1_name'} = $fields[$TOPHAT_GENE1-1]; + $tophat_data{$breakpoint}{'chr1'} = $fields[$TOPHAT_CHR1-1]; + $tophat_data{$breakpoint}{'pos1'} = $fields[$TOPHAT_POS1-1]; + $tophat_data{$breakpoint}{'gene2_name'} = $fields[$TOPHAT_GENE2-1]; + $tophat_data{$breakpoint}{'chr2'} = $fields[$TOPHAT_CHR2-1]; + $tophat_data{$breakpoint}{'pos2'} = $fields[$TOPHAT_POS2-1]; + $tophat_data{$breakpoint}{'num_spanning_reads'} = $fields[$TOPHAT_SPAN_READS-1]; + $tophat_data{$breakpoint}{'num_spanning_mate_pairs'} = $fields[$TOPHAT_SPAN_MATE_PAIRS-1]; + $tophat_data{$breakpoint}{'num_spanning_mates_2'} = $fields[$TOPHAT_SPAN_MATE_PAIRS2-1]; + $tophat_data{$breakpoint}{'score'} = $fields[$TOPHAT_SCORE]; + $tophat_data{$breakpoint}{'strand1'} = $fields[$TOPHAT_STRAND1-1]; + $tophat_data{$breakpoint}{'strand2'} = $fields[$TOPHAT_STRAND2-1]; + } + close ($ifh1); + + return \%tophat_data; +} + sub parse_transcript_data { my ($fusion, $breaknum, $transcripts) = @_; @@ -630,6 +890,28 @@ sub parse_transcript_data { return $fusion; } +sub parse_vagrent_query_file { + my ($line) = @_; + + my @fields = split "\t", $line; + + my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( + -breakpoint => $fields[0], + -alt_breakpoint => $fields[1], + -alt_breakpoint2 => $fields[2], + -chr1 => $fields[3], + -pos1_start => $fields[4], + -pos1_end => $fields[5], + -strand1 => $fields[6], + -chr2 => $fields[7], + -pos2_start => $fields[8], + -pos2_end => $fields[9], + -strand2 => $fields[10], + -feature1 => $fields[11]); + + return $fusion; +} + sub process_annotation_file { my ($options, $input, $output, $gene_info) = @_; @@ -641,6 +923,7 @@ sub process_annotation_file { my $curr_exon_end; my $break; my $alt_break; + my $alt_break2; my ($gtf, $path) = fileparse($options->{'gtf'}); @@ -652,7 +935,8 @@ sub process_annotation_file { my $annotation = parse_annotation($line); $break = $annotation->{'breakpoint'}; $alt_break = $annotation->{'alt_breakpoint'}; - next if($annotation->{'gene_name'} ne $annotation->{'star_genename'}); + $alt_break2 = $annotation->{'alt_breakpoint2'}; + next if($annotation->{'gene_name'} ne $annotation->{'genename'} && $annotation->{'source'} ne "D"); if($break ne $curr_break){ unless($curr_break eq ""){ $curr_pos = $curr_annotation->{'pos_end'}; @@ -660,18 +944,20 @@ sub process_annotation_file { $curr_exon_end = $curr_annotation->{'feature_end'}; if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; } elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; } else{ + my $breakpoint = $curr_annotation->{'breakpoint'}; + my $name = $curr_annotation->{'genename'}; # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; + my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$gtf."\t-\t-\t-\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; } } $curr_distance = 10000000; @@ -695,23 +981,23 @@ sub process_annotation_file { $curr_exon_end = $curr_annotation->{'feature_end'}; if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; } elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; } else{ # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_start'}; - my $gene_end = $gene_info->{$curr_annotation->{'star_genename'}}{'feature_end'}; + my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; + my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'star_genename'}."\t".$curr_annotation->{'star_geneid'}."\tIntronic\t".$gtf."\t-\t-\t-\n"; + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; } } } else{ - print $ofh1 $break."\t".$alt_break."\n"; + print $ofh1 $break."\t".$alt_break."\t".$alt_break2."\n"; } close ($ifh1); close ($ofh1); @@ -719,57 +1005,344 @@ sub process_annotation_file { return 1; } -sub query_vagrent { +sub process_overlap_files { my $options = shift; my $tmp = $options->{'tmp'}; return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); - - my $sample = $options->{'sample'}; - # There will always be a 1_2 comparison file so deal with that first and build the fusions object. + my $sample = $options->{'sample'}; # Establish the source of 1 and 2 respectively + my $source_comb; my $source1 = $options->{'fusion_files'}->{'1'}->{'format'}; my $source2 = $options->{'fusion_files'}->{'2'}->{'format'}; - my $col_set = 1; - my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; - if($source2 eq 'star'){ - $col_set = 2; - $star_file = $options->{'fusion_files'}->{'2'}->{'name'}; + my $output_file = File::Spec->catfile($tmp, "$sample.vagrent.query.list"); + my %all_fusions; + my $cols; + + if($options->{'num'} == 3){ + my $source3 = $options->{'fusion_files'}->{'3'}->{'format'}; + $source_comb = uc(substr($source1,0,1).substr($source2,0,1).substr($source3,0,1)); + my $overlap_file1_2_3 = File::Spec->catfile($tmp, "1_2_3.$sample.bedpe_overlap"); + open (my $ifh1, $overlap_file1_2_3) or die "Could not open file '$overlap_file1_2_3' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + my $fusion = parse_intersection($line); + my $breakpoint = $fusion->breakpoint(); + + if(!exists $all_fusions{$breakpoint}){ + $all_fusions{$breakpoint} = $fusion; + $all_fusions{$breakpoint}{'source'} = $source_comb; + } + } + close($ifh1); + + $source_comb = uc(substr($source1,0,1).substr($source3,0,1)); + my $overlap_file1_3 = File::Spec->catfile($tmp, "1_3.$sample.bedpe_overlap"); + open (my $ifh2, $overlap_file1_3) or die "Could not open file '$overlap_file1_3' $!"; + while (<$ifh2>) { + chomp; + my $line = $_; + my $fusion = parse_overlap($line); + my $breakpoint = $fusion->breakpoint(); + + if(!exists $all_fusions{$breakpoint}){ + $all_fusions{$breakpoint} = $fusion; + $all_fusions{$breakpoint}{'source'} = $source_comb; + } + } + close($ifh2); + + $source_comb = uc(substr($source2,0,1).substr($source3,0,1)); + my $overlap_file2_3 = File::Spec->catfile($tmp, "2_3.$sample.bedpe_overlap"); + open (my $ifh3, $overlap_file2_3) or die "Could not open file '$overlap_file2_3' $!"; + while (<$ifh3>) { + chomp; + my $line = $_; + my $fusion = parse_overlap($line); + my $breakpoint = $fusion->breakpoint(); + + if(!exists $all_fusions{$breakpoint}){ + $all_fusions{$breakpoint} = $fusion; + $all_fusions{$breakpoint}{'source'} = $source_comb; + } + } + close($ifh3); } - + my $overlap_file1_2 = File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); + $source_comb = uc(substr($source1,0,1).substr($source2,0,1)); + open (my $ifh4, $overlap_file1_2) or die "Could not open file '$overlap_file1_2' $!"; + while (<$ifh4>) { + chomp; + my $line = $_; + my $fusion = parse_overlap($line); + my $breakpoint = $fusion->breakpoint(); + + if(!exists $all_fusions{$breakpoint}){ + $all_fusions{$breakpoint} = $fusion; + $all_fusions{$breakpoint}{'source'} = $source_comb; + } + } + close($ifh4); - my %star_gene_list; - open (my $ifh1, $star_file) or die "Could not open file '$star_file' $!"; + open(my $ofh1, '>', $output_file) or die "Could not open file '$output_file' $!"; + for my $brk (keys %all_fusions){ + my $output_line = $all_fusions{$brk}->format_fusion_line($all_fusions{$brk}{'source'}); + print $ofh1 $output_line."\n"; + + } + close($ofh1); + + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + return 1; +} + +sub process_singletons { + my $options = shift; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + my $sample = $options->{'sample'}; + + my %star_ids; + my %tophat_ids; + my %defuse_ids; + + my $overlaps_file = File::Spec->catfile($tmp, "$sample.vagrent.query.list"); + open (my $ifh1, $overlaps_file) or die "Could not open file '$overlaps_file' $!"; while (<$ifh1>) { chomp; my $line = $_; my @fields = split "\t", $line; - my $breakpoint = $fields[0]; - $star_gene_list{$breakpoint}{'gene1_name'} = $fields[4]; - $star_gene_list{$breakpoint}{'gene1_id'} = $fields[5]; - $star_gene_list{$breakpoint}{'gene2_name'} = $fields[10]; - $star_gene_list{$breakpoint}{'gene2_id'} = $fields[11]; + my $length = scalar @fields; + my $source = $fields[$length-1]; + + if($source eq 'STD'){ + $star_ids{$fields[0]} = 'STD'; + $tophat_ids{$fields[1]} = 'STD'; + my $defuse_brk_id = $fields[2]; + my @defuse_id_split = split "_", $defuse_brk_id; + my $defuse_junction = $defuse_id_split[0]; + $defuse_ids{$defuse_junction} = 'STD'; + } + elsif($source eq 'SD'){ + $star_ids{$fields[0]} = 'SD'; + my $defuse_brk_id = $fields[1]; + my @defuse_id_split = split "_", $defuse_brk_id; + my $defuse_junction = $defuse_id_split[0]; + $defuse_ids{$defuse_junction} = 'SD'; + } + elsif($source eq 'ST'){ + $star_ids{$fields[0]} = 'ST'; + $tophat_ids{$fields[1]} = 'ST'; + } + elsif($source eq 'TD'){ + $tophat_ids{$fields[0]} = 'TD'; + $defuse_ids{$fields[1]} = 'TD'; + my $defuse_brk_id = $fields[1]; + my @defuse_id_split = split "_", $defuse_brk_id; + my $defuse_junction = $defuse_id_split[0]; + $defuse_ids{$defuse_junction} = 'TD'; + } + else{ + die "Fusion does not have recognised source algorithms\n"; + } + } + close($ifh1); + + my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; + my $tophat_file = $options->{'fusion_files'}->{'2'}->{'name'}; + my $defuse_file = $options->{'fusion_files'}->{'3'}->{'name'}; + my $star_data = parse_star_file($star_file); + my $tophat_data = parse_tophat_file($tophat_file); + my $defuse_data = parse_defuse_file($defuse_file); + my %all_singletons; + + for my $star_brk (keys $star_data){ + if(!exists $star_ids{$star_brk}){ + + $star_ids{$star_brk} = 'S'; + + my $pos_start1 = $star_data->{$star_brk}{'pos1'} -1; + my $pos_start2 = $star_data->{$star_brk}{'pos2'} -1; + + my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( + -breakpoint => $star_brk, + -chr1 => $star_data->{$star_brk}{'chr1'}, + -pos1_start => $star_data->{$star_brk}{'pos1'}-1, + -pos1_end => $star_data->{$star_brk}{'pos1'}, + -strand1 => $star_data->{$star_brk}{'strand1'}, + -chr2 => $star_data->{$star_brk}{'chr2'}, + -pos2_start => $star_data->{$star_brk}{'pos2'}-1, + -pos2_end => $star_data->{$star_brk}{'pos2'}, + -strand2 => $star_data->{$star_brk}{'strand2'}, + -alt_breakpoint => 'NA', + -alt_breakpoint2 => 'NA'); + + if(!exists $all_singletons{$star_brk}){ + $all_singletons{$star_brk} = $fusion; + $all_singletons{$star_brk}{'source'} = 'S'; + } + } + } + + for my $tophat_brk (keys $tophat_data){ + if(!exists $tophat_ids{$tophat_brk}){ + + $tophat_ids{$tophat_brk} = 'T'; + + my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( + -breakpoint => $tophat_brk, + -chr1 => $tophat_data->{$tophat_brk}{'chr1'}, + -pos1_start => $tophat_data->{$tophat_brk}{'pos1'}-1, + -pos1_end => $tophat_data->{$tophat_brk}{'pos1'}, + -strand1 => $tophat_data->{$tophat_brk}{'strand1'}, + -chr2 => $tophat_data->{$tophat_brk}{'chr2'}, + -pos2_start => $tophat_data->{$tophat_brk}{'pos2'}-1, + -pos2_end => $tophat_data->{$tophat_brk}{'pos2'}, + -strand2 => $tophat_data->{$tophat_brk}{'strand2'}, + -alt_breakpoint => 'NA', + -alt_breakpoint2 => 'NA'); + + if(!exists $all_singletons{$tophat_brk}){ + $all_singletons{$tophat_brk} = $fusion; + $all_singletons{$tophat_brk}{'source'} = 'T'; + } + } + } + + for my $defuse_brk (keys $defuse_data){ + my @defuse_ids = split "_", $defuse_brk; + my $breakpoint = $defuse_ids[0]; + if(!exists $defuse_ids{$breakpoint}){ + $defuse_ids{$breakpoint} = 'D'; + + my $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( + -breakpoint => $defuse_brk, + -chr1 => $defuse_data->{$defuse_brk}{'chr1'}, + -pos1_start => $defuse_data->{$defuse_brk}{'pos1'}-1, + -pos1_end => $defuse_data->{$defuse_brk}{'pos1'}, + -strand1 => $defuse_data->{$defuse_brk}{'strand1'}, + -chr2 => $defuse_data->{$defuse_brk}{'chr2'}, + -pos2_start => $defuse_data->{$defuse_brk}{'pos2'}-1, + -pos2_end => $defuse_data->{$defuse_brk}{'pos2'}, + -strand2 => $defuse_data->{$defuse_brk}{'strand2'}, + -alt_breakpoint => 'NA', + -alt_breakpoint2 => 'NA'); + + if(!exists $all_singletons{$defuse_brk}){ + $all_singletons{$defuse_brk} = $fusion; + $all_singletons{$defuse_brk}{'source'} = 'D'; + } + } + } + open(my $ofh1, '>>', $overlaps_file) or die "Could not open file '$overlaps_file' $!"; + for my $brk (keys %all_singletons){ + my $output_line = $all_singletons{$brk}->format_fusion_line($all_singletons{$brk}{'source'}); + print $ofh1 $output_line."\n"; + } + close($ofh1); + + PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + + return 1; +} + +sub query_vagrent { + my $options = shift; + + my $tmp = $options->{'tmp'}; + return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); + + my $sample = $options->{'sample'}; + + my $star_file = $options->{'fusion_files'}->{'1'}->{'name'}; + my $tophat_file = $options->{'fusion_files'}->{'2'}->{'name'}; + my $defuse_file = $options->{'fusion_files'}->{'3'}->{'name'}; + + my $gene_list = parse_star_file($star_file); + my $gene_gtf = filter_gtf($options, 'gene'); + + my %gene_info; + open (my $ifh1, $gene_gtf) or die "Could not open file '$gene_gtf' $!"; + while (<$ifh1>) { + chomp; + my $line = $_; + my $gene_annot = parse_gene_info($line); + if(!exists $gene_info{$gene_annot->{'gene_name'}}){ + $gene_info{$gene_annot->{'gene_name'}} = $gene_annot->{'gene_id'}; + } } close ($ifh1); + my %rev_gene_info = reverse %gene_info; + + my $tophat_data = parse_tophat_file($tophat_file); + + for my $brk (keys $tophat_data){ + if(!exists $gene_list->{$brk}){ + my $gene1_name = $tophat_data->{$brk}{'gene1_name'}; + my $gene2_name = $tophat_data->{$brk}{'gene2_name'}; + $gene_list->{$brk}{'gene1_name'} = $gene1_name; + if(defined $gene_info{$gene1_name}){ + $gene_list->{$brk}{'gene1_id'} = $gene_info{$gene1_name}; + } + elsif(defined $rev_gene_info{$gene1_name}){ + $gene_list->{$brk}{'gene1_id'} = $gene1_name; + $gene_list->{$brk}{'gene1_name'} = $rev_gene_info{$gene1_name}; + } + else{ + $gene_list->{$brk}{'gene1_id'} = $gene1_name; + } + $gene_list->{$brk}{'gene2_name'} = $gene2_name; + if(defined $gene_info{$gene2_name}){ + $gene_list->{$brk}{'gene2_id'} = $gene_info{$gene2_name}; + } + elsif(defined $rev_gene_info{$gene2_name}){ + $gene_list->{$brk}{'gene2_id'} = $gene2_name; + $gene_list->{$brk}{'gene2_name'} = $rev_gene_info{$gene2_name}; + } + else{ + $gene_list->{$brk}{'gene2_id'} = $gene1_name; + } + } + } + + my $defuse_data = parse_defuse_file($defuse_file); + + for my $defuse_brk (keys $defuse_data){ + if(!exists $gene_list->{$defuse_brk}){ + my $gene1_name = $defuse_data->{$defuse_brk}{'gene1_name'}; + my $gene2_name = $defuse_data->{$defuse_brk}{'gene2_name'}; + $gene_list->{$defuse_brk}{'gene1_name'} = $gene1_name; + $gene_list->{$defuse_brk}{'gene1_id'} = $gene_info{$gene1_name}; + $gene_list->{$defuse_brk}{'gene1_id'} = $gene1_name if(!defined $gene_list->{$defuse_brk}{'gene1_id'}); + $gene_list->{$defuse_brk}{'gene2_name'} = $gene2_name; + $gene_list->{$defuse_brk}{'gene2_id'} = $gene_info{$gene2_name}; + $gene_list->{$defuse_brk}{'gene2_id'} = $gene2_name if(!defined $gene_list->{$defuse_brk}{'gene2_id'}); + } + } + my $vagrent_version = "VAGrENT_".Sanger::CGP::Vagrent->VERSION; my $ts = Sanger::CGP::Vagrent::TranscriptSource::FileBasedTranscriptSource->new('cache' => $options->{'cache'}); my %breaklist; - open (my $ifh3, $overlap_file1_2) or die "Could not open file '$overlap_file1_2' $!"; - while (<$ifh3>) { + my $vagrent_query_file = File::Spec->catfile($tmp, "$sample.vagrent.query.list"); + + open (my $ifh2, $vagrent_query_file) or die "Could not open file '$vagrent_query_file' $!"; + while (<$ifh2>) { chomp; my $line = $_; - my $fusion = parse_overlap($line, $col_set); - $fusion->gene1($star_gene_list{$fusion->{'breakpoint'}}{'gene1_name'}); - $fusion->gene2($star_gene_list{$fusion->{'breakpoint'}}{'gene2_name'}); - $fusion->gene1_id($star_gene_list{$fusion->{'breakpoint'}}{'gene1_id'}); - $fusion->gene2_id($star_gene_list{$fusion->{'breakpoint'}}{'gene2_id'}); + my $fusion = parse_vagrent_query_file($line); + $fusion->gene1($gene_list->{$fusion->{'breakpoint'}}{'gene1_name'}); + $fusion->gene2($gene_list->{$fusion->{'breakpoint'}}{'gene2_name'}); + $fusion->gene1_id($gene_list->{$fusion->{'breakpoint'}}{'gene1_id'}); + $fusion->gene2_id($gene_list->{$fusion->{'breakpoint'}}{'gene2_id'}); my $genomic_pos1 = Sanger::CGP::Vagrent::Data::GenomicRegion->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $fusion->{'chr1'}, 'minpos' => $fusion->{'pos1_start'}, 'maxpos' => $fusion->{'pos1_end'}, 'id' => $fusion->{'breakpoint'}); my $genomic_pos2 = Sanger::CGP::Vagrent::Data::GenomicRegion->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $fusion->{'chr2'}, 'minpos' => $fusion->{'pos2_start'}, 'maxpos' => $fusion->{'pos2_end'}, 'id' => $fusion->{'breakpoint'}); @@ -779,10 +1352,9 @@ sub query_vagrent { parse_transcript_data($fusion, 1, \@trans1); parse_transcript_data($fusion, 2, \@trans2); - $breaklist{$fusion->{'breakpoint'}} = $fusion if(!exists $breaklist{$fusion->{'breakpoint'}}); } - close ($ifh3); + close ($ifh2); my $final_annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann_final"); my $final_annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann_final"); @@ -838,13 +1410,36 @@ sub run_bed_pairtopair { my $sample = $options->{'sample'}; my $prog = _which('bedtools'); + my @commands; # There will always be at least two input files so build the command for the first comparison - my $command = $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.bedpe"), + my $overlap_file1 = File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); + push @commands, $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.bedpe"), File::Spec->catfile($tmp, "2.$sample.bedpe"), - File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap"); + $overlap_file1; + + if($options->{'num'} == 3){ + + my $overlap_file2 = File::Spec->catfile($tmp, "1_3.$sample.bedpe_overlap"); + push @commands, $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "1.$sample.bedpe"), + File::Spec->catfile($tmp, "3.$sample.bedpe"), + $overlap_file2; + + my $overlap_file3 = File::Spec->catfile($tmp, "2_3.$sample.bedpe_overlap"); + push @commands, $prog . sprintf $BEDTOOLS_PAIRTOPAIR, File::Spec->catfile($tmp, "2.$sample.bedpe"), + File::Spec->catfile($tmp, "3.$sample.bedpe"), + $overlap_file3; + + # Also find the intersection between all three algorithms using Unix sort and join commands. + my $sorted_file1 = File::Spec->catfile($tmp, "1_2.$sample.bedpe_overlap.sorted"); + my $sorted_file2 = File::Spec->catfile($tmp, "1_3.$sample.bedpe_overlap.sorted"); + my $overlap_file123 = File::Spec->catfile($tmp, "1_2_3.$sample.bedpe_overlap"); + push @commands, sprintf $SORT, 8, 8, $overlap_file1, $sorted_file1; + push @commands, sprintf $SORT, 8,8, $overlap_file2, $sorted_file2; + push @commands, sprintf $JOIN, $sorted_file1,$sorted_file2, $overlap_file123; + } - PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), $command, 0); + PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return 1; From 75d22410d2d60db11cc2c14dcbe3fccb30ed3b01 Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 30 Nov 2015 13:45:01 +0000 Subject: [PATCH 08/40] Changed vargrent code to consider mid exon breakpoints rather than just the exon boundaries --- perl/lib/Sanger/CGP/CompareFusions/Implement.pm | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index ab186c4..566aef7 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -855,6 +855,8 @@ sub parse_transcript_data { my $exon_end; my $transcript_id; my $gene_biotype; + my $found_exon_boundary = 0; + my $curr_distance = 10000000; my $num_transcripts = scalar @sortedTrans; for (my $x=0;$x<$num_transcripts; $x++){ my @exons = $sortedTrans[$x]->getExons; @@ -862,15 +864,25 @@ sub parse_transcript_data { for (my $y=0;$y<$num_exons; $y++){ my $e = $exons[$y]; if($fusion->{'pos'.$breaknum.'_end'} == $e->getMinPos || $fusion->{'pos'.$breaknum.'_end'} == $e->getMaxPos){ + $found_exon_boundary = 1; $transcript_id = $sortedTrans[$x]->getAccession; $gene_biotype = $sortedTrans[$x]->{'_genetype'}; $exon_start = $e->getMinPos; $exon_end = $e->getMaxPos; $exon_number = $y+1; last; + }elsif($fusion->{'pos'.$breaknum.'_end'} > $e->getMinPos && $fusion->{'pos'.$breaknum.'_end'} < $e->getMaxPos){ + my $distance = find_closest_boundary($fusion->{'pos'.$breaknum.'_end'}, $e->getMinPos, $e->getMaxPos); + if ($distance < $curr_distance){ + $transcript_id = $sortedTrans[$x]->getAccession; + $gene_biotype = $sortedTrans[$x]->{'_genetype'}; + $exon_start = $e->getMinPos; + $exon_end = $e->getMaxPos; + $exon_number = $y+1; + } } } - last if(defined $transcript_id); + last if($found_exon_boundary); } if($breaknum == 1){ $fusion->transcript1_id($transcript_id); @@ -936,7 +948,7 @@ sub process_annotation_file { $break = $annotation->{'breakpoint'}; $alt_break = $annotation->{'alt_breakpoint'}; $alt_break2 = $annotation->{'alt_breakpoint2'}; - next if($annotation->{'gene_name'} ne $annotation->{'genename'} && $annotation->{'source'} ne "D"); + #next if($annotation->{'gene_name'} ne $annotation->{'genename'} && $annotation->{'source'} ne "D"); if($break ne $curr_break){ unless($curr_break eq ""){ $curr_pos = $curr_annotation->{'pos_end'}; @@ -953,6 +965,7 @@ sub process_annotation_file { my $breakpoint = $curr_annotation->{'breakpoint'}; my $name = $curr_annotation->{'genename'}; # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic + print "The gene is $name\n"; my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; From 66ec37c28898c1fc2787b00fa5f6e9381fab6fca Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 30 Nov 2015 17:14:38 +0000 Subject: [PATCH 09/40] Changed the select process_annotation_file subroutine so that it can handle when the gene name called by the fusion algorithm does not match what is in the gtf file. It will now either annotated as intronic or unannotated --- .../Sanger/CGP/CompareFusions/Implement.pm | 27 ++++++++++++++----- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index 566aef7..820dd65 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -962,15 +962,20 @@ sub process_annotation_file { print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; } else{ - my $breakpoint = $curr_annotation->{'breakpoint'}; - my $name = $curr_annotation->{'genename'}; # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - print "The gene is $name\n"; my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + if(defined $gene_start && defined $gene_end){ + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + } + else{ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + } + } + else{ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; } } $curr_distance = 10000000; @@ -1004,9 +1009,17 @@ sub process_annotation_file { my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; my $break_pos = $curr_annotation->{'pos_end'}; - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + if(defined $gene_start && defined $gene_end){ + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + } + else{ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + } } + else{ + print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; + } } } else{ From 2b555825fc492bca06b853186145c44d545c36bc Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 4 Dec 2015 13:45:20 +0000 Subject: [PATCH 10/40] Changed vagrent and select annotation subroutines to check that the gene annotation is consistent with the algorithm call --- .../Sanger/CGP/CompareFusions/Implement.pm | 363 +++++++++++------- 1 file changed, 229 insertions(+), 134 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index 820dd65..ff5edce 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -654,6 +654,43 @@ sub parse_annotation { return \%annotation; } +sub parse_bed_file { + my ($line, $breaknum) = @_; + + my @fields = split "\t", $line; + my $fusion; + + if($breaknum == 1){ + + $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( + -breakpoint => $fields[3], + -alt_breakpoint => $fields[4], + -alt_breakpoint2 => $fields[8], + -chr1 => $fields[0], + -pos1_start => $fields[1], + -pos1_end => $fields[2], + -strand1 => $fields[5], + -gene1 => $fields[6], + -gene1_id => $fields[7], + -feature1 => $fields[9]); + } + else{ + $fusion = new Sanger::CGP::CompareFusions::FusionAnnotation( + -breakpoint => $fields[3], + -alt_breakpoint => $fields[4], + -alt_breakpoint2 => $fields[8], + -chr2 => $fields[0], + -pos2_start => $fields[1], + -pos2_end => $fields[2], + -strand2 => $fields[5], + -gene2 => $fields[6], + -gene2_id => $fields[7], + -feature1 => $fields[9]); + } + + return $fusion; +} + sub parse_break_data { my $line = shift; @@ -713,6 +750,63 @@ sub parse_defuse_file { return \%defuse_data; } +sub parse_exon_data { + my ($fusion, $breaknum, $exons) = @_; + + my $exon_number; + my $exon_start; + my $exon_end; + my $transcript_id; + my $gene_biotype; + my $found_exon_boundary = 0; + my $found_exon = 0; + my $curr_distance = 10000000; + + my $break_pos = $fusion->{'pos'.$breaknum.'_end'}; + + for my $e (keys $exons){ + my $exon = $exons->{$e}; + if($break_pos == $exon->{'feature_start'} || $break_pos == $exon->{'feature_end'}){ + $found_exon_boundary = 1; + $transcript_id = $exon->{'transcript_id'}; + $gene_biotype = $exon->{'gene_biotype'}; + $exon_start = $exon->{'feature_start'}; + $exon_end = $exon->{'feature_end'}; + $exon_number = $exon->{'exon_number'}; + last; + } + elsif($break_pos > $exon->{'feature_start'} && $break_pos < $exon->{'feature_end'}){ + my $distance = find_closest_boundary($break_pos, $exon->{'feature_start'}, $exon->{'feature_end'}); + if ($distance < $curr_distance){ + $found_exon = 1; + $transcript_id = $exon->{'transcript_id'}; + $gene_biotype = $exon->{'gene_biotype'}; + $exon_start = $exon->{'feature_start'}; + $exon_end = $exon->{'feature_end'}; + $exon_number = $exon->{'exon_number'}; + } + } + last if($found_exon_boundary); + } + if ($found_exon_boundary || $found_exon){ + if($breaknum == 1){ + $fusion->transcript1_id($transcript_id); + $fusion->gene1_biotype($gene_biotype); + $fusion->exon1_num($exon_number); + $fusion->feature1_start($exon_start); + $fusion->feature1_end($exon_end); + } + else{ + $fusion->transcript2_id($transcript_id); + $fusion->gene2_biotype($gene_biotype); + $fusion->exon2_num($exon_number); + $fusion->feature2_start($exon_start); + $fusion->feature2_end($exon_end); + } + } + return $fusion; +} + sub parse_gene_info { my $line = shift; $line =~ s/"//g; @@ -841,7 +935,6 @@ sub parse_tophat_file { sub parse_transcript_data { my ($fusion, $breaknum, $transcripts) = @_; - my @filteredTrans; foreach my $t(@{$transcripts}){ push(@filteredTrans, $t) if ($fusion->{'pos'.$breaknum.'_end'} >= $t->getGenomicMinPos && $fusion->{'pos'.$breaknum.'_end'} <= $t->getGenomicMaxPos); @@ -849,13 +942,14 @@ sub parse_transcript_data { my @sortedTrans = sort{&annotation_sort} @filteredTrans; if(defined $sortedTrans[0]){ - + my $vagrent_genename; my $exon_number; my $exon_start; my $exon_end; my $transcript_id; my $gene_biotype; my $found_exon_boundary = 0; + my $found_exon = 0; my $curr_distance = 10000000; my $num_transcripts = scalar @sortedTrans; for (my $x=0;$x<$num_transcripts; $x++){ @@ -870,35 +964,43 @@ sub parse_transcript_data { $exon_start = $e->getMinPos; $exon_end = $e->getMaxPos; $exon_number = $y+1; + $vagrent_genename = $sortedTrans[$x]->{'_genename'}; last; }elsif($fusion->{'pos'.$breaknum.'_end'} > $e->getMinPos && $fusion->{'pos'.$breaknum.'_end'} < $e->getMaxPos){ my $distance = find_closest_boundary($fusion->{'pos'.$breaknum.'_end'}, $e->getMinPos, $e->getMaxPos); if ($distance < $curr_distance){ + $found_exon = 1; $transcript_id = $sortedTrans[$x]->getAccession; $gene_biotype = $sortedTrans[$x]->{'_genetype'}; $exon_start = $e->getMinPos; $exon_end = $e->getMaxPos; $exon_number = $y+1; + $vagrent_genename = $sortedTrans[$x]->{'_genename'}; } } } last if($found_exon_boundary); } - if($breaknum == 1){ - $fusion->transcript1_id($transcript_id); - $fusion->gene1_biotype($gene_biotype); - $fusion->exon1_num($exon_number); - $fusion->feature1_start($exon_start); - $fusion->feature1_end($exon_end); - } - else{ - $fusion->transcript2_id($transcript_id); - $fusion->gene2_biotype($gene_biotype); - $fusion->exon2_num($exon_number); - $fusion->feature2_start($exon_start); - $fusion->feature2_end($exon_end); - } - } + + if(defined $vagrent_genename){ + if($vagrent_genename eq $fusion->{'gene'.$breaknum} && ($found_exon_boundary || $found_exon)){ + if($breaknum == 1){ + $fusion->transcript1_id($transcript_id); + $fusion->gene1_biotype($gene_biotype); + $fusion->exon1_num($exon_number); + $fusion->feature1_start($exon_start); + $fusion->feature1_end($exon_end); + } + else{ + $fusion->transcript2_id($transcript_id); + $fusion->gene2_biotype($gene_biotype); + $fusion->exon2_num($exon_number); + $fusion->feature2_start($exon_start); + $fusion->feature2_end($exon_end); + } + } + } + } return $fusion; } @@ -925,110 +1027,24 @@ sub parse_vagrent_query_file { } sub process_annotation_file { - my ($options, $input, $output, $gene_info) = @_; - - my $curr_distance = 10000000; - my $curr_break = ""; - my $curr_annotation; - my $curr_pos; - my $curr_exon_start; - my $curr_exon_end; - my $break; - my $alt_break; - my $alt_break2; - - my ($gtf, $path) = fileparse($options->{'gtf'}); + my ($input) = @_; + + my %exon_annotation; - open(my $ofh1, '>>', $output) or die "Could not open file $output $!"; open (my $ifh1, $input) or die "Could not open file '$input' $!"; while (<$ifh1>){ chomp; my $line = $_; my $annotation = parse_annotation($line); - $break = $annotation->{'breakpoint'}; - $alt_break = $annotation->{'alt_breakpoint'}; - $alt_break2 = $annotation->{'alt_breakpoint2'}; - #next if($annotation->{'gene_name'} ne $annotation->{'genename'} && $annotation->{'source'} ne "D"); - if($break ne $curr_break){ - unless($curr_break eq ""){ - $curr_pos = $curr_annotation->{'pos_end'}; - $curr_exon_start = $curr_annotation->{'feature_start'}; - $curr_exon_end = $curr_annotation->{'feature_end'}; - - if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; - } - elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; - } - else{ - # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; - my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; - my $break_pos = $curr_annotation->{'pos_end'}; - if(defined $gene_start && defined $gene_end){ - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; - } - else{ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; - } - } - else{ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; - } - } - $curr_distance = 10000000; - $curr_break = $break; - } - } - my $pos = $annotation->{'pos_end'}; - my $exon_start = $annotation->{'feature_start'}; - my $exon_end = $annotation->{'feature_end'}; - my $distance = find_closest_boundary($pos, $exon_start, $exon_end); - - if($distance < $curr_distance){ - $curr_distance = $distance; - $curr_annotation = $annotation; - $curr_break = $break; - } - } - if (defined $curr_annotation){ - $curr_pos = $curr_annotation->{'pos_end'}; - $curr_exon_start = $curr_annotation->{'feature_start'}; - $curr_exon_end = $curr_annotation->{'feature_end'}; - - if($curr_distance <= 10){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; - } - elsif($curr_pos > $curr_exon_start && $curr_pos < $curr_exon_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\t".$curr_annotation->{'transcript_id'}."\t".$gtf."\t".$curr_annotation->{'exon_number'}."\t".$curr_annotation->{'feature_start'}."\t".$curr_annotation->{'feature_end'}."\t".$curr_annotation->{'source'}."\n"; - } - else{ - # The breakpoint doesn't fall within 10bp of an exon boundary. We need to check it falls within the footprint of the star gene and, for now, print it as intronic - my $gene_start = $gene_info->{$curr_annotation->{'genename'}}{'feature_start'}; - my $gene_end = $gene_info->{$curr_annotation->{'genename'}}{'feature_end'}; - my $break_pos = $curr_annotation->{'pos_end'}; - if(defined $gene_start && defined $gene_end){ - if($break_pos >= $gene_start && $break_pos <= $gene_end){ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\t".$curr_annotation->{'gene_id'}."\tIntronic\t".$gtf."\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; - } - else{ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; - } - } - else{ - print $ofh1 $curr_annotation->{'breakpoint'}."\t".$curr_annotation->{'alt_breakpoint'}."\t".$curr_annotation->{'alt_breakpoint2'}."\t".$curr_annotation->{'genename'}."\tNA\tUnannotated\tNA\tNA\tNA\tNA\t".$curr_annotation->{'source'}."\n"; - } + if($annotation->{'gene_name'} eq $annotation->{'genename'}){ + my $break = $annotation->{'breakpoint'}; + my $exon = $annotation->{'exon_id'}; + $exon_annotation{$break}{$exon} = $annotation; } } - else{ - print $ofh1 $break."\t".$alt_break."\t".$alt_break2."\n"; - } close ($ifh1); - close ($ofh1); - return 1; + return \%exon_annotation; } sub process_overlap_files { @@ -1376,8 +1392,8 @@ sub query_vagrent { my @trans1 = $ts->getTranscripts($genomic_pos1); my @trans2 = $ts->getTranscripts($genomic_pos2); - parse_transcript_data($fusion, 1, \@trans1); - parse_transcript_data($fusion, 2, \@trans2); + $fusion = parse_transcript_data($fusion, 1, \@trans1); + $fusion = parse_transcript_data($fusion, 2, \@trans2); $breaklist{$fusion->{'breakpoint'}} = $fusion if(!exists $breaklist{$fusion->{'breakpoint'}}); } close ($ifh2); @@ -1480,12 +1496,27 @@ sub select_annotation { return 1 if PCAP::Threaded::success_exists(File::Spec->catdir($tmp, 'progress'), 0); my $sample = $options->{'sample'}; - my $annot_file1; - my $annot_file2; - my $final_annot_file1; - my $final_annot_file2; - my $gene_gtf = File::Spec->catfile($tmp, "filtered_gene.gtf"); + my ($gtf, $path) = fileparse($options->{'gtf'}); + + my $annot_file1 = File::Spec->catfile($tmp, "$sample.1.ann"); + my $annot_file2 = File::Spec->catfile($tmp, "$sample.2.ann"); + my $bed_file1 = File::Spec->catfile($tmp, "$sample.1.bed"); + my $bed_file2 = File::Spec->catfile($tmp, "$sample.2.bed"); + my $final_annot_file1 = $annot_file1."_final"; + my $final_annot_file2 = $annot_file2."_final"; + + my $exon_annotation1; + my $exon_annotation2; + + if(-s $annot_file1){ + $exon_annotation1 = process_annotation_file($annot_file1); + } + if(-s $annot_file2){ + $exon_annotation2 = process_annotation_file($annot_file2); + } + + my $gene_gtf = File::Spec->catfile($tmp, "filtered_gene.gtf"); my %gene_info; open (my $ifh1, $gene_gtf) or die "Could not open file '$gene_gtf' $!"; @@ -1500,24 +1531,88 @@ sub select_annotation { } close ($ifh1); - opendir(my $dh, $tmp); - while(my $file = readdir $dh){ - $annot_file1 = File::Spec->catfile($tmp, $file) if($file eq "$sample.1.ann"); - $annot_file2 = File::Spec->catfile($tmp, $file) if($file eq "$sample.2.ann"); - } - closedir($dh); + my $bed1; + my $bed2; + + if(-s $bed_file1){ + open(my $ofh1, '>>', $final_annot_file1) or die "Could not open file '$final_annot_file1' $!"; + open (my $ifh2, $bed_file1) or die "Could not open file '$bed_file1' $!"; + while (<$ifh2>) { + chomp; + my $line = $_; + my $fusion = parse_bed_file($line, 1); + if(exists $exon_annotation1->{$fusion->breakpoint}){ + $fusion = parse_exon_data($fusion, 1, $exon_annotation1->{$fusion->breakpoint}); + } + if(!exists $fusion->{'transcript1_id'}){ + $fusion->{'exon1_number'} = 'NA'; + $fusion->{'feature1_start'} = 'NA'; + $fusion->{'feature1_end'} = 'NA'; + # Check whether the breakpoint falls within the footprint of the gene. + my $gene_start = $gene_info{$fusion->{'gene1'}}{'feature_start'}; + my $gene_end = $gene_info{$fusion->{'gene1'}}{'feature_end'}; + my $break_pos = $fusion->{'pos1_end'}; + if(defined $gene_start && defined $gene_end){ + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + $fusion->{'transcript1_id'} = 'Intronic'; + } + else{ + my $location = 'upstream'; + $location = 'downstream' if($break_pos > $gene_end); + $fusion->{'transcript1_id'} = $location; + } + } + else{ + $fusion->{'transcript1_id'} = 'Unannotated'; + } + } + my $output_line = $fusion->format_break_line(1, $gtf); + print $ofh1 $output_line."\n"; + } + close($ifh2); + close($ofh1); + } - $final_annot_file1 = $annot_file1."_final"; - $final_annot_file2 = $annot_file2."_final"; - - if(-s $annot_file1){ - process_annotation_file($options, $annot_file1, $final_annot_file1, \%gene_info); + if(-s $bed_file2){ + open(my $ofh2, '>>', $final_annot_file2) or die "Could not open file '$final_annot_file2' $!"; + open (my $ifh3, $bed_file2) or die "Could not open file '$bed_file2' $!"; + while (<$ifh3>) { + chomp; + my $line = $_; + my $fusion = parse_bed_file($line, 2); + if(exists $exon_annotation1->{$fusion->breakpoint}){ + $fusion = parse_exon_data($fusion, 2, $exon_annotation1->{$fusion->breakpoint}); + } + if(!exists $fusion->{'transcript2_id'}){ + $fusion->{'exon2_number'} = 'NA'; + $fusion->{'feature2_start'} = 'NA'; + $fusion->{'feature2_end'} = 'NA'; + # Check whether the breakpoint falls within the footprint of the gene. + my $gene_start = $gene_info{$fusion->{'gene2'}}{'feature_start'}; + my $gene_end = $gene_info{$fusion->{'gene2'}}{'feature_end'}; + my $break_pos = $fusion->{'pos2_end'}; + if(defined $gene_start && defined $gene_end){ + if($break_pos >= $gene_start && $break_pos <= $gene_end){ + $fusion->{'transcript2_id'} = 'Intronic'; + } + else{ + my $location = 'upstream'; + $location = 'downstream' if($break_pos > $gene_end); + $fusion->{'transcript2_id'} = $location; + } + } + else{ + $fusion->{'transcript2_id'} = 'Unannotated'; + } + } + my $output_line = $fusion->format_break_line(2, $gtf); + print $ofh2 $output_line."\n"; + } + close($ifh3); + close($ofh2); } - if(-s $annot_file2){ - process_annotation_file($options, $annot_file2, $final_annot_file2, \%gene_info); - } - - PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); + + #PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); return 1; } From 734b1db9a9cf93f036f2e829ff6279c1f094598e Mon Sep 17 00:00:00 2001 From: am26 Date: Tue, 8 Dec 2015 17:39:32 +0000 Subject: [PATCH 11/40] Debugging code after running over cell lines. Removed dependency on overlapping files and excluded certain GL00 chromosomes --- perl/bin/compare_overlapping_fusions.pl | 23 ++++++++-------- .../Sanger/CGP/CompareFusions/Implement.pm | 26 ++++++++++++------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl index 33ec3a8..25e08aa 100755 --- a/perl/bin/compare_overlapping_fusions.pl +++ b/perl/bin/compare_overlapping_fusions.pl @@ -80,20 +80,19 @@ BEGIN $threads->run($options->{'num'}, 'createjunctionbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createjunctionbed'); Sanger::CGP::CompareFusions::Implement::run_bed_pairtopair($options) if(!exists $options->{'process'} || $options->{'process'} eq 'runbedpairtopair'); - if(-s File::Spec->catfile($options->{'tmp'}, "1_2.".$options->{'sample'}.".bedpe_overlap")){ - Sanger::CGP::CompareFusions::Implement::process_overlap_files($options) if(!exists $options->{'process'} || $options->{'process'} eq 'processoverlaps'); - Sanger::CGP::CompareFusions::Implement::process_singletons($options) if(!exists $options->{'process'} || $options->{'process'} eq 'singletons'); - Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent'); - if(-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".1.bed") || -s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".2.bed")){ - Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); - Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); - Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation'); - Sanger::CGP::CompareFusions::Implement::deduplicate_fusions($options) if(!exists $options->{'process'} || $options->{'process'} eq 'deduplicate'); - } + Sanger::CGP::CompareFusions::Implement::process_overlap_files($options) if(!exists $options->{'process'} || $options->{'process'} eq 'processoverlaps'); + Sanger::CGP::CompareFusions::Implement::process_singletons($options) if(!exists $options->{'process'} || $options->{'process'} eq 'singletons'); + Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent'); + if(-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".1.bed") || -s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".2.bed")){ + Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed'); + Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation'); + Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation'); + Sanger::CGP::CompareFusions::Implement::deduplicate_fusions($options) if(!exists $options->{'process'} || $options->{'process'} eq 'deduplicate'); } + if(!exists $options->{'process'} || $options->{'process'} eq 'output') { Sanger::CGP::CompareFusions::Implement::generate_output($options); - #cleanup($options); + cleanup($options); } } @@ -101,7 +100,7 @@ sub cleanup { my $options = shift; my $tmpdir = $options->{'tmp'}; my $sample = $options->{'sample'}; - move(File::Spec->catfile($tmpdir, "$sample.star-defuse.overlapping.fusions.txt"), $options->{'outdir'}) || die $!; + move(File::Spec->catfile($tmpdir, "$sample.detected.fusions.txt"), $options->{'outdir'}) || die $!; move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!; remove_tree $tmpdir if(-e $tmpdir); return 0; diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index ff5edce..400049c 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -83,12 +83,12 @@ my %ALLOWED_BIOTYPES = ( my %CONFIDENCE_SCORES = ( STD => '76%', - SD => '79%', + SD => '77%', ST => '48%', TD => '25%', - D => '59%', - T => '51%', - S => '24%', + D => '58%', + T => '50%', + S => '22%', ); # Position of the columns in the tophat-fusion filtered file used to format the bed file. @@ -1375,7 +1375,7 @@ sub query_vagrent { my %breaklist; my $vagrent_query_file = File::Spec->catfile($tmp, "$sample.vagrent.query.list"); - + open (my $ifh2, $vagrent_query_file) or die "Could not open file '$vagrent_query_file' $!"; while (<$ifh2>) { chomp; @@ -1388,12 +1388,18 @@ sub query_vagrent { my $genomic_pos1 = Sanger::CGP::Vagrent::Data::GenomicRegion->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $fusion->{'chr1'}, 'minpos' => $fusion->{'pos1_start'}, 'maxpos' => $fusion->{'pos1_end'}, 'id' => $fusion->{'breakpoint'}); my $genomic_pos2 = Sanger::CGP::Vagrent::Data::GenomicRegion->new('species' => 'human', 'genomeVersion' => 'GRCh38', 'chr' => $fusion->{'chr2'}, 'minpos' => $fusion->{'pos2_start'}, 'maxpos' => $fusion->{'pos2_end'}, 'id' => $fusion->{'breakpoint'}); - - my @trans1 = $ts->getTranscripts($genomic_pos1); - my @trans2 = $ts->getTranscripts($genomic_pos2); + + unless($genomic_pos1->{'_chr'} eq 'GL000219.1' || $genomic_pos2->{'_chr'} eq 'GL000219.1' || $genomic_pos1->{'_chr'} eq 'KI270726.1' || $genomic_pos2->{'_chr'} eq 'KI270726.1'){ + + print Dumper $genomic_pos1; + print Dumper $genomic_pos2; + my @trans1 = $ts->getTranscripts($genomic_pos1); + my @trans2 = $ts->getTranscripts($genomic_pos2); + + $fusion = parse_transcript_data($fusion, 1, \@trans1); + $fusion = parse_transcript_data($fusion, 2, \@trans2); + } - $fusion = parse_transcript_data($fusion, 1, \@trans1); - $fusion = parse_transcript_data($fusion, 2, \@trans2); $breaklist{$fusion->{'breakpoint'}} = $fusion if(!exists $breaklist{$fusion->{'breakpoint'}}); } close ($ifh2); From 89aaa91c679b18cd5f33baa819d2d68336427a73 Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 14 Dec 2015 12:08:37 +0000 Subject: [PATCH 12/40] Tidying up code --- perl/lib/Sanger/CGP/CompareFusions/Implement.pm | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index 400049c..e3358e5 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -1391,8 +1391,6 @@ sub query_vagrent { unless($genomic_pos1->{'_chr'} eq 'GL000219.1' || $genomic_pos2->{'_chr'} eq 'GL000219.1' || $genomic_pos1->{'_chr'} eq 'KI270726.1' || $genomic_pos2->{'_chr'} eq 'KI270726.1'){ - print Dumper $genomic_pos1; - print Dumper $genomic_pos2; my @trans1 = $ts->getTranscripts($genomic_pos1); my @trans2 = $ts->getTranscripts($genomic_pos2); @@ -1551,7 +1549,7 @@ sub select_annotation { $fusion = parse_exon_data($fusion, 1, $exon_annotation1->{$fusion->breakpoint}); } if(!exists $fusion->{'transcript1_id'}){ - $fusion->{'exon1_number'} = 'NA'; + $fusion->{'exon1_num'} = 'NA'; $fusion->{'feature1_start'} = 'NA'; $fusion->{'feature1_end'} = 'NA'; # Check whether the breakpoint falls within the footprint of the gene. @@ -1590,7 +1588,7 @@ sub select_annotation { $fusion = parse_exon_data($fusion, 2, $exon_annotation1->{$fusion->breakpoint}); } if(!exists $fusion->{'transcript2_id'}){ - $fusion->{'exon2_number'} = 'NA'; + $fusion->{'exon2_num'} = 'NA'; $fusion->{'feature2_start'} = 'NA'; $fusion->{'feature2_end'} = 'NA'; # Check whether the breakpoint falls within the footprint of the gene. From 30df67ba8ee25b18149f40ab7767489350b3fb1c Mon Sep 17 00:00:00 2001 From: am26 Date: Wed, 3 Feb 2016 16:55:35 +0000 Subject: [PATCH 13/40] Fixing bug where defuse orientation incorrectly picked up --- .../Sanger/CGP/CompareFusions/Implement.pm | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index e3358e5..45873cd 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -431,6 +431,15 @@ sub deduplicate_fusions { shift @fields; print $ofh2 join("\t", @fields)."\n"; } + } + elsif($source eq 'D'){ + $fields[1] =~ m/^(.*)_[0-9]+$/; + my $junction = $1; + if(!exists $seen{$junction}){ + $seen{$junction} = $fields[1]; + shift @fields; + print $ofh2 join("\t", @fields)."\n"; + } } else{ if(!exists $seen{$fields[1]}){ @@ -579,12 +588,15 @@ sub generate_output { $defuse_breakpoint = $fields[$defuse_pos]; $defuse_splitr_count = $defuse_data->{$defuse_breakpoint}{'split_reads'}; $defuse_span_count = $defuse_data->{$defuse_breakpoint}{'span_reads'}; - $chr1 = $defuse_data->{$defuse_breakpoint}{'chr1'}; - $pos1 = $defuse_data->{$defuse_breakpoint}{'pos1'}; - $strand1 = $defuse_data->{$defuse_breakpoint}{'strand1'}; - $chr2 = $defuse_data->{$defuse_breakpoint}{'chr2'}; - $pos2 = $defuse_data->{$defuse_breakpoint}{'pos2'}; - $strand2 = $defuse_data->{$defuse_breakpoint}{'strand2'}; + # If we already have the chr-pos-strand info we don't want to pick it up from deFuse in case the orientation is reported differently + if(!defined $star_pos && !defined $tophat_pos){ + $chr1 = $defuse_data->{$defuse_breakpoint}{'chr1'}; + $pos1 = $defuse_data->{$defuse_breakpoint}{'pos1'}; + $strand1 = $defuse_data->{$defuse_breakpoint}{'strand1'}; + $chr2 = $defuse_data->{$defuse_breakpoint}{'chr2'}; + $pos2 = $defuse_data->{$defuse_breakpoint}{'pos2'}; + $strand2 = $defuse_data->{$defuse_breakpoint}{'strand2'}; + } $defuse_splitr_seq = $defuse_data->{$defuse_breakpoint}{'sequence'}; my @defuse_temp = split "_", $defuse_breakpoint; $defuse_junction = $defuse_temp[0]; From 4a053341ba92b22038928e396cdf2207667080f6 Mon Sep 17 00:00:00 2001 From: am26 Date: Thu, 4 Feb 2016 12:55:02 +0000 Subject: [PATCH 14/40] Script to apply filters to the deFuse raw output based on Graham Bignells recommendations --- perl/bin/defuse_filters.pl | 160 +++++++++++++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100755 perl/bin/defuse_filters.pl diff --git a/perl/bin/defuse_filters.pl b/perl/bin/defuse_filters.pl new file mode 100755 index 0000000..17810df --- /dev/null +++ b/perl/bin/defuse_filters.pl @@ -0,0 +1,160 @@ +#!/usr/bin/perl +##########LICENCE ########## +#Copyright (c) 2015 Genome Research Ltd. +### +#Author: Cancer Genome Project +### +#This file is part of cgpRna. +### +#cgpRna is free software: you can redistribute it and/or modify it under +#the terms of the GNU Affero General Public License as published by the +#Free Software Foundation; either version 3 of the License, or (at your +#option) any later version. +### +#This program is distributed in the hope that it will be useful, but +#WITHOUT ANY WARRANTY; without even the implied warranty of +#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero +#General Public License for more details. +### +#You should have received a copy of the GNU Affero General Public +#License along with this program. If not, see +#. +### +#1. The usage of a range of years within a copyright statement contained +#within this distribution should be interpreted as being equivalent to a +#list of years including the first and last year specified and all +#consecutive years between them. For example, a copyright statement that +#reads ‘Copyright (c) 2005, 2007- 2009, 2011-2012’ should be interpreted +#as being identical to a statement that reads ‘Copyright (c) 2005, 2007, +#2008, 2009, 2011, 2012’ and a copyright statement that reads ‘Copyright +#(c) 2005-2012’ should be interpreted as being identical to a statement +#that reads ‘Copyright (c) 2005, 2006, 2007, 2008, 2009, 2010, 2011, +#2012’." +##########LICENCE ########## +########## +use strict; +use warnings; + +use autodie qw(:all); +use English qw( -no_match_vars ); +use File::Path qw(remove_tree make_path); +use Getopt::Long; +use File::Spec; +use Pod::Usage qw(pod2usage); +use Const::Fast qw(const); +use PCAP::Cli; + +# Columns in the deFuse output file that will be filtered. +const my $DEFUSE_SPLIT_CHAR => '\t'; + +const my $SPLITR_MIN_PVAL_COL => 7; +const my $BREAKSEQS_ESTISLANDS_PERCIDENT_COL => 14; +const my $CDNA_BREAKSEQS_PERCIDENT_COL => 15; +const my $EST_BREAKSEQS_PERCIDENT_COL => 17; +const my $GENOME_BREAKSEQS_PERCIDENT_COL => 38; +const my $SPAN_COVERAGE_MIN_COL => 66; + +const my $SPLITR_MIN_PVAL_VAL => 0.1; # splitr_min_pvalue - > 0.1 +const my $BREAKSEQS_ESTISLANDS_PERCIDENT_VAL => 0.3; # breakseqs_estislands_percident - < 0.3 +const my $CDNA_BREAKSEQS_PERCIDENT_VAL => 0.1; # cdna_breakseqs_percident - < 0.1 +const my $EST_BREAKSEQS_PERCIDENT_VAL => 0.3; # est_breakseqs_percident - < 0.3 +const my $GENOME_BREAKSEQS_PERCIDENT_VAL => 0.1; # genome_breakseqs_percident - < 0.1 +const my $SPAN_COVERAGE_MIN_VAL => 0.6; # span_coverage_min - > 0.6 + +{ + my $options = setup(); + + my $input = File::Spec->rel2abs($options->{'input'}); + my $sample = $options->{'sample'}; + my $outdir = $options->{'outdir'}; + my $output = File::Spec->catfile($outdir, "$sample.defuse-fusion.normals.ext.filtered.txt"); + + open (my $ifh, $input) or die "Could not open file '$input' $!"; + open(my $ofh, '>', $output) or die "Could not open file '$output' $!"; + + while (<$ifh>) { + chomp; + my $line = $_; + if($line =~ m/^breakpoint_ref/){ + print $ofh $line."\n"; + } + else{ + my @fields = split $DEFUSE_SPLIT_CHAR, $line; + if($fields[$SPLITR_MIN_PVAL_COL-1] > $SPLITR_MIN_PVAL_VAL && $fields[$BREAKSEQS_ESTISLANDS_PERCIDENT_COL-1] < $BREAKSEQS_ESTISLANDS_PERCIDENT_VAL && + $fields[$CDNA_BREAKSEQS_PERCIDENT_COL-1] < $CDNA_BREAKSEQS_PERCIDENT_VAL && $fields[$EST_BREAKSEQS_PERCIDENT_COL-1] < $EST_BREAKSEQS_PERCIDENT_VAL && + $fields[$GENOME_BREAKSEQS_PERCIDENT_COL-1] < $GENOME_BREAKSEQS_PERCIDENT_VAL && $fields[$SPAN_COVERAGE_MIN_COL-1] > $SPAN_COVERAGE_MIN_VAL) { + + print $ofh $line."\n"; + } + } + } + close($ifh); + close($ofh); + +} + +sub setup { + my %opts; + pod2usage(-msg => "\nERROR: Options must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0); + $opts{'cmd'} = join " ", $0, @ARGV; + + GetOptions( 'h|help' => \$opts{'h'}, + 'm|man' => \$opts{'m'}, + 'i|input=s' => \$opts{'input'}, + 'o|outdir=s' => \$opts{'outdir'}, + 's|sample=s' => \$opts{'sample'}, + ) or pod2usage(2); + + pod2usage(-verbose => 1) if(defined $opts{'h'}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + PCAP::Cli::file_for_reading('input', $opts{'input'}); + + # Check the output directory exists and is writeable, create if not + PCAP::Cli::out_dir_check('outdir', $opts{'outdir'}); + + return \%opts; +} + +sub write_output { + my $options = shift; + + my $tmp = $options->{'tmp'}; + my $sample = $options-> {'sample'}; + my $program = $options-> {'program'}; + my $header = $options->{'header'}; + my $outdir = $options->{'outdir'}; + my $fusions_file = File::Spec->catfile($tmp,"$sample.fusions.filtered"); + my $output_file = File::Spec->catfile($outdir,"$sample.$program-fusion.normals.filtered.txt"); + PCAP::Cli::file_for_reading('filtered.fusions', $fusions_file); + + open (my $ifh, $fusions_file) or die "Could not open file $fusions_file $!"; + open(my $ofh, '>', $output_file) or die "Could not open file $output_file $!"; + print $ofh $header."\n"; + while (<$ifh>) { + chomp; + my $line = $_; + $line =~ s/\s/\t/g; + print $ofh $line."\n"; + } + close ($ifh); + close ($ofh); + + return 1; +} + +__END__ + +=head1 defuse_fusions.pl + +Filters the defuse data based on validation carried out by Graham Bignell on the CTTV RNA-Seq cell lines data set. Details of the filters are provided in the constants section at the top of the script. + +=head1 SYNOPSIS + +defuse_fusions.pl [options] + + Required parameters: + -outdir -o Folder to output result to. + -sample -s Sample name + -input -i deFuse input file containing fusions called by the cgpRna pipeline. + From 39ea4b221efb4aed4f3cc9d44a020f7c40641603 Mon Sep 17 00:00:00 2001 From: am26 Date: Thu, 4 Feb 2016 13:15:05 +0000 Subject: [PATCH 15/40] Adding a flag to the defuse output rather than actually filtering the data --- perl/bin/defuse_filters.pl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/perl/bin/defuse_filters.pl b/perl/bin/defuse_filters.pl index 17810df..621e8b3 100755 --- a/perl/bin/defuse_filters.pl +++ b/perl/bin/defuse_filters.pl @@ -76,7 +76,7 @@ chomp; my $line = $_; if($line =~ m/^breakpoint_ref/){ - print $ofh $line."\n"; + print $ofh $line."\tcgp_defuse_filter\n"; } else{ my @fields = split $DEFUSE_SPLIT_CHAR, $line; @@ -84,7 +84,10 @@ $fields[$CDNA_BREAKSEQS_PERCIDENT_COL-1] < $CDNA_BREAKSEQS_PERCIDENT_VAL && $fields[$EST_BREAKSEQS_PERCIDENT_COL-1] < $EST_BREAKSEQS_PERCIDENT_VAL && $fields[$GENOME_BREAKSEQS_PERCIDENT_COL-1] < $GENOME_BREAKSEQS_PERCIDENT_VAL && $fields[$SPAN_COVERAGE_MIN_COL-1] > $SPAN_COVERAGE_MIN_VAL) { - print $ofh $line."\n"; + print $ofh $line."\t1\n"; + } + else{ + print $ofh $line."\t0\n"; } } } @@ -147,7 +150,8 @@ sub write_output { =head1 defuse_fusions.pl -Filters the defuse data based on validation carried out by Graham Bignell on the CTTV RNA-Seq cell lines data set. Details of the filters are provided in the constants section at the top of the script. +Adds a flag (called cgp_defuse_filter) to the raw defuse data based on validation carried out by Graham Bignell on the CTTV RNA-Seq cell lines data set. +The flag can be used to filter the data in downstream analysis with the aim of reducing the number of false positive fusions called. Details of the filter thresholds can be found in the constants section at the top of the script. =head1 SYNOPSIS @@ -158,3 +162,4 @@ =head1 SYNOPSIS -sample -s Sample name -input -i deFuse input file containing fusions called by the cgpRna pipeline. +In the output file, a row with a 1 in the column cgp_defuse_filter means that this fusion has passed the set of filter thresholds whereas 0 means this fusion can potentially be filtered out. \ No newline at end of file From 1719f2d06c80e8e8af5d19bab92811b70d73ce68 Mon Sep 17 00:00:00 2001 From: am26 Date: Thu, 4 Feb 2016 13:16:16 +0000 Subject: [PATCH 16/40] Removing unnecessary subroutine write_output --- perl/bin/defuse_filters.pl | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/perl/bin/defuse_filters.pl b/perl/bin/defuse_filters.pl index 621e8b3..7023c64 100755 --- a/perl/bin/defuse_filters.pl +++ b/perl/bin/defuse_filters.pl @@ -119,33 +119,6 @@ sub setup { return \%opts; } -sub write_output { - my $options = shift; - - my $tmp = $options->{'tmp'}; - my $sample = $options-> {'sample'}; - my $program = $options-> {'program'}; - my $header = $options->{'header'}; - my $outdir = $options->{'outdir'}; - my $fusions_file = File::Spec->catfile($tmp,"$sample.fusions.filtered"); - my $output_file = File::Spec->catfile($outdir,"$sample.$program-fusion.normals.filtered.txt"); - PCAP::Cli::file_for_reading('filtered.fusions', $fusions_file); - - open (my $ifh, $fusions_file) or die "Could not open file $fusions_file $!"; - open(my $ofh, '>', $output_file) or die "Could not open file $output_file $!"; - print $ofh $header."\n"; - while (<$ifh>) { - chomp; - my $line = $_; - $line =~ s/\s/\t/g; - print $ofh $line."\n"; - } - close ($ifh); - close ($ofh); - - return 1; -} - __END__ =head1 defuse_fusions.pl From dd0041e090aa63a8869b7faef08b882f62095be3 Mon Sep 17 00:00:00 2001 From: am26 Date: Thu, 4 Feb 2016 13:39:46 +0000 Subject: [PATCH 17/40] Adding the cgp_defuse_filter column to the output where the fusion was called by deFuse --- perl/lib/Sanger/CGP/CompareFusions/Implement.pm | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm index 45873cd..b63a36e 100644 --- a/perl/lib/Sanger/CGP/CompareFusions/Implement.pm +++ b/perl/lib/Sanger/CGP/CompareFusions/Implement.pm @@ -56,7 +56,7 @@ const my $BEDTOOLS_PAIRTOPAIR => q{ pairtopair -a %s -b %s -slop 5 > %s}; const my $SORT => q{ sort -k%d,%d %s > %s}; const my $TRI_SORT => q{ sort -k%d,%d -k%d,%dr -k%d,%dr %s > %s}; const my $JOIN => q{ join -1 8 -2 8 %s %s > %s }; -const my $OUTPUT_HEADER => "sample\tfusion_name\talgorithm\tconfidence_score\tstar_junction\ttophat_junction\tdefuse_junction\tdefuse_cluster_id\tstar_junction_reads\tstar_spanning_frags\ttophat_junction_reads\ttophat_spanning_frags\tdefuse_splitr_count\tdefuse_span_count\t5'_gene\t5'_gene_id\t5'_chr\t5'_pos\t5'_strand\t3'_gene\t3'_gene_id\t3'_chr\t3'_pos\t3'_strand\t5'_transcript_id\t5'_transcript_src\t5'_exon_num\t5'_exon_start\t5'_exon_end\t3'_transcript_id\t3'_transcript_src\t3'_exon_num\t3'_exon_start\t3'_exon_end\tdefuse_splitr_sequence\ttophat_splitr_sequence\n"; +const my $OUTPUT_HEADER => "sample\tfusion_name\talgorithm\tconfidence_score\tstar_junction\ttophat_junction\tdefuse_junction\tdefuse_cluster_id\tstar_junction_reads\tstar_spanning_frags\ttophat_junction_reads\ttophat_spanning_frags\tdefuse_splitr_count\tdefuse_span_count\t5'_gene\t5'_gene_id\t5'_chr\t5'_pos\t5'_strand\t3'_gene\t3'_gene_id\t3'_chr\t3'_pos\t3'_strand\t5'_transcript_id\t5'_transcript_src\t5'_exon_num\t5'_exon_start\t5'_exon_end\t3'_transcript_id\t3'_transcript_src\t3'_exon_num\t3'_exon_start\t3'_exon_end\tdefuse_splitr_sequence\ttophat_splitr_sequence\tcgp_defuse_filter\n"; # This filter on biotypes is currently not used in subroutine filter_gtf (uncomment the line to switch on). my %ALLOWED_BIOTYPES = ( @@ -125,6 +125,7 @@ const my $DEFUSE_CLUSTER_ID => 2; const my $DEFUSE_SEQUENCE => 3; const my $DEFUSE_SPLIT_READS => 4; const my $DEFUSE_SPAN_READS => 62; +const my $DEFUSE_CGP_FILTER => 73; const my $DEFUSE_HEADER_PATTERN => 'cluster_id'; # Position of the columns in the star-fusion output file used to format fusion breakpoint references. @@ -561,6 +562,7 @@ sub generate_output { my $defuse_splitr_count = 'NA'; my $defuse_span_count = 'NA'; my $defuse_splitr_seq = 'NA'; + my $defuse_cgp_filter = 'NA'; if(defined $tophat_pos){ $tophat_breakpoint = $fields[$tophat_pos]; @@ -601,6 +603,7 @@ sub generate_output { my @defuse_temp = split "_", $defuse_breakpoint; $defuse_junction = $defuse_temp[0]; $defuse_clusterid = $defuse_temp[1]; + $defuse_cgp_filter = $defuse_data->{$defuse_breakpoint}{'cgp_filter'}; } if($length > 11){ my $gene1_name = $fields[3]; @@ -621,11 +624,11 @@ sub generate_output { $source = reverse $source; - print $ofh1 "$sample\t$fusion_name\t$source\t$confidence\t$star_breakpoint\t$tophat_breakpoint\t$defuse_junction\t$defuse_clusterid\t$star_junction_reads\t$star_spanning_frags\t$tophat_junction_reads\t$tophat_spanning_frags\t$defuse_splitr_count\t$defuse_span_count\t$gene1_name\t$gene1_id\t$chr1\t$pos1\t$strand1\t$gene2_name\t$gene2_id\t$chr2\t$pos2\t$strand2\t$transcript1_id\t$transcript1_src\t$exon1_number\t$exon1_start\t$exon1_end\t$transcript2_id\t$transcript2_src\t$exon2_number\t$exon2_start\t$exon2_end\t$defuse_splitr_seq\t$tophat_splitr_seq\n"; + print $ofh1 "$sample\t$fusion_name\t$source\t$confidence\t$star_breakpoint\t$tophat_breakpoint\t$defuse_junction\t$defuse_clusterid\t$star_junction_reads\t$star_spanning_frags\t$tophat_junction_reads\t$tophat_spanning_frags\t$defuse_splitr_count\t$defuse_span_count\t$gene1_name\t$gene1_id\t$chr1\t$pos1\t$strand1\t$gene2_name\t$gene2_id\t$chr2\t$pos2\t$strand2\t$transcript1_id\t$transcript1_src\t$exon1_number\t$exon1_start\t$exon1_end\t$transcript2_id\t$transcript2_src\t$exon2_number\t$exon2_start\t$exon2_end\t$defuse_splitr_seq\t$tophat_splitr_seq\t$defuse_cgp_filter\n"; } else{ $source = reverse $source; - print $ofh1 "$sample\tFUSION COULD NOT BE ANNOTATED\t$source\t$confidence\t$star_breakpoint\t$tophat_breakpoint\t$defuse_junction\t$defuse_clusterid\t$star_junction_reads\t$star_spanning_frags\t$tophat_junction_reads\t$tophat_spanning_frags\t$defuse_splitr_count\t$defuse_span_count\t\t\t$chr1\t$pos1\t$strand1\t\t\t$chr2\t$pos2\t$strand2\t\t\t\t\t\t\t\t\t\t\t$defuse_splitr_seq\t$tophat_splitr_seq\n"; + print $ofh1 "$sample\tFUSION COULD NOT BE ANNOTATED\t$source\t$confidence\t$star_breakpoint\t$tophat_breakpoint\t$defuse_junction\t$defuse_clusterid\t$star_junction_reads\t$star_spanning_frags\t$tophat_junction_reads\t$tophat_spanning_frags\t$defuse_splitr_count\t$defuse_span_count\t\t\t$chr1\t$pos1\t$strand1\t\t\t$chr2\t$pos2\t$strand2\t\t\t\t\t\t\t\t\t\t\t$defuse_splitr_seq\t$tophat_splitr_seq\t$defuse_cgp_filter\n"; } } close($ifh1); @@ -756,6 +759,7 @@ sub parse_defuse_file { $defuse_data{$breakpoint}{'sequence'} = $fields[$DEFUSE_SEQUENCE-1]; $defuse_data{$breakpoint}{'split_reads'} = $fields[$DEFUSE_SPLIT_READS-1]; $defuse_data{$breakpoint}{'span_reads'} = $fields[$DEFUSE_SPAN_READS-1]; + $defuse_data{$breakpoint}{'cgp_filter'} = $fields[$DEFUSE_CGP_FILTER-1]; } close ($ifh1); From cf869593020492dd6bb65aaa64e0db506c5b5958 Mon Sep 17 00:00:00 2001 From: am26 Date: Wed, 24 Feb 2016 17:05:39 +0000 Subject: [PATCH 18/40] Extending information in README file --- README.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/README.md b/README.md index 3b1724f..620ad5c 100644 --- a/README.md +++ b/README.md @@ -29,3 +29,22 @@ reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012’." + +cgpRna +====== + +cgpRna provides pipelines, for RNA-Seq data, that implement commonly used mapping +and analysis programs, such as TopHat and rna-star. +At the present time (May 2016), only pipelines for mapping (with STAR), lane QC +and fusion gene detection are included in this codebase but this will be added +to over time with; differential expression, gene/transcript quantification, splice +variant analysis and allele specific expression. + +Installation +============ + +./setup.sh path_to_install_to + +Installation Dependencies +========================= + From f5250742d4f7c4046e227d9906888248997d5505 Mon Sep 17 00:00:00 2001 From: Angela Matchan Date: Wed, 24 Feb 2016 17:43:49 +0000 Subject: [PATCH 19/40] Update README.md Adding installation and dependency information --- README.md | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 620ad5c..d41aa12 100644 --- a/README.md +++ b/README.md @@ -40,11 +40,23 @@ and fusion gene detection are included in this codebase but this will be added to over time with; differential expression, gene/transcript quantification, splice variant analysis and allele specific expression. -Installation -============ +### Dependencies/Installation ./setup.sh path_to_install_to -Installation Dependencies -========================= +Please install Perl package [PCAP-core](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) first. +Prerequisites for the [RSeQC](http://rseqc.sourceforge.net/#installation) software are: +* gcc +* [python2.7](https://www.python.org/downloads/) (The minimum version the pipeline has been tested with is python-2.7.6 +* [R](https://www.r-project.org/) +* [numpy](http://www.numpy.org/) + +Once that is done, run the following to install cgpRna: +./setup.sh path_to_install_to + +### Tools installed by setup.sh + +* Some CPAN hosted libraries, see perl/Makefile.PL +* [STAR](https://github.com/alexdobin/STAR/releases) +* [RSeQC](http://rseqc.sourceforge.net) From 285a264d5d567695d9d1efd5bb486589643da39f Mon Sep 17 00:00:00 2001 From: Angela Matchan Date: Wed, 24 Feb 2016 17:45:33 +0000 Subject: [PATCH 20/40] Update README.md Format changes --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index d41aa12..3c60e40 100644 --- a/README.md +++ b/README.md @@ -42,8 +42,6 @@ variant analysis and allele specific expression. ### Dependencies/Installation -./setup.sh path_to_install_to - Please install Perl package [PCAP-core](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) first. Prerequisites for the [RSeQC](http://rseqc.sourceforge.net/#installation) software are: @@ -53,6 +51,7 @@ Prerequisites for the [RSeQC](http://rseqc.sourceforge.net/#installation) softwa * [numpy](http://www.numpy.org/) Once that is done, run the following to install cgpRna: + ./setup.sh path_to_install_to ### Tools installed by setup.sh From d5801dc5506aba2ee81db460efed86e2f578a85e Mon Sep 17 00:00:00 2001 From: am26 Date: Thu, 3 Mar 2016 12:02:36 +0000 Subject: [PATCH 21/40] Adding details about the install location --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3c60e40..f47554f 100644 --- a/README.md +++ b/README.md @@ -50,10 +50,12 @@ Prerequisites for the [RSeQC](http://rseqc.sourceforge.net/#installation) softwa * [R](https://www.r-project.org/) * [numpy](http://www.numpy.org/) -Once that is done, run the following to install cgpRna: +Once that is done and your $PATH environment variable has been updated so that newly installed software can be found, run the following to install cgpRna: ./setup.sh path_to_install_to +N.B. the path_to_install_to should be the same as the install location used for PCAP-Core above. + ### Tools installed by setup.sh * Some CPAN hosted libraries, see perl/Makefile.PL From 829ab2634e1827c0a0df096274163b4bde70357a Mon Sep 17 00:00:00 2001 From: am26 Date: Thu, 3 Mar 2016 12:04:55 +0000 Subject: [PATCH 22/40] Correcting small typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f47554f..cccdb8d 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ Once that is done and your $PATH environment variable has been updated so that n ./setup.sh path_to_install_to -N.B. the path_to_install_to should be the same as the install location used for PCAP-Core above. +N.B. the path_to_install_to should be the same as the install location used for PCAP-core above. ### Tools installed by setup.sh From a6d83096ec8bc9dd49f8d6be2340f127c468075e Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 1 Apr 2016 14:07:19 +0100 Subject: [PATCH 23/40] Renaming file so as not to confuse with installed defuse script --- perl/bin/defuse_fusion.pl | 273 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 273 insertions(+) create mode 100755 perl/bin/defuse_fusion.pl diff --git a/perl/bin/defuse_fusion.pl b/perl/bin/defuse_fusion.pl new file mode 100755 index 0000000..175eb1b --- /dev/null +++ b/perl/bin/defuse_fusion.pl @@ -0,0 +1,273 @@ +#!/usr/bin/perl +##########LICENCE ########## +#Copyright (c) 2015 Genome Research Ltd. +### +#Author: Cancer Genome Project +### +#This file is part of cgpRna. +### +#cgpRna is free software: you can redistribute it and/or modify it under +#the terms of the GNU Affero General Public License as published by the +#Free Software Foundation; either version 3 of the License, or (at your +#option) any later version. +### +#This program is distributed in the hope that it will be useful, but +#WITHOUT ANY WARRANTY; without even the implied warranty of +#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero +#General Public License for more details. +### +#You should have received a copy of the GNU Affero General Public +#License along with this program. If not, see +#. +### +#1. The usage of a range of years within a copyright statement contained +#within this distribution should be interpreted as being equivalent to a +#list of years including the first and last year specified and all +#consecutive years between them. For example, a copyright statement that +#reads ‘Copyright (c) 2005, 2007- 2009, 2011-2012’ should be interpreted +#as being identical to a statement that reads ‘Copyright (c) 2005, 2007, +#2008, 2009, 2011, 2012’ and a copyright statement that reads ‘Copyright +#(c) 2005-2012’ should be interpreted as being identical to a statement +#that reads ‘Copyright (c) 2005, 2006, 2007, 2008, 2009, 2010, 2011, +#2012’." +##########LICENCE ########## +########## +use FindBin; +use lib "$FindBin::Bin/../lib"; + +use strict; +use warnings FATAL => 'all'; +use autodie qw(:all); +use English qw( -no_match_vars ); + +use File::Path qw(remove_tree make_path); +use Getopt::Long; +use File::Spec; +use Pod::Usage qw(pod2usage); +use List::Util qw(first); +use Const::Fast qw(const); +use File::Copy; +use Config::IniFiles; +use version; +use Cwd; + +use PCAP::Cli; +use Sanger::CGP::Defuse::Implement; + +my $ini_file = "$FindBin::Bin/../config/defuse.ini"; # default config.ini file path +const my @REQUIRED_PARAMS => qw(outdir sample); +const my @VALID_PROCESS => qw(prepare merge defuse filter); +const my %INDEX_FACTOR => ( 'prepare' => -1, + 'merge' => 1, + 'defuse' => 1, + 'filter' => 1); + +{ + my $options = setup(); + + if(!exists $options->{'process'} || $options->{'process'} eq 'prepare'){ + # Process the input files. + my $threads = PCAP::Threaded->new($options->{'threads'}); + #&PCAP::Threaded::disable_out_err if(exists $options->{'index'}); + $threads->add_function('prepare', \&Sanger::CGP::Defuse::Implement::prepare); + $threads->run($options->{'max_split'}, 'prepare', $options); + } + + # If multiple BAMs or pairs of fastq files have been input, merge into one pair of fastqs + if($options->{'max_split'} > 1){ + Sanger::CGP::Defuse::Implement::merge($options) if(!exists $options->{'process'} || $options->{'process'} eq 'merge'); + } + + Sanger::CGP::Defuse::Implement::defuse($options) if(!exists $options->{'process'} || $options->{'process'} eq 'defuse'); + + if(!exists $options->{'process'} || $options->{'process'} eq 'filter'){ + Sanger::CGP::Defuse::Implement::filter_fusions($options); + cleanup($options); + } +} + +sub cleanup { + my $options = shift; + my $tmpdir = $options->{'tmp'}; + Sanger::CGP::Defuse::Implement::compress_sam($options); + move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs_defuse')) || die $!; + remove_tree $tmpdir if(-e $tmpdir); + return 0; +} + +sub setup { + my %opts; + pod2usage(-msg => "\nERROR: Options must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0); + $opts{'cmd'} = join " ", $0, @ARGV; + + GetOptions( 'h|help' => \$opts{'h'}, + 'm|man' => \$opts{'m'}, + 'v|version' => \$opts{'version'}, + 'o|outdir=s' => \$opts{'outdir'}, + 's|sample=s' => \$opts{'sample'}, + 'sp|species=s' => \$opts{'species'}, + 'rb|refbuild=s' => \$opts{'referencebuild'}, + 'gb|genebuild=i' => \$opts{'genebuild'}, + 'r|refdataloc=s' => \$opts{'refdataloc'}, + 'n|normals=s' => \$opts{'normalfusionslist'}, + 'd|defuseconfig=s' => \$opts{'defuseconfig'}, + 't|threads=i' => \$opts{'threads'}, + 'p|process=s' => \$opts{'process'}, + 'i|index=i' => \$opts{'index'}, + 'c|config=s' => \$opts{'config'}, + ) or pod2usage(1); + + pod2usage(-verbose => 1) if(defined $opts{'h'}); + pod2usage(-verbose => 2) if(defined $opts{'m'}); + + # Read in the config.ini file + $ini_file = $opts{'config'} if(defined $opts{'config'}); + die "No config file has been specified." if($ini_file eq ''); + my $cfg = new Config::IniFiles( -file => $ini_file ) or die "Could not open config file: $ini_file"; + $opts{'config'} = $ini_file; + + # Populate the options hash with values from the config file + $opts{'refdataloc'} = $cfg->val('defuse-config','referenceloc') unless(defined $opts{'refdataloc'}); + $opts{'referencebuild'} = $cfg->val('defuse-config','referencebuild') unless(defined $opts{'referencebuild'}); + $opts{'genebuild'} = $cfg->val('defuse-config','genebuild') unless(defined $opts{'genebuild'}); + $opts{'normalfusionslist'} = $cfg->val('defuse-config','normalfusionslist') unless(defined $opts{'normalfusionslist'}); + $opts{'species'} = $cfg->val('defuse-config','species') unless(defined $opts{'species'}); + $opts{'defusepath'} = $cfg->val('defuse-config','defusepath'); + $opts{'defuseversion'} = $cfg->val('defuse-config','defuseversion'); + $opts{'defuseconfig'} = $cfg->val('defuse-config','defuseconfig') unless(defined $opts{'defuseconfig'}); + + # Print version information for this program (deFuse itself does not have a -v or --version option) + if($opts{'version'}) { + print 'CGP defuse.pl version: ',Sanger::CGP::Defuse::Implement->VERSION,"\n"; + print 'deFuse version: ',$opts{'defuseversion'},"\n" if(defined $opts{'defuseversion'}); + exit 0; + } + + for(@REQUIRED_PARAMS) { + pod2usage(-msg => "\nERROR: $_ is a required argument.\n", -verbose => 1, -output => \*STDERR) unless(defined $opts{$_}); + } + + # Check the output directory exists and is writeable, create if not + PCAP::Cli::out_dir_check('outdir', $opts{'outdir'}); + + my $tmpdir = File::Spec->catdir($opts{'outdir'}, 'tmpDefuse'); + make_path($tmpdir) unless(-d $tmpdir); + $opts{'tmp'} = $tmpdir; + my $progress = File::Spec->catdir($tmpdir, 'progress'); + make_path($progress) unless(-d $progress); + my $logs = File::Spec->catdir($tmpdir, 'logs'); + make_path($logs) unless(-d $logs); + my $input = File::Spec->catdir($tmpdir, 'input'); + make_path($input) unless(-d $input); + + # Check the input is fastq (paired only) or BAM and that a mixture of these file types hasn't been entered + $opts{'raw_files'} = \@ARGV; + Sanger::CGP::Defuse::Implement::check_input(\%opts); + + delete $opts{'process'} unless(defined $opts{'process'}); + delete $opts{'index'} unless(defined $opts{'index'}); + delete $opts{'config'} unless(defined $opts{'config'}); + + # Apply defaults + $opts{'threads'} = 1 unless(defined $opts{'threads'}); + + if(exists $opts{'process'}){ + PCAP::Cli::valid_process('process', $opts{'process'}, \@VALID_PROCESS); + my $max_index = $INDEX_FACTOR{$opts{'process'}}; + + $max_index = $opts{'max_split'} if($opts{'process'} eq 'prepare'); + + if(exists $opts{'index'}) { + if($opts{'process'} eq 'prepare'){ + PCAP::Cli::opt_requires_opts('index', \%opts, ['process']); + PCAP::Cli::valid_index_by_factor('index', $opts{'index'}, $max_index, 1); + $opts{'max_split'} = $opts{'index'}; + } + else{ + die "Index is not a valid for process $opts{'process'}, please re-run without the -i parameter.\n"; + } + } + } + elsif(exists $opts{'index'}) { + die "ERROR: -index cannot be defined without -process\n"; +} + + return \%opts; +} + +__END__ + +=head1 defuse.pl + +Cancer Genome Project implementation of the deFuse RNA-Seq algorithm +https://bitbucket.org/dranew/defuse + +=head1 SYNOPSIS + +defuse_fusion.pl [options] [file(s)...] + + Required parameters: + -outdir -o Folder to output result to. + -sample -s Sample name + + Optional + -defuseconfig -d Name of the defuse config file. It should reside under /refdataloc/species/refbuild/genebuild/ [defuse-config-GRCh38-77.txt] + -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38] + -threads -t Number of cores to use. [1] + -config -c Path to config.ini file. The file contains defaults for the reference data and deFuse software installation details [/perl/config/defuse.ini] + -refbuild -rb Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] + -genebuild -gb Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style. Please use the build number only minus any prefixes such as e/ensembl [77] + -refdataloc -r Parent directory of the reference data. + -species -sp Species [human] + + Targeted processing (further detail under OPTIONS): + -process -p Only process this step then exit + -index -i Only valid for process prepare - 1.. + + Other: + -help -h Brief help message. + -man -m Full documentation. + -version -v Version + + File list can be full file names or wildcard, e.g. + defuse_fusion.pl -t 16 -o myout -refbuild GRCh38 -genebuild 77 -s sample input/*.bam + + Run with '-m' for possible input file types. + +=head1 OPTIONS + +=over 2 + +=item B<-process> + +Available processes for this tool are: + + prepare + merge + defuse + filter + +=back + +=head2 INPUT FILE TYPES + +There are several types of file that the script is able to process. + +=over 8 + +=item f[ast]q + +A standard uncompressed fastq file. Requires a pair of inputs with standard suffix of '_1' and '_2' +immediately prior to '.f[ast]q'. + +=item f[ast]q.gz + +As *.f[ast]q but compressed with gzip. + +=item bam + +A list of single lane BAM files, RG line is transfered to aligned files. + +=back + +N.B. Interleaved fastq files are not valid for deFuse. From a4a4f0f3237494f6f6f824c304d79c44cf361f6a Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 1 Apr 2016 14:10:41 +0100 Subject: [PATCH 24/40] Removing parameters specific to CGP environment --- perl/config/defuse.ini | 14 +++++++------- perl/config/tophat.ini | 30 +++++++++++++++--------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/perl/config/defuse.ini b/perl/config/defuse.ini index 31e698c..db8bca1 100755 --- a/perl/config/defuse.ini +++ b/perl/config/defuse.ini @@ -1,9 +1,9 @@ [defuse-config] -referencebuild=GRCh38 -genebuild=77 +referencebuild= +genebuild= species=human -referenceloc=/lustre/scratch112/sanger/am26/referenceData -normalfusionslist=normal-fusions-b38 -defusepath=/nfs/users/nfs_a/am26/software/dranew-defuse-d5955f9aafdc/scripts/defuse.pl -defuseversion=0.7.0 -defuseconfig=defuse-config-GRCh38-77.txt +referenceloc= +normalfusionslist= +defusepath= +defuseversion= +defuseconfig= diff --git a/perl/config/tophat.ini b/perl/config/tophat.ini index 164b3f0..2bc6c0a 100755 --- a/perl/config/tophat.ini +++ b/perl/config/tophat.ini @@ -1,21 +1,21 @@ [tophat-config] bowtieversion=1 -referencebuild=GRCh38 -genebuild=77 -species=human -referenceloc=/lustre/scratch112/sanger/am26/referenceData -referenceindex=GRCh38.genome -transcriptomeindex=GRCh38.77 -tophatpostbuild=hg38 -tophatpostindex=hg38.genome -ensgene=ensGene_e77.txt -refgene=refGene_hg38.txt +referencebuild= +genebuild= +species= +referenceloc= +referenceindex= +transcriptomeindex= +tophatpostbuild= +tophatpostindex= +ensgene=ensGene.txt +refgene=refGene.txt blastdb=blast -blastn=/nfs/users/nfs_a/am26/software/ncbi-blast-2.2.30+/bin -normalfusionslist=normal-fusions-b38 -bowtie1path=/nfs/users/nfs_a/am26/software/bowtie-1.1.1/bowtie -bowtie2path=/nfs/users/nfs_a/am26/software/bowtie2-2.2.3/bowtie2 -tophatpath=/nfs/users/nfs_a/am26/software/tophat-2.0.13.Linux_x86_64/tophat +blastn= +normalfusionslist= +bowtie1path= +bowtie2path= +tophatpath= librarytype=fr-unstranded [tophat-parameters] From cdd34b887aa4b906154eb7aaa49b4f33657bf093 Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 1 Apr 2016 14:11:09 +0100 Subject: [PATCH 25/40] Adding fusion pipeline files to install --- perl/Makefile.PL | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perl/Makefile.PL b/perl/Makefile.PL index db7e796..1d18ac1 100644 --- a/perl/Makefile.PL +++ b/perl/Makefile.PL @@ -41,6 +41,14 @@ WriteMakefile( EXE_FILES => [qw( bin/star_mapping.pl bin/process_qcstats.pl + bin/star_fusion.pl + bin/tophat_fusion.pl + bin/tophat_add_strand.pl + bin/defuse_fusion.pl + bin/defuse_filters.pl + bin/filter_fusions.pl + bin/compare_overlapping_fusions.pl + bin/compare_CN_and_fusion.pl )], PREREQ_PM => { 'Capture::Tiny' => 0.30, From f4e5d8e2d68b7e02692d6f81cfb710d3271b1caf Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 1 Apr 2016 14:12:24 +0100 Subject: [PATCH 26/40] Adding installation of software related to Tophat and deFuse for the fusion pipeline --- setup.sh | 217 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 200 insertions(+), 17 deletions(-) diff --git a/setup.sh b/setup.sh index 49bdc0c..a9aa1ee 100755 --- a/setup.sh +++ b/setup.sh @@ -33,6 +33,18 @@ SOURCE_STAR="https://github.com/alexdobin/STAR/archive/STAR_2.4.1c.tar.gz" SOURCE_RSEQC="http://sourceforge.net/projects/rseqc/files/RSeQC-2.6.3.tar.gz/download" +SOURCE_BOWTIE1="https://sourceforge.net/projects/bowtie-bio/files/bowtie/1.1.1/bowtie-1.1.1-linux-x86_64.zip/download" +VERSION_BOWTIE1="1.1.1" +SOURCE_BOWTIE2="https://sourceforge.net/projects/bowtie-bio/files/bowtie2/2.2.3/bowtie2-2.2.3-linux-x86_64.zip/download" +VERSION_BOWTIE2="2.2.3" +SOURCE_TOPHAT="http://ccb.jhu.edu/software/tophat/downloads/tophat-2.0.13.Linux_x86_64.tar.gz" +SOURCE_BLASTN="ftp://ftp.ncbi.nlm.nih.gov/blast/executables/blast+/2.2.30/ncbi-blast-2.2.30+-x64-linux.tar.gz" +SOURCE_DEFUSE="https://bitbucket.org/dranew/defuse/get/v0.7.0.tar.gz" +VERSION_DEFUSE="0.7.0" +SOURCE_GMAP="http://research-pub.gene.com/gmap/src/gmap-gsnap-2015-09-10.tar.gz" +SOURCE_BLAT="http://users.soe.ucsc.edu/~kent/src/blatSrc35.zip" +SOURCE_FATOTWOBIT="http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/faToTwoBit" +SOURCE_BEDTOOLS="https://github.com/arq5x/bedtools2/releases/download/v2.21.0/bedtools-2.21.0.tar.gz" done_message () { @@ -169,18 +181,174 @@ if [ -e $SETUP_DIR/star.success ]; then else ( cd $SETUP_DIR - get_distro "star" $SOURCE_STAR && - mkdir -p star && - tar --strip-components 1 -C star -zxf star.tar.gz && - cp star/bin/Linux_x86_64/STAR $INST_PATH/bin/. && - cp star/STAR-Fusion-*/STAR-Fusion $INST_PATH/bin/. && - cp star/STAR-Fusion-*/lib/* $PERLROOT/. && + get_distro "star" $SOURCE_STAR + mkdir -p star + tar --strip-components 1 -C star -zxf star.tar.gz + cp star/bin/Linux_x86_64/STAR $INST_PATH/bin/. + cp star/STAR-Fusion-*/STAR-Fusion $INST_PATH/bin/. + cp star/STAR-Fusion-*/lib/* $PERLROOT/. touch $SETUP_DIR/star.success )>>$INIT_DIR/setup.log 2>&1 fi done_message "" "Failed to build STAR." - +# Install bowtie1 +echo -n "Installing bowtie1 ..." +if [ -e $SETUP_DIR/bowtie1.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "bowtie1" $SOURCE_BOWTIE1 + unzip -qu bowtie1.zip + cd $SETUP_DIR/bowtie-$VERSION_BOWTIE1 + cp bowtie* $INST_PATH/bin/. + touch $SETUP_DIR/bowtie1.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build bowtie1." + +# Install bowtie2 +echo -n "Installing bowtie2 ..." +if [ -e $SETUP_DIR/bowtie2.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "bowtie2" $SOURCE_BOWTIE2 + unzip -qu bowtie2.zip + cd $SETUP_DIR/bowtie2-$VERSION_BOWTIE2 + cp bowtie2* $INST_PATH/bin/. + touch $SETUP_DIR/bowtie2.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build bowtie2." + +# Install tophat +echo -n "Installing tophat ..." +if [ -e $SETUP_DIR/tophat.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "tophat" $SOURCE_TOPHAT + mkdir -p tophat + tar --strip-components 1 -C tophat -zxf tophat.tar.gz + cd tophat + rm ./AUTHORS ./COPYING ./README + cp ./* $INST_PATH/bin/. + touch $SETUP_DIR/tophat.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build tophat." + +# Install blastn +echo -n "Installing blastn ..." +if [ -e $SETUP_DIR/blastn.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "blastn" $SOURCE_BLASTN + mkdir -p blastn + tar --strip-components 1 -C blastn -zxf blastn.tar.gz + cp blastn/bin/blastn $INST_PATH/bin/. + touch $SETUP_DIR/blastn.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build blastn." + +# Install defuse +echo -n "Installing defuse ..." +if [ -e $SETUP_DIR/defuse.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "defuse" $SOURCE_DEFUSE + mkdir -p defuse + tar --strip-components 1 -C defuse -zxf defuse.tar.gz + mkdir -p $INST_PATH/bin/defuse_install + cp -r defuse/* $INST_PATH/bin/defuse_install + cp defuse/scripts/*pm $INST_PATH/lib/perl5 + ln -s $INST_PATH/bin/defuse_install/scripts/defuse.pl $INST_PATH/bin/defuse.pl + touch $SETUP_DIR/defuse.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build defuse." + +# Install faToTwoBit +echo -n "Installing faToTwoBit ..." +if [ -e $SETUP_DIR/faToTwoBit.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + wget $SOURCE_FATOTWOBIT + chmod +x faToTwoBit + cp faToTwoBit $INST_PATH/bin/. + touch $SETUP_DIR/faToTwoBit.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to install faToTwoBit." + +# Install blat +echo -n "Installing blat ..." +if [ -e $SETUP_DIR/blat.success ] || [ -e $INST_PATH/bin/blat ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "blat" $SOURCE_BLAT + unzip -qu blat.zip + cd $SETUP_DIR/blatSrc + BINDIR=$SETUP_DIR/blat/bin + export BINDIR + export MACHTYPE + mkdir -p $BINDIR + make -j$CPU + cp $BINDIR/blat $INST_PATH/bin/. + touch $SETUP_DIR/blat.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build blat." + +# Install bedtools +echo -n "Installing bedtools ..." +if [ -e $SETUP_DIR/bedtools.success ] || [ -e $INST_PATH/bin/bedtools ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "bedtools2" $SOURCE_BEDTOOLS + mkdir -p bedtools2 + tar --strip-components 1 -C bedtools2 -zxf bedtools2.tar.gz + make -C bedtools2 -j$CPU + cp bedtools2/bin/* $INST_PATH/bin/. + touch $SETUP_DIR/bedtools.success +)>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build bedtools." + +# Install gmap +echo -n "Installing gmap ..." +if [ -e $SETUP_DIR/gmap.success ]; then + echo -n " previously installed ..."; +else +( + cd $SETUP_DIR + get_distro "gmap" $SOURCE_GMAP + mkdir -p gmap + tar --strip-components 1 -C gmap -zxf gmap.tar.gz + cd gmap + ./configure --prefix=$INST_PATH --with-gmapdb=$INST_PATH + make + make install + touch $SETUP_DIR/gmap.success + )>>$INIT_DIR/setup.log 2>&1 +fi +done_message "" "Failed to build gmap." + # Install RSeQC using PYTHONPATH location set above echo -n "Installing RSeQC ..." if [ -e $SETUP_DIR/rseqc.success ]; then @@ -188,11 +356,11 @@ if [ -e $SETUP_DIR/rseqc.success ]; then else ( cd $SETUP_DIR - get_distro "rseqc" $SOURCE_RSEQC && - mkdir -p rseqc && - tar --strip-components 1 -C rseqc -zxf rseqc.tar.gz && + get_distro "rseqc" $SOURCE_RSEQC + mkdir -p rseqc + tar --strip-components 1 -C rseqc -zxf rseqc.tar.gz cd $SETUP_DIR/rseqc && - python ./setup.py install --prefix=$INST_PATH && + python ./setup.py install --prefix=$INST_PATH touch $SETUP_DIR/rseqc.success )>>$INIT_DIR/setup.log 2>&1 fi @@ -219,12 +387,12 @@ done_message "" "Failed during installation of core dependencies." # Install cgpRna code echo -n "Installing cgpRna..." ( - cd $INIT_DIR/perl && - perl Makefile.PL INSTALL_BASE=$INST_PATH && - make && - make test && - make install && - cp $INIT_DIR/perl/config/star.ini $INST_PATH/config/ + cd $INIT_DIR/perl + perl Makefile.PL INSTALL_BASE=$INST_PATH + make + make test + make install + cp $INIT_DIR/perl/config/*.ini $INST_PATH/config/ ) >>$INIT_DIR/setup.log 2>&1 done_message "" "cgpRna install failed." @@ -240,5 +408,20 @@ echo " $PERLROOT" echo "Please add the following to beginning of PYTHONPATH:" echo " $PYTHONROOT" echo +echo "If you intend to use the fusion pipeline, open the defuse config file: $INST_PATH/bin/defuse_install/scripts/config.txt and update the following values..." +echo "source_directory = $INST_PATH/bin/defuse_install" +echo "Then further down in the section titled # Paths to external tools..." +echo "samtools_bin = $INST_PATH/bin/samtools" +echo "bowtie_bin = $INST_PATH/bin/bowtie" +echo "bowtie_build_bin = $INST_PATH/bin/bowtie-build" +echo "blat_bin = $INST_PATH/bin/blat" +echo "fatotwobit_bin = $INST_PATH/bin/faToTwoBit" +echo "r_bin = " +echo "rscript_bin = " +echo "gmap_bin = $INST_PATH/bin/gmap" +echo "gmap_build_bin = $INST_PATH/bin/gmap_build" +echo +echo "Finally, open the file: $INST_PATH/config/defuse.ini and update the following parameter to..." +echo "defuseversion=$VERSION_DEFUSE" exit 0 From 4fb8b60a5003e57e6c508d0ff30a6c420dfde843 Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 1 Apr 2016 14:12:59 +0100 Subject: [PATCH 27/40] Removing old file --- perl/bin/defuse.pl | 273 --------------------------------------------- 1 file changed, 273 deletions(-) delete mode 100755 perl/bin/defuse.pl diff --git a/perl/bin/defuse.pl b/perl/bin/defuse.pl deleted file mode 100755 index e3d74cf..0000000 --- a/perl/bin/defuse.pl +++ /dev/null @@ -1,273 +0,0 @@ -#!/usr/bin/perl -##########LICENCE ########## -#Copyright (c) 2015 Genome Research Ltd. -### -#Author: Cancer Genome Project -### -#This file is part of cgpRna. -### -#cgpRna is free software: you can redistribute it and/or modify it under -#the terms of the GNU Affero General Public License as published by the -#Free Software Foundation; either version 3 of the License, or (at your -#option) any later version. -### -#This program is distributed in the hope that it will be useful, but -#WITHOUT ANY WARRANTY; without even the implied warranty of -#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero -#General Public License for more details. -### -#You should have received a copy of the GNU Affero General Public -#License along with this program. If not, see -#. -### -#1. The usage of a range of years within a copyright statement contained -#within this distribution should be interpreted as being equivalent to a -#list of years including the first and last year specified and all -#consecutive years between them. For example, a copyright statement that -#reads ‘Copyright (c) 2005, 2007- 2009, 2011-2012’ should be interpreted -#as being identical to a statement that reads ‘Copyright (c) 2005, 2007, -#2008, 2009, 2011, 2012’ and a copyright statement that reads ‘Copyright -#(c) 2005-2012’ should be interpreted as being identical to a statement -#that reads ‘Copyright (c) 2005, 2006, 2007, 2008, 2009, 2010, 2011, -#2012’." -##########LICENCE ########## -########## -use FindBin; -use lib "$FindBin::Bin/../lib"; - -use strict; -use warnings FATAL => 'all'; -use autodie qw(:all); -use English qw( -no_match_vars ); - -use File::Path qw(remove_tree make_path); -use Getopt::Long; -use File::Spec; -use Pod::Usage qw(pod2usage); -use List::Util qw(first); -use Const::Fast qw(const); -use File::Copy; -use Config::IniFiles; -use version; -use Cwd; - -use PCAP::Cli; -use Sanger::CGP::Defuse::Implement; - -my $ini_file = "$FindBin::Bin/../config/defuse.ini"; # default config.ini file path -const my @REQUIRED_PARAMS => qw(outdir sample); -const my @VALID_PROCESS => qw(prepare merge defuse filter); -const my %INDEX_FACTOR => ( 'prepare' => -1, - 'merge' => 1, - 'defuse' => 1, - 'filter' => 1); - -{ - my $options = setup(); - - if(!exists $options->{'process'} || $options->{'process'} eq 'prepare'){ - # Process the input files. - my $threads = PCAP::Threaded->new($options->{'threads'}); - #&PCAP::Threaded::disable_out_err if(exists $options->{'index'}); - $threads->add_function('prepare', \&Sanger::CGP::Defuse::Implement::prepare); - $threads->run($options->{'max_split'}, 'prepare', $options); - } - - # If multiple BAMs or pairs of fastq files have been input, merge into one pair of fastqs - if($options->{'max_split'} > 1){ - Sanger::CGP::Defuse::Implement::merge($options) if(!exists $options->{'process'} || $options->{'process'} eq 'merge'); - } - - Sanger::CGP::Defuse::Implement::defuse($options) if(!exists $options->{'process'} || $options->{'process'} eq 'defuse'); - - if(!exists $options->{'process'} || $options->{'process'} eq 'filter'){ - Sanger::CGP::Defuse::Implement::filter_fusions($options); - cleanup($options); - } -} - -sub cleanup { - my $options = shift; - my $tmpdir = $options->{'tmp'}; - Sanger::CGP::Defuse::Implement::compress_sam($options); - move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs_defuse')) || die $!; - remove_tree $tmpdir if(-e $tmpdir); - return 0; -} - -sub setup { - my %opts; - pod2usage(-msg => "\nERROR: Options must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0); - $opts{'cmd'} = join " ", $0, @ARGV; - - GetOptions( 'h|help' => \$opts{'h'}, - 'm|man' => \$opts{'m'}, - 'v|version' => \$opts{'version'}, - 'o|outdir=s' => \$opts{'outdir'}, - 's|sample=s' => \$opts{'sample'}, - 'sp|species=s' => \$opts{'species'}, - 'rb|refbuild=s' => \$opts{'referencebuild'}, - 'gb|genebuild=i' => \$opts{'genebuild'}, - 'r|refdataloc=s' => \$opts{'refdataloc'}, - 'n|normals=s' => \$opts{'normalfusionslist'}, - 'd|defuseconfig=s' => \$opts{'defuseconfig'}, - 't|threads=i' => \$opts{'threads'}, - 'p|process=s' => \$opts{'process'}, - 'i|index=i' => \$opts{'index'}, - 'c|config=s' => \$opts{'config'}, - ) or pod2usage(1); - - pod2usage(-verbose => 1) if(defined $opts{'h'}); - pod2usage(-verbose => 2) if(defined $opts{'m'}); - - # Read in the config.ini file - $ini_file = $opts{'config'} if(defined $opts{'config'}); - die "No config file has been specified." if($ini_file eq ''); - my $cfg = new Config::IniFiles( -file => $ini_file ) or die "Could not open config file: $ini_file"; - $opts{'config'} = $ini_file; - - # Populate the options hash with values from the config file - $opts{'refdataloc'} = $cfg->val('defuse-config','referenceloc') unless(defined $opts{'refdataloc'}); - $opts{'referencebuild'} = $cfg->val('defuse-config','referencebuild') unless(defined $opts{'referencebuild'}); - $opts{'genebuild'} = $cfg->val('defuse-config','genebuild') unless(defined $opts{'genebuild'}); - $opts{'normalfusionslist'} = $cfg->val('defuse-config','normalfusionslist') unless(defined $opts{'normalfusionslist'}); - $opts{'species'} = $cfg->val('defuse-config','species') unless(defined $opts{'species'}); - $opts{'defusepath'} = $cfg->val('defuse-config','defusepath'); - $opts{'defuseversion'} = $cfg->val('defuse-config','defuseversion'); - $opts{'defuseconfig'} = $cfg->val('defuse-config','defuseconfig') unless(defined $opts{'defuseconfig'}); - - # Print version information for this program (deFuse itself does not have a -v or --version option) - if($opts{'version'}) { - print 'CGP defuse.pl version: ',Sanger::CGP::Defuse::Implement->VERSION,"\n"; - print 'deFuse version: ',$opts{'defuseversion'},"\n" if(defined $opts{'defuseversion'}); - exit 0; - } - - for(@REQUIRED_PARAMS) { - pod2usage(-msg => "\nERROR: $_ is a required argument.\n", -verbose => 1, -output => \*STDERR) unless(defined $opts{$_}); - } - - # Check the output directory exists and is writeable, create if not - PCAP::Cli::out_dir_check('outdir', $opts{'outdir'}); - - my $tmpdir = File::Spec->catdir($opts{'outdir'}, 'tmpDefuse'); - make_path($tmpdir) unless(-d $tmpdir); - $opts{'tmp'} = $tmpdir; - my $progress = File::Spec->catdir($tmpdir, 'progress'); - make_path($progress) unless(-d $progress); - my $logs = File::Spec->catdir($tmpdir, 'logs'); - make_path($logs) unless(-d $logs); - my $input = File::Spec->catdir($tmpdir, 'input'); - make_path($input) unless(-d $input); - - # Check the input is fastq (paired only) or BAM and that a mixture of these file types hasn't been entered - $opts{'raw_files'} = \@ARGV; - Sanger::CGP::Defuse::Implement::check_input(\%opts); - - delete $opts{'process'} unless(defined $opts{'process'}); - delete $opts{'index'} unless(defined $opts{'index'}); - delete $opts{'config'} unless(defined $opts{'config'}); - - # Apply defaults - $opts{'threads'} = 1 unless(defined $opts{'threads'}); - - if(exists $opts{'process'}){ - PCAP::Cli::valid_process('process', $opts{'process'}, \@VALID_PROCESS); - my $max_index = $INDEX_FACTOR{$opts{'process'}}; - - $max_index = $opts{'max_split'} if($opts{'process'} eq 'prepare'); - - if(exists $opts{'index'}) { - if($opts{'process'} eq 'prepare'){ - PCAP::Cli::opt_requires_opts('index', \%opts, ['process']); - PCAP::Cli::valid_index_by_factor('index', $opts{'index'}, $max_index, 1); - $opts{'max_split'} = $opts{'index'}; - } - else{ - die "Index is not a valid for process $opts{'process'}, please re-run without the -i parameter.\n"; - } - } - } - elsif(exists $opts{'index'}) { - die "ERROR: -index cannot be defined without -process\n"; -} - - return \%opts; -} - -__END__ - -=head1 defuse.pl - -Cancer Genome Project implementation of the deFuse RNA-Seq algorithm -https://bitbucket.org/dranew/defuse - -=head1 SYNOPSIS - -defuse.pl [options] [file(s)...] - - Required parameters: - -outdir -o Folder to output result to. - -sample -s Sample name - - Optional - -defuseconfig -d Name of the defuse config file. It should reside under /refdataloc/species/refbuild/genebuild/ [defuse-config-GRCh38-77.txt] - -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38] - -threads -t Number of cores to use. [1] - -config -c Path to config.ini file. The file contains defaults for the reference data and deFuse software installation details [/perl/config/defuse.ini] - -refbuild -rb Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] - -genebuild -gb Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style. Please use the build number only minus any prefixes such as e/ensembl [77] - -refdataloc -r Parent directory of the reference data. - -species -sp Species [human] - - Targeted processing (further detail under OPTIONS): - -process -p Only process this step then exit - -index -i Only valid for process prepare - 1.. - - Other: - -help -h Brief help message. - -man -m Full documentation. - -version -v Version - - File list can be full file names or wildcard, e.g. - defuse.pl -t 16 -o myout -refbuild GRCh38 -genebuild 77 -s sample input/*.bam - - Run with '-m' for possible input file types. - -=head1 OPTIONS - -=over 2 - -=item B<-process> - -Available processes for this tool are: - - prepare - merge - defuse - filter - -=back - -=head2 INPUT FILE TYPES - -There are several types of file that the script is able to process. - -=over 8 - -=item f[ast]q - -A standard uncompressed fastq file. Requires a pair of inputs with standard suffix of '_1' and '_2' -immediately prior to '.f[ast]q'. - -=item f[ast]q.gz - -As *.f[ast]q but compressed with gzip. - -=item bam - -A list of single lane BAM files, RG line is transfered to aligned files. - -=back - -N.B. Interleaved fastq files are not valid for deFuse. From c5cde7ecc9ff5237b6db0a08fbcf5f40f2aba7f5 Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 1 Apr 2016 14:23:24 +0100 Subject: [PATCH 28/40] Adding in a check and message for user to install Vagrent as well as PCAP-Core prior to installing cgpRna --- setup.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/setup.sh b/setup.sh index a9aa1ee..e411c81 100755 --- a/setup.sh +++ b/setup.sh @@ -151,6 +151,12 @@ if [[ "x$CHK" == "x" ]] ; then exit 1; fi +CHK2=`perl -le 'eval "require $ARGV[0]" and print $ARGV[0]->VERSION' Sanger::CGP::Vagrent` +if [[ "x$CHK2" == "x" ]] ; then + echo "PREREQUISITE: Please install VAGrENT before proceeding: https://github.com/cancerit/VAGrENT/releases" + exit 1; +fi + #create a location to build dependencies SETUP_DIR=$INIT_DIR/install_tmp mkdir -p $SETUP_DIR From 955059d1e05a0961394da83494b244a4dc4a6d8f Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 4 Apr 2016 11:24:06 +0100 Subject: [PATCH 29/40] Checking blastn is on the $PATH for tophat fusion post to run --- perl/lib/Sanger/CGP/Tophat/Implement.pm | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/perl/lib/Sanger/CGP/Tophat/Implement.pm b/perl/lib/Sanger/CGP/Tophat/Implement.pm index 919002b..cdd8f9c 100755 --- a/perl/lib/Sanger/CGP/Tophat/Implement.pm +++ b/perl/lib/Sanger/CGP/Tophat/Implement.pm @@ -517,6 +517,12 @@ sub tophatfusion_post { # Ensure the correct version of bowtie is on the path along with blastn my $bwtpath = dirname($options->{'bowtiepath'} ); my $blastnpath = $options->{'blastn'}; + + if(! defined $options->{'blastn'} || $options->{'blastn'} eq ''){ + $blastnpath = _which('blastn'); + $options->{'blastn'} = $blastnpath; + } + $ENV{PATH} = "$bwtpath:$ENV{PATH}" if($ENV{'PATH'} !~ /$bwtpath/); $ENV{PATH} = "$blastnpath:$ENV{PATH}" if($ENV{'PATH'} !~ /$blastnpath/); _which('bowtie'); From 197f03f16d8036fc5b537fa48e17611f573dd106 Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 4 Apr 2016 12:43:13 +0100 Subject: [PATCH 30/40] Adding info about the fusion pipeline tools --- README.md | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index cccdb8d..804ffd2 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ variant analysis and allele specific expression. ### Dependencies/Installation -Please install Perl package [PCAP-core](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) first. +Please install Perl packages [PCAP-core](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) and [VAGrENT](https://github.com/cancerit/VAGrENT/releases) first. Prerequisites for the [RSeQC](http://rseqc.sourceforge.net/#installation) software are: * gcc @@ -54,10 +54,23 @@ Once that is done and your $PATH environment variable has been updated so that n ./setup.sh path_to_install_to -N.B. the path_to_install_to should be the same as the install location used for PCAP-core above. +N.B. the path_to_install_to should be the same as the install location used for PCAP-core and VAGrENT above. ### Tools installed by setup.sh * Some CPAN hosted libraries, see perl/Makefile.PL * [STAR](https://github.com/alexdobin/STAR/releases) +* [Tophat](https://ccb.jhu.edu/software/tophat/index.shtml) +* [deFuse](https://bitbucket.org/dranew/defuse) * [RSeQC](http://rseqc.sourceforge.net) +* [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) N.B. both bowtie and bowtie2 are installed and can be used with Tophat +* [blat](http://hgwdev.cse.ucsc.edu/~kent/src/) Unless already in the install location bin directory +* [gmap](http://research-pub.gene.com/gmap/) The aligner used by deFuse +* [faToTwoBit](http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/) deFuse dependency +* [bedtools](https://github.com/arq5x/bedtools2/) Unless already in the install location bin directory +* [blastn](ftp://ftp.ncbi.nlm.nih.gov/blast/) Used by tophat-fusion post + +N.B. samtools is also a dependency but this is installed by PCAP-Core which should have already been installed (see above). + +If you are planning to use the fusion pipeline, specifically defuse_fusion.pl, the deFuse config.txt file will need to be updated with the installed locations of a number of tools. +These paths are printed to screen if the setup.sh script completes successfully so make a note of the locations and update the file as instructed. From f7dbb2515b4d195fa7259f94978e78087aad6fdf Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 4 Apr 2016 12:49:07 +0100 Subject: [PATCH 31/40] Updating the blastn download link --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 804ffd2..dd04ab9 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ N.B. the path_to_install_to should be the same as the install location used for * [gmap](http://research-pub.gene.com/gmap/) The aligner used by deFuse * [faToTwoBit](http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/) deFuse dependency * [bedtools](https://github.com/arq5x/bedtools2/) Unless already in the install location bin directory -* [blastn](ftp://ftp.ncbi.nlm.nih.gov/blast/) Used by tophat-fusion post +* [blastn](http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) Used by tophat-fusion post N.B. samtools is also a dependency but this is installed by PCAP-Core which should have already been installed (see above). From cdb75e69b7312c815bafb8afdd1a8301069c7c12 Mon Sep 17 00:00:00 2001 From: am26 Date: Tue, 5 Apr 2016 16:19:48 +0100 Subject: [PATCH 32/40] Changing the directory structure for the tophat reference data --- perl/bin/tophat_fusion.pl | 3 -- perl/config/tophat.ini | 9 +++--- perl/lib/Sanger/CGP/Tophat/Implement.pm | 41 ++++++++++++++----------- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/perl/bin/tophat_fusion.pl b/perl/bin/tophat_fusion.pl index b210234..ae7da2f 100755 --- a/perl/bin/tophat_fusion.pl +++ b/perl/bin/tophat_fusion.pl @@ -119,7 +119,6 @@ sub setup { 'r|refdataloc=s' => \$opts{'refdataloc'}, 'ri|refindex=s' => \$opts{'referenceindex'}, 'ti|transindex=s' => \$opts{'transcriptomeindex'}, - 'ub|ucscbuild=s' => \$opts{'tophatpostbuild'}, 'ui|ucscindex=s' => \$opts{'tophatpostindex'}, 'n|normals=s' => \$opts{'normalfusionslist'}, 't|threads=i' => \$opts{'threads'}, @@ -144,7 +143,6 @@ sub setup { $opts{'bowtieversion'} = $cfg->val('tophat-config','bowtieversion') unless(defined $opts{'bowtieversion'}); $opts{'referenceindex'} = $cfg->val('tophat-config','referenceindex') unless(defined $opts{'referenceindex'}); $opts{'transcriptomeindex'} = $cfg->val('tophat-config','transcriptomeindex') unless(defined $opts{'transcriptomeindex'}); - $opts{'tophatpostbuild'} = $cfg->val('tophat-config','tophatpostbuild') unless(defined $opts{'tophatpostbuild'}); $opts{'tophatpostindex'} = $cfg->val('tophat-config','tophatpostindex') unless(defined $opts{'tophatpostindex'}); $opts{'ensgene'} = $cfg->val('tophat-config','ensgene') unless(defined $opts{'ensgene'}); $opts{'refgene'} = $cfg->val('tophat-config','refgene') unless(defined $opts{'refgene'}); @@ -264,7 +262,6 @@ =head1 SYNOPSIS -genebuild -gb Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style [77] -refindex -ri Stem name of the bowtie index files for the reference which need to be compatible with the bowtie version [GRCh38.genome] -transindex -ti Stem name of the bowtie index files for the transcriptome which need to be compatible with the bowtie version [GRCh38.77] - -ucscbuild -ub Tophat fusion post requires a reference build in UCSC format which must be equivalent to the refbuild version specified e.g. if refbuild = GRCh37 ucscbuild should be hg19 [hg38] -ucscindex -ui Stem name of the bowtie index files for the transcriptome in ucsc format which should be compatible with the bowtie version and ucsc build [hg38.genome] -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38] -species -sp Species [human] diff --git a/perl/config/tophat.ini b/perl/config/tophat.ini index 2bc6c0a..4b1e9ec 100755 --- a/perl/config/tophat.ini +++ b/perl/config/tophat.ini @@ -4,15 +4,14 @@ referencebuild= genebuild= species= referenceloc= -referenceindex= -transcriptomeindex= -tophatpostbuild= -tophatpostindex= +referenceindex=genome +transcriptomeindex=transcriptome +tophatpostindex=genome ensgene=ensGene.txt refgene=refGene.txt blastdb=blast blastn= -normalfusionslist= +normalfusionslist=normal-fusions bowtie1path= bowtie2path= tophatpath= diff --git a/perl/lib/Sanger/CGP/Tophat/Implement.pm b/perl/lib/Sanger/CGP/Tophat/Implement.pm index cdd8f9c..4f0e738 100755 --- a/perl/lib/Sanger/CGP/Tophat/Implement.pm +++ b/perl/lib/Sanger/CGP/Tophat/Implement.pm @@ -132,33 +132,32 @@ sub check_input { if($options->{'bowtieversion'} == 1){ for $suffix(@BOWTIE1_SUFFIXES){ - PCAP::Cli::file_for_reading('bowtie1-ref-index',File::Spec->catfile($ens_refdata,'bowtie1',$ref_prefix.$suffix)); - PCAP::Cli::file_for_reading('bowtie1-transcriptome-index',File::Spec->catfile($ens_refdata,$options->{'genebuild'},'bowtie1',$trans_prefix.$suffix)); - $options->{'referencepath'} = File::Spec->catfile($ens_refdata,'bowtie1',$ref_prefix); - $options->{'transcriptomepath'} = File::Spec->catfile($ens_refdata,$options->{'genebuild'},'bowtie1',$trans_prefix); + PCAP::Cli::file_for_reading('bowtie1-ref-index',File::Spec->catfile($ens_refdata,'tophat',$ref_prefix.$suffix)); + PCAP::Cli::file_for_reading('bowtie1-transcriptome-index',File::Spec->catfile($ens_refdata,'tophat',$options->{'genebuild'},$trans_prefix.$suffix)); + $options->{'referencepath'} = File::Spec->catfile($ens_refdata,'tophat',$ref_prefix); + $options->{'transcriptomepath'} = File::Spec->catfile($ens_refdata,'tophat',$options->{'genebuild'},$trans_prefix); } } else{ for $suffix(@BOWTIE2_SUFFIXES){ - PCAP::Cli::file_for_reading('bowtie2-ref-index',File::Spec->catfile($ens_refdata,'bowtie2',$ref_prefix.$suffix)); - PCAP::Cli::file_for_reading('bowtie2-transcriptome-index',File::Spec->catfile($ens_refdata,$options->{'genebuild'},'bowtie2',$trans_prefix.$suffix)); - $options->{'referencepath'} = File::Spec->catfile($ens_refdata,'bowtie2',$ref_prefix); - $options->{'transcriptomepath'} = File::Spec->catfile($ens_refdata,$options->{'genebuild'},'bowtie2',$trans_prefix); + PCAP::Cli::file_for_reading('bowtie2-ref-index',File::Spec->catfile($ens_refdata,'tophat',$ref_prefix.$suffix)); + PCAP::Cli::file_for_reading('bowtie2-transcriptome-index',File::Spec->catfile($ens_refdata,'tophat',$options->{'genebuild'},$trans_prefix.$suffix)); + $options->{'referencepath'} = File::Spec->catfile($ens_refdata,'tophat',$ref_prefix); + $options->{'transcriptomepath'} = File::Spec->catfile($ens_refdata,'tophat',$options->{'genebuild'},$trans_prefix); } } # Check the TopHat Fusion Post files exist my $ucsc_prefix = $options->{'tophatpostindex'}; - my $ucsc_refdata = File::Spec->catdir($refdata,$options->{'tophatpostbuild'}); for $suffix(@BOWTIE1_SUFFIXES){ - PCAP::Cli::file_for_reading('bowtie1-tophatpost-index',File::Spec->catfile($ucsc_refdata,'bowtie1',$ucsc_prefix.$suffix)); + PCAP::Cli::file_for_reading('bowtie1-tophatpost-index',File::Spec->catfile($ens_refdata,'tophat',$ucsc_prefix.$suffix)); } - $options->{'tophatpostpath'} = File::Spec->catfile($ucsc_refdata,'bowtie1',$ucsc_prefix); - PCAP::Cli::file_for_reading('refGene',File::Spec->catfile($ucsc_refdata,$options->{'refgene'})); - PCAP::Cli::file_for_reading('ensGene',File::Spec->catfile($ucsc_refdata,$options->{'genebuild'},$options->{'ensgene'})); + $options->{'tophatpostpath'} = File::Spec->catfile($ens_refdata,'tophat',$ucsc_prefix); + PCAP::Cli::file_for_reading('refGene',File::Spec->catfile($ens_refdata,'tophat',$options->{'refgene'})); + PCAP::Cli::file_for_reading('ensGene',File::Spec->catfile($ens_refdata,'tophat',$options->{'genebuild'},$options->{'ensgene'})); # Check the normal fusions file exists for the filtering step - PCAP::Cli::file_for_reading('normals-list',File::Spec->catfile($ens_refdata,$options->{'normalfusionslist'})); + PCAP::Cli::file_for_reading('normals-list',File::Spec->catfile($ens_refdata,'cgpRna',$options->{'normalfusionslist'})); $options->{'meta_set'} = PCAP::Bwa::Meta::files_to_meta($options->{'tmp'}, $options->{'raw_files'}, $options->{'sample'}); @@ -229,7 +228,7 @@ sub filter_fusions { die "Please run tophatfusion_post step prior to filter\n" unless(-d $post_outdir); die "Please run tophatfusion_post step prior to filter\n" unless(-e $fusions_file); - my $normals_file = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},$options->{'normalfusionslist'}); + my $normals_file = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'cgpRna',$options->{'normalfusionslist'}); my $command = "$^X "; $command .= _which('filter_fusions.pl'); @@ -384,9 +383,9 @@ sub split_setup { my $post_rundir = File::Spec->catdir($options->{'tmp'}, 'tophatpostrun'); make_path($post_rundir) unless(-d $post_rundir); - my $refgene = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'tophatpostbuild'},$options->{'refgene'}); - my $ensgene = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'tophatpostbuild'},$options->{'genebuild'},$options->{'ensgene'}); - my $blast = File::Spec->catdir($options->{'refdataloc'},$options->{'species'},$options->{'blastdb'}); + my $refgene = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'tophat',$options->{'refgene'}); + my $ensgene = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'tophat',$options->{'genebuild'},$options->{'ensgene'}); + my $blast = File::Spec->catdir($options->{'refdataloc'},$options->{'species'},'tophat',$options->{'blastdb'}); symlink($refgene, $post_rundir.'/refGene.txt') unless(-l File::Spec->catfile($post_rundir,'refGene.txt')); symlink($ensgene, $post_rundir.'/ensGene.txt') unless(-l File::Spec->catfile($post_rundir,'ensGene.txt')); symlink($blast, $post_rundir.'/blast') unless(-l $post_rundir.'/blast'); @@ -483,6 +482,9 @@ sub tophat_fusion { my $ref_index_stem = $options->{'referencepath'}; my $tophat_path = $options->{'tophatpath'}; + if(! defined $tophat_path || $tophat_path eq ''){ + $tophat_path = _which('tophat'); + } my $command = $tophat_path." ".$tophat_params." ".$ref_index_stem." ".join(",",@input1)." ".join(",",@input2); @@ -505,6 +507,9 @@ sub tophatfusion_post { my $tophatpost_params = process_tophatpost_params($options); my $tophatpost = $options->{'tophatpath'}; + if(! defined $tophatpost || $tophatpost eq ''){ + $tophatpost = _which('tophat-fusion-post'); + } my $tophatpostindex = $options->{'tophatpostpath'}; my $runcommand = $tophatpost.'-fusion-post'." ".$tophatpost_params." ".$tophatpostindex; From 5db90e50769e83a86e87f0b9639b552c99b5bdfb Mon Sep 17 00:00:00 2001 From: am26 Date: Tue, 5 Apr 2016 17:15:52 +0100 Subject: [PATCH 33/40] Changes related to the change in reference data structure --- perl/bin/defuse_fusion.pl | 4 ++-- perl/bin/star_fusion.pl | 6 +++--- perl/bin/tophat_fusion.pl | 8 ++++---- perl/config/defuse.ini | 4 ++-- perl/config/star.ini | 2 +- perl/lib/Sanger/CGP/Defuse/Implement.pm | 6 +++--- perl/lib/Sanger/CGP/Star/Implement.pm | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/perl/bin/defuse_fusion.pl b/perl/bin/defuse_fusion.pl index 175eb1b..6e49897 100755 --- a/perl/bin/defuse_fusion.pl +++ b/perl/bin/defuse_fusion.pl @@ -211,8 +211,8 @@ =head1 SYNOPSIS -sample -s Sample name Optional - -defuseconfig -d Name of the defuse config file. It should reside under /refdataloc/species/refbuild/genebuild/ [defuse-config-GRCh38-77.txt] - -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38] + -defuseconfig -d Name of the defuse config file. It should reside under /refdataloc/species/refbuild/defuse/genebuild/ [defuse-config.txt] + -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions] -threads -t Number of cores to use. [1] -config -c Path to config.ini file. The file contains defaults for the reference data and deFuse software installation details [/perl/config/defuse.ini] -refbuild -rb Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] diff --git a/perl/bin/star_fusion.pl b/perl/bin/star_fusion.pl index 3da7a25..dc3ae07 100755 --- a/perl/bin/star_fusion.pl +++ b/perl/bin/star_fusion.pl @@ -213,12 +213,12 @@ =head1 SYNOPSIS -sample -s Sample name Optional - -gtffile -g GTF annotation file name which should be compatible with the refbuild and gene build versions. It should reside under /refdataloc/species/refbuild/star/genebuild/ [Homo_sapiens.GRCh38.77.gtf] - -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38] + -gtffile -g GTF annotation file name which should be compatible with the refbuild and gene build versions. It should reside under /refdataloc/species/refbuild/star/genebuild/ [ensembl.gtf] + -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions] -threads -t Number of cores to use. [1] -config -c Path to config.ini file. It contains defaults for; the reference and gene build versions, star software and default star and star-fusion parameters [/perl/config/star.ini] -refbuild -rb Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] - -genebuild -gb Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style.[e77] + -genebuild -gb Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style.[77] -refdataloc -r Parent directory of the reference data -species -sp Species [human] diff --git a/perl/bin/tophat_fusion.pl b/perl/bin/tophat_fusion.pl index ae7da2f..0da64ff 100755 --- a/perl/bin/tophat_fusion.pl +++ b/perl/bin/tophat_fusion.pl @@ -260,10 +260,10 @@ =head1 SYNOPSIS -config -c Path to config.ini file. Defaults for the reference and transcriptome related parameters are provided in the config.ini file. -refbuild -rb Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] -genebuild -gb Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style [77] - -refindex -ri Stem name of the bowtie index files for the reference which need to be compatible with the bowtie version [GRCh38.genome] - -transindex -ti Stem name of the bowtie index files for the transcriptome which need to be compatible with the bowtie version [GRCh38.77] - -ucscindex -ui Stem name of the bowtie index files for the transcriptome in ucsc format which should be compatible with the bowtie version and ucsc build [hg38.genome] - -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38] + -refindex -ri Stem name of the bowtie index files for the reference which need to be compatible with the bowtie version [genome] + -transindex -ti Stem name of the bowtie index files for the transcriptome which need to be compatible with the bowtie version [transcriptome] + -ucscindex -ui Stem name of the bowtie index files for the transcriptome in ucsc format which should be compatible with the bowtie version and ucsc build [genome] + -normals -n File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions] -species -sp Species [human] Targeted processing (further detail under OPTIONS): diff --git a/perl/config/defuse.ini b/perl/config/defuse.ini index db8bca1..2fe86b9 100755 --- a/perl/config/defuse.ini +++ b/perl/config/defuse.ini @@ -3,7 +3,7 @@ referencebuild= genebuild= species=human referenceloc= -normalfusionslist= +normalfusionslist=normal-fusions defusepath= defuseversion= -defuseconfig= +defuseconfig=defuse-config.txt diff --git a/perl/config/star.ini b/perl/config/star.ini index ce4ca83..1bb209d 100755 --- a/perl/config/star.ini +++ b/perl/config/star.ini @@ -4,7 +4,7 @@ genebuild= species= referenceloc= gtffilename= -normalfusionslist= +normalfusionslist=normal-fusions starpath= starfusionpath= diff --git a/perl/lib/Sanger/CGP/Defuse/Implement.pm b/perl/lib/Sanger/CGP/Defuse/Implement.pm index 48b67e4..776779a 100755 --- a/perl/lib/Sanger/CGP/Defuse/Implement.pm +++ b/perl/lib/Sanger/CGP/Defuse/Implement.pm @@ -66,10 +66,10 @@ sub check_input { my $ref_build_loc = File::Spec->catdir($ref_data, $species, $ref_build); # Check the normal fusions file exists for the filtering step - PCAP::Cli::file_for_reading('normals-list',File::Spec->catfile($ref_build_loc,$options->{'normalfusionslist'})); + PCAP::Cli::file_for_reading('normals-list',File::Spec->catfile($ref_build_loc,'cgpRna',$options->{'normalfusionslist'})); # Check the defuse config file exists for the ref-gene build - PCAP::Cli::file_for_reading('defuse-config',File::Spec->catfile($ref_build_loc,$gene_build,$options->{'defuseconfig'})); + PCAP::Cli::file_for_reading('defuse-config',File::Spec->catfile($ref_build_loc,'defuse',$gene_build,$options->{'defuseconfig'})); $options->{'meta_set'} = PCAP::Bwa::Meta::files_to_meta($options->{'tmp'}, $options->{'raw_files'}, $options->{'sample'}); @@ -164,7 +164,7 @@ sub defuse { } # Get the relevant defuse config file for the reference and gene builds - my $defuse_config = File::Spec->catfile($options->{'refdataloc'}, $options->{'species'}, $options->{'referencebuild'}, $options->{'genebuild'}, $options->{'defuseconfig'} ); + my $defuse_config = File::Spec->catfile($options->{'refdataloc'}, $options->{'species'}, $options->{'referencebuild'},'defuse',$options->{'genebuild'}, $options->{'defuseconfig'} ); my $command = sprintf $DEFUSE, $defuse, $defuse_config, $threads, diff --git a/perl/lib/Sanger/CGP/Star/Implement.pm b/perl/lib/Sanger/CGP/Star/Implement.pm index 00dfd5a..029ff91 100755 --- a/perl/lib/Sanger/CGP/Star/Implement.pm +++ b/perl/lib/Sanger/CGP/Star/Implement.pm @@ -83,7 +83,7 @@ sub check_input { PCAP::Cli::file_for_reading('gtf-file', File::Spec->catfile($ref_build_loc, 'star', $gene_build, $options->{'gtffilename'})); if($fusion_mode){ - PCAP::Cli::file_for_reading('normals-list',File::Spec->catfile($ref_build_loc,$options->{'normalfusionslist'})); + PCAP::Cli::file_for_reading('normals-list',File::Spec->catfile($ref_build_loc,'cgpRna',$options->{'normalfusionslist'})); } my $input_meta = PCAP::Bwa::Meta::files_to_meta($options->{'tmp'}, $options->{'raw_files'}, $options->{'sample'}); From 007e4bef6ff40d97292aceee645cf64d6ef49384 Mon Sep 17 00:00:00 2001 From: am26 Date: Wed, 6 Apr 2016 10:52:25 +0100 Subject: [PATCH 34/40] Only need to sort the transcriptome BAM if star has been run in mapping mode. NA for fusion mode --- perl/lib/Sanger/CGP/Defuse/Implement.pm | 4 ++++ perl/lib/Sanger/CGP/Star/Implement.pm | 13 ++++++++++--- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/perl/lib/Sanger/CGP/Defuse/Implement.pm b/perl/lib/Sanger/CGP/Defuse/Implement.pm index 776779a..94bae83 100755 --- a/perl/lib/Sanger/CGP/Defuse/Implement.pm +++ b/perl/lib/Sanger/CGP/Defuse/Implement.pm @@ -116,6 +116,10 @@ sub defuse { $threads = $options->{'threads'} if($options->{'threads'} < $DEFUSE_MAX_CORES); my $sample = $options->{'sample'}; my $defuse = $options->{'defusepath'}; + if(! defined $defuse || $defuse eq ''){ + $defuse = _which('defuse.pl'); + } + my $outdir = File::Spec->catdir($tmp, "defuse_$sample"); my $fastq1; my $fastq2; diff --git a/perl/lib/Sanger/CGP/Star/Implement.pm b/perl/lib/Sanger/CGP/Star/Implement.pm index 029ff91..f971efe 100755 --- a/perl/lib/Sanger/CGP/Star/Implement.pm +++ b/perl/lib/Sanger/CGP/Star/Implement.pm @@ -520,9 +520,16 @@ sub star { File::Spec->catfile($stardir, 'Aligned.toTranscriptome.sortedByCoord.out.bam'), $threads, $threads; - my @commands; - push @commands, $bamsort_command1; - push @commands, $bamsort_command2; + + my $fusion_mode; + + if(exists $options->{'fusion_mode'}){ + $fusion_mode = $options->{'fusion_mode'}; + } + + my @commands; + push @commands, $bamsort_command1; + push @commands, $bamsort_command2 unless(defined $fusion_mode); PCAP::Threaded::external_process_handler(File::Spec->catdir($tmp, 'logs'), \@commands, 0); PCAP::Threaded::touch_success(File::Spec->catdir($tmp, 'progress'), 0); From bf6cfd5ee3dd36dc6992c16275e33b91dc7f1591 Mon Sep 17 00:00:00 2001 From: am26 Date: Wed, 6 Apr 2016 16:37:12 +0100 Subject: [PATCH 35/40] Picking up STAR-Fusion from PATH if not in star.ini file --- perl/lib/Sanger/CGP/Star/Implement.pm | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/Star/Implement.pm b/perl/lib/Sanger/CGP/Star/Implement.pm index f971efe..bae1807 100755 --- a/perl/lib/Sanger/CGP/Star/Implement.pm +++ b/perl/lib/Sanger/CGP/Star/Implement.pm @@ -551,7 +551,12 @@ sub star_fusion { my $sample = $options->{'sample'}; my $gtf = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'}, 'star',$options->{'genebuild'}, $options->{'gtffilename'}); - my $commands = sprintf $STAR_FUSION, $options->{'starfusionpath'}, + my $starfusionpath = $options->{'starfusionpath'}; + if(! defined $starfusionpath || $starfusionpath eq ''){ + $starfusionpath = _which('STAR-Fusion'); + } + + my $commands = sprintf $STAR_FUSION, $starfusionpath, $chimeric_sam, $chimeric_junction, $gtf, From ea33a4a4a859cff0f9c840c87bb2f27574af388a Mon Sep 17 00:00:00 2001 From: am26 Date: Wed, 6 Apr 2016 16:43:00 +0100 Subject: [PATCH 36/40] Updating location of normal-fusions file --- perl/lib/Sanger/CGP/Star/Implement.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/Star/Implement.pm b/perl/lib/Sanger/CGP/Star/Implement.pm index bae1807..3f7c4a9 100755 --- a/perl/lib/Sanger/CGP/Star/Implement.pm +++ b/perl/lib/Sanger/CGP/Star/Implement.pm @@ -138,7 +138,7 @@ sub filter_fusions { my $fusions_file = File::Spec->catfile($star_outdir, "$sample.fusion_candidates.txt"); die "The star fusion output files are missing, please run the starfusion step prior to filter.\n" unless(-e $fusions_file); - my $normals_file = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},$options->{'normalfusionslist'}); + my $normals_file = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'cgpRna',$options->{'normalfusionslist'}); my $command = "$^X "; $command .= _which('filter_fusions.pl'); From 8e97c7b672ce0f264cbac93af037f43df6457f5d Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 8 Apr 2016 13:24:17 +0100 Subject: [PATCH 37/40] Updating location of blast database --- perl/lib/Sanger/CGP/Tophat/Implement.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/Tophat/Implement.pm b/perl/lib/Sanger/CGP/Tophat/Implement.pm index 4f0e738..191d1bc 100755 --- a/perl/lib/Sanger/CGP/Tophat/Implement.pm +++ b/perl/lib/Sanger/CGP/Tophat/Implement.pm @@ -385,7 +385,7 @@ sub split_setup { my $refgene = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'tophat',$options->{'refgene'}); my $ensgene = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'tophat',$options->{'genebuild'},$options->{'ensgene'}); - my $blast = File::Spec->catdir($options->{'refdataloc'},$options->{'species'},'tophat',$options->{'blastdb'}); + my $blast = File::Spec->catdir($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'tophat',$options->{'blastdb'}); symlink($refgene, $post_rundir.'/refGene.txt') unless(-l File::Spec->catfile($post_rundir,'refGene.txt')); symlink($ensgene, $post_rundir.'/ensGene.txt') unless(-l File::Spec->catfile($post_rundir,'ensGene.txt')); symlink($blast, $post_rundir.'/blast') unless(-l $post_rundir.'/blast'); From 37b537e7d42597fbc98f3ad498528151c51b0c2c Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 8 Apr 2016 13:33:59 +0100 Subject: [PATCH 38/40] Updating location of normal fusions file --- perl/lib/Sanger/CGP/Defuse/Implement.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/Defuse/Implement.pm b/perl/lib/Sanger/CGP/Defuse/Implement.pm index 94bae83..69bd4a9 100755 --- a/perl/lib/Sanger/CGP/Defuse/Implement.pm +++ b/perl/lib/Sanger/CGP/Defuse/Implement.pm @@ -196,7 +196,7 @@ sub filter_fusions { die "Please run the defuse step prior to filter\n" unless(-d $defuse_outdir); die "One of the deFuse output files is missing, please run the defuse step prior to filter.\n" unless(-e $fusions_file && -e File::Spec->catfile($defuse_outdir, 'cdna.pair.sam')); - my $normals_file = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},$options->{'normalfusionslist'}); + my $normals_file = File::Spec->catfile($options->{'refdataloc'},$options->{'species'},$options->{'referencebuild'},'cgpRna',$options->{'normalfusionslist'}); my $command = "$^X "; $command .= _which('filter_fusions.pl'); From 7d0fd5764ef67e177b45411890e8041b268e4b81 Mon Sep 17 00:00:00 2001 From: am26 Date: Fri, 8 Apr 2016 14:38:10 +0100 Subject: [PATCH 39/40] Adding defuse make step --- setup.sh | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/setup.sh b/setup.sh index e411c81..5b36bbc 100755 --- a/setup.sh +++ b/setup.sh @@ -274,9 +274,16 @@ else get_distro "defuse" $SOURCE_DEFUSE mkdir -p defuse tar --strip-components 1 -C defuse -zxf defuse.tar.gz + cd ./defuse/tools + include_search=`grep "#include " ./Common.h | wc -l` + if [ $include_search -eq 0 ]; then + sed -i 's/#include /#include \n#include /' ./Common.h + fi + make + cd ../../ mkdir -p $INST_PATH/bin/defuse_install - cp -r defuse/* $INST_PATH/bin/defuse_install - cp defuse/scripts/*pm $INST_PATH/lib/perl5 + cp -r ./defuse/* $INST_PATH/bin/defuse_install + cp ./defuse/scripts/*pm $INST_PATH/lib/perl5 ln -s $INST_PATH/bin/defuse_install/scripts/defuse.pl $INST_PATH/bin/defuse.pl touch $SETUP_DIR/defuse.success )>>$INIT_DIR/setup.log 2>&1 From b3d73f11a21143eef15f982d091c11a40b8baa7c Mon Sep 17 00:00:00 2001 From: am26 Date: Mon, 11 Apr 2016 13:29:13 +0100 Subject: [PATCH 40/40] Upping version --- perl/lib/Sanger/CGP/CgpRna.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/CgpRna.pm b/perl/lib/Sanger/CGP/CgpRna.pm index c4c033f..1c9e37f 100644 --- a/perl/lib/Sanger/CGP/CgpRna.pm +++ b/perl/lib/Sanger/CGP/CgpRna.pm @@ -36,7 +36,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '2.0.6'; +our $VERSION = '2.0.7'; our @EXPORT = qw($VERSION); 1;