Merge branch 'release/2.0.7'

cancerit · Apr 11, 2016 · aa34ff4 · aa34ff4
2 parents 165ec56 + b3d73f1
commit aa34ff4
Show file tree

Hide file tree

Showing 17 changed files with 1,834 additions and 648 deletions.
diff --git a/README.md b/README.md
@@ -29,3 +29,48 @@ reads ‘Copyright (c) 2005, 2007, 2008, 2009, 2011, 2012’ and a copyright
 statement that reads ‘Copyright (c) 2005-2012’ should be interpreted as being
 identical to a statement that reads ‘Copyright (c) 2005, 2006, 2007, 2008,
 2009, 2010, 2011, 2012’."
+
+cgpRna
+======
+
+cgpRna provides pipelines, for RNA-Seq data, that implement commonly used mapping 
+and analysis programs, such as TopHat and rna-star.
+At the present time (May 2016), only pipelines for mapping (with STAR), lane QC 
+and fusion gene detection are included in this codebase but this will be added 
+to over time with; differential expression, gene/transcript quantification, splice 
+variant analysis and allele specific expression.
+
+### Dependencies/Installation
+
+Please install Perl packages [PCAP-core](https://github.com/ICGC-TCGA-PanCancer/PCAP-core/releases) and [VAGrENT](https://github.com/cancerit/VAGrENT/releases) first.
+
+Prerequisites for the [RSeQC](http://rseqc.sourceforge.net/#installation) software are:
+* gcc
+* [python2.7](https://www.python.org/downloads/) (The minimum version the pipeline has been tested with is python-2.7.6
+* [R](https://www.r-project.org/)
+* [numpy](http://www.numpy.org/)
+
+Once that is done and your $PATH environment variable has been updated so that newly installed software can be found, run the following to install cgpRna:
+
+./setup.sh path_to_install_to
+
+N.B. the path_to_install_to should be the same as the install location used for PCAP-core and VAGrENT above.
+
+### Tools installed by setup.sh
+
+* Some CPAN hosted libraries, see perl/Makefile.PL
+* [STAR](https://github.com/alexdobin/STAR/releases)
+* [Tophat](https://ccb.jhu.edu/software/tophat/index.shtml)
+* [deFuse](https://bitbucket.org/dranew/defuse)
+* [RSeQC](http://rseqc.sourceforge.net)
+* [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) N.B. both bowtie and bowtie2 are installed and can be used with Tophat
+* [blat](http://hgwdev.cse.ucsc.edu/~kent/src/) Unless already in the install location bin directory
+* [gmap](http://research-pub.gene.com/gmap/) The aligner used by deFuse
+* [faToTwoBit](http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/) deFuse dependency
+* [bedtools](https://github.com/arq5x/bedtools2/) Unless already in the install location bin directory
+* [blastn](http://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=Download) Used by tophat-fusion post
+
+N.B. samtools is also a dependency but this is installed by PCAP-Core which should have already been installed (see above).
+
+If you are planning to use the fusion pipeline, specifically defuse_fusion.pl, the deFuse config.txt file will need to be updated with the installed locations of a number of tools.
+These paths are printed to screen if the setup.sh script completes successfully so make a note of the locations and update the file as instructed.
diff --git a/perl/Makefile.PL b/perl/Makefile.PL
@@ -41,6 +41,14 @@ WriteMakefile(
   EXE_FILES     => [qw(
                         bin/star_mapping.pl
                         bin/process_qcstats.pl
+                        bin/star_fusion.pl
+                        bin/tophat_fusion.pl
+                        bin/tophat_add_strand.pl
+                        bin/defuse_fusion.pl
+                        bin/defuse_filters.pl
+                        bin/filter_fusions.pl
+                        bin/compare_overlapping_fusions.pl
+                        bin/compare_CN_and_fusion.pl
                         )],
   PREREQ_PM     => {
                      'Capture::Tiny' => 0.30,

diff --git a/perl/bin/compare_overlapping_fusions.pl b/perl/bin/compare_overlapping_fusions.pl
@@ -58,33 +58,40 @@ BEGIN
 use Data::Dumper;
 
 const my @REQUIRED_PARAMS => qw(outdir sample gtf);
-const my @VALID_PROCESS => qw(createbed annotatebed selectannotation createbedpe runbedpairtopair compareoverlaps);
-const my %INDEX_FACTOR => (	'createbed' => -1,
-				'annotatebed' => -1,
-				'selectannotation' => -1,
-				'createbedpe' => -1,
+const my @VALID_PROCESS => qw(createjunctionbed runbedpairtopair processoverlaps singletons queryvagrent annotatebed selectannotation collateannotation deduplicate output);
+const my %INDEX_FACTOR => (	'createjunctionbed' => -1,
 				'runbedpairtopair' => 1,
-				'compareoverlaps' => 1);				
+				'processoverlaps' => 1,
+				'singletons' => 1,
+				'queryvagrent' => 1,
+				'annotatebed' => 1,
+				'selectannotation' => 1,
+				'collateannotation' => 1,
+				'deduplicate' => 1,
+				'output' => 1);				
 {
   my $options = setup();
 
   my $threads = PCAP::Threaded->new($options->{'threads'});
   &PCAP::Threaded::disable_out_err if(exists $options->{'index'});
 
-  $threads->add_function('createbed', \&Sanger::CGP::CompareFusions::Implement::create_bed);
-  $threads->add_function('annotatebed', \&Sanger::CGP::CompareFusions::Implement::annotate_bed);
-  $threads->add_function('selectannotation', \&Sanger::CGP::CompareFusions::Implement::select_annotation);
-  $threads->add_function('createbedpe', \&Sanger::CGP::CompareFusions::Implement::create_bedpe);
+  $threads->add_function('createjunctionbed', \&Sanger::CGP::CompareFusions::Implement::create_junction_bedpe);
 
-  $threads->run($options->{'num'}, 'createbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbed');
-  $threads->run($options->{'num'}, 'annotatebed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed');
-  $threads->run($options->{'num'}, 'selectannotation', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation');
-  $threads->run($options->{'num'}, 'createbedpe', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createbedpe');
+  $threads->run($options->{'num'}, 'createjunctionbed', $options) if(!exists $options->{'process'} || $options->{'process'} eq 'createjunctionbed');
 
   Sanger::CGP::CompareFusions::Implement::run_bed_pairtopair($options) if(!exists $options->{'process'} || $options->{'process'} eq 'runbedpairtopair');
+  Sanger::CGP::CompareFusions::Implement::process_overlap_files($options) if(!exists $options->{'process'} || $options->{'process'} eq 'processoverlaps');
+  Sanger::CGP::CompareFusions::Implement::process_singletons($options) if(!exists $options->{'process'} || $options->{'process'} eq 'singletons');
+  Sanger::CGP::CompareFusions::Implement::query_vagrent($options) if(!exists $options->{'process'} || $options->{'process'} eq 'queryvagrent');
+  if(-s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".1.bed") || -s File::Spec->catfile($options->{'tmp'}, $options->{'sample'}.".2.bed")){
+    Sanger::CGP::CompareFusions::Implement::annotate_bed($options) if(!exists $options->{'process'} || $options->{'process'} eq 'annotatebed');
+    Sanger::CGP::CompareFusions::Implement::select_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'selectannotation');
+    Sanger::CGP::CompareFusions::Implement::collate_annotation($options) if(!exists $options->{'process'} || $options->{'process'} eq 'collateannotation');
+    Sanger::CGP::CompareFusions::Implement::deduplicate_fusions($options) if(!exists $options->{'process'} || $options->{'process'} eq 'deduplicate');
+  }
 
-  if(!exists $options->{'process'} || $options->{'process'} eq 'compareoverlaps') {
-  Sanger::CGP::CompareFusions::Implement::compare_overlaps($options);
+  if(!exists $options->{'process'} || $options->{'process'} eq 'output') {
+    Sanger::CGP::CompareFusions::Implement::generate_output($options);
     cleanup($options);
   } 
 }
@@ -93,8 +100,7 @@ sub cleanup {
   my $options = shift;
   my $tmpdir = $options->{'tmp'};
   my $sample = $options->{'sample'};
-  move(File::Spec->catfile($tmpdir, "$sample.gene-fusions.txt"), $options->{'outdir'}) || die $!;
-  move(File::Spec->catfile($tmpdir, "$sample.exon-fusions.txt"), $options->{'outdir'}) || die $!;
+  move(File::Spec->catfile($tmpdir, "$sample.detected.fusions.txt"), $options->{'outdir'}) || die $!;
   move(File::Spec->catdir($tmpdir, 'logs'), File::Spec->catdir($options->{'outdir'}, 'logs')) || die $!;
   remove_tree $tmpdir if(-e $tmpdir);
   return 0;
@@ -113,6 +119,7 @@ sub setup {
 		't|threads=i' => \$opts{'threads'},
 		'p|process=s' => \$opts{'process'},
 		'i|index=i' => \$opts{'index'},
+		'c|cache=s' => \$opts{'cache'},
   ) or pod2usage(2);
 
   pod2usage(-verbose => 1) if(defined $opts{'h'});
@@ -143,13 +150,23 @@ sub setup {
   $opts{'input_files'} = \@ARGV;
 
   my $format;
+  my $format_num;
   my %fusion_files;
   my $input;
   for (my $iter=1; $iter <= $file_count; $iter++) {
     $input = $ARGV[$iter-1];
     $format = Sanger::CGP::CompareFusions::Implement::check_input($input);
-    $fusion_files{$iter}{'format'} = $format;
-    $fusion_files{$iter}{'name'} = $input;
+    if($format eq 'star'){
+      $format_num = 1;
+    }
+    elsif($format eq 'tophat'){
+      $format_num = 2;
+    }
+    else{
+      $format_num = 3;
+    }
+    $fusion_files{$format_num}{'format'} = $format;
+    $fusion_files{$format_num}{'name'} = $input;
   }
 
   $opts{'fusion_files'} = \%fusion_files;
@@ -181,7 +198,7 @@ sub setup {
 
 =head1 compare_overlapping_fusions.pl
 
-Produces a report of fusions that have been called by two algorithms (tophat, star or deFuse) or overlapping all three. Two output lists will be generated; one at the gene level and the other at the exon level.
+Produces a report of overlapping fusions that have been called by star-fusion and deFuse.
 
 =head1 SYNOPSIS
 
@@ -191,6 +208,7 @@ =head1 SYNOPSIS
     -outdir    		-o   	Folder to output result to.
     -sample   		-s   	Sample name
     -gtf    		-g   	GTF file to use with bedtools to annotate each fusion breakpoint position.
+    -cache    		-c   	VAGrENT cache file that should be the same reference and gene build as the GTF file being used e.g. GRCh38 e77.
     
   Optional:
     -threads    	-t   	Number of threads (cpus) to use [1].
@@ -199,7 +217,7 @@ =head1 SYNOPSIS
     -process   		-p   	Only process this step then exit
     -index    		-i   	Valid for processes; createbed, annotatebed and createbedpe - 1..<num_input_files>
     
-    Input files should be in the format generated by cgpRna pipelines; defuse.pl, tophat_fusion.pl or star_fusion.pl
+    Input files should be in the format generated by cgpRna pipelines; defuse.pl or star_fusion.pl
     
 =head1 OPTIONS
 
@@ -209,11 +227,12 @@ =head1 OPTIONS
 
 Available processes for this tool are:
 
-  createbed
+  createjunctionbed
+  runbedpairtopair
+  queryvagrent
   annotatebed
   selectannotation
-  createbedpe
-  runbedpairtopair
-  compareoverlaps
+  collateannotation
+  output
 
 =back
diff --git a/perl/bin/defuse_filters.pl b/perl/bin/defuse_filters.pl
@@ -0,0 +1,138 @@
+#!/usr/bin/perl
+##########LICENCE ##########
+#Copyright (c) 2015 Genome Research Ltd.
+###
+#Author: Cancer Genome Project <[email protected]>
+###
+#This file is part of cgpRna.
+###
+#cgpRna is free software: you can redistribute it and/or modify it under
+#the terms of the GNU Affero General Public License as published by the
+#Free Software Foundation; either version 3 of the License, or (at your
+#option) any later version.
+###
+#This program is distributed in the hope that it will be useful, but
+#WITHOUT ANY WARRANTY; without even the implied warranty of
+#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero
+#General Public License for more details.
+###
+#You should have received a copy of the GNU Affero General Public
+#License along with this program. If not, see
+#<http://www.gnu.org/licenses/>.
+###
+#1. The usage of a range of years within a copyright statement contained
+#within this distribution should be interpreted as being equivalent to a
+#list of years including the first and last year specified and all
+#consecutive years between them. For example, a copyright statement that
+#reads ‘Copyright (c) 2005, 2007- 2009, 2011-2012’ should be interpreted
+#as being identical to a statement that reads ‘Copyright (c) 2005, 2007,
+#2008, 2009, 2011, 2012’ and a copyright statement that reads ‘Copyright
+#(c) 2005-2012’ should be interpreted as being identical to a statement
+#that reads ‘Copyright (c) 2005, 2006, 2007, 2008, 2009, 2010, 2011,
+#2012’."
+##########LICENCE ##########
+##########
+use strict;
+use warnings;
+
+use autodie qw(:all);
+use English qw( -no_match_vars );
+use File::Path qw(remove_tree make_path);
+use Getopt::Long;
+use File::Spec;
+use Pod::Usage qw(pod2usage);
+use Const::Fast qw(const);
+use PCAP::Cli;
+
+# Columns in the deFuse output file that will be filtered.
+const my $DEFUSE_SPLIT_CHAR => '\t';
+
+const my $SPLITR_MIN_PVAL_COL => 7;
+const my $BREAKSEQS_ESTISLANDS_PERCIDENT_COL => 14;
+const my $CDNA_BREAKSEQS_PERCIDENT_COL => 15;
+const my $EST_BREAKSEQS_PERCIDENT_COL => 17;
+const my $GENOME_BREAKSEQS_PERCIDENT_COL => 38;
+const my $SPAN_COVERAGE_MIN_COL => 66;
+
+const my $SPLITR_MIN_PVAL_VAL =>  0.1; # splitr_min_pvalue - > 0.1
+const my $BREAKSEQS_ESTISLANDS_PERCIDENT_VAL => 0.3; # breakseqs_estislands_percident - < 0.3
+const my $CDNA_BREAKSEQS_PERCIDENT_VAL => 0.1; # cdna_breakseqs_percident - < 0.1
+const my $EST_BREAKSEQS_PERCIDENT_VAL => 0.3; # est_breakseqs_percident - < 0.3
+const my $GENOME_BREAKSEQS_PERCIDENT_VAL => 0.1; # genome_breakseqs_percident - < 0.1
+const my $SPAN_COVERAGE_MIN_VAL => 0.6; # span_coverage_min - > 0.6
+
+{
+	my $options = setup();
+
+  my $input = File::Spec->rel2abs($options->{'input'});
+  my $sample = $options->{'sample'};
+  my $outdir = $options->{'outdir'};
+  my $output = File::Spec->catfile($outdir, "$sample.defuse-fusion.normals.ext.filtered.txt");
+
+  open (my $ifh, $input) or die "Could not open file '$input' $!";
+	open(my $ofh, '>', $output) or die "Could not open file '$output' $!";
+
+  while (<$ifh>) {
+		chomp;
+		my $line = $_;
+		if($line =~ m/^breakpoint_ref/){
+		  print $ofh $line."\tcgp_defuse_filter\n";
+		}
+		else{
+		  my @fields = split $DEFUSE_SPLIT_CHAR, $line;
+		  if($fields[$SPLITR_MIN_PVAL_COL-1] > $SPLITR_MIN_PVAL_VAL && $fields[$BREAKSEQS_ESTISLANDS_PERCIDENT_COL-1] < $BREAKSEQS_ESTISLANDS_PERCIDENT_VAL && 
+		     $fields[$CDNA_BREAKSEQS_PERCIDENT_COL-1] < $CDNA_BREAKSEQS_PERCIDENT_VAL && $fields[$EST_BREAKSEQS_PERCIDENT_COL-1] < $EST_BREAKSEQS_PERCIDENT_VAL && 
+		     $fields[$GENOME_BREAKSEQS_PERCIDENT_COL-1] < $GENOME_BREAKSEQS_PERCIDENT_VAL && $fields[$SPAN_COVERAGE_MIN_COL-1] > $SPAN_COVERAGE_MIN_VAL) {
+
+		    print $ofh $line."\t1\n";		     
+		  }
+		  else{
+		    print $ofh $line."\t0\n";
+		  }
+		} 
+  }
+  close($ifh);
+  close($ofh);
+
+}
+
+sub setup {
+	my %opts;
+	pod2usage(-msg => "\nERROR: Options must be defined.\n", -verbose => 1, -output => \*STDERR) if(scalar @ARGV == 0);
+	$opts{'cmd'} = join " ", $0, @ARGV;
+
+	GetOptions( 	'h|help' => \$opts{'h'},
+			'm|man' => \$opts{'m'},
+			'i|input=s' => \$opts{'input'},
+			'o|outdir=s' => \$opts{'outdir'},
+			's|sample=s' => \$opts{'sample'},			
+	) or pod2usage(2);
+
+	pod2usage(-verbose => 1) if(defined $opts{'h'});
+	pod2usage(-verbose => 2) if(defined $opts{'m'});
+
+	PCAP::Cli::file_for_reading('input', $opts{'input'});
+
+	# Check the output directory exists and is writeable, create if not
+	PCAP::Cli::out_dir_check('outdir', $opts{'outdir'});
+
+	return \%opts;
+}
+
+__END__
+
+=head1 defuse_fusions.pl
+
+Adds a flag (called cgp_defuse_filter) to the raw defuse data based on validation carried out by Graham Bignell on the CTTV RNA-Seq cell lines data set. 
+The flag can be used to filter the data in downstream analysis with the aim of reducing the number of false positive fusions called. Details of the filter thresholds can be found in the constants section at the top of the script.
+
+=head1 SYNOPSIS
+
+defuse_fusions.pl [options]
+
+  Required parameters:
+    -outdir    		-o   	Folder to output result to.
+    -sample   		-s   	Sample name
+    -input    		-i   	deFuse input file containing fusions called by the cgpRna pipeline.
+
+In the output file, a row with a 1 in the column cgp_defuse_filter means that this fusion has passed the set of filter thresholds whereas 0 means this fusion can potentially be filtered out.
diff --git a/perl/bin/defuse.pl → perl/bin/defuse_fusion.pl b/perl/bin/defuse.pl → perl/bin/defuse_fusion.pl
@@ -204,15 +204,15 @@ =head1 defuse.pl
 
 =head1 SYNOPSIS
 
-defuse.pl [options] [file(s)...]
+defuse_fusion.pl [options] [file(s)...]
 
   Required parameters:
     -outdir    		-o   	Folder to output result to.
     -sample   		-s   	Sample name
 
   Optional
-    -defuseconfig 	-d  	Name of the defuse config file. It should reside under /refdataloc/species/refbuild/genebuild/ [defuse-config-GRCh38-77.txt]
-    -normals  	  	-n  	File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38]
+    -defuseconfig 	-d  	Name of the defuse config file. It should reside under /refdataloc/species/refbuild/defuse/genebuild/ [defuse-config.txt]
+    -normals  	  	-n  	File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions]
     -threads   		-t  	Number of cores to use. [1]
     -config   		-c  	Path to config.ini file. The file contains defaults for the reference data and deFuse software installation details [<cgpRna-install-location>/perl/config/defuse.ini]
     -refbuild 		-rb 	Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] 
@@ -230,7 +230,7 @@ =head1 SYNOPSIS
     -version   		-v   	Version
 
   File list can be full file names or wildcard, e.g.
-    defuse.pl -t 16 -o myout -refbuild GRCh38 -genebuild 77 -s sample input/*.bam
+    defuse_fusion.pl -t 16 -o myout -refbuild GRCh38 -genebuild 77 -s sample input/*.bam
 
   Run with '-m' for possible input file types.
 

diff --git a/perl/bin/star_fusion.pl b/perl/bin/star_fusion.pl
@@ -213,12 +213,12 @@ =head1 SYNOPSIS
     -sample   		-s   	Sample name
 
   Optional
-    -gtffile 		-g  	GTF annotation file name which should be compatible with the refbuild and gene build versions. It should reside under /refdataloc/species/refbuild/star/genebuild/ [Homo_sapiens.GRCh38.77.gtf]
-    -normals  	  	-n  	File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions-b38]
+    -gtffile 		-g  	GTF annotation file name which should be compatible with the refbuild and gene build versions. It should reside under /refdataloc/species/refbuild/star/genebuild/ [ensembl.gtf]
+    -normals  	  	-n  	File containing list of gene fusions detected in normal samples. It should reside under /refdataloc/species/refbuild/ [normal-fusions]
     -threads   		-t  	Number of cores to use. [1]
     -config   		-c  	Path to config.ini file. It contains defaults for; the reference and gene build versions, star software and default star and star-fusion parameters [<cgpRna-install-location>/perl/config/star.ini]
     -refbuild 		-rb 	Reference assembly version. Can be UCSC or Ensembl format e.g. GRCh38 or hg38 [GRCh38] 
-    -genebuild 		-gb 	Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style.[e77]
+    -genebuild 		-gb 	Gene build version. This needs to be consistent with the reference build in terms of the version and chromosome name style.[77]
     -refdataloc  	-r  	Parent directory of the reference data
     -species  		-sp 	Species [human]