From a9b57c8ac7bce33122c105e007688f6e15eba662 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 11:37:11 +0000 Subject: [PATCH 1/9] Base function as described in #80 --- perl/lib/Sanger/CGP/Pindel/InputGen.pm | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index d9a0fba..faa241f 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -296,6 +296,16 @@ sub reads_to_disk { } } +sub corrupt_pindel_input { + my ($filename, $expect_bytes) = @_; + # !! not an object method !! + + # stub until tests written + # will return name of corrupt file or undef + + return undef; +} + 1; __END__ From 667ed59682140f14dcb0139144aae6b129319798 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 12:07:00 +0000 Subject: [PATCH 2/9] Tests 3/4 fail as expected for current stage of development as decribed in #81 --- perl/t/data/inputGen-NUL.txt.gz | Bin 0 -> 42 bytes perl/t/data/inputGen-goodfile.txt.gz | Bin 0 -> 42 bytes perl/t/inputGen.t | 56 +++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 perl/t/data/inputGen-NUL.txt.gz create mode 100644 perl/t/data/inputGen-goodfile.txt.gz create mode 100644 perl/t/inputGen.t diff --git a/perl/t/data/inputGen-NUL.txt.gz b/perl/t/data/inputGen-NUL.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..6bc6f47578622b459982aba722221e352de24248 GIT binary patch literal 42 ycmb2|=3sbl=n%`m?0x!-XXpw4V2yKYyh06(f;?8~c +# +# This file is part of cgpPindel. +# +# cgpPindel is free software: you can redistribute it and/or modify it under +# the terms of the GNU Affero General Public License as published by the Free +# Software Foundation; either version 3 of the License, or (at your option) any +# later version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more +# details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . +########## LICENCE ########## + +use strict; +use Test::More; +use Test::Fatal; +use Const::Fast qw(const); +use FindBin qw($Bin); + +const my $MODULE => 'Sanger::CGP::Pindel::InputGen'; +const my $DATA => "$Bin/data"; + +my $obj; +subtest 'Initialisation checks' => sub { + use_ok($MODULE); +}; + +subtest 'corrupt_pindel_input checks' => sub { + # File of correct size (no NUL) + is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-goodfile.txt.gz", 22), + undef, + 'corrupt_pindel_input - well formed compressed file, expected size'); + # File of incorrect size (no NUL) + is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-goodfile.txt.gz", 9), + "$DATA/inputGen-goodfile.txt.gz", + 'corrupt_pindel_input - well formed compressed file, UNexpected size'); + # File of incorrect size + NUL + is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-NUL.txt.gz", 22), + "$DATA/inputGen-NUL.txt.gz", + 'corrupt_pindel_input - NUL character in compressed file, expected size'); + # File of correct size + NUL + is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-NUL.txt.gz", 9), + "$DATA/inputGen-NUL.txt.gz", + 'corrupt_pindel_input - NUL character in compressed file, UNexpected size'); +}; + +done_testing(); + From 04ac6b00ccfc5256e1d81bfed35ff8d5b89e6bb3 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 12:23:29 +0000 Subject: [PATCH 3/9] Completed function as per #82, tests now pass --- perl/lib/Sanger/CGP/Pindel/InputGen.pm | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index faa241f..d483fea 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -53,6 +53,8 @@ const my $PAIRS_PER_THREAD => 500_000; const my $BAMCOLLATE => q{%s outputformat=sam colsbs=268435456 collate=1 classes=F,F2 exclude=DUP,SECONDARY,SUPPLEMENTARY T=%s filename=%s reference=%s inputformat=%s}; +const my $VERIFY_GENERATED => q{bash -c 'gzip -cd %s | tee >(grep -alP "\x00" || true) | wc -c'}; + sub new { my ($class, $bam, $exclude, $ref) = @_; my $self = {'rname_fhs' => {}, @@ -300,10 +302,25 @@ sub corrupt_pindel_input { my ($filename, $expect_bytes) = @_; # !! not an object method !! - # stub until tests written # will return name of corrupt file or undef + my $result = undef; + + my $command = sprintf $VERIFY_GENERATED, $filename; + my ($pid, $process, $bytes); + $pid = open $process, q{-|}, $command or croak 'Could not fork: '.$OS_ERROR; + while (my $tmp = <$process>) { + chomp $tmp; + $bytes = $tmp; + } + close $process; + if($bytes !~ m/^[0-9]+$/) { + croak "corrupt_pindel_input() doesn't appear to have generated valid number as a byte count"; + } + if ($bytes != $expect_bytes) { + $result = $filename; + } - return undef; + return $result; } 1; From b1077a45dfe02ef52d985f6fe40e5cc91d52311a Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 12:27:23 +0000 Subject: [PATCH 4/9] Fix clear copy/paste error in legacy code --- perl/lib/Sanger/CGP/Pindel/InputGen.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index d483fea..dad607b 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -61,7 +61,7 @@ sub new { 'threads' => 1, }; bless $self, $class; $self->set_input($bam) if(defined $bam); - $self->set_reference($ref) if(defined $bam); + $self->set_reference($ref) if(defined $ref); $self->set_exclude($exclude) if(defined $exclude); return $self; } From e64b83dcc5464b87bed763025a51728ce895eb96 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 12:53:57 +0000 Subject: [PATCH 5/9] Failing test as required by #83 --- perl/t/inputGen.t | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/perl/t/inputGen.t b/perl/t/inputGen.t index ab9437c..bc08be5 100644 --- a/perl/t/inputGen.t +++ b/perl/t/inputGen.t @@ -20,6 +20,7 @@ ########## LICENCE ########## use strict; +use File::Temp qw(tempdir); use Test::More; use Test::Fatal; use Const::Fast qw(const); @@ -28,6 +29,13 @@ use FindBin qw($Bin); const my $MODULE => 'Sanger::CGP::Pindel::InputGen'; const my $DATA => "$Bin/data"; +const my $RECORD_SET => [ + [ "@2:2428:29677:5760/1_RG461144\nGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGATAGGAAGAGCACA\n+\t22\t10001\t38\t364\tSAMPLE", + "@2:2428:29677:5760/2_RG461144\nTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCAAACCCAAACCCAAACA\n-\t22\t10130\t38\t364\tSAMPLE" + ] +]; +const my $RECORD_OUT_BYTES => 385; + my $obj; subtest 'Initialisation checks' => sub { use_ok($MODULE); @@ -52,5 +60,16 @@ subtest 'corrupt_pindel_input checks' => sub { 'corrupt_pindel_input - NUL character in compressed file, UNexpected size'); }; +subtest 'reads_to_disk checks' => sub{ + $obj = new_ok($MODULE, + [ "$DATA/test.bam", + undef, + "$DATA/genome_22.fa"]); + my $out_folder = tempdir( CLEANUP => 1 ); + $obj->set_outdir($out_folder); + ok($obj->reads_to_disk($RECORD_SET), 'create output'); + is($obj->{'rname_bytes'}->{'22'}, $RECORD_OUT_BYTES, 'Verify bytes written captured'); +}; + done_testing(); From f8ee85250ae77e37e6ff0e656f684b07ce801c05 Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 13:19:05 +0000 Subject: [PATCH 6/9] Resolves #84, tests function now function changes to capture bytes written --- perl/lib/Sanger/CGP/Pindel/InputGen.pm | 7 ++++++- perl/t/inputGen.t | 10 ++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index dad607b..5bb4dc5 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -58,6 +58,7 @@ const my $VERIFY_GENERATED => q{bash -c 'gzip -cd %s | tee >(grep -alP "\x00" || sub new { my ($class, $bam, $exclude, $ref) = @_; my $self = {'rname_fhs' => {}, + 'rname_bytes' => {}, 'threads' => 1, }; bless $self, $class; $self->set_input($bam) if(defined $bam); @@ -281,6 +282,7 @@ sub reads_to_disk { } } my $rname_fh = $self->{'rname_fhs'}; + my $rname_bytes = $self->{'rname_bytes'}; for my $rname(keys %grouped) { my $mode = '>>'; unless(exists $rname_fh->{$rname}) { @@ -292,10 +294,13 @@ sub reads_to_disk { my $gzip = sprintf 'gzip --fast -c %s %s', $mode, $rname_fh->{$rname}; open my $fh, '|-', $gzip or die "Can't start gzip"; for my $record(@{$grouped{$rname}}) { - print $fh (join "\n", $record),"\n"; + my $to_write = (join "\n", $record)."\n"; + print $fh $to_write; + $rname_bytes->{$rname} += length $to_write; } close $fh; } + return 1; } sub corrupt_pindel_input { diff --git a/perl/t/inputGen.t b/perl/t/inputGen.t index bc08be5..2150eb8 100644 --- a/perl/t/inputGen.t +++ b/perl/t/inputGen.t @@ -30,11 +30,12 @@ const my $MODULE => 'Sanger::CGP::Pindel::InputGen'; const my $DATA => "$Bin/data"; const my $RECORD_SET => [ - [ "@2:2428:29677:5760/1_RG461144\nGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGATAGGAAGAGCACA\n+\t22\t10001\t38\t364\tSAMPLE", - "@2:2428:29677:5760/2_RG461144\nTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCAAACCCAAACCCAAACA\n-\t22\t10130\t38\t364\tSAMPLE" + [ "\@2:2428:29677:5760/1_RG461144\nGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGATAGGAAGAGCACA\n+\t22\t10001\t38\t364\tSAMPLE", + "\@2:2428:29677:5760/2_RG461144\nTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCAAACCCAAACCCAAACA\n-\t22\t10130\t38\t364\tSAMPLE" ] ]; -const my $RECORD_OUT_BYTES => 385; +# length of each record plus line feed each (added in function) +const my $RECORD_OUT_BYTES => (length $RECORD_SET->[0]->[0]) + (length $RECORD_SET->[0]->[1]) +2; my $obj; subtest 'Initialisation checks' => sub { @@ -65,7 +66,8 @@ subtest 'reads_to_disk checks' => sub{ [ "$DATA/test.bam", undef, "$DATA/genome_22.fa"]); - my $out_folder = tempdir( CLEANUP => 1 ); + my $out_folder = tempdir( 'pindelTests_XXXX', CLEANUP => 1 ); + $obj->set_outdir($out_folder); ok($obj->reads_to_disk($RECORD_SET), 'create output'); is($obj->{'rname_bytes'}->{'22'}, $RECORD_OUT_BYTES, 'Verify bytes written captured'); From cec9ea0013a7c08918c81792af1265359a664c3f Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 13:36:24 +0000 Subject: [PATCH 7/9] Ready for detailed testing --- perl/bin/pindel_input_gen.pl | 1 + perl/lib/Sanger/CGP/Pindel/InputGen.pm | 17 +++++++++++++++++ perl/t/inputGen.t | 2 ++ 3 files changed, 20 insertions(+) diff --git a/perl/bin/pindel_input_gen.pl b/perl/bin/pindel_input_gen.pl index 5e9dadd..e8792ae 100755 --- a/perl/bin/pindel_input_gen.pl +++ b/perl/bin/pindel_input_gen.pl @@ -49,6 +49,7 @@ BEGIN $generator->set_threads($options->{'threads'}); $generator->set_outdir($options->{'outdir'}); $generator->run; + $generator->validate; } sub setup { diff --git a/perl/lib/Sanger/CGP/Pindel/InputGen.pm b/perl/lib/Sanger/CGP/Pindel/InputGen.pm index 5bb4dc5..8894305 100644 --- a/perl/lib/Sanger/CGP/Pindel/InputGen.pm +++ b/perl/lib/Sanger/CGP/Pindel/InputGen.pm @@ -183,6 +183,23 @@ sub run { }; } +sub validate { + my $self = shift; + my $rname_fh = $self->{'rname_fhs'}; # paths, not handles + my $rname_bytes = $self->{'rname_bytes'}; + my @bad_files; + for my $chr(keys %{$rname_bytes}) { + my $problem = corrupt_pindel_input($rname_fh->{$chr}, $rname_bytes->{$chr}); + if(defined $problem) { + push @bad_files, $problem; + } + } + if(@bad_files > 0) { + croak join "\t\n", (sprintf q{%d generated files are corrupt:}, scalar @bad_files), @bad_files; + } + return 1; +} + sub _process_set { my ($self, $rg_pis, $sample_name, $pairs) = @_; my $max_threads = $self->{'threads'}; diff --git a/perl/t/inputGen.t b/perl/t/inputGen.t index 2150eb8..2046531 100644 --- a/perl/t/inputGen.t +++ b/perl/t/inputGen.t @@ -71,6 +71,8 @@ subtest 'reads_to_disk checks' => sub{ $obj->set_outdir($out_folder); ok($obj->reads_to_disk($RECORD_SET), 'create output'); is($obj->{'rname_bytes'}->{'22'}, $RECORD_OUT_BYTES, 'Verify bytes written captured'); + + ok($obj->validate, 'validate returns true') }; done_testing(); From 34f36cfef99d23c2fd4e56d04d31bc94c397474a Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Tue, 30 Jul 2019 13:40:10 +0000 Subject: [PATCH 8/9] Detail changes, update date in file header --- CHANGES.md | 4 ++++ perl/bin/pindel_input_gen.pl | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index f8c75a8..1cc5e7e 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,5 +1,9 @@ # CHANGES +## 3.3.0 + +* I/O hardening, see [milestone 3](https://github.com/cancerit/cgpPindel/milestone/3) + ## 3.2.2 * Handle Input files that may have no reads at all, specifically an issue when generating a normal panel. diff --git a/perl/bin/pindel_input_gen.pl b/perl/bin/pindel_input_gen.pl index e8792ae..5bca2ca 100755 --- a/perl/bin/pindel_input_gen.pl +++ b/perl/bin/pindel_input_gen.pl @@ -1,7 +1,7 @@ #!/usr/bin/perl ########## LICENCE ########## -# Copyright (c) 2014-2018 Genome Research Ltd. +# Copyright (c) 2014-2019 Genome Research Ltd. # # Author: CASM/Cancer IT # From 4d50da329327fca51e61f1fedc7dafbe539a06fa Mon Sep 17 00:00:00 2001 From: Keiran Raine Date: Wed, 31 Jul 2019 09:53:50 +0000 Subject: [PATCH 9/9] Versions for release tagging --- Dockerfile | 2 +- perl/lib/Sanger/CGP/Pindel.pm | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 243d258..950a6bc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -37,7 +37,7 @@ FROM ubuntu:16.04 LABEL maintainer="cgphelp@sanger.ac.uk" \ uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \ - version="v3.2.2" \ + version="v3.0.0" \ description="cgpPindel docker" RUN apt-get -yq update diff --git a/perl/lib/Sanger/CGP/Pindel.pm b/perl/lib/Sanger/CGP/Pindel.pm index 61849c5..7b518d1 100644 --- a/perl/lib/Sanger/CGP/Pindel.pm +++ b/perl/lib/Sanger/CGP/Pindel.pm @@ -26,7 +26,7 @@ use strict; use Const::Fast qw(const); use base 'Exporter'; -our $VERSION = '3.2.2'; +our $VERSION = '3.3.0'; our @EXPORT = qw($VERSION); 1;