Skip to content

Commit

Permalink
Merge branch 'release/v3.3.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
keiranmraine committed Jul 31, 2019
2 parents e142055 + 4d50da3 commit 0d82d09
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 5 deletions.
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# CHANGES

## 3.3.0

* I/O hardening, see [milestone 3](https://github.com/cancerit/cgpPindel/milestone/3)

## 3.2.2

* Handle Input files that may have no reads at all, specifically an issue when generating a normal panel.
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ FROM ubuntu:16.04

LABEL maintainer="[email protected]" \
uk.ac.sanger.cgp="Cancer, Ageing and Somatic Mutation, Wellcome Trust Sanger Institute" \
version="v3.2.2" \
version="v3.0.0" \
description="cgpPindel docker"

RUN apt-get -yq update
Expand Down
3 changes: 2 additions & 1 deletion perl/bin/pindel_input_gen.pl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/perl

########## LICENCE ##########
# Copyright (c) 2014-2018 Genome Research Ltd.
# Copyright (c) 2014-2019 Genome Research Ltd.
#
# Author: CASM/Cancer IT <[email protected]>
#
Expand Down Expand Up @@ -49,6 +49,7 @@ BEGIN
$generator->set_threads($options->{'threads'});
$generator->set_outdir($options->{'outdir'});
$generator->run;
$generator->validate;
}

sub setup {
Expand Down
2 changes: 1 addition & 1 deletion perl/lib/Sanger/CGP/Pindel.pm
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ use strict;
use Const::Fast qw(const);

use base 'Exporter';
our $VERSION = '3.2.2';
our $VERSION = '3.3.0';
our @EXPORT = qw($VERSION);

1;
53 changes: 51 additions & 2 deletions perl/lib/Sanger/CGP/Pindel/InputGen.pm
Original file line number Diff line number Diff line change
Expand Up @@ -53,13 +53,16 @@ const my $PAIRS_PER_THREAD => 500_000;

const my $BAMCOLLATE => q{%s outputformat=sam colsbs=268435456 collate=1 classes=F,F2 exclude=DUP,SECONDARY,SUPPLEMENTARY T=%s filename=%s reference=%s inputformat=%s};

const my $VERIFY_GENERATED => q{bash -c 'gzip -cd %s | tee >(grep -alP "\x00" || true) | wc -c'};

sub new {
my ($class, $bam, $exclude, $ref) = @_;
my $self = {'rname_fhs' => {},
'rname_bytes' => {},
'threads' => 1, };
bless $self, $class;
$self->set_input($bam) if(defined $bam);
$self->set_reference($ref) if(defined $bam);
$self->set_reference($ref) if(defined $ref);
$self->set_exclude($exclude) if(defined $exclude);
return $self;
}
Expand Down Expand Up @@ -180,6 +183,23 @@ sub run {
};
}

sub validate {
my $self = shift;
my $rname_fh = $self->{'rname_fhs'}; # paths, not handles
my $rname_bytes = $self->{'rname_bytes'};
my @bad_files;
for my $chr(keys %{$rname_bytes}) {
my $problem = corrupt_pindel_input($rname_fh->{$chr}, $rname_bytes->{$chr});
if(defined $problem) {
push @bad_files, $problem;
}
}
if(@bad_files > 0) {
croak join "\t\n", (sprintf q{%d generated files are corrupt:}, scalar @bad_files), @bad_files;
}
return 1;
}

sub _process_set {
my ($self, $rg_pis, $sample_name, $pairs) = @_;
my $max_threads = $self->{'threads'};
Expand Down Expand Up @@ -279,6 +299,7 @@ sub reads_to_disk {
}
}
my $rname_fh = $self->{'rname_fhs'};
my $rname_bytes = $self->{'rname_bytes'};
for my $rname(keys %grouped) {
my $mode = '>>';
unless(exists $rname_fh->{$rname}) {
Expand All @@ -290,10 +311,38 @@ sub reads_to_disk {
my $gzip = sprintf 'gzip --fast -c %s %s', $mode, $rname_fh->{$rname};
open my $fh, '|-', $gzip or die "Can't start gzip";
for my $record(@{$grouped{$rname}}) {
print $fh (join "\n", $record),"\n";
my $to_write = (join "\n", $record)."\n";
print $fh $to_write;
$rname_bytes->{$rname} += length $to_write;
}
close $fh;
}
return 1;
}

sub corrupt_pindel_input {
my ($filename, $expect_bytes) = @_;
# !! not an object method !!

# will return name of corrupt file or undef
my $result = undef;

my $command = sprintf $VERIFY_GENERATED, $filename;
my ($pid, $process, $bytes);
$pid = open $process, q{-|}, $command or croak 'Could not fork: '.$OS_ERROR;
while (my $tmp = <$process>) {
chomp $tmp;
$bytes = $tmp;
}
close $process;
if($bytes !~ m/^[0-9]+$/) {
croak "corrupt_pindel_input() doesn't appear to have generated valid number as a byte count";
}
if ($bytes != $expect_bytes) {
$result = $filename;
}

return $result;
}

1;
Expand Down
Binary file added perl/t/data/inputGen-NUL.txt.gz
Binary file not shown.
Binary file added perl/t/data/inputGen-goodfile.txt.gz
Binary file not shown.
79 changes: 79 additions & 0 deletions perl/t/inputGen.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
########## LICENCE ##########
# Copyright (c) 2019 Genome Research Ltd.
#
# Author: CASM/Cancer IT <[email protected]>
#
# This file is part of cgpPindel.
#
# cgpPindel is free software: you can redistribute it and/or modify it under
# the terms of the GNU Affero General Public License as published by the Free
# Software Foundation; either version 3 of the License, or (at your option) any
# later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
# details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
########## LICENCE ##########

use strict;
use File::Temp qw(tempdir);
use Test::More;
use Test::Fatal;
use Const::Fast qw(const);
use FindBin qw($Bin);

const my $MODULE => 'Sanger::CGP::Pindel::InputGen';
const my $DATA => "$Bin/data";

const my $RECORD_SET => [
[ "\@2:2428:29677:5760/1_RG461144\nGTTAGGGTTAGGGTTAGGGTTGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAAGATAGGAAGAGCACA\n+\t22\t10001\t38\t364\tSAMPLE",
"\@2:2428:29677:5760/2_RG461144\nTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCTAACCCAACCCAAACCCAAACCCAAACA\n-\t22\t10130\t38\t364\tSAMPLE"
]
];
# length of each record plus line feed each (added in function)
const my $RECORD_OUT_BYTES => (length $RECORD_SET->[0]->[0]) + (length $RECORD_SET->[0]->[1]) +2;

my $obj;
subtest 'Initialisation checks' => sub {
use_ok($MODULE);
};

subtest 'corrupt_pindel_input checks' => sub {
# File of correct size (no NUL)
is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-goodfile.txt.gz", 22),
undef,
'corrupt_pindel_input - well formed compressed file, expected size');
# File of incorrect size (no NUL)
is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-goodfile.txt.gz", 9),
"$DATA/inputGen-goodfile.txt.gz",
'corrupt_pindel_input - well formed compressed file, UNexpected size');
# File of incorrect size + NUL
is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-NUL.txt.gz", 22),
"$DATA/inputGen-NUL.txt.gz",
'corrupt_pindel_input - NUL character in compressed file, expected size');
# File of correct size + NUL
is( Sanger::CGP::Pindel::InputGen::corrupt_pindel_input("$DATA/inputGen-NUL.txt.gz", 9),
"$DATA/inputGen-NUL.txt.gz",
'corrupt_pindel_input - NUL character in compressed file, UNexpected size');
};

subtest 'reads_to_disk checks' => sub{
$obj = new_ok($MODULE,
[ "$DATA/test.bam",
undef,
"$DATA/genome_22.fa"]);
my $out_folder = tempdir( 'pindelTests_XXXX', CLEANUP => 1 );

$obj->set_outdir($out_folder);
ok($obj->reads_to_disk($RECORD_SET), 'create output');
is($obj->{'rname_bytes'}->{'22'}, $RECORD_OUT_BYTES, 'Verify bytes written captured');

ok($obj->validate, 'validate returns true')
};

done_testing();

0 comments on commit 0d82d09

Please sign in to comment.