diff --git a/_countit/batch_bed2wig.pl b/_countit/batch_bed2wig.pl index 32ee9af..ec48a9e 100755 --- a/_countit/batch_bed2wig.pl +++ b/_countit/batch_bed2wig.pl @@ -8,6 +8,8 @@ use Carp; use Data::Dumper; + +use MyConfig; use Quantas; my $prog = basename ($0); @@ -18,7 +20,13 @@ my $suffix = ""; my $inputFormat = "bed"; my $weight = ""; -my $cache = ""; +my $qsub = 0; +my $queueName = ""; +my $jobName = "job"; + +my $cache = getDefaultCache ($prog); +my $keepCache = 0; + #my $method = "mean"; #sum @@ -27,6 +35,9 @@ "weight"=>\$weight, "base:s"=>\$base, "suffix:s"=>\$suffix, + "qsub"=>\$qsub, + "queue:s"=>\$queueName, + "job:s"=>\$jobName, "cache:s"=>\$cache, "v|verbose"=>\$verbose ); @@ -35,12 +46,16 @@ { print "generate expression matrix\n"; print "Usage $prog [options] \n"; - print " -format [string] : input format ([bed]|bedgraph), gz or bz2 files are accepted\n"; - print " -weight [string] : weight individual tags (for bed files only; bedgraphs are always weighted)\n"; - print " -base [string] : base dir of input data\n"; - print " -suffix [string] : add suffix to the file names\n"; - print " -cache [string] : cache dir\n"; - print " -v : verbose\n"; + print " --format [string] : input format ([bed]|bedgraph), gz or bz2 files are accepted\n"; + print " --weight [string] : weight individual tags (for bed files only; bedgraphs are always weighted)\n"; + print " --base [string] : base dir of input data\n"; + print " --suffix [string] : add suffix to the file names\n"; + print " --qsub : run jobs using sge\n"; + print " --queue [string] : queque name\n"; + print " --job [string] : job name\n"; + print " --cache [string] : cache dir\n"; + print " --keep-cache : keep cache dir\n"; + print " -v : verbose\n"; exit (1); } @@ -51,7 +66,6 @@ Carp::croak "incorrect input format: $inputFormat\n" if $inputFormat ne 'bed' && $inputFormat ne 'bedgraph'; - print "loading configuration file from $configFile ...\n" if $verbose; Carp::croak "contig file $configFile does not exist\n" unless -f $configFile; my $groups = readExprConfigFile ($configFile, $base, $suffix); @@ -64,7 +78,19 @@ print "generating wig files for each group...\n" if $verbose; my @groupNames = sort {$groups->{$a}->{"id"} <=> $groups->{$b}->{"id"}} keys %$groups; +system ("mkdir $cache"); + my $iter = 0; +my $scriptListFile = "$cache/script.list"; +my $scriptDir = "$cache/scripts"; + +my $fout; +if ($qsub) +{ + open ($fout, ">$scriptListFile") || Carp::croak "cannot open $scriptListFile to write\n"; + system ("mkdir $scriptDir"); +} + foreach my $gName (@groupNames) { my $samples = $groups->{$gName}->{"samples"}; @@ -80,29 +106,64 @@ $cmd = "bzcat "; } - foreach my $s (@$samples) + my $outputFile = "$outDir/$gName"; + $outputFile .= $suffix if $suffix ne ""; + + if (@$samples == 1 && $inputFormat eq 'bedgraph') { - print "$iter: group=$gName, sample=$s\n" if $verbose; - my $inputFile = $s; + #only one bedgraph file, we just make a copy + my $inputFile = $samples->[0]; $inputFile = "$base/$inputFile" if $base ne ''; + + $cmd = "cp $inputFile $outputFile"; + } + else + { + foreach my $s (@$samples) + { + print "$iter: group=$gName, sample=$s\n" if $verbose; + my $inputFile = $s; + $inputFile = "$base/$inputFile" if $base ne ''; - $cmd .= " $inputFile"; + $cmd .= " $inputFile"; + } + + if ($inputFormat eq 'bedgraph') + { + $cmd .= " | awk '{print \$1\"\\t\"\$2\"\\t\"\$3\"\\tt\\t\"\$4}'"; + } + + my $weightFlag = $inputFormat eq 'bedgraph' || $weight ? '--weight' : ''; + $cmd .= " | perl $cmdDir/tag2profile.pl $verboseFlag -c $cache/tag2profile_$iter $weightFlag -big -exact -of bedgraph - $outputFile"; } - if ($inputFormat eq 'bedgraph') + print $cmd, "\n"; + + if ($qsub) { - $cmd .= " | awk '{print \$1\"\\t\"\$2\"\\t\"\$3\"\\tt\\t\"\$4}'"; + my $scriptFile = "$scriptDir/script_$iter.sh"; + my $fout2; + open ($fout2, ">$scriptFile") || Carp::croak "cannot open file $scriptFile to write\n"; + print $fout2 $cmd, "\n"; + close ($fout2); + print $fout $scriptFile, "\n"; } + else + { + my $ret = system ($cmd); + Carp::croak "CMD=$cmd failed:$?\n" if $ret != 0; + } + $iter++; +} - my $weightFlag = $inputFormat eq 'bedgraph' || $weight ? '--weight' : ''; - my $cacheFlag = $cache ne ''? "-c $cache" : ''; - $cmd .= " | perl $cmdDir/tag2profile.pl $verboseFlag $cacheFlag $weightFlag -big -exact -of bedgraph - $outDir/$gName.wig"; - - print $cmd, "\n"; - my $ret = system ($cmd); - Carp::croak "CMD=$cmd failed:$?\n" if $ret != 0; +if ($qsub) +{ + close ($fout); - $iter++; + my $cmd = "perl $cmdDir/batchQsub.pl $verboseFlag -c $cache/qsub -j $jobName -q $queueName --wait $scriptListFile"; + print $cmd, "\n" if $verbose; + system ($cmd); } +system ("rm -rf $cache") unless $keepCache; diff --git a/_countit/gen_expression_matrix.pl b/_countit/gen_expression_matrix.pl index 3975246..161f5b3 100755 --- a/_countit/gen_expression_matrix.pl +++ b/_countit/gen_expression_matrix.pl @@ -50,18 +50,21 @@ print "done.\n" if $verbose; -print "loading data of individual samples ...\n" if $verbose; +print "loading data of individual samples and aggregating samples in the same group...\n" if $verbose; -my %sampleData; +#my %sampleData; my $geneId; my $n = 0; my $iter = 0; my $geneInfo; my @groupNames = sort {$groups->{$a}->{"id"} <=> $groups->{$b}->{"id"}} keys %$groups; +my @groupData; -foreach my $gName (@groupNames) +for (my $g = 0; $g < @groupNames; $g++) { + my $gName = $groupNames[$g]; + my $samples = $groups->{$gName}->{"samples"}; foreach my $s (@$samples) { @@ -78,28 +81,7 @@ { $n = @$geneInfo; } - $sampleData{$s} = $sdata->{"data"}; - $iter++; - } -} - -print "$iter samples, $n genes loaded.\n" if $verbose; - - -print "aggregating samples in the same group ...\n" if $verbose; - - -my @groupData; - -for (my $g = 0; $g < @groupNames; $g++) -{ - my $gName = $groupNames[$g]; - my $samples = $groups->{$gName}->{"samples"}; - - foreach my $s (@$samples) - { - print "sample=$s\n" if $verbose; - my $data = $sampleData{$s}; + my $data = $sdata->{"data"}; for (my $i = 0; $i < $n; $i++) { my $d = $data->[$i]; @@ -107,9 +89,11 @@ $groupData[$g][$i][0] += $d->[0]; #tagNum $groupData[$g][$i][1] += $d->[1]; #RPKM } + $iter++; } } +print "$iter samples, $n genes loaded.\n" if $verbose; my $fout;