Skip to content

Commit

Permalink
modified: _countit/batch_bed2wig.pl
Browse files Browse the repository at this point in the history
	modified:   _countit/gen_expression_matrix.pl
  • Loading branch information
zhangchaolin committed Jun 10, 2024
1 parent 1ce55d8 commit 8a7b946
Show file tree
Hide file tree
Showing 2 changed files with 92 additions and 47 deletions.
105 changes: 83 additions & 22 deletions _countit/batch_bed2wig.pl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

use Carp;
use Data::Dumper;

use MyConfig;
use Quantas;

my $prog = basename ($0);
Expand All @@ -18,7 +20,13 @@
my $suffix = "";
my $inputFormat = "bed";
my $weight = "";
my $cache = "";
my $qsub = 0;
my $queueName = "";
my $jobName = "job";

my $cache = getDefaultCache ($prog);
my $keepCache = 0;


#my $method = "mean"; #sum

Expand All @@ -27,6 +35,9 @@
"weight"=>\$weight,
"base:s"=>\$base,
"suffix:s"=>\$suffix,
"qsub"=>\$qsub,
"queue:s"=>\$queueName,
"job:s"=>\$jobName,
"cache:s"=>\$cache,
"v|verbose"=>\$verbose
);
Expand All @@ -35,12 +46,16 @@
{
print "generate expression matrix\n";
print "Usage $prog [options] <in.conf> <outdir>\n";
print " -format [string] : input format ([bed]|bedgraph), gz or bz2 files are accepted\n";
print " -weight [string] : weight individual tags (for bed files only; bedgraphs are always weighted)\n";
print " -base [string] : base dir of input data\n";
print " -suffix [string] : add suffix to the file names\n";
print " -cache [string] : cache dir\n";
print " -v : verbose\n";
print " --format [string] : input format ([bed]|bedgraph), gz or bz2 files are accepted\n";
print " --weight [string] : weight individual tags (for bed files only; bedgraphs are always weighted)\n";
print " --base [string] : base dir of input data\n";
print " --suffix [string] : add suffix to the file names\n";
print " --qsub : run jobs using sge\n";
print " --queue [string] : queque name\n";
print " --job [string] : job name\n";
print " --cache [string] : cache dir\n";
print " --keep-cache : keep cache dir\n";
print " -v : verbose\n";
exit (1);
}

Expand All @@ -51,7 +66,6 @@
Carp::croak "incorrect input format: $inputFormat\n" if $inputFormat ne 'bed' && $inputFormat ne 'bedgraph';



print "loading configuration file from $configFile ...\n" if $verbose;
Carp::croak "contig file $configFile does not exist\n" unless -f $configFile;
my $groups = readExprConfigFile ($configFile, $base, $suffix);
Expand All @@ -64,7 +78,19 @@
print "generating wig files for each group...\n" if $verbose;
my @groupNames = sort {$groups->{$a}->{"id"} <=> $groups->{$b}->{"id"}} keys %$groups;

system ("mkdir $cache");

my $iter = 0;
my $scriptListFile = "$cache/script.list";
my $scriptDir = "$cache/scripts";

my $fout;
if ($qsub)
{
open ($fout, ">$scriptListFile") || Carp::croak "cannot open $scriptListFile to write\n";
system ("mkdir $scriptDir");
}

foreach my $gName (@groupNames)
{
my $samples = $groups->{$gName}->{"samples"};
Expand All @@ -80,29 +106,64 @@
$cmd = "bzcat ";
}

foreach my $s (@$samples)
my $outputFile = "$outDir/$gName";
$outputFile .= $suffix if $suffix ne "";

if (@$samples == 1 && $inputFormat eq 'bedgraph')
{
print "$iter: group=$gName, sample=$s\n" if $verbose;
my $inputFile = $s;
#only one bedgraph file, we just make a copy
my $inputFile = $samples->[0];
$inputFile = "$base/$inputFile" if $base ne '';

$cmd = "cp $inputFile $outputFile";
}
else
{
foreach my $s (@$samples)
{
print "$iter: group=$gName, sample=$s\n" if $verbose;
my $inputFile = $s;
$inputFile = "$base/$inputFile" if $base ne '';

$cmd .= " $inputFile";
$cmd .= " $inputFile";
}

if ($inputFormat eq 'bedgraph')
{
$cmd .= " | awk '{print \$1\"\\t\"\$2\"\\t\"\$3\"\\tt\\t\"\$4}'";
}

my $weightFlag = $inputFormat eq 'bedgraph' || $weight ? '--weight' : '';
$cmd .= " | perl $cmdDir/tag2profile.pl $verboseFlag -c $cache/tag2profile_$iter $weightFlag -big -exact -of bedgraph - $outputFile";
}

if ($inputFormat eq 'bedgraph')
print $cmd, "\n";

if ($qsub)
{
$cmd .= " | awk '{print \$1\"\\t\"\$2\"\\t\"\$3\"\\tt\\t\"\$4}'";
my $scriptFile = "$scriptDir/script_$iter.sh";
my $fout2;
open ($fout2, ">$scriptFile") || Carp::croak "cannot open file $scriptFile to write\n";
print $fout2 $cmd, "\n";
close ($fout2);
print $fout $scriptFile, "\n";
}
else
{
my $ret = system ($cmd);
Carp::croak "CMD=$cmd failed:$?\n" if $ret != 0;
}
$iter++;
}

my $weightFlag = $inputFormat eq 'bedgraph' || $weight ? '--weight' : '';
my $cacheFlag = $cache ne ''? "-c $cache" : '';
$cmd .= " | perl $cmdDir/tag2profile.pl $verboseFlag $cacheFlag $weightFlag -big -exact -of bedgraph - $outDir/$gName.wig";

print $cmd, "\n";
my $ret = system ($cmd);
Carp::croak "CMD=$cmd failed:$?\n" if $ret != 0;
if ($qsub)
{
close ($fout);

$iter++;
my $cmd = "perl $cmdDir/batchQsub.pl $verboseFlag -c $cache/qsub -j $jobName -q $queueName --wait $scriptListFile";
print $cmd, "\n" if $verbose;
system ($cmd);
}

system ("rm -rf $cache") unless $keepCache;

34 changes: 9 additions & 25 deletions _countit/gen_expression_matrix.pl
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,21 @@

print "done.\n" if $verbose;

print "loading data of individual samples ...\n" if $verbose;
print "loading data of individual samples and aggregating samples in the same group...\n" if $verbose;

my %sampleData;
#my %sampleData;
my $geneId;
my $n = 0;
my $iter = 0;
my $geneInfo;

my @groupNames = sort {$groups->{$a}->{"id"} <=> $groups->{$b}->{"id"}} keys %$groups;
my @groupData;

foreach my $gName (@groupNames)
for (my $g = 0; $g < @groupNames; $g++)
{
my $gName = $groupNames[$g];

my $samples = $groups->{$gName}->{"samples"};
foreach my $s (@$samples)
{
Expand All @@ -78,38 +81,19 @@
{
$n = @$geneInfo;
}
$sampleData{$s} = $sdata->{"data"};
$iter++;
}
}

print "$iter samples, $n genes loaded.\n" if $verbose;


print "aggregating samples in the same group ...\n" if $verbose;


my @groupData;

for (my $g = 0; $g < @groupNames; $g++)
{
my $gName = $groupNames[$g];
my $samples = $groups->{$gName}->{"samples"};

foreach my $s (@$samples)
{
print "sample=$s\n" if $verbose;
my $data = $sampleData{$s};
my $data = $sdata->{"data"};
for (my $i = 0; $i < $n; $i++)
{
my $d = $data->[$i];

$groupData[$g][$i][0] += $d->[0]; #tagNum
$groupData[$g][$i][1] += $d->[1]; #RPKM
}
$iter++;
}
}

print "$iter samples, $n genes loaded.\n" if $verbose;

my $fout;

Expand Down

0 comments on commit 8a7b946

Please sign in to comment.