-
Notifications
You must be signed in to change notification settings - Fork 16
/
bed2gene2symbol.pl
executable file
·105 lines (75 loc) · 3.2 KB
/
bed2gene2symbol.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env perl
use strict;
use warnings;
use File::Basename;
use MyConfig;
use Getopt::Long;
my $prog = basename ($0);
my $cmdDir = dirname ($0);
my $cache = getDefaultCache($prog);
my $verbose = 0;
my $separateStrand = 0;
my $noRegionId = 0;
my $big = 0;
GetOptions ("cache:s"=>\$cache,
"ss"=>\$separateStrand,
"no-region-id"=>\$noRegionId,
"big"=>\$big,
"v"=>\$verbose);
if (@ARGV != 5)
{
print "mapping regions to genes and symbols\n";
print "Usage: $prog [options] <in.bed> <refGene_knownGene.bed> <refGene_knownGene_To_gene.txt> <gene2symbol.txt> <out.txt>\n";
print " -ss : separate strand\n";
print " -big : big file\n";
print " --no-region-id : do not print region id\n";
print " -cache [string]: cache dir ($cache)\n";
print " -v : verbose\n";
exit (0);
}
my ($inBedFile, $transcriptBedFile, $transcriptToGeneFile, $gene2SymbolFile, $outFile) = @ARGV;
system ("mkdir $cache");
my $bigFlag = $big ? '-big' : '';
my $verboseFlag = $verbose ? '-v' : '';
my $tagVSTs = "$cache/in.vs.ts.bed";
my $ssFlag = $separateStrand ? "-ss" : " ";
system ("perl $cmdDir/tagoverlap.pl $bigFlag $ssFlag $verboseFlag -region $transcriptBedFile $inBedFile $tagVSTs");
Carp::croak "cmd failed\n" unless -f $tagVSTs;
my $tagVSTsIdpair = "$cache/in.vs.ts.idpair";
my $cmd = "awk \'{print \$4}\' $tagVSTs | awk -F \"//\" \'{print \$1\"\\t\"\$2}\' > $tagVSTsIdpair";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $tagVSTsIdpair;
my $tagVSTs_Tsid = "$cache/in.vs.ts.tsid";
$cmd = "awk \'{print \$2}\' $tagVSTsIdpair > $tagVSTs_Tsid";
system ($cmd);
my $tsidToGeneId = "$cache/tsid2geneid";
$cmd = "perl $cmdDir/selectRow.pl -p -pt \"\" -s $transcriptToGeneFile $tagVSTs_Tsid > $tsidToGeneId";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $tsidToGeneId;
my $tag2gene = "$cache/tag2gene";
$cmd = "paste $tagVSTsIdpair $tsidToGeneId | awk \'{if (NF>3) {print \$1\"\\t\"\$4}}' | sort | uniq > $tag2gene";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $tag2gene;
my $tag2geneUniq = "$cache/tag2gene.uniq";
$cmd = "perl $cmdDir/uniqRow.pl $verboseFlag $tag2gene $tag2geneUniq";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $tag2geneUniq;
my $tag2geneUniq_geneid = "$cache/tag2gene.uniq.geneid";
$cmd = "awk \'{print \$2}\' $tag2geneUniq > $tag2geneUniq_geneid";
system ($cmd);
my $gene2symbol = "$cache/gene2symbol";
$cmd = "perl $cmdDir/selectRow.pl -p -pt \"\" -s $gene2SymbolFile $tag2geneUniq_geneid > $gene2symbol";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $gene2symbol;
my $tag2gene2symbol = "$cache/tag2gene2symbol";
$cmd = "paste $tag2geneUniq $gene2symbol | awk \'{print \$1\"\\t\"\$3\"\\t\"\$4}\' > $tag2gene2symbol";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $tag2gene2symbol;
my $tagid = "$cache/tagid";
$cmd = "awk \'{print \$4}\' $inBedFile > $tagid";
system ($cmd);
Carp::croak "cmd=$cmd failed\n" unless -f $tagid;
$cmd = "perl $cmdDir/selectRow.pl -p -pt \"\" -s $tag2gene2symbol $tagid | awk '{print \$1\"\\t\"\$2\"\\t\"\$3}' > $outFile";
$cmd = "perl $cmdDir/selectRow.pl -p -pt \"\" -s $tag2gene2symbol $tagid | awk '{print \$2\"\\t\"\$3}' > $outFile" if $noRegionId;
system ($cmd);
system ("rm -rf $cache");