-
Notifications
You must be signed in to change notification settings - Fork 16
/
simulateCIMS.pl
executable file
·161 lines (127 loc) · 3.65 KB
/
simulateCIMS.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use File::Basename;
use Carp;
use Bed;
my $prog = basename ($0);
my $verbose = 0;
my $mutationSize = 1;
my $numSites = 0;
my $CIMSPositionFile = "";
my $rand_seed = 0;
GetOptions ("n:i"=>\$numSites,
'w:i'=>\$mutationSize,
'p:s'=>\$CIMSPositionFile,
'srand:f'=>\$rand_seed,
"v"=>\$verbose);
if (@ARGV != 2)
{
print "simulate CLIP mismatch sites\n";
print "Usage: $prog [options] <in.bed> <out.bed>\n";
print " -n [int] : number of mismatches\n";
print " -w [int] : mutation size ($mutationSize)\n";
print " -p [file] : specify the position of mismatch sites relative to read start\n";
print " -srand [float]: seed of random number (0=no fixed seeds)\n";
print " -v : verbose\n";
exit (0);
}
my ($inBedFile, $outBedFile) = @ARGV;
Carp::croak "-n or -p must be specified\n" if $numSites == 0 && $CIMSPositionFile eq '';
my $numTags = `grep -v \"track=\" $inBedFile | wc -l`;
chomp $numTags;
$numTags =~/^\s*(\d+)/;
$numTags = $1;
print "$numTags tags detected in $inBedFile\n" if $verbose;
my @CIMSPositions;
if (-f $CIMSPositionFile)
{
print "loading CIMS positions ...\n" if $verbose;
my $fin;
open ($fin, "<$CIMSPositionFile");
while (my $line=<$fin>)
{
chomp $line;
next if $line=~/^\s*$/;
Carp::croak "position < 0?\n" if $line < 0;
push @CIMSPositions, $line;
}
$numSites = @CIMSPositions;
}
elsif ($CIMSPositionFile ne '')
{
Carp::croak "can not read $CIMSPositionFile\n";
}
#record how many time each tag should be sampled and at which position
#if uniform sampling is used, hold the position undetermined
if ($rand_seed != 0)
{
srand ($rand_seed);
}
my %idxHash;
for (my $i = 0; $i < $numSites; $i++)
{
my $idx = int(rand($numTags)); #index of tags to sample
if (@CIMSPositions == $numSites)
{
push @{$idxHash{$idx}}, $CIMSPositions[$i];
}
else
{
push @{$idxHash{$idx}}, -1;
}
}
print "permute mutations ...\n" if $verbose;
my $fin;
my $fout;
open ($fin, "<$inBedFile") || Carp::croak "cannot open file $inBedFile to read\n";
open ($fout, ">$outBedFile") || Carp::croak "cannot open file $outBedFile to write\n";
my $idx = -1;
my $n = 0;
while (my $line=<$fin>)
{
chomp $line;
next if $line=~/^track/;
next if $line=~/^\s*$/;
my $t = lineToBed ($line);
$idx++;
print "$idx ...\n" if $verbose && $idx % 50000 == 0;
next unless exists $idxHash{$idx};
my $positions = $idxHash{$idx};
foreach my $pos (@$positions)
{
my $chromStart = $t->{'chromStart'};
my $chromEnd = $t->{'chromEnd'};
if ($pos >= 0)
{
$chromStart = $t->{'strand'} eq '+' ? $chromStart + $pos : $chromEnd - $pos;
if ($chromStart < 0)
{
#cz fix 06/12/2018
#in some very rare cases, the mutation go outside the range
#as a quick and dirty fix, we do uniform sampling for these
#uniform sampling
$chromStart = $t->{'chromStart'} + 1; #do not allow changes in the first and last position
$chromEnd = $t->{'chromEnd'} - 1;
my $tagLen = $chromEnd - $chromStart + 1;
$pos = int(rand($tagLen - $mutationSize + 1));
$chromStart += $pos;
}
}
else
{
#uniform sampling
$chromStart = $t->{'chromStart'} + 1; #do not allow changes in the first and last position
$chromEnd = $t->{'chromEnd'} - 1;
my $tagLen = $chromEnd - $chromStart + 1;
$pos = int(rand($tagLen - $mutationSize + 1));
$chromStart += $pos;
}
print $fout join ("\t", $t->{'chrom'}, $chromStart, $chromStart + $mutationSize, "CIM_". $n, 0, $t->{'strand'}), "\n";
$n++;
}
}
Carp::croak "the total number of mutation sites is not $numSites as expected\n" unless $n == $numSites;
close ($fout);
close ($fin);