-
Notifications
You must be signed in to change notification settings - Fork 1
/
extractSubSeqs.pl
executable file
·288 lines (247 loc) · 7.34 KB
/
extractSubSeqs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/perl -w
##---------------------------------------------------------------------------##
## File:
## @(#) extractSubSeqs.pl
## Author:
## Robert M. Hubley [email protected]
## Description:
## Given a set of coseg files extract the source sequences
## from the fasta file for a given subfamily.
##
#******************************************************************************
#* This software is provided ``AS IS'' and any express or implied *
#* warranties, including, but not limited to, the implied warranties of *
#* merchantability and fitness for a particular purpose, are disclaimed. *
#* In no event shall the authors or the Institute for Systems Biology *
#* liable for any direct, indirect, incidental, special, exemplary, or *
#* consequential damages (including, but not limited to, procurement of *
#* substitute goods or services; loss of use, data, or profits; or *
#* business interruption) however caused and on any theory of liability, *
#* whether in contract, strict liability, or tort (including negligence *
#* or otherwise) arising in any way out of the use of this software, even *
#* if advised of the possibility of such damage. *
#* *
#******************************************************************************
#
# ChangeLog
#
# $Log: extractSubSeqs.pl,v $
# Revision 1.7 2016/11/11 23:46:04 rhubley
# - Cleaning up perl scripts
#
# Revision 1.6 2012/11/02 23:02:19 rhubley
# - Cleaned up and consolidated EM routines.
# - Moved final minCount check to prune routine.
#
# Revision 1.5 2011/06/16 15:59:28 rhubley
# - Fixed extractSubSeqs.pl
#
# Revision 1.4 2008/09/25 17:10:10 rhubley
# - Improved code documentation
# - Single mutation significance cutoff ( SIGMATHRESH ) was
# pre-calculated for Alkes Alu analysis and hardcoded. This
# version calculates the correct sigma cutoff using the length
# of the input sequence.
# - Switched default pvalue method to Andy Siegel's method and
# provided a new "-k" switch to use Alkes Price's method.
# - Fixed bug where the program was exiting when calculations
# fell below the precision of the machine ( epsilon ). Message
# given was "Below epsilon..." and the runcoseg.pl script
# moved on even though coseg failed.
# - Begun to code CpG adjusted consensus routine
#
# Revision 1.2 2008/07/24 22:17:58 rhubley
# - Minor changes
#
#
###############################################################################
#
# To Do:
#
=head1 NAME
extractSubSeqs.pl - Extract fasta elements for a given subfamily.
=head1 SYNOPSIS
extractSubSeq.pl [-version] -seqFile <coseg seq file> -subFam #
[-fastaDB <fasta file>]
[-flankBases #]
=head1 DESCRIPTION
The options are:
=over 4
=item -seqFile <coseg seq file>
The name of the input sequence file to the coseg program. NOTE: This is
not the output file created by coseg.
=item -subFam #
The subfamily number for which you would like sequences extracted.
=item -fastaDB <fasta file>
The fasta file used to create the alignment file fed to coseg
=item -flankBases #
The number of flanking bases to capture from the fastaDB when
extracting the sequences. Default 100 -- only when fastaDB option
is given.
=item -version
Displays the version of the program
=back
=head1 SEE ALSO
=head1 COPYRIGHT
Copyright 2008 Robert Hubley, Institute for Systems Biology
=head1 AUTHOR
Robert Hubley <[email protected]>
=cut
#
# Module Dependence
#
use strict;
use Getopt::Long;
use Data::Dumper;
use FindBin;
use lib $FindBin::RealBin;
use CosegConfig;
use lib $CosegConfig::REPEATMASKER_DIR;
if ( ! -s "$CosegConfig::REPEATMASKER_DIR/FastaDB.pm" )
{
print "\nError: Could not locate the required module \"FastaDB.pm\" from\n";
print " the RepeatMasker directory ( $CosegConfig::REPEATMASKER_DIR ).\n";
print " Perhaps this path is old or incorrect. Please check the\n";
print " configuration file CosegConfig.pm before continuing.\n\n";
exit;
}
use FastaDB;
#
# Magic numbers/constants here
# ie. my $PI = 3.14159;
#
my $DEBUG = 0;
my $Version = 0.1;
#
# Option processing
# e.g.
# -t: Single letter binary option
# -t=s: String parameters
# -t=i: Number paramters
#
my @getopt_args = (
'-version', # print out the version and exit
'-subFam=n',
'-flankBases=n',
'-seqFile=s',
'-fastaDB=s'
);
my %options = ();
Getopt::Long::config("noignorecase", "bundling_override");
unless (GetOptions(\%options, @getopt_args)) {
usage();
}
sub usage {
print "$0 - $Version\n";
exec "pod2text $0";
exit;
}
if ($options{'version'}) {
print "$Version\n";
exit;
}
#
# ARGV Processing
#
if ( ! defined $options{'subFam'} ||
! defined $options{'seqFile'} ) {
usage();
}
my $prefix = $options{'seqFile'};
my $subnum = $options{'subFam'};
my $seqDB = undef;
if ( defined $options{'fastaDB'} && -f $options{'fastaDB'} )
{
$seqDB = FastaDB->new( fileName => $options{'fastaDB'},
openMode => SeqDBI::ReadOnly );
}
my $numFlankingBases = 100;
if ( $seqDB && $options{'flankBases'} )
{
$numFlankingBases = $options{'flankBases'};
}
open ASSIGN, "<$prefix.assign" or
die "Error: Could not open assignment file $options{'seqFile'}.assign\n";
# Create hash of sequences assigned to the subfamily
my %seqs = ();
while ( <ASSIGN> )
{
if ( /^(\d+)\s+(\d+)/ )
{
my $seqNum = $1;
my $subFamNum = $2;
if ( $subFamNum == $subnum )
{
$seqs{ $seqNum } = 1;
}
}
}
close ASSIGN;
$prefix =~ s/\.seqs/.fasta/g;
open IN,"<$prefix" or
die "Error: Could not open fasta file $prefix\n";
my $seqIndex = -1;
my $header = "";
my $seq = "";
my $cosegID = "";
my $fastaDBID="";
my $fastaDBStart=0;
my $fastaDBEnd=0;
while (<IN>)
{
#>Seq4 chr1_428953_431249:913-1804 alignment=3
if ( /^>Seq(\d+)\s+(\S+):(\d+)-(\d+)\s+alignment=\d+/ )
{
$cosegID = $1;
$fastaDBID = $2;
$fastaDBStart = $3;
$fastaDBEnd = $4;
# Is this sequence in the requested subfamily?
if ( $seqs{$seqIndex} )
{
# Should we give the sequence from the .fasta file
# ( only as long as the coseg sequence ) or should
# we give sequence from the fastaDB file ( with
# flanking sequences attached )?
if ( $seqDB )
{
# sequence from fastaDB
my $seqLen = $seqDB->getSeqLength( $fastaDBID );
# $fastaDBStart is 1-based
my $start = $fastaDBStart - $numFlankingBases - 1;
$start = 0 if ( $start < 0 );
my $end = $fastaDBEnd + $numFlankingBases;
$end = $seqLen if ( $end > $seqLen );
$seq = $seqDB->getSubstr( $fastaDBID, $start, $end-$start+1);
print ">$fastaDBID:$start-$end Seq$cosegID\n$seq\n";
}else
{
# .fasta sequences only
$seq =~ s/[\s\n\r]//g;
print "$header\n$seq\n";
}
}
$seqIndex++;
$header = $_;
$header =~ s/[\n\r]//g;
$seq = "";
}
else {
$seq .= $_;
}
}
close IN;
# Last entry
if ( $seqs{$seqIndex} )
{
if ( $seqDB )
{
# sequence from fastaDB
}else
{
# .fasta sequences only
$seq =~ s/[\s\n\r]//g;
print "$header\n$seq\n";
}
}
1;