-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_trap.pl
428 lines (291 loc) · 11.3 KB
/
update_trap.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
#!/usr/bin/perl -w
=head2 Authors
=head3 Created by
Guglielmo Roma
=head3 Modified by
Vincenza Maselli
=head2 Descritpion
This script executes a query in GenBank and retrieves the trap data in order to update the "trap" table
=head2 Usage
./update_trap.pl [-c check] [-mindate] [-h help] [-o output]
-c = check y if you want insert new trap n if is the first insert
-mindate = minimum date to retrieve from
-o = the complete path of the output file, optional, only if debug is required (see configuration file to set up the debug)
example: nohup perl update_trap.pl -c yes -mindate 2000/01/01 -o ~/src/scripts/data/tmp/update_trap.out &\n";
=cut
use strict;
BEGIN {
require "unitrap_conf.pl";
};
use DBI;
use Getopt::Long;
use Data::Dumper;
use Net::SMTP;
use Bio::DB::GenBank;
use Bio::SeqIO;
use Bio::Tools::RepeatMasker;
use Bio::Unitrap::Db;
my $USAGE = "update_trap.pl [-c check] [-mindate] [-h help] [-o output]";
my ($check, $mindate, $help,$output);
&GetOptions( 'check|c' => \$check,
'mindate|m=s' => \$mindate,
'output|o' => \$output,
'help|h' => \$help);
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime (time);
$year = $year+1900;
$mon = $mon+1;
=pod
This section (line 69 -81) check the input options
=cut
if (!$mindate) {
print OUT "You must specify a minimum date to retrieve from.\n";
exit;
}
if ($help) {
print OUT "HELP:
-c = check y if you want insert new trap n if is the first insert
-mindate = minimum date to retrieve from
-o = the complete path of the output file, optional, only if debug is required (see configuration file to set up the debug)
example: nohup perl update_trap.pl -c yes -mindate 2000/01/01 -o ~/src/scripts/data/tmp/update_trap.out &\n";
exit;
}
=pod
This section (lines 89-95) sets up the variables form the configuration file
=cut
my %conf = %::conf;
my $debug = $conf{'debug'};
my $debugSQL = $conf{'debugSQL'};
my $traphost = $conf{'traphost'};
my $trapuser = $conf{'trapuser'};
my $trapdbname = $conf{'trapdbname'};
my $trappass = $conf{'trappass'};
=pod
This section (lines 103-109) checks and opens the output file
=cut
if (!$output) {
print OUT "The output will be write into the default directory $ENV{'HOME'}/tmp/unitrap/.\n" if $debug;
}
my $fileout = "$ENV{'HOME'}/tmp/unitrap/";
if ($output){$fileout = $output}
open (OUT, ">>$fileout") || die "$! $fileout\n";
$debug && print OUT "Trying to download new traps from gss db\n";
=pod
This section (lines 120-121) is for the backup of the unitrap database. It has to be reviewed and rewrited
=cut
my $unitrapdb_obj = Bio::Unitrap::Db->new;
#$unitrapdb_obj->exec_dump($conf{'mysql_path'}, $conf{'trapuser'}, $conf{'trappass'}, $conf{'trapdbname'}, $conf{'traphost'}, $conf{'tmp_dir'}.$conf{'trapdbname'}."_"."$mday"."_"."$mon"."_"."$year".".sql", "0", $debug);
=pod
This section (lines 133-139) is used to
- Connect to unitrap_db
- Retrieve all the gene-trap projects from the trapdb, except tigem
Reminder: Tigem traps available in Gss are not updated, so we can retrieve them from the last UNITRAP database
=cut
my $trap_db = DBI->connect("DBI:mysql:database=$trapdbname;host=$traphost;port=3306", $trapuser, $trappass) || die "Can't connect to $trapdbname: $DBI::errstr";
my $sql_projects = qq{select project_id, project_name, wordkey from project where project_name != 'tigem' order by project_id};
my $sth_projects = $trap_db->prepare($sql_projects) || die "Can't prepare $sql_projects: $DBI::errstr";
$debug && print OUT "SQL CODE: $sql_projects\n";
my $n_projects = $sth_projects->execute|| die "Can't execute $sql_projects: $DBI::errstr";
$debug && print OUT "Number of projects in ".$conf{'trapdbname'}.": $n_projects\n";
my $base_keyword = 'gene trap and mouse AND GSS and ';
#my $base_keyword = 'gene trap and mouse AND gbdiv_gss[PROP] and ';
=pod
This section (lines 151- 334) queries GenBank with "gene trap and mouse AND GSS and " as static key plus the wordkey of the project retrieved from the database. It Checks if the trap is already in the database and if not, retrieve all the information and put them into the database
=cut
while (my $rhref_projects = $sth_projects->fetchrow_hashref) {
my $p_id = $rhref_projects->{'project_id'};
my $p_name = $rhref_projects->{'project_name'};
my $wordkey = $rhref_projects->{'wordkey'};
print OUT "wordkey: $wordkey\n";
=pod
Querying GenBank
=cut
my $seqio = &genbank_exec($base_keyword.$wordkey, $mindate);
while (my $seq = $seqio->next_seq) {
=pod
Step1 - Retrieving trap info from NCBI GenBank
Trap_name is supposed to be the 1st word in the description
=cut
my @desc = split (" ", $seq->desc());
my $trap_name = $desc[0];
=pod
Step2 - Building a hash containing trap info
=cut
my %trapinsert;
$trapinsert{'trap_name'}= $trap_name;
$debug && print Dumper @desc;
$debug && print OUT "Project: $p_name\tTrap_name: $trap_name\n";
#Some traps from egtc cannot be parsed correctly from GSS. Need some corrections
=pod
Correction for egtc traps
=cut
if ($trap_name eq 'Mus' && $p_name eq 'egtc') {
$trap_name = $desc[8];
if ($trap_name eq 'genomic') {
$trap_name = $desc[7];
}
$debug && print OUT "EGTC exception => TRAP NAME $trap_name\n";
$trap_name =~ s/,//;
$debug && print "EGTC exception => NEW NAME $trap_name\n";
if ($trap_name =~ ":") {
my @words = split(":", $trap_name);
$trap_name = $words[1];
}
$trapinsert{'trap_name'}= $trap_name;
}
##### Some traps from tigm cannot be parsed correctly from GSS. Need some corrections
=pod
Correction for tigm trpas
=cut
if ($trap_name eq 'Mus' && $p_name eq 'tigm') {
$trap_name = $desc[3];
$debug && print OUT "TIGM exception => TRAP NAME $trap_name\n";
$trap_name =~ s/,//;
$debug && print "TIGM exception => NEW NAME $trap_name\n";
$trapinsert{'trap_name'}= $trap_name;
}
=pod
Step3 - Check
=cut
my $trap_id;
my $exists = 0;
my $sth_check;
if ($check == 1) {
my $sql_check = qq{select distinct trap_id from trap where trap_name = ?};
$debug && print OUT "SQL CODE: $sql_check\n";
$sth_check = $trap_db->prepare($sql_check);
$exists = $sth_check->execute($trap_name) || die "select failed : $DBI::errstr";
}
my $splk=0;
=pod
Step4 - Insert new trap
=cut
if ($exists == 0) {
=pod
Step4-1 Extracting the race-type (3' or 5')
=cut
my $race;
#$seq is Bio::Seq object; $seq->annotation method returns a Bio::AnnotationCollectionI
# get_Annotation returns a list of Bio::AnnotationI
foreach my $value ($seq->annotation->get_Annotations("comment")) {
my $comment = $value->as_text;
# NOTE THAT $tag_name is not used in this loop
my $tag_name = $value->tagname;
if ($comment =~ /5'/) {
$race = "5'";
} elsif ($comment =~ /3'/) {
$race = "3'";
} else {
$race = "na";
}
if ($comment =~ /Splinkerette/) {
$splk = 1;
}
$debug && print OUT "Comment: $comment\nRACE $race\n";
}
my $gb_id = $seq->primary_id;
my $gb_locus = $seq->accession_number;
my $sequence = $seq->seq;
my $seq_length = $seq->length;
my $nrepeat = $seq->seq =~ tr/N/N/;
my $percent_masked = ($nrepeat/$seq_length)*100;
$debug && print OUT "SEQUENCE: $sequence\nGB-ID $gb_id\nGB-LOCUS $gb_locus\n\nSEQ LENGTH $seq_length\nN-REPEAT $nrepeat\n%MASKED $percent_masked\n";
=pod
Step4-2 Checking for sequence quality
check if there are repeated regions
if yes, check the longer non repeated fragment
=cut
my $seq_length_not_N = $seq_length - $nrepeat;
my $max_frag_length_N_splitted;
if ($seq_length_not_N < $seq_length) {
my @fragments = split (/N+/, $sequence);
foreach my $fragment (@fragments) {
$debug && print OUT "Fragment is $fragment\n";
my $frag_length = length($fragment);
# check for the maximum length of non repated fragments
if ($frag_length > $max_frag_length_N_splitted) {
$max_frag_length_N_splitted = $frag_length;
}
}
}
else {
$max_frag_length_N_splitted = $seq_length;
}
$debug && print OUT "Length not-N: $seq_length_not_N\nMax Fragment Length (splitted by N): $max_frag_length_N_splitted\n";
$trapinsert{'project_id'} = $p_id;
$trapinsert{'sequence'} = $sequence;
$trapinsert{'seq_length'} = $seq_length;
$trapinsert{'nrepeat'} = $nrepeat;
$trapinsert{'seq_length_not_N'}= $seq_length_not_N;
$trapinsert{'max_frag_length_N_splitted'} = $max_frag_length_N_splitted;
$trapinsert{'gb_id'} = $gb_id;
$trapinsert{'gb_locus'} = $gb_locus;
$trapinsert{'race'} = $race;
$trapinsert{'n_percent_masked'} = $percent_masked;
$trapinsert{'splk'} = $splk;
my @features = $seq->get_all_SeqFeatures;
foreach my $feat (@features) {
$debug && print OUT "TAG ".$feat->primary_tag."\n";
=pod
Step4-3 Collecting other trap info, such as vector_name and mol_type
=cut
if ($feat->primary_tag eq 'source') {
if ($feat->has_tag('note')) {
#### Extracting the mol_type
my @mol = $feat->get_tag_values('mol_type');
my $mol_type = join (",", @mol);
$debug && print OUT "mol_type: $mol_type\n";
#### Extracting the vector_name
my @notes = $feat->get_tag_values('note');
my $str = join (",", @notes). "\n";
$debug && print OUT "NOTE: $str\n";
my ($vec, $vector_desc) = split (":", $str);
my ($vector_name, $other) = split (";", $vector_desc);
$vector_name =~ s/ //g;
$vector_name =~ s/\n//g;
$debug && print OUT "vector name: $vector_name\n";
=pod
Correction for egtc
The way to parse the GenBank format is different for the egtc project, i.e. trap_name needs to be modified
=cut
if ($p_name eq 'egtc') {
my @clone = $feat->get_tag_values('clone');
$trap_name = $clone[0];
$trapinsert{'trap_name'}= $trap_name;
}
$trapinsert{'vector_name'} = $vector_name;
$trapinsert{'mol_type'} = $mol_type;
$debug && print OUT Dumper \%trapinsert;
}
}
}
=pod
Step4-4 Inserting trap info into the trap_db
=cut
my $stmt = $unitrapdb_obj->prepare_stmt($trap_db, \%trapinsert);
$trap_id = $unitrapdb_obj->insert_set($trap_db, $stmt, "trap", $debugSQL);
}
# trap already exists
else {
my $rhref_trap_id = $sth_check->fetchrow_hashref;
$trap_id = $rhref_trap_id->{'trap_id'};
$debug && print OUT "### Message => This trap already exists: $trap_id!!!\n";
}
}
}
=head1 SUBROUTINES
=head2 NAME genbank_exec
arguments: keywords (string) and date (date)
caller: main
return: Bio::Seq
exception: none
=cut
sub genbank_exec () {
my ($keywords, $mindate) = @_;
my $dbh = Bio::DB::GenBank->new();
my $query = Bio::DB::Query::GenBank->new ( -query => $keywords,
-db => 'Nucleotide',
-mindate => $mindate);
return my $result = $dbh->get_Stream_by_query($query);
}