-
Notifications
You must be signed in to change notification settings - Fork 4
/
05_scriptExtractMatch_v20_BLASTX.py
executable file
·674 lines (477 loc) · 24.2 KB
/
05_scriptExtractMatch_v20_BLASTX.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
#!/usr/bin/python
### TBLASTX formatting
### MATCH = Only the first match keeped
MATCH = 0 # Only 1rst match Wanted
#MATCH = 1 # All match wanted
### SUBMATCH = several part of a same sequence match with the query
SUBMATCH = 0 # SUBMATCH NOT WANTED (only the best hit)
#SUBMATCH =1 # SUBMATCH WANTED
### NAME FORMATTING:
# [A] FORMAT QUERY NAME 1st STEP [IN DEF1]
# [B] FORMAT MATCH NAME 1st STEP [IN DEF2.1]
# [C] FORMAT MATCH NAME 2nd STEP [MIDDLE of DEF 2.3]
# [D] FORMAT QUERY NAME 2nd STEP [END of DEF 2.3]
# [E] FORMAT MATCH NAME 3rd STEP [END of DEF 2.3]
### SPECIFICITY TBLASTX (/BLASTN) formatting:
## 1/ "TBLASTX" formatting => At start of "RUN RUN RUN" change the keyword
## 2/ change line "if keyword in nextline:" in function "split_file"
## 3/ change "Strand" by "Frame" in function "get_information_on_matches"
########################################
########################################
### DEF 1. Split each "BLASTN" event ###
########################################
########################################
#def split_file(file_in, file_out, keyword):
def split_file(path_in, keyword):
file_in = open(path_in, "r")
RUN = ''
BASH1={}
while 1:
nextline = file_in.readline()
#if not nextline:
# break
##################################
##################################
### [A] FORMATTING QUERY NAME ###
##################################
##################################
### Get query name ###
if nextline[0:6]=='Query=':
L1 = string.split(nextline, "||")
L2 = string.split(L1[0], " ")
query = L2[1]
if query[-1] == "\n":
query = query[:-1]
######################################
######################################
### [A] END FORMATTING QUERY NAME ###
######################################
######################################
### split the file with keyword ###
if keyword in nextline:
#if nextline[:6] == keyword: # start of a "RUN" block // Treatment of lines starting with the keyword ("BLASTN") only
# Two cases here:
#1# If it is the first "RUN" in the block (i.e. the first occurence of "BLASTN" in the file), we have just to add the new lines in the "RUN" list ... 2nd , we have also to detect the 'key' of bash1, which is the "query" name ... and third we will have to save this "RUN" in the bash1, once we will have detected a new "RUN" (i.e. a new line beginning with "BLASTN".
#2# If it isn't the first run, we have the save the previous "RUN" in the "bash1", before to re-initialize the RUN list (RUN =[]), before to append lines to the new "RUN"
if RUN == '': # case #1#
RUN = RUN + nextline # we just added the first line of the file
else: # case #2# (there was a run before)
BASH1[query] = RUN # add the previous run to the bash
RUN = '' # re-initialize the "RUN"
RUN = RUN + nextline # add the line starting with the keyword ("BLASTN") (except the first line of the file (the first "RUN")
else: # Treatment of the subsequent lines of the one starting with the keyword ("BLASTN") (which is not treated here but previously)
RUN = RUN + nextline
if not nextline: # when no more line, we should record the last "RUN" in the bash1
#print "END = %s" %query
BASH1[query] = RUN # add the last "RUN"
break
file_in.close()
#file_out.close()
#print "BASH1.keys() = %s" %BASH1.keys()
return(BASH1)
#########################################################
#########################################################
### 2. Parse blast output for each query
#########################################################
### 2.1. detect matches (i.e. 'Sequences producing significant alignments:' ###
def detect_Matches(query, MATCH, WORK_DIR):
F5 = open("%s/tmp/blastRun2.tmp" %WORK_DIR, 'w')
F5.write(bash1[query])
F5.close()
F6 = open("%s/tmp/blastRun2.tmp" %WORK_DIR, 'r')
list1 =[]
list2 =[]
while 1:
nexteu = F6.readline()
if not nexteu : break
if "***** No hits found ******" in nexteu :
hit = 0
#print "NO HITS FOUND"
break
if 'Sequences producing significant alignments:' in nexteu:
hit = 1
#print "HITS FOUND!!!!!!!!!!"
F6.readline() # jump a line
while 1:
nexteu2 = F6.readline()
if nexteu2[0]==">": break
######################################
######################################
### [B] FORMAT MATCH NAME 1st STEP ###
######################################
######################################
if nexteu2 != '\n':
LL1 = string.split(nexteu2, " ") # specific NORTH database names !!!!!!!
match = LL1[0] #### SOUTH databank // NORTH will have "|" separators
list1.append(match)
match2 = ">" + LL1[0] # more complete name // still specific NORTH database names !!!!!!!
list2.append(match2) #### SOUTH databank // NORTH will have "|" separators
if MATCH == 0: ## Only read the 1rst line (i.e. the First Match)
break
else: ## Read the other lines (i.e. All the Matches)
continue
##########################################
##########################################
### [B] END FORMAT MATCH NAME 1st STEP ###
##########################################
##########################################
break
##############################################
## [OPTION 1st HIT] ONLY KEEP THE FIRST ENTRY FOR EACH QUERY
#if list1 != []: # when no match, this list is empty
# hit1 = list1[0]
# list1 = [hit1]
#if list2 != []: # when no match this list is empty
# hit2 = list2[0]
# list2 = [hit2]
## [/OPTION 1st HIT]
##############################################
#print "LIST1 = %s" %list1
#print "LIST2 = %s" %list2
F6.close()
return(list1, list2, hit) # list1 = short name // list2 = more complete name
############################################
### 2.2. Get Information on matches ###
### Function used in the next function (2.3.)
#############################################
def get_information_on_matches(list_of_line):
for line in list_of_line:
## Score and Expect
if "Score" in line:
#print line
line = line[:-1] # remove "\n"
S_line = string.split(line, " = ")
Expect = S_line[-1] ## ***** Expect
S_line2 = string.split(S_line[1], " bits ")
Score = string.atof(S_line2[0])
## Identities/gaps/percent/divergence/length_matched
elif "Identities" in line:
line = line[:-1] # remove "\n"
g = 0
if "Gaps" in line:
#print "HIT!!!"
pre_S_line = string.split(line, ",")
identity_line = pre_S_line[0]
gaps_line = pre_S_line[1]
g = 1
else:
identity_line = line
g = 0
## treat identity line
S_line = string.split(identity_line, " ")
identities = S_line[-2] ## ***** identities
#print "\t\tIdentities = %s" %identities
S_line2 = string.split(identities, "/")
hits = string.atof(S_line2[0]) ## ***** hits
length_matched = string.atof(S_line2[1]) ## ***** length_matched
abs_nb_differences = length_matched - hits ## ***** abs_nb_differences
## identity_percent = S_line[-1]
identity_percent = hits/length_matched * 100 ## ***** identity_percent
#print "\t\tIdentity (percent) = %.2f" %identity_percent
divergence_percent = abs_nb_differences/length_matched*100 ## ***** divergence_percent
#print "\t\tDivergence (percent) = %.2f" %divergence_percent
## treat gap line if any
if g ==1: # means there are gaps
S_line3 = string.split(gaps_line, " ")
gaps_part = S_line3[-2]
S_line4 = string.split(gaps_part, "/")
gaps_number = string.atoi(S_line4[0]) ## ***** gaps_number
#print "\t\tGaps number = %s" %gaps_number
real_differences = abs_nb_differences - gaps_number ## ***** real_differences
real_divergence_percent = (real_differences/length_matched)*100 ## ***** real_divergence_percent
#print "\t\tReal divergence (percent)= %.2f" %real_divergence_percent
else:
gaps_number = 0
#print "\t\tGaps number = %s" %gaps_number
real_differences = 0
real_divergence_percent = divergence_percent
## Strand
#elif "Strand" in line:
# line = line[:-1] # remove "\n"
# S_line = string.split(line, " = ")
# strand = S_line[1]
# print "\t\tStrand = %s" %strand
## Frame
elif "Frame" in line:
line = line[:-1] # remove "\n"
S_line = string.split(line, " = ")
frame = S_line[1]
#print "\t\tFrame = %s" %frame
list_informations=[length_matched, Expect, Score, identities, hits, identity_percent, divergence_percent,gaps_number, real_divergence_percent, frame, length_matched]
return(list_informations)
###############################################
###############################################
### 2.3. get sequences ###
### [+ get informations from the function 2.2.]
###############################################
###############################################
def get_sequences(query, list2, SUBMATCHEU,WORK_DIR):
#print "\t[get_sequence BEGIN]"
list_Pairwise = []
F7 = open("%s/tmp/blastRun3.tmp" %WORK_DIR, 'w')
F7.write(bash1[query]) # bash1[query] ==> blast output for each query
F7.close()
F8 = open("%s/tmp/blastRun3.tmp" %WORK_DIR, 'r')
text1 = F8.readlines()
miniList = []
for name in list2: # "list2" contains name of matched sequences (long version! the list1 is the same list but for short version names). It was previously generated by "detect_Matches" function
l = -1
for n in text1:
l = l+1
if name in n:
i = l
miniList.append(i) # content positions in the list "text1", of all begining of match (e.g. >gnl|UG|Apo#S51012099 [...])
miniList.reverse()
if miniList != []:
length = len(miniList)
ii = 0
Listing1 = []
while ii < length:
iii = miniList[ii]
entry = text1[iii:]
text1 = text1[:iii]
Listing1.append(entry) # each "entry" = list of thing beginning by ">"
ii = ii+1 # Listing1 is a table of table!!
Listing1.append(text1) # "text1" = the first lines (begin with "BLASTN 2.2.1 ...]"
Listing1.reverse()
Listing2 = Listing1[1:] # remove the first thing ("BLASTN ...") and keep only table beginning with a line with ">"
SEK = len(Listing2)
NB_SEK = 0
for e1 in Listing2: # "Listing2" contents all the entries begining with ">"
NB_SEK = NB_SEK + 1
list51 = []
l = -1
for line in e1:
l = l+1
if "Score =" in line:
index = l
list51.append(l) # index of the lines with score
#break
list51.reverse()
Listing3 = []
for i5 in list51:
e2 = e1[i5:]
Listing3.append(e2)
e1 = e1[:i5]
######################################
######################################
### [C] FORMAT MATCH NAME 2nd STEP ###
######################################
######################################
BigFastaName = e1 ### LIST OF LINES <=> What is remaining after removing all the hit with "Score =", so all the text comprise between ">" and the first "Score =" ==> Include Match name & "Length & empty lines
SmallFastaName = BigFastaName[0] ## First line <=> MATCH NAME
#L1 = string.split(SmallFastaName[1:], " ")
#SmallFastaName = L1[0]
SmallFastaName = SmallFastaName[1:-1] ### remove ">" and "\n"
if SmallFastaName[-1] == " ":
SmallFastaName = SmallFastaName[:-1]
PutInFastaName1 = SmallFastaName
##########################################
##########################################
### [C] END FORMAT MATCH NAME 2nd STEP ###
##########################################
##########################################
SUBSEK = len(Listing3)
NB_SUBSEK = 0
list_inBatch = []
### IF NO SUBMATCH WANTED !!!! => ONLY KEEP THE FIRST HIT OF "LISTING3":
if SUBMATCHEU == 0: # NO SUBMATCH WANTED !!!!
Listing4 = []
Listing4.append(Listing3[-1]) # Remove this line if submatch wanted!!!
elif SUBMATCHEU == 1:
Listing4 = Listing3
for l in Listing4: ## "listing3" contents
NB_SUBSEK = NB_SUBSEK+1
ll1 = string.replace(l[0], " ", "")
ll2 = string.replace(l[1], " ", "")
ll3 = string.replace(l[2], " ", "")
PutInFastaName2 = ll1[:-1] + "||" + ll2[:-1] + "||" + ll3[:-1] # match information
#print PutInFastaName2
seq_query = ""
pos_query = []
seq_match = ""
pos_match = []
for line in l:
if "Query:" in line:
line = string.replace(line, " ", " ") # remove multiple spaces in line
line = string.replace(line, " ", " ")
line = string.replace(line, " ", " ")
lll1 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end
pos1 = lll1[1]
pos1 = string.atoi(pos1)
pos_query.append(pos1)
pos2 = lll1[3][:-1]
pos2 = string.atoi(pos2)
pos_query.append(pos2)
seq = lll1[2]
seq_query = seq_query + seq
if "Sbjct:" in line:
line = string.replace(line, " ", " ") # remove multiple spaces in line
line = string.replace(line, " ", " ")
line = string.replace(line, " ", " ")
lll2 = string.split(line, " ") # split the line, 0: "Query=", 1:start, 2:seq, 3:end
pos1 = lll2[1]
pos1 = string.atoi(pos1)
pos_match.append(pos1)
pos2 = lll2[3][:-1]
pos2 = string.atoi(pos2)
pos_match.append(pos2)
seq = lll2[2]
seq_match = seq_match + seq
## Get the query and matched sequences and the corresponding positions
pos_query.sort() # rank small to big
pos_query_start = pos_query[0] # get the smaller
pos_query_end = pos_query[-1] # get the bigger
PutInFastaName3 = "%d...%d" %(pos_query_start, pos_query_end)
######################################
######################################
### [D] FORMAT QUERY NAME 2nd STEP ###
######################################
######################################
FINAL_fasta_Name_Query = ">" + query + "||"+ PutInFastaName3 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
##########################################
##########################################
### [D] END FORMAT QUERY NAME 2nd STEP ###
##########################################
##########################################
pos_match.sort()
pos_match_start = pos_match[0]
pos_match_end = pos_match[-1]
PutInFastaName4 = "%d...%d" %(pos_match_start, pos_match_end)
######################################
######################################
### [E] FORMAT MATCH NAME 3rd STEP ###
######################################
######################################
FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
#FINAL_fasta_Name_Match = ">" + PutInFastaName1 + "||" + PutInFastaName4 + "||" + PutInFastaName2+ "||[[%d/%d]][[%d/%d]]" %(NB_SEK, SEK, NB_SUBSEK,SUBSEK)
##########################################
##########################################
### [E] END FORMAT MATCH NAME 3rd STEP ###
##########################################
##########################################
Pairwise = [FINAL_fasta_Name_Query , seq_query , FINAL_fasta_Name_Match , seq_match] # list with 4 members
list_Pairwise.append(Pairwise)
### Get informations about matches
list_info = get_information_on_matches(l) ### DEF 2.2. ###
#divergence = list_info[6]
F8.close()
#print "\t[get_sequence CLOSE]"
#print list_Pairwise
return(list_Pairwise, list_info)
#########################################
######################
### 2. RUN RUN RUN ###
######################
import string, os, time, re, sys
WORK_DIR = sys.argv[1]
path_in = "%s/04_outputBlast.txt" %WORK_DIR
file_out = open("%s/06_PairwiseMatch.fasta" %WORK_DIR,"w")
file_out3 = open("%s/06_pairwise.txt" %WORK_DIR, "w")
file_out4 = open("%s/06_pairwise_long_names.txt" %WORK_DIR, "w")
file_log = open("%s/06_ParseBLASToutput_ALL.log" %WORK_DIR, "w")
## create Bash1 ##
bash1 = split_file(path_in, "TBLASTX")
print bash1.keys()
## detect and save match ##
list_hits =[]
list_no_hits = []
j= 0
k = 0
lene = len(bash1.keys())
for query in bash1.keys():
j = j+1
print "\n\n***************** Nb: %d/%d *********************" %(j,lene)
## 2.1. detect matches ##
#print "\n#######"
print "QUERY = <%s>"%query
list_match, list_match2, hit=detect_Matches(query, MATCH, WORK_DIR) ### FUNCTION ###
#print "TEST"
#print "%s" %list_match
#print "OK"
if hit == 1: # match(es)
list_hits.append(query)
if hit == 0: # no match for that sequence
list_no_hits.append(query)
## 2.2. get sequences ##
if hit ==1:
#print ""
list_pairwiseMatch, list_info = get_sequences(query, list_match2, SUBMATCH, WORK_DIR) ### FUNCTION ###
# divergencve
divergence = list_info[6]
#print "Divergence = %s" %divergence
# gap number
gap_number = list_info[7]
#print "Gap number = %s" %gap_number
# real divergence (divergence without accounting INDELs)
real_divergence = list_info[8]
#print "Real Divergence = %s" %real_divergence
# length matched
length_matched = list_info[10]
#print "Length mathced = %s" %length_matched
### WRITE PAIRWISE ALIGNMENT IN OUTPUT FILES
for pairwise in list_pairwiseMatch:
k = k+1
query_name = pairwise[0]
query_seq = pairwise[1]
match_name = pairwise[2]
match_seq = pairwise[3]
len_query_seq = len(query_seq)
Lis1 = string.split(query_name, "||")
short_query_name = Lis1[0]
Lis2 = string.split(match_name, "||")
short_match_name = Lis2[0]
# [ONTROL FOR LENGHT]
# if len_query_seq >= 200:
# file_out.write(query_name)
# file_out.write("\n")
# file_out.write(query_seq)
# file_out.write("\n")
# file_out.write("\n")
# file_out.write(match_name)
# file_out.write("\n")
# file_out.write(match_seq)
# file_out.write("\n")
# file_out.write("\n")
# If NO CONTROL FOR LENGTH, USE THE FOLLOWING LINES INSTEAD:
file_out.write("%s||%s||%s||%s||%s" %(query_name,divergence,gap_number,real_divergence,length_matched))
file_out.write("\n")
file_out.write("%s" %query_seq)
file_out.write("\n")
#file_out.write("\n")
file_out.write("%s||%s||%s||%s||%s" %(match_name,divergence,gap_number,real_divergence,length_matched))
file_out.write("\n")
file_out.write("%s" %match_seq)
file_out.write("\n")
#file_out.write("\n")
#file_out2.write(match_name)
#file_out2.write("\n")
#file_out2.write(match_seq)
#file_out2.write("\n")
#file_out2.write("\n")
file_out3.write("%s,%s,%s,%s,%s,%s\n" %(short_query_name[1:], short_match_name[1:], divergence,gap_number,real_divergence,length_matched))
file_out4.write("%s,%s,%s,%s,%s,%s\n" %(query_name[1:],match_name[1:], divergence,gap_number,real_divergence,length_matched))
# [/CONTROL FOR LENGHT]
#file_out.write("")
#print "\n#######\n"
### Write Summary ###
file_log.write("\n\n************************************************\n")
file_log.write("******************* SUMMARY ********************\n")
file_log.write("************************************************\n\n")
file_log.write("\nNumber of sequences matching something = %d\n" % len(list_hits))
for hiti in list_hits:
file_log.write(hiti)
file_log.write("\n")
file_log.write("\nNumber of sequences matching nothing = %d\n" % len(list_no_hits))
for no_hiti in list_no_hits:
file_log.write(no_hiti)
file_log.write("\n")
file_log.write("\n\n************************************************\n\n")
nb_pairwiseMatches = k
file_log.write("Total number of pairwise matches = %d\n" %nb_pairwiseMatches)
file_out.close()
#file_out2.close()
#file_out3.close()
file_log.close()
#file_out4.close()
os.system("rm %s/tmp/*" %WORK_DIR)