forked from enormandeau/Scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
blast_find_duplicates.py
executable file
·79 lines (68 loc) · 2.16 KB
/
blast_find_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Search sequences that represent the same gene in a blastn result file.
Input file is the result file (outfmt 6) of fasta file blasted against itself.
Usage:
%program <input_file> <output_file>"""
import sys
import re
try:
in_file = sys.argv[1]
out_file = sys.argv[2]
except:
print __doc__
sys.exit(0)
def blast_generator(in_file):
"""Yield one blastplus result at a time
"""
with open(in_file) as f:
begin = False
while begin == False:
l = f.readline().rstrip()
if l.find("Query=") > -1:
query = [l]
begin = True
for line in f:
l = line.rstrip()
if l.find("Query=") > -1:
yield query
query = []
query.append(l)
yield query
def find_name(text):
"""Use regex to find the name of the sequence being treated
"""
contig_search = re.compile("Contig_[0-9]+")
est_search = re.compile("gi\|[0-9]+")
geneFP_search = re.compile("geneFP_[0-9]+")
array_search = re.compile("[a-zA-Z]+[0-9]+")
amdc_search = re.compile("all_abyss2_min_l200_min_c500_contig_[0-9]+")
name = re.findall(contig_search, text)
name += re.findall(est_search, text)
name += re.findall(geneFP_search, text)
name += re.findall(array_search, text)
name += re.findall(amdc_search, text)
try:
name = name[0]
except:
name = "No_name"
return name
blast_results = blast_generator(in_file)
my_sum = 0
with open(out_file, "w") as f:
for res in blast_results:
name = find_name(res[0])
results = [x.strip().split()[0] for x in res[1:] if
find_name(x) != "No_name" and not x.startswith(">")
and not x.strip().split()[0] == name]
line = name
if len(results) == 0:
my_sum += 1
line += "\n"
elif len(results) == 1:
my_sum += 0
line += "\t" + "\t".join(results) + "\n"
else:
line += "\t" + "\t".join(results) + "\n"
f.write(line)
print "There are", my_sum, "non-repeated sequences"