-
Notifications
You must be signed in to change notification settings - Fork 0
/
convert-search-logs.py
50 lines (44 loc) · 1.48 KB
/
convert-search-logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
import csv
import urllib
searches = 0
snpSearches = 0
searchList = []
p = re.compile('^rs[0-9]+')
p2 = re.compile('^([^a-zA-Z0-9\\"])+')
genepattern = re.compile('^[A-Z0-9]+$')
with open("/home/tburdett/gwas/log-analysis/search-analysis.txt") as logs:
for line in logs:
data = line.strip()
cols = data.split()
if len(cols) > 1:
try:
term = urllib.unquote(cols[1]).strip()
count = int(cols[0])
m = p.match(term)
m2 = p2.match(term)
m3 = genepattern.match(term)
if m:
snpSearches += count
elif m2:
print "Skipping " + term
elif m3:
item = {'name': term, 'size': count}
searchList.append(item)
else:
print "Skipping " + term
except UnicodeDecodeError as ude:
print "Ignoring search string that can't be decoded (" + cols[1] + ")"
continue
snpItem = {'name': "rsID", 'size': snpSearches}
searchList.insert(0, snpItem)
# resultsList = searchList[0:100]
resultsList = searchList
with open('gene-data.csv', 'w') as f:
writer = csv.writer(f, delimiter='\t')
for result in resultsList:
writer.writerow([result.get('name'), result.get('size')])
print "Written data to CSV"
#with open('data.json', 'w') as f:
# json.dump(resultsList, f)
#print "Written data to json"