-
Notifications
You must be signed in to change notification settings - Fork 0
/
postmcl.py
254 lines (228 loc) · 10.9 KB
/
postmcl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#! /bin/bash/env python
from Applications import NcbiblastpCommandline
from seqcurator import *
import gzip
import math
import random
import re
import urllib
class PostMCL(object):
def __init__(self):
pass
def matrixmaker(self, evalue):
header = False
for key in settings.species_id.keys():
key = key.strip()
# value = settings.species_id.values()
matrixline = []
with open('venninator_clusters_' + evalue + '.mcl', 'r') as clusterfile:
for line in clusterfile:
if key in line:
matrixline.append('1')
elif key not in line:
matrixline.append('0')
matrixline = ''.join(matrixline)
with open('matrix_' + evalue + '.nex', 'a') as matrixfile:
if header == False:
matrixfile.write('#NEXUS\nBegin data;\nDimensions ntax=' + str(len(settings.species_id)) + ' nchar=' + str(len(matrixline)) + ';\nformat symbols=\"01\";\nMatrix\n')
header = True
matrixfile.write(settings.species_id[key] + '\t' + matrixline + '\n')
with open('matrix_' + evalue + '.nex', 'a') as matrixfile:
matrixfile.write(';\nEnd;')
def annotater(self, evalue):
linenum = 0
### ANNOTATER STEP ONE:
#### Create master fasta file as "Genename\tSEQUENCE".
### Like creating deinterleaved file, but with tabs instead of linebreaks.
### (This way, when we find the gene name, we don't have to use a memory-intensive index step
### to retrieve the sequence on the next line; instead, we split by tab, and output each element
### as its own line in the outfile.)
# Only do this one time during first e-value...
if not os.path.isfile('../venninator_combined_forannotation.tmp'):
with open('../venninator_combined_forannotation.tmp', 'w') as outfile:
for name in settings.filelist:
with open('../' + name + '_uniqueid.tmp') as infile:
for line in infile:
line = line.strip()
if len(line) > 0:
if line[0] == '>':
line = line[1:] # Removing the '>', which we will add to the final file; this makes equivalency checks below much easier
if linenum == 0: # So the first line isn't just a carriage return
outfile.write(line + '\t')
linenum += 1
else:
outfile.write('\n' + line + '\t')
else:
outfile.write(line)
outfile.write('\n') # adds a linebreak at end of file such that 1st line of next file doesn't overlap with last line of previous
### ANNOTATOR STEP TWO:
### From master file above, create sub-fasta file of sequences to be searched against SWISS-PROT database.
print 'Retrieving sequences from clusters to be annotated ...'
# Populate a dictionary with all sequences from master file
genes_all = {}
with open('../venninator_combined_forannotation.tmp', 'r') as genes_all_file:
for line in genes_all_file:
if '\t' in line:
line = line.strip().split('\t')
genes_all[line[0]] = line[1]
# Generate the genes of interest you want to extract from genes_all dictionary above
# Also populate clusters_nameandnum, so we maintain association btwn cluster numbers and genes
genes_of_interest = set()
clusters_nameandnum = {}
clusternum = 1
with open('venninator_clusters_' + evalue + '.mcl', 'r') as genes_of_interest_file:
for line in genes_of_interest_file:
if '\t' in line:
seqname = line.strip().split('\t')[0]
else:
seqname = line.strip()
genes_of_interest.add(seqname)
clusters_nameandnum[seqname] = clusternum
clusternum += 1
# Find the overlap between the total sequences and the desired ones
genes_all_justnames = set(genes_all.keys())
toextract = genes_all_justnames.intersection(genes_of_interest)
# Use 'toextract' set to generate desired file
with open('annotation_tobeblasted.tmp', 'w') as extractfile:
for name in toextract:
extractfile.write('>' + name + '\n' + genes_all[name] + '\n')
### ANNOTATER STEP THREE:
### Execute BLASTP search of generated fasta file against Swiss-PROT database.
# First, download the latest Swiss-PROT database and decompress it:
# Do this only once ...
if not os.path.isfile('../swissprot.tmp'):
print 'Downloading and decompressing SWISS-PROT database ...'
urllib.urlretrieve('ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz', '../swissprot.fa.gz.tmp')
with gzip.open('../swissprot.fa.gz.tmp', 'rb') as swissprot_compressed:
with open('../swissprot.tmp', 'w') as swissprot_decompressed:
for line in swissprot_compressed:
swissprot_decompressed.write(line)
swissprot_compressed.close()
split = Splitter()
split.splitter('annotation_tobeblasted.tmp', 10)
# Set up BLAST search database:
if not os.path.isfile('../swissprot_database.phr'):
os.system('makeblastdb -in ../swissprot.tmp -dbtype prot -out ../swissprot_database')
# Do segmented BLASTs of files to be annotated versus SWISS-PROT:
annot_blast_files = [annot for annot in os.listdir('.') if annot.startswith("annotation_tobeblasted_")]
totalnum = len(annot_blast_files)
currentnum = 0
print 'Now searching sequences of each cluster against the SWISS-PROT database ...'
for file in annot_blast_files:
currentnum += 1
suffix = file.split('_',-1)[2]
outname = 'annot_blastp_output_' + suffix + '.tmp'
swissprotsearch = NcbiblastpCommandline(cmd='blastp', query=file, db='../swissprot_database', out=outname, outfmt='"6 qseqid sseqid qcovs evalue"', evalue='1e-5', max_target_seqs='1', max_hsps_per_subject='1', num_threads=settings.numthreads)
print 'Now searching %s of %s cluster sequence files ...' % (currentnum, totalnum)
swissprotsearch()
# Concatenate the segmented BLASTP output files
annot_blast_outfiles = [annotout for annotout in os.listdir('.') if annotout.startswith("annot_blastp_output_")]
with open('annot_blastp_output_all.tmp', 'w') as outfile:
for file in annot_blast_outfiles:
with open(file) as annotfile:
for line in annotfile:
line = line.strip()
outfile.write(line + '\n')
# Unique the concatenated output file
concatoutput_seen = set()
with open('annot_blastp_output_all_uniqued.tmp', 'w') as concatuniqued:
with open('annot_blastp_output_all.tmp', 'r') as concatoutput:
for line in concatoutput:
line = line.strip().split('\t')
if line[0] not in concatoutput_seen:
concatoutput_seen.add(line[0])
concatuniqued.write('\t'.join(line) + '\n')
### ANNOTATER STEP FOUR: Retrieve relevant SWISS-PROT flat files.
# Do this using several folders so that none holds > 10,000
# Alongside, create dictionary with folder number for each ID,
# so it can be retrieved for annotation file.
print 'Retrieving Swiss-PROT flat files ...'
# Create new folder and change working directory to that folder
def foldermaker(foldernumber):
if not os.path.exists('../flatfiles' + str(foldernumber)):
os.makedirs('../flatfiles' + str(foldernumber))
with open('annot_blastp_output_all_uniqued.tmp', 'r') as full_annotfile:
foldermaker(settings.flatfiles_foldernumber)
for line in full_annotfile:
line = line.strip().split('\t')
flatfile_id = line[1].split('|',-1)[1]
if flatfile_id not in settings.flatfiles_and_foldernums:
settings.flatfiles_added += 1
settings.flatfiles_and_foldernums[flatfile_id] = settings.flatfiles_foldernumber
working_foldernumber = str(settings.flatfiles_foldernumber)
if not os.path.exists('../flatfiles' + working_foldernumber + '/%s.uni' % flatfile_id):
try:
urllib.urlretrieve('http://www.uniprot.org/uniprot/' + flatfile_id + '.txt', '../flatfiles' + working_foldernumber + '/%s.uni' % flatfile_id)
except Exception:
continue
if settings.flatfiles_added > 9999:
settings.flatfiles_foldernumber += 1
foldermaker(settings.flatfiles_foldernumber)
settings.flatfiles_added = 0
### ANNOTATER STEP FIVE: Generate the annotation file.
# Create dictionary: 1:id1_PAC:15701388, 2:id1_PAC:15703879, etc.
# For each line of output_all, extract relevant information (GO, KEGG, PFAM, etc.) for each cluster rep
# When writing that line, retrieve associated key (cluster#) for that value (sequencename)
# Once all lines are written, sort by clusternum.
print 'Generating annotation file ...'
# get all the variables from the specific UniProt flatfile.
def flatfile_infograbber(current_flatfile):
flatfileID = 'No_ID'
description = 'No_Description'
KEGG = 'No_KEGG'
KEGG_KO = 'No_KEGGKO'
GO = 'No_GO'
Pfam = 'No_Pfam'
for line in open('../flatfiles' + str(current_flatfile_foldernum) + '/' + current_flatfile + '.uni', 'r'):
line = line.strip()
if line[0:3] == 'ID ':
flatfileID = line.split(' ')[3]
if line[0:2] == 'DE' and description == 'No_Description':
description = line.split('=')[1][:-1]
if line[0:2] == 'DR':
if line[5:9] == 'KEGG':
try:
KEGG = line.replace(' ','').split(';')[1]
except Exception:
continue
if line[5:7] == 'KO':
try:
KEGG_KO = line.replace(' ','').split(';')[1]
except Exception:
continue
if line[5:7] == 'GO':
line = line.strip().split(';')
if GO == 'No_GO':
GO = line[1]
else:
GO += '; %s' % line[1]
if line[5:9] == 'Pfam':
Pfam = line.replace(' ','').split(';')[1]
return '%s\t%s\t%s\t%s\t%s\t%s' % (flatfileID, description, KEGG, KEGG_KO, GO, Pfam)
# Cluster by cluster, identify the corresponding number, name,
# and then use above function to pull corresponding annotational info
masternum = 1
clusters_covered = []
with open(evalue + '_annotations_unsorted.tmp', 'w') as finalannotfile:
with open('annot_blastp_output_all_uniqued.tmp', 'r') as finalconcat:
finalannotfile.write('Cluster #\tRepresentative Sequence\tUni-PROT ID\tDescription\tKEGG\tKEGG KO\tGO\tPfam\n')
for line in finalconcat:
line = line.strip().split('\t')
current_seq = line[0]
current_flatfile = line[1].split('|',-1)[1]
current_flatfile_foldernum = settings.flatfiles_and_foldernums[current_flatfile]
current_clusternum = clusters_nameandnum[current_seq]
clusters_covered.append(current_clusternum)
# Find if identified flatfile exists; if so, retrieve information
if os.path.exists('../flatfiles' + str(current_flatfile_foldernum) + '/' + current_flatfile + '.uni'):
restofrow = flatfile_infograbber(current_flatfile)
else:
restofrow = 'NULL (no annotation found)'
# Write out line
finalannotfile.write(str(current_clusternum) + '\t' + current_seq.replace('@@', ',').replace('$$', ' ') + '\t' + restofrow + '\n')
num_clusters = sum(1 for line in open('annot_blastp_output_all_uniqued.tmp'))
for i in range(1, num_clusters):
if i not in clusters_covered:
finalannotfile.write(str(i) + '\t' + 'NULL (no annotation found)' + '\n')
# Sort final annotation file by cluster number and return to results folder
os.system('sort -n %s_annotations_unsorted.tmp > annotations_%s.txt' % (evalue, evalue))