forked from cplaisier/offYerBackV2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
pssm.py
253 lines (224 loc) · 10.2 KB
/
pssm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
#################################################################
# @Program: pssm.py #
# @Version: 1 #
# @Author: Chris Plaisier #
# @Sponsored by: #
# Nitin Baliga, ISB #
# Institute for Systems Biology #
# 1441 North 34th Street #
# Seattle, Washington 98103-8904 #
# (216) 732-2139 #
# @Also Sponsored by: #
# Luxembourg Systems Biology Grant #
# #
# If this program is used in your analysis please mention who #
# built it. Thanks. :-) #
# #
# Copyrighted by Chris Plaisier 12/4/2009 #
#################################################################
from math import log
# Libraries for plotting
#import numpy, corebio # http://numpy.scipy.org and http://code.google.com/p/corebio/
from numpy import array, float64, log10 # http://numpy.scipy.org
#from weblogolib import * # http://code.google.com/p/weblogo/
# A class designed to hold a position specific scoring matrix
# and be able to output this in many different formats
#
# Variables:
# pssmName - name
# eValue - the significance of the motif
# nsites - number of genes that have the motif
# genes - the genes that have the motif
# pssmMatrix - a matrix of x by 4 [A, C, G, T]
#
# Functions:
# readPssm(pssmFileName) - Reads in the pssm from the specified file, file name should be as absolute as necessary.
# getMatrix() - returns the pssm matrix as is
# getMemeFormatted() - returns the pssm matrix as a meme 3 formatted string
# colConsensus() - gets the consensus letter for a column of the pssm matrix
# getConsensusMotif() - repeatedly calls colConsensus to get the complete consensus motif
# getName() - return the name of the pssm "bicluster#_motif#<optional _3pUTR>"
# getNSites() - return the number of genes with the motif
# getEValue() - return the significance of the motif for the bicluster
# getGenes() - return the names of the genes that have the motif
#
class pssm:
# Initialize the pssm
def __init__(self, pssmFileName=None, biclusterName=None, nsites=None, eValue=None, pssm=None, genes=None, de_novo_method='meme'):
self.de_novo_method = de_novo_method
self.matches = [] # each entry should be a dictionary of {'factor':<factor_name>,'confidence':<confidence_value>}
self.permutedPValue = None
if not pssmFileName==None:
print biclusterName, pssmFileName, (((pssmFileName.split('.'))[0]).split('/'))[-1], de_novo_method
self.name = str(biclusterName)+'_'+(((pssmFileName.split('.'))[0]).split('/'))[-1]+'_'+de_novo_method
self.readPssm(pssmFileName)
else:
self.name = biclusterName
self.nsites = nsites
self.eValue = eValue
self.matrix = pssm
self.genes = genes
# Read in the PSSM matrix
def readPssm(self, pssmFileName):
pssmFile = open(pssmFileName,'r')
firstLine = pssmFile.readline().strip().split(',')
self.eValue = firstLine[0]
self.nsites = firstLine[1]
self.genes = []
for i in range(int(self.nsites)):
self.genes.append(pssmFile.readline().strip().split(',')[0])
self.matrix = []
self.matrix += [[float(i) for i in line.strip().split(',')] for line in pssmFile.readlines() if line]
pssmFile.close()
# Returns the name of the PSSM
def getName(self):
return self.name
# Sets the name of the PSSM
def setName(self,name):
self.name = name
# Returns the name of the PSSM
def getMethod(self):
return self.de_novo_method
# Returns the name of the PSSM
def setMethod(self, de_novo_method):
self.de_novo_method = de_novo_method
# Returns the E-value of the PSSM
def getEValue(self):
return self.eValue
# Returns the number of sites for the PSSM
def getNSites(self):
return self.nsites
# Returns the number of genes of the PSSM
def getNumGenes(self):
return len(self.genes)
# Returns the genes of the PSSM
def getGenes(self):
return self.genes
# Returns the matrix
def getMatrix(self):
return self.matrix
# Pads the meme nucleotide frequencies with zeros
def padMe(self,str1):
if len(str1)<8:
for i in range(8-len(str1)):
str1 += '0'
return str1
# Retunrs a log-odds value
def logOdds(self, p, f):
p = float(p)
f = float(f)
if p==float(0):
v1 = str(int(round(log(float(1E-300)/f,2),0)))
else:
v1 = str(int(round(log(p/f,2),0)))
if len(v1)<6:
for i in range(6-len(v1)):
v1 = ' ' + v1
return v1
# Returns a meme 3 formatted string (letter-probability matrix)
def getMemeFormatted(self,atFreq=0.25,cgFreq=0.25):
memeFormatted = 'MOTIF '+self.name+'\n'
memeFormatted += 'BL MOTIF '+self.name+' width=0 seqs=0\n'
if self.de_novo_method=='meme':
nsites = self.nsites
eValue = self.eValue
elif self.de_novo_method=='weeder':
nsites = len(self.nsites)
eValue = 0.05
memeFormatted += 'letter-probability matrix: alength= 4 w= '+str(len(self.matrix))+' nsites= '+str(nsites)+' E= '+str(eValue)
for i in self.matrix:
memeFormatted += '\n '+self.padMe(str(round(float(i[0]),6)))+' '+self.padMe(str(round(float(i[1]),6)))+' '+self.padMe(str(round(float(i[2]),6)))+' '+self.padMe(str(round(float(i[3]),6)))
return memeFormatted
# Returns a mast formatted string (log-odds matrix)
def getMastFormatted(self,atFreq=0.25,cgFreq=0.25):
mastFormatted = 'log-odds matrix: alength= 4 w= '+str(len(self.matrix))
for i in self.matrix:
mastFormatted += '\n '+self.logOdds(i[0],atFreq)+' '+self.logOdds(i[1],cgFreq)+' '+self.logOdds(i[2],cgFreq)+' '+self.logOdds(i[3],atFreq)
return mastFormatted
# Returns the consensus word for a motif
def getConsensusMotif(self, lim1=0.6, lim2=0.8, three=0):
consensus = ''
for i in range(len(self.matrix)):
consensus += self.colConsensus(self.matrix, i, lim1, lim2, three)
return consensus
# Returns the consensus letter for a motif column
def colConsensus(self, pssm, i, lim1, lim2, three):
two_base_l = ['Y','R','W','S','K','M']
three_base_l = ['V','H','D','B']
conLet = 'N'
if float(pssm[i][0])>=lim1:
conLet = 'A'
elif float(pssm[i][1])>=lim1:
conLet = 'C'
elif float(pssm[i][2])>=lim1:
conLet = 'G'
elif float(pssm[i][3])>=lim1:
conLet = 'T'
else:
two_base_c = [float(pssm[i][1])+float(pssm[i][3]), float(pssm[i][0])+float(pssm[i][2]), float(pssm[i][0])+float(pssm[i][3]), float(pssm[i][1])+float(pssm[i][2]), float(pssm[i][2])+float(pssm[i][3]), float(pssm[i][0])+float(pssm[i][1])]
three_base_c = [float(pssm[i][0])+float(pssm[i][1])+float(pssm[i][2]), float(pssm[i][0])+float(pssm[i][1])+float(pssm[i][3]), float(pssm[i][0])+float(pssm[i][2])+float(pssm[i][3]), float(pssm[i][1])+float(pssm[i][2])+float(pssm[i][3])]
pMax = 0
for k in range(0,6):
if two_base_c[k] > pMax:
pMax = two_base_c[k]
conLet = two_base_l[k]
if not pMax>lim2 and three==1:
for k in range(0,4):
if three_base_c[k] > pMax:
pMax = three_base_c[k]
conLet = three_base_l[k]
if not pMax>lim2:
conLet = 'N'
return conLet
# Plot a PSSM using weblogo
def plot(self, fileName):
dist = numpy.array( self.getMatrix(), numpy.float64 )
data = LogoData.from_counts(corebio.seq.unambiguous_dna_alphabet, dist*100)
options = LogoOptions()
options.color_scheme = colorscheme.nucleotide
format = LogoFormat(data, options)
fout = open(fileName, 'w')
png_formatter(data, format, fout)
fout.close()
# Add a match to a pssm
def addMatch(self, factor, confidence):
if not hasattr(self,'matches'):
self.matches = []
self.matches.append({'factor':factor, 'confidence':confidence})
# Add a match to a pssm
def getMatches(self):
if hasattr(self, 'matches') and not self.matches==[]:
return self.matches
else:
return None
# Add an expanded match to a pssm
def addExpandedMatch(self, factor, seedFactor):
if not hasattr(self,'expandedMatches'):
self.expandedMatches = []
self.expandedMatches.append({'factor':factor, 'seedFactor':seedFactor})
# Get expanded matches for a pssm
def getExpandedMatches(self):
if hasattr(self, 'expandedMatches') and not self.expandedMatches==[]:
return self.expandedMatches
else:
return None
# Add a correlated match to a pssm
def addCorrelatedMatch(self, factor, rho, pValue):
if not hasattr(self,'correlatedMatches'):
self.correlatedMatches = []
self.correlatedMatches.append({'factor':factor, 'rho':rho, 'pValue':pValue})
# Get correlated matches for a pssm
def getCorrelatedMatches(self):
if hasattr(self, 'correlatedMatches') and not self.correlatedMatches==[]:
return self.correlatedMatches
else:
return None
# Add a match to a pssm
def setPermutedPValue(self, permutedPValue):
self.permutedPValue = permutedPValue
# Add a match to a pssm
def getPermutedPValue(self):
if hasattr(self,'permutedPValue'):
return self.permutedPValue
else:
return None