-
Notifications
You must be signed in to change notification settings - Fork 17
/
naivebayesclassifier.py
79 lines (66 loc) · 2.43 KB
/
naivebayesclassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
'''
Implementation of Naive Bayes Classifier for tweets
'''
import sys
import math
from classifier import Classifier
class NaiveBayesClassifier(Classifier):
def __init__(self, fname, *args, **kargs):
Classifier.__init__(self, fname, *args, **kargs)
# sometimes a threshold value is trained during Bayesian
# classification to avoid classifying too many 'documents' as
# one kind or the other
self.thresholds = [1.0, 1.0]
def setThresholds(self, neg=1.0, pos=1.0):
self.thresholds = [neg, pos]
def probTweetClass(self, text, c):
'''
Returns the (log) probability of a tweet, given a particular class
P(tweet | class)
'''
features = self.getFeatures(text)
p = 0
for f in features:
p += math.log(self.weightedProb(f, c))
return p
def probClassTweet(self, text, c):
'''
Returns the (log) probability of a class, given a particular tweet
P(class | tweet) = P(tweet | class) x P(class) / P(tweet)
But P(tweet) is constant for all classes; so forget
'''
return self.probTweetClass(text, c) + math.log(self.probC(c))
def classify(self, text):
'''
Returns 0 (negative) if P(class=0 | tweet) > P(class=1 | tweet) * thresholds[0]
Return 1 (positive) if P(class=1 | tweet) > P(class=0 | tweet) * thresholds[1]
Else return -1 (neutral)
'''
p0 = self.probClassTweet(text, 0)
p1 = self.probClassTweet(text, 1)
if p0 > p1 + math.log(self.thresholds[0]):
return 0
elif p1 > p0 + math.log(self.thresholds[1]):
return 1
else:
return -1
def __repr__(self):
return "Classifier info: (weight=%s, grams=%s, thresholds=%s)" % (self.weight, self.numgrams, self.thresholds)
def main():
# file to get training data from
fromf = 'trainingandtestdata/training.csv'
naive = NaiveBayesClassifier(fromf)
naive.trainClassifier()
# optionally, pass in some tweet text to classify
if len(sys.argv) == 2:
print
text = sys.argv[1]
result = naive.classify(text)
if result == 0:
print "'%s' predicted to be Negative" % text
elif result == 1:
print "'%s' predicted to be Positive" % text
else:
print "'%s' predicted to be Neutral" % text
if __name__ == "__main__":
main()