This repository has been archived by the owner on Dec 15, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiment.py
156 lines (129 loc) · 6.79 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# -*- coding: utf-8 -*-
"""
Created on Fri Jan 01 20:54:08 2016
@author: Pratik
"""
from math import log10
import numpy as np
import matplotlib.pyplot as plt
import re, math, collections, itertools, os
import nltk, nltk.classify.util
from nltk.metrics.scores import precision, recall, f_measure
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
POLARITY_DATA_DIR = os.path.join('polarityData', 'rt-polaritydata')
RT_POLARITY_POS_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-pos.txt')
RT_POLARITY_NEG_FILE = os.path.join(POLARITY_DATA_DIR, 'rt-polarity-neg.txt')
#this function takes a feature selection mechanism and returns its performance in a variety of metrics
def evaluate_features(feature_select):
posFeatures = []
negFeatures = []
global cnt
cnt += 1
#http://stackoverflow.com/questions/367155/splitting-a-string-into-words-and-punctuation
#breaks up the sentences into lists of individual words (as selected by the input mechanism) and appends 'pos' or 'neg' after each list
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords = [feature_select(posWords), 'pos'] #calls make_full_dict and returns a dict with [word,'True']
posFeatures.append(posWords)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWords = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords = [feature_select(negWords), 'neg']
negFeatures.append(negWords)
#selects 3/4 of the features to be used for training and 1/4 to be used for testing
posCutoff = int(math.floor(len(posFeatures)*3/4))
negCutoff = int(math.floor(len(negFeatures)*3/4))
trainFeatures = posFeatures[:posCutoff] + negFeatures[:negCutoff]
testFeatures = posFeatures[posCutoff:] + negFeatures[negCutoff:]
#trains a Naive Bayes Classifier
classifier = NaiveBayesClassifier.train(trainFeatures)
#initiates referenceSets and testSets
referenceSets = collections.defaultdict(set)
testSets = collections.defaultdict(set)
#puts correctly labeled sentences in referenceSets and the predictively labeled version in testsets
for i, (features, label) in enumerate(testFeatures):
referenceSets[label].add(i)
predicted = classifier.classify(features)
testSets[predicted].add(i)
#prints metrics to show how well the feature selection did
print 'train on %d instances, test on %d instances' % (len(trainFeatures), len(testFeatures))
print 'accuracy:', nltk.classify.util.accuracy(classifier, testFeatures)
print 'pos precision:', precision(referenceSets['pos'], testSets['pos'])
print 'pos recall:', recall(referenceSets['pos'], testSets['pos'])
print 'pos f1-score:', f_measure(referenceSets['pos'], testSets['pos'])
print 'neg precision:', precision(referenceSets['neg'], testSets['neg'])
print 'neg recall:', recall(referenceSets['neg'], testSets['neg'])
print 'neg f1-score:', f_measure(referenceSets['neg'], testSets['neg'])
classifier.show_most_informative_features(10)
print '================================================='
#creates a feature selection mechanism that uses all words
def make_full_dict(words):
return dict([(word, True) for word in words])
#tries using all words as the feature selection mechanism
print 'using all words as features'
evaluate_features(make_full_dict,)
#scores words based on chi-squared test to show information gain (http://streamhacker.com/2010/06/16/text-classification-sentiment-analysis-eliminate-low-information-features/)
def create_word_scores():
#creates lists of all positive and negative words
posWords = []
negWords = []
with open(RT_POLARITY_POS_FILE, 'r') as posSentences:
for i in posSentences:
posWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
posWords.append(posWord)
with open(RT_POLARITY_NEG_FILE, 'r') as negSentences:
for i in negSentences:
negWord = re.findall(r"[\w']+|[.,!?;]", i.rstrip())
negWords.append(negWord)
posWords = list(itertools.chain(*posWords))
negWords = list(itertools.chain(*negWords))
#build frequency distibution of all words and then frequency distributions of words within positive and negative labels
word_fd = FreqDist()
cond_word_fd = ConditionalFreqDist()
for word in posWords:
word_fd[word.lower()] += 1
cond_word_fd['pos'][word.lower()] += 1
for word in negWords:
word_fd[word.lower()] += 1
cond_word_fd['neg'][word.lower()] += 1
#finds the number of positive and negative words, as well as the total number of words
pos_word_count = cond_word_fd['pos'].N()
neg_word_count = cond_word_fd['neg'].N()
total_word_count = pos_word_count + neg_word_count
#builds dictionary of word scores based on chi-squared test
word_scores = {}
for word, freq in word_fd.iteritems():
pos_score = BigramAssocMeasures.chi_sq(cond_word_fd['pos'][word], (freq, pos_word_count), total_word_count)
neg_score = BigramAssocMeasures.chi_sq(cond_word_fd['neg'][word], (freq, neg_word_count), total_word_count)
word_scores[word] = pos_score + neg_score
return word_scores
#finds word scores
word_scores = create_word_scores()
#finds the best 'number' words based on word scores
def find_best_words(word_scores, number_i):
best_vals = sorted(word_scores.iteritems(), key=lambda (w, s): s, reverse=True)[:number_i]
best_words = set([w for w, s in best_vals])
return best_words
#creates feature selection mechanism that only uses best words
def best_word_features(words):
return dict([(word, True) for word in words if word in best_words])
#numbers of features to select
numbers_to_test = [10, 100, 1000, 5000, 10000, 15000, 20000, 25000]
#tries the best_word_features mechanism with each of the numbers_to_test of features
for i in numbers_to_test:
global c
c += 1
print 'evaluating best %d word features' % (i)
best_words = find_best_words(word_scores, i)
evaluate_features(best_word_features)
features = [10, 50, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 12500, 15000, 20000, 25000]
logFeatures = [log10(i) for i in features]
accuracy = [0.5746436609152288, 0.6406601650412603, 0.6822955738934734, 0.7291822955738935, 0.7659414853713429, 0.7861965491372843, 0.7963240810202551, 0.8304576144036009, 0.8510877719429858, 0.8465866466616654, 0.8465866466616654, 0.8465866466616654, 0.8469617404351087, 0.773068267067, 0.773068267067]
m,b = np.polyfit(logFeatures, accuracy, 1)
regression = [m*x+b for x in logFeatures]
plt.plot(logFeatures, accuracy)
plt.plot(logFeatures, regression)
plt.show()