-
Notifications
You must be signed in to change notification settings - Fork 0
/
dbpedia_tweets.py
113 lines (86 loc) · 3.22 KB
/
dbpedia_tweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import spotlight
import json
import tweepy
from clubfilehandler import ClubFileHandler
consumer_key = 'fanorpDFidIhsaLNzQFeSOCOG'
consumer_secret = 'i2ELm5Olc68yNFhIIZDNbeMEeqHB4coBde7PK2MELFbUgB6lU6'
access_token = '2887746393-2LkYKjiBCfo3uUaG34osy9YhM0BGCHHl7yPAc74'
access_token_secret = 'GL5szUVLnnFEIhOgEMNOLaBegNclSWmSfCU5ZZKjVGEZS'
annotations_host = 'http://model.dbpedia-spotlight.org/en/annotate'
candidates_host = 'http://model.dbpedia-spotlight.org/en/candidates'
confidence_level = 0.0
support_level = -1
filters = {
'types': "DBPedia:SoccerPlayer,DBPedia:SoccerClub"
}
nicknames = {"Heart FC" : "Heart of Midlothian F.C",
"Spurs" : " Tottenham Hotspurs F.C.",
"West Ham" : "West Ham United F.C.",
"Wigan" : "Wigan Athletic F.C.",
"Paris" : "Paris Saint-Germain F.C."
}
def get_twitter_api():
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
return auth
def get_annotations(text):
try:
return spotlight.annotate(annotations_host, text, confidence=-confidence_level,
support=support_level, filters=filters)
except spotlight.SpotlightException:
return "No annotations found"
def get_candidates(text):
try:
return spotlight.candidates(candidates_host, text, confidence=confidence_level,
support=support_level)
except spotlight.SpotlightException as e:
return "No candidates"
def load_json_file(filename):
with open(filename, 'r') as f:
return json.load(f)
def get_tweets():
x = 0
list_of_tweets = []
while x < no_of_tweets:
if no_of_tweets - x > 100:
list_of_tweets.extend(api.statuses_lookup(tweets_ids[x:99 + x]))
else:
list_of_tweets.extend(api.statuses_lookup(tweets_ids[x:no_of_tweets - 1]))
x += 100
return list_of_tweets
def format_tweet(tweet):
txt = tweet.text
txt = txt.split(' http')[0]
txt = txt.replace('?', '.')
txt = txt.encode('utf-8').decode('utf-8')
return txt
api = tweepy.API(get_twitter_api())
tweets_ids = load_json_file('old_ids.json')
no_of_tweets = len(tweets_ids)
print(no_of_tweets)
tweets = get_tweets()
print(len(tweets))
fields = ['player', 'club']
handler = ClubFileHandler("player_club_uri.csv", fields)
writer = handler.get_csv_writer()
writer.writeheader()
no_of_valid_tweets = 0
for tweet in tweets:
tweet_text = format_tweet(tweet)
annotations = get_annotations(tweet_text)
if "No annotations found" not in annotations:
print(tweet_text)
if len(annotations) > 1:
if (" to " in tweet_text) or (" trials " in tweet_text):
player = annotations[0]['URI'].encode('utf-8')
club = annotations[1]['URI'].encode('utf-8')
else:
player = annotations[1]['URI'].encode('utf-8')
club = annotations[0]['URI'].encode('utf-8')
print(player)
print(club)
writer.writerow({'player': player, 'club': club})
no_of_valid_tweets += 1
else:
print("Not enough annotations found")
print(no_of_valid_tweets)