-
Notifications
You must be signed in to change notification settings - Fork 1
/
extractor.py
319 lines (288 loc) · 12.9 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
# extractor.py
# Written by Alexander M. Ruch ([email protected])
# Co-authors: Reid Ralston, Benjamin Cornwell
# Last update: November 9, 2017
# Stage 1 deliverables:
# TODO: Extract Dataset A: actor/actress data
# TODO: use wikipedia.py to get actor/actress data
# TODO: Extract Dataset M: movie data
# TODO: Identify and extract Dataset O: other relevant data
# TODO: Extract race (by text or pics)
# TODO: Determine data structure(s)
# TODO: Create Joined Dataset: joined Datasets A, M, O
# Import data extraction packages
import requests
import json
from time import sleep
import get_data # https://github.com/scraperwiki/wikipedia-infobox-tool
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
# Import data structuring packages
import numpy as np
import datetime as dt
# Import output packages
import pickle
import os.path
'''PART 1: EXTRACT ACTOR/ACTRESS DATA AND GENERATE DATASET A '''
# Initialize dict for datasetA
datasetA = {}
errors = []
# Set links for main pages
male_su = 'Category:American_male_actors_who_committed_suicide'
fem_su = 'Category:American_actresses_who_committed_suicide'
male_nosu = 'Category:American_male_film_actors'
fem_noso = 'Category:American_film_actresses'
namesdict = {
male_su:{'male':1, 'suicide':1},
fem_su:{'male':0, 'suicide':1},
male_nosu:{'male':1, 'suicide':0},
fem_noso:{'male':0, 'suicide':0},
}
def getactnames(namesdict = namesdict):
'''Extracts actors'/actresses' names and Wiki ids. Input is dict of dicts of
lists with Wiki category links, sex, and suicide status of actors/actresses.
This is a modified example of www.mediawiki.org/wiki/API:Query#Sample_query.'''
if os.path.isfile(data/actnames.pickle):
run = input('actnames.pickle detected. Continue to run getactnames()? (Y/N): )')
if (run.lower() == 'y') or (run.lower() == 'yes'):
pass
# L0: Loop through Wiki categories
for group in namesdict:
print('Begin reqesting', group)
api = 'https://en.wikipedia.org/w/api.php'
params = {
'action':'query',
'list':'categorymembers',
'cmtitle':group,
'cmlimit':'max',
'cmcontinue':'',
'continue':'',
'format':'json'
}
headers = {'User-agent':'[email protected]', 'request-delay':'50ms'}
timeout = 10
call = -1
# L1: Keep calling until all group data extracted (500 results per call)
while True:
call += 1
print('Call request', call)
sleep(0.05) # 50 ms
result = requests.get(api, params=params, headers=headers, timeout=timeout)
if result.raise_for_status() != None:
raise Error(result.raise_for_status())
result = result.json()
if 'error' in result:
raise Error(result['error'])
if 'warnings' in result:
print(result['warnings'])
if 'query' in result:
member = -1 # initialize member index
# L2: start looping through members in categorymembers query
while member < len(result['query']['categorymembers']) - 1:
member += 1
title = result['query']['categorymembers'][member]['title']
pageid = result['query']['categorymembers'][member]['pageid']
datasetA[title] = {} # add title:dict pair to datasetA
print('Get data for', title)
datasetA[title]['pageid'] = pageid
print(' pageid:', pageid)
if namesdict[group]['male'] == 1:
datasetA[title]['male'] = 1
print(' male')
if namesdict[group]['male'] == 0:
datasetA[title]['male'] = 0
print(' female')
if namesdict[group]['suicide'] == 1:
datasetA[title]['suicide'] = 1
print(' suicide')
if namesdict[group]['suicide'] == 0:
datasetA[title]['suicide'] = 0
print(' no suicide')
if 'continue' not in result:
break
params['cmcontinue'] = result['continue']['cmcontinue']
params['continue'] = result['continue']['continue']
# Reset continue parameters
print('Finished requests in group', group)
params['cmcontinue'] = ''
params['continue'] = ''
datasetA = {k:v for k,v in datasetA.items() if 'Category' not in k}
print('Finished all requests for all groups. Saving as data/actnames.pickle.')
with open('data/actnames.pickle', 'wb') as f:
pickle.dump(datasetA, f, pickle.HIGHEST_PROTOCOL)
print('Save complete. Function finished.')
def getactinfobox(datasetA = datasetA):
'''Extracts Wikipedia infobox data (born, died, cause of death, nationality,
years active, spouse(s), etc). NOTE: Must run getactnames() first to extract
datasetA, a dict of dicts of actors'/actresses' titles and pageids.'''
# Modified from github.com/scraperwiki/wikipedia-infobox-tool/get_data.py
# Check datasetA
if len(datasetA) == 0:
print('NOTE: datasetA is empty.')
if os.path.isfile(data/actnames.pickle):
get = input('actnames.pickle detected. Open it as datasetA? (Y/N): )')
if (get.lower() == 'y') or (get.lower() == 'yes'):
with open('data/actnames.pickle', 'rb') as f:
datasetA = pickle.load(f)
else:
raise Error('Must run getactnames() first to get datasetA.')
else:
raise Error('Must run getactnames() first to get datasetA.')
# L0: Loop through acts and give their pageids to scrape_infobox()
call = -1
for act in datasetA:
pageid = datasetA[act]['pageid']
call += 1
if call % 1000 == 0:
print('Called', call, 'out of', len(datasetA), 'requests')
# Begin extracting infobox biography data via Wiki API
data = get_data.scrape_infobox(pageid)
# Append info dict to datasetA[act] dict
try:
datasetA[act] = {**datasetA[act], **data}
except:
errors.append([act, type(data)])
pass
print(errors)
def getacttitle(searchdic, pageid):
for k_searchdict in searchdic:
try:
for k_actdict,v_actvar in searchdic[k_searchdict].items():
if v_actvar == pageid:
return k_searchdict
except:
pass
def getactpageid(searchdic, title):
for k_searchdict in searchdic:
try:
if k_searchdict == title:
return searchdic[k_searchdict]['pageid']
except:
pass
def getactsuicide(searchdic, title):
for k_searchdict in searchdic:
try:
if k_searchdict == title:
return searchdic[k_searchdict]['suicide']
except:
pass
def cleanactdata(datasetA = datasetA):
'''Edits datasetA to remove any incorrectly extracted element and to remove
any string from title keys that is not part of one's name (eg, "(actor)",
"(actress)", etc.'''
# Search for elements with 'Category' in the title key
['category' in m for m in list(ex.datasetA.keys())]
# Append erroneously extracted element ids to a list
# Print list of errors and use input to confirm popping in datasetA
# Search for elements with non-name text in the title key (eg, '(actor)')
[print(m) for m in list(ex.datasetA.keys()) if '(' in m.lower() or ')' in m.lower()]
# Edit title to remove non-name text so only name remains in title key
''' PART 2: EXTRACT MOVIE DATA AND GENERATE Dataset M '''
# Initialize dict for datasetM
datasetM = {}
''' Part 3: EXTRACT OTHER RELEVANT DATA AND GENERATE Dataset O '''
# Initialize dict for datasetO
datasetO = {}
''' Part 4: RESTRUCTURE DATA '''
#
''' Part 5: JOIN Datasets A, M, O '''
#
################################################################################
################################################################################
''' Extracts titles, Wiki links, sex, and suicide. Input wikies_su is a list
of Wikipedia category pages for film actors/actresses who committed suicide.
def getactsu(wikies_su = wikies_su):
male = 2 # initialize sex indicator (1=male, 0=female)
# L0: start group look
for wiki in wikies_su:
male -= 1
try:
page = urlopen(wiki)
print('Opening:', wiki)
except:
print('EXCEPTION: urlopen() failed. Processing ended on', wiki)
soup = BeautifulSoup(page, 'html.parser')
names_box = soup.find_all("div", class_="mw-category-group")
box = -1 # initialize name group index
# L1: start name group loop
while box < len(names_box) - 1:
box += 1
print('Processing name box', str(box), 'of', str(len(names_box)-1))
# L2: start within name look
for l in names_box[box].find_all('a'):
title = l.text # get title
print('Get title:', title)
pageid = linkroot + l.get('href') # get pageid
print(' Get pageid:', pageid)
datasetA[title] = {} # add title to datasetA
datasetA[title]['pageid'] = pageid # add pageid to title dict
datasetA[title]['suicide'] = True # add suicide to title dict
if male == 1:
datasetA[title]['male'] = 'male'
print(' Get sex: male')
if male == 0:
datasetA[title]['male'] = 'female'
print(' Get sex: female')
sleep(1)
'''
''' Extracts titles, Wiki links, sex, and no suicide. Input wikies_nosu is a
list of lists with links to Wikipedia pages for film actors/actresses who
did not commit suicide.
def getactnosu(wikies_nosu = wikies_nosu):
male = 2 # initialize sex indicator (0=male, 1=female): get list's lists
# L0: start genderlist loop
for genderlist in wikies_nosu:
male -= 1
print('Begin loop in', genderlist)
# L1: start looping through link pages of genderlist: get list elements
for link in genderlist:
try:
# Make API request; receive event node field and/or edge data response
sleep(0.05)
print('Opening:', link)
req = requests.get(link)
req = req.json()
except:
print('EXCEPTION: requests() failed at', link)
soup = BeautifulSoup(page, 'html.parser')
names_box = soup.find_all("div", class_="mw-content-ltr")[2] # get titles/links
names_box = names_box.find_all('a')
box = -1 # initialize name group index
# L2: loop through name groups on link page
while box < len(names_box) - 1:
box += 1
print('Processing name box', str(box), 'of', str(len(names_box)-1))
# L3: start within name loop
for l in names_box[box].find_all('a'):
title = l.text # get title
print('Get title:', title)
pageid = linkroot + l.get('href') # get pageid
print(' Get pageid:', pageid)
datasetA[title] = {} # add title to datasetA
datasetA[title]['pageid'] = pageid # add pageid to title dict
datasetA[title]['suicide'] = False # add suicide to title dict
if male == 1:
datasetA[title]['male'] = 'male'
print(' Get sex: male')
if male == 0:
datasetA[title]['male'] = 'female'
print(' Get sex: female')
# After processing all names_box, get next page link
try:
nextlink = soup.find_all("div", id="mw-pages") # get page links
nextlink = str(mwpages)[str(mwpages).rfind('/w/'):] # slice nextlink front
if "previous page" in nextlink: # = next page inactive, last page
print('CONTINUE: Last page processed or next page not extracted')
print("'nextlink' =", nextlink, '. Check previous/next pages')
continue # Move to L1 if no next page (L2 for loop expires)
nextlink = nextlink[:nextlink.index('"')] # slice nextlink end
nextlink = linkroot + nextlink # add nextlink to linkroot
wikies_nosu[genderlist].append(nextlink) # append nextlnik to L1
# If no more next page links or genderlist elements:
except:
print('BREAK: Processing ended on the following page:')
print(link)
print('Failed command: soup.find_all("div", id="mw-pages")')
break
'''