forked from simoncos/weibo-complaint-crawler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
160 lines (141 loc) · 7.47 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import traceback
import time
import re
from selenium.common.exceptions import StaleElementReferenceException
from conf import SLEEP_NEXT_REPORTER, SLEEP_NEXT_REPORTS_PAGE, REPORTER_MAX_ITER
def extractReporter(driver):
reporter = driver.find_element_by_xpath(
'//div[@class="W_main_half_l"]//div[@class="user bg_blue2 clearfix"]')
reporter_name = reporter.find_element_by_xpath('p[@class="mb W_f14"]/a[1]').text
reporter_url = reporter.find_element_by_xpath('p[@class="mb W_f14"]/a[1]').get_attribute('href')
reporter_img_url = reporter.find_element_by_xpath(
'//*[@id="pl_service_common"]/div[2]/div[1]/div/div[2]/div/img').get_attribute('src')
reporter_gender = reporter.find_element_by_xpath('p[@class="mb"]/img').get_attribute('class')
reporter_location = reporter.find_element_by_xpath('p[@class="mb"]').text
reporter_description = reporter.find_element_by_xpath('p[last()]').text
return {'reporter_url': reporter_url,
'reporter_name': reporter_name,
'reporter_img_url': reporter_img_url,
'reporter_gender': reporter_gender,
'reporter_location': reporter_location,
'reporter_description': reporter_description,
}
def extractReporters(driver):
# TODO: one reporter has multiple reports: http://service.account.weibo.com/show?rid=K1CaK6wBk7agg
reporter_count_text = driver.find_element_by_xpath('//*[@id="pl_service_common"]/div[2]/div[1]/div/div[1]/span[2]').text
if re.match('\(共有', reporter_count_text):
reporter_count = 20
actual_reporter_count = int(reporter_count_text.split(sep='共有')[1].split(sep='人')[0])
else:
reporter_count = int(reporter_count_text.split(sep='共')[1].split(sep='人')[0])
actual_reporter_count = reporter_count
print('Reporter Count: {}, Actual: {}'.format(reporter_count, actual_reporter_count))
if reporter_count == 1:
crawled_reporters = [extractReporter(driver)]
else:
iter = 0
crawled_reporters = []
crawled_reporter_names = []
while True:
if iter > REPORTER_MAX_ITER:
print(f'reporter iter exceeds {REPORTER_MAX_ITER}, break')
break
iter += 1
print('reporter iter: {}'.format(iter))
if len(crawled_reporter_names) == reporter_count:
break
try: # StaleElementReferenceException, if no try, will throw not-attached element exception
reporter = driver.find_element_by_xpath(
'//div[@class="W_main_half_l"]//div[@class="user bg_blue2 clearfix"]')
reporter_name = reporter.find_element_by_xpath('p[@class="mb W_f14"]/a[1]').text
if reporter_name not in crawled_reporter_names:
crawled_reporters.append(extractReporter(driver))
crawled_reporter_names.append(reporter_name)
else:
print('duplicated reporter')
next_reporter = driver.find_element_by_xpath('//*[@id="pl_service_common"]/div[2]/div[1]/div/div[1]/a')
next_reporter.click()
time.sleep(SLEEP_NEXT_REPORTER)
except:
print('Reporter iter exception: {}'.format(traceback.format_exc()))
continue
return crawled_reporters, actual_reporter_count
def extractReports(driver, crawled_reporters):
crawled_reports_count = 0
while True:
reports = driver.find_elements_by_xpath('//*[@id="pl_service_common"]/div[4]/div[1]/div/div/div[1]/div')
for report in reports:
report_time_text = report.find_element_by_xpath('p[@class="publisher"]').text
if report_time_text != '举报人:':
report_time = report_time_text.split('举报人陈述时间:')[1]
report_text = report.find_element_by_xpath('div[@class="feed clearfix"]/div[@class="con"]').text
reporter_url = report.find_element_by_xpath(
'div[@class="feed clearfix"]/div[@class="con"]/a').get_attribute('href')
for r in crawled_reporters:
if r['reporter_url'] == reporter_url:
r['report_time'] = report_time
r['report_text'] = report_text
else:
pass
else:
print('Reporter deleted report')
pass
crawled_reports_count += len(reports)
print('crawled reporters: {}, crawled reports: {}'.format(len(crawled_reporters), crawled_reports_count))
try:
next_page = driver.find_element_by_xpath('//*[@id="pl_service_common"]/div[4]/div[1]/div/div/div[2]/div/a[@class="next"]')
next_page.click()
time.sleep(SLEEP_NEXT_REPORTS_PAGE)
print('Go to next reports page')
except:
break
return crawled_reporters
def extractRumor(driver):
rumorer = driver.find_element_by_xpath('//div[@class="W_main_half_r"]//div[@class="user bg_orange2 clearfix"]')
rumorer_name = rumorer.find_element_by_xpath('p[@class="mb W_f14"]/a[1]').text
rumorer_url = rumorer.find_element_by_xpath('p[@class="mb W_f14"]/a[1]').get_attribute('href')
rumorer_gender = rumorer.find_element_by_xpath('p[@class="mb"]/img').get_attribute('class')
rumorer_location = rumorer.find_element_by_xpath('p[@class="mb"]').text
rumorer_description = rumorer.find_element_by_xpath('p[last()]').text
crawled_rumor = {
'rumorer_name': rumorer_name,
'rumorer_url': rumorer_url,
'rumorer_gender': rumorer_gender,
'rumorer_location': rumorer_location,
'rumorer_description': rumorer_description,
}
rumor = driver.find_element_by_xpath('//*[@id="pl_service_common"]/div[4]/div[2]/div/div/div/div')
# TODO: handle deleted rumor weibo
rumor_time_text = rumor.find_element_by_xpath('p[@class="publisher"]').text
if rumor_time_text != '被举报微博':
rumor_time = rumor_time_text.split('被举报微博 发布时间:')[1].split(' | 原文')[0]
try:
rumor_url = rumor.find_element_by_xpath('p[@class="publisher"]/a').get_attribute('href')
except:
print('Can not find original url of rumor')
rumor_url = ''
rumor_text = rumor.find_element_by_xpath('div[@class="feed bg_orange2 clearfix"]/div[@class="con"]').text
rumorer_url = rumor.find_element_by_xpath(
'div[@class="feed bg_orange2 clearfix"]/div[@class="con"]/a').get_attribute('href')
assert rumorer_url == crawled_rumor['rumorer_url']
crawled_rumor['rumor_url'] = rumor_url
crawled_rumor['rumor_time'] = rumor_time
crawled_rumor['rumor_text'] = rumor_text
else:
print('Rumor weibo deleted')
pass
return crawled_rumor
def extractOfficial(driver):
# TODO: multiple times of official: http://service.account.weibo.com/show?rid=K1CaK6wBk7agg
official_text = driver.find_element_by_xpath('//*[@id="pl_service_common"]/div[3]/div/div/p').text
crawled_official = {
'official_text': official_text
}
return crawled_official
def extractLooks(driver):
crawled_looks = []
for look in driver.find_elements_by_xpath('//*[@id="pl_service_looker"]/div/ul/li'):
looker_url = look.find_element_by_xpath('a').get_attribute('href')
looker_name = look.find_element_by_xpath('a').get_attribute('title')
crawled_looks.append([looker_url, looker_name])
return crawled_looks