-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
108 lines (78 loc) · 2.5 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import requests
from bs4 import BeautifulSoup
cookies = {
'__cfduid':'d38fe948b503b113b6c9c8cf801c171481507175723',
'_gat':'1',
'_ga':'GA1.2.135018015.1507175730',
'_gid':'GA1.2.1241147407.1507175730',
'over18': '1',
}
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
def getPage_link(board, page_size):
host_link = 'https://www.ptt.cc/bbs/'
link = host_link + board + '/index.html'
r = requests.get(link, cookies=cookies, headers=headers)
s = BeautifulSoup(r.text, 'lxml')
ss = s.find_all('a', class_='btn wide')
prepage = ss[1]['href']
length = len('/bbs/')+len(board) + len('/index')
page_num = int(prepage[length:-5])
print(page_num)
page_link = list()
last_page_num = max(0, page_num-page_size)
for i in range(page_num, last_page_num, -1):
tmp_s = host_link+board+'/index'+str(i)+'.html'
page_link.append(tmp_s)
return page_link
def getSinglePage_link(link):
r = requests.get(link, cookies=cookies, headers=headers)
s = BeautifulSoup(r.text, 'lxml')
ss = s.find_all('div', class_='r-ent')
single_page_link = list()
for i in ss:
try:
a = i.find('a')
print(a.text)
l = a['href']
single_page_link.append(l)
except:
continue
result = ['https://www.ptt.cc'+i for i in single_page_link]
return result
def getContent(link):
r = requests.get(link, cookies=cookies, headers=headers)
s = BeautifulSoup(r.text, 'lxml')
ss = s.find('div', id='main-content')
return ss.text
def DownLoadFromBoard(Board, page_size, filename):
links = getPage_link(Board, page_size)
page_links = list()
with open(filename, 'w') as f:
for link in links:
l = getSinglePage_link(link)
for i in l:
f.write(i+'\n')
f.flush()
def DownLoadFromListOFLink(linkfile, docfile):
doc_fp = open(docfile, 'w')
link_fp = open(linkfile, 'r')
cnt = 0
for line in link_fp:
l = line.replace('\n', '')
cnt += 1
print("{}: {}".format(cnt, l))
try:
c = getContent(l)
doc_fp.write(c+'\n')
doc_fp.flush()
except:
continue
doc_fp.close()
link_fp.close()
def String2File(s, filename):
with open(filename, 'w') as f:
f.write(s)
if __name__ == "__main__":
DownLoadFromListOFLink('LinkOfdata.txt', 'data.txt')