-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawler.py
98 lines (73 loc) · 2.67 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import requests
from bs4 import BeautifulSoup
import re
import time
import json
def take(link, result,key):
url = link
try:
r = requests.get(url)
except requests.exceptions.RequestException as e:
print(f"Couldn't access {url}. Error: {e}")
return [], [], []
html_2 = r.text.encode("utf8")
# Extraindo texto da página
soup = BeautifulSoup(html_2, 'html.parser')
paragraphs = soup.find_all('p')
all_text = ' '.join(p.get_text() for p in paragraphs)
headline = soup.title.string
result.append(f'{{"key": "{key}", "link": "{link}", "headline": "{headline}", "content": "{all_text}"}}')
# Extraindo todos os links da página
wiki_links = []
other_links = []
for link in soup.find_all('a', href=True):
full_link = link['href']
if full_link.startswith('/'):
full_link = 'https://pt.wikipedia.org' + full_link
wiki_links.append(full_link)
elif full_link.startswith('http'):
other_links.append(full_link)
# Separando links da Wikipédia
title_links = []
other_wiki_links = []
for link in wiki_links:
if re.match(r'https://pt\.wikipedia\.org/wiki/[^/:(]*$', link) and not link.endswith('.png'):
if link not in title_links:
title_links.append(link)
else:
other_wiki_links.append(link)
return title_links, other_wiki_links, other_links
def traverse(title_links, result, link_acess,key):
print()
for i in range(5):
if title_links:
link = title_links.pop(0)
if link not in link_acess:
print("calling link:" + link)
time.sleep(1)
new_title_links, other_wiki_links, other_links = take(link, result,key)
title_links.extend(new_title_links)
link_acess.append(link)
result = []
link_acess = []
keys = ["Brsil", "Bolo", "Gato"]
for i in range(len(keys)):
link = ('https://pt.wikipedia.org/wiki/' + keys[i])
title_links, other_wiki_links, other_links = take(link, result, keys[i])
traverse(title_links, result, link_acess, keys[i])
file_name = "data.json"
try:
with open(file_name, "w", encoding='utf-8') as file:
for item in result:
file.write(json.dumps(item, ensure_ascii=False))
file.write('\n')
print(f"JSON saved successfully at: {file_name}")
except Exception as e:
print(f"Error saving JSON: {e}")
#ler o .json
# try:
# with open(file_name, "r", encoding='utf-8') as file:
# json_content = [json.loads(line) for line in file]
# print(json_content)
# except Exception as e:
# print(f"Error reading JSON: {e}")