-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
91 lines (71 loc) · 2.28 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import json
import requests
import time
import random
HERO_ENDPOINT = 'https://api.opendota.com/api/heroes?'
REG_MATCH_BATCH_ENDPOINT = 'https://api.opendota.com/api/publicMatches'
REG_MATCH_FILE = 'data/reg_matches.json'
HERO_PATH = 'data/heroes.json'
#### Functions #################################################################
def get_heroes():
with open(HERO_PATH, 'w') as f:
f.write(requests.get(HERO_ENDPOINT).text)
def load_heroes():
with open(HERO_PATH, 'r') as f:
return json.loads(f.read().strip())
def previous_progress_matches(infile):
match_ids = []
with open(infile, 'r') as f:
for line in f:
if line == 'error\n':
continue
try:
j = json.loads(line)
match_ids.append(int(j['match_id']))
except:
print(j)
if len(match_ids) > 0:
return max(match_ids), min(match_ids)
else:
return None, None
def query_matches(endpoint, less_than_match_id):
if less_than_match_id == None:
request_string = endpoint
else:
request_string = endpoint +\
'?less_than_match_id={}'.format(less_than_match_id)
print(request_string)
return json.loads(requests.get(request_string).text)
def get_least_recent_match_id(list_of_matches):
match_ids = []
for match in list_of_matches:
if match == 'error':
continue
match_ids.append(match['match_id'])
try:
return min(match_ids)
except:
return []
def write_matches(outfile, matches_list):
with open(outfile, 'a') as f:
for line in matches_list:
f.write(json.dumps(line) + '\n')
#### Get heroes ################################################################
get_heroes()
#### Get matches ###############################################################
most_recent_match, most_distant_match = previous_progress_matches(
REG_MATCH_FILE
)
while True:
try:
results = query_matches(
REG_MATCH_BATCH_ENDPOINT,
most_distant_match,
)
time.sleep(1.5)
write_matches(REG_MATCH_FILE, results)
most_distant_match = get_least_recent_match_id(results)
except:
time.sleep(10)
continue
print('Queried!')