-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_repos.py
106 lines (82 loc) · 2.54 KB
/
get_repos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import json
import time
import requests
import lxml.html
from tqdm import tqdm
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
def get_repo(url):
resp = requests.get('https://github.com{}'.format(url))
html = lxml.html.fromstring(resp.content)
stars = html.cssselect('.social-count')[1].text
try:
desc = html.cssselect('[itemprop=about]')[0].text.strip()
except IndexError:
desc = None
tags = []
for t in html.cssselect('.topic-tag'):
tag = t.text.strip()
tags.append(tag)
return {
'url': url,
'tags': tags,
'desc': desc,
'stars': int(stars)
}
repos = []
new_topics = set()
topics = open('topics.txt').read().splitlines()
driver = webdriver.Firefox()
seen = set()
bar = tqdm(topics)
for topic in bar:
bar.set_description(topic)
topic_url = 'https://github.com/topics/{}'.format(topic)
driver.get(topic_url)
time.sleep(2)
try:
while True:
driver.find_element_by_css_selector('.ajax-pagination-btn').click()
time.sleep(5)
except NoSuchElementException:
pass
html = lxml.html.fromstring(driver.page_source)
for repo in html.cssselect('article'):
url = repo.cssselect('h1 a:last-child')[0].attrib['href']
# Probably an issue, not a repo
if url.count('/') > 2: continue
# Already encountered
if url in seen: continue
repo = get_repo(url)
for t in repo['tags']:
if t not in topics:
new_topics.add(t)
repos.append(repo)
seen.add(url)
terms = open('terms.txt').read().splitlines()
bar = tqdm(terms)
for term in bar:
bar.set_description(term)
page = 1
while True:
resp = requests.get('https://github.com/search', params={'q': '"{}"'.format(term), 'p': page})
html = lxml.html.fromstring(resp.content)
results = html.cssselect('.repo-list-item')
if not results:
break
for repo in results:
url = repo.cssselect('a')[0].attrib['href']
# Already encountered
if url in seen: continue
repo = get_repo(url)
for t in repo['tags']:
if t not in topics:
new_topics.add(t)
repos.append(repo)
seen.add(url)
page += 1
print(len(repos), 'repos')
with open('data/repos.json', 'w') as f:
json.dump(repos, f)
with open('data/new_topics.txt', 'w') as f:
f.write('\n'.join(list(new_topics)))