This repository has been archived by the owner on Sep 21, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gre.py
79 lines (63 loc) · 2.03 KB
/
gre.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import re
import signal
from contextlib import contextmanager
fileToRead = 'strs'
class TimeoutException(Exception): pass
@contextmanager
def time_limit(seconds):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.signal(signal.SIGALRM, signal_handler)
signal.alarm(seconds)
try:
yield
finally:
signal.alarm(0)
def emailRegex(string):
regex = r"^[a-z0-9]+[\._]?[a-z0-9]+[@]\w+[.]\w{2,3}$"
emails = re.findall(regex, string)
return [x[0] for x in emails]
def httpRegex(string):
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex, string)
return [x[0] for x in url]
def writeFile(listData):
file = open(fileToWrite, 'w+')
strData = ""
for item in listData:
strData = strData+item+'\n'
file.write(strData)
if __name__ == "__main__":
emails = []
env_vars = []
urls = []
file = open(fileToRead, 'r')
listLine = file.readlines()
counter = 0
for line in listLine:
print(counter, line)
counter += 1
lines = line.split()
for line in lines:
try:
with time_limit(1):
f_urls = httpRegex(line)
urls += f_urls
except TimeoutException as e:
print("Timed out!")
try:
with time_limit(1):
f_emails = emailRegex(line)
emails += f_emails
except TimeoutException as e:
print("Timed out!")
fileToWrite = 'emailExtracted.txt'
if emails:
uniqEmail = set(emails)
print(len(uniqEmail), "emails collected!")
writeFile(uniqEmail)
fileToWrite = 'urlsExtracted.txt'
if urls:
uniq = set(urls)
print(len(uniq), "urls collected!")
writeFile(uniq)