-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlTools.py
88 lines (75 loc) · 2.65 KB
/
htmlTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
def desinfectString(string : str):
return re.sub("[^a-zA-Z 0-9\\-_äüöÄÜÖ#*=\\(\\)\\[\\]\\{\\}\\.]+", "", string).replace("..", "")
def getElement(html, beginning, searchFrom = 0):
cPos = html.find(beginning, searchFrom)
beginPos = cPos
if cPos == -1:
return -1, ""
balance = 1 # start at 0
while balance > 0:
s = html.find("<", cPos + 1) # open
e1 = html.find("</", cPos + 1) # close
e2 = html.find("/>", cPos + 1) # close self closing
if s == e1 == e2 == -1:
cPos = len(html)
break
if s == -1:
s = len(html)
if e1 == -1:
e1 = len(html)
if e2 == -1:
e2 = len(html)
if e1 <= s and e1 < e2: # next is close tag
balance -= 1
cPos = e1
elif e2 < s and e2 < e1: # next is end of self closing tag
balance -= 1
cPos = e2 + 1
elif s < e1 and s < e2: # next is open tag
balance += 1
cPos = s
cPos = html.find(">", cPos) + 1
return cPos, html[beginPos:cPos]
def removeElement(html, beginning):
s = html.find(beginning)
cPos, _ = getElement(html, beginning)
return html[:s] + html[cPos:]
def htmlUnescape(html):
return html.replace("<", "<").replace(">", ">").replace("&", "&")
def parseSection(html):
match = re.findall('<a href="([^>]*)" class="">([^>]*)</a>', html)
if len(match) == 0 or len(match[0]) != 2:
return False, {}
return True, {
"name": match[0][1].strip(),
"url": htmlUnescape(match[0][0])
}
def parseSectionEntry(html):
match = re.findall('<a class="" onclick="" href="([^>]*)"><img[^>]*/><span class="instancename">([^>]*)<span class="accesshide " >([^>]*)</span>', html)
if len(match) == 0 or len(match[0]) != 3:
return False, {}
return True, {
"url": htmlUnescape(match[0][0]),
"name": match[0][1].strip(),
"type": match[0][2].strip()
}
def getFolderDetail(html):
match = re.findall('<input type="hidden" name="id" value="([^>]*)" /><input type="hidden" name="sesskey" value="([^>]*)" />', html)
if len(match) == 0 or len(match[0]) != 2:
return False, {}
return True, {
"id": match[0][0],
"sesskey": match[0][1]
}
def getFileTreeFiles(html):
match = re.findall('<a target="_blank" href="([^>]*)">([^>]*)</a>', html)
out = []
for i in range(len(match)):
out += [{
"url": match[i][0].replace("&amp;forcedownload=1", ""),
"name": match[i][1]
}]
if len(match) == 0:
return False, []
return True, out