-
Notifications
You must be signed in to change notification settings - Fork 18
/
getQueryParameters.py
219 lines (198 loc) · 5.92 KB
/
getQueryParameters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
#!/usr/bin/env python
#encoding:utf8
import requests
import sys
import urlparse
from bs4 import BeautifulSoup
from optparse import OptionParser
def urlParser(url):
result = {}
tempurl = urlparse.urlparse(url)
result['scheme'] = tempurl.scheme
result['hostname'] = tempurl.hostname
result['path'] = tempurl.path
if tempurl.port == None:
result['port'] = 80
else:
result['port'] = tempurl.port
return result
def getSchemeDomainPort(url):
tempurl = urlparse.urlparse(url)
return tempurl.scheme + "://" + tempurl.hostname + "/"
def getContent(url):
return requests.get(url).text
def getAllLinks(soup):
return soup.findAll(href=True)
def getAllHerfs(links):
hrefs = set()
for link in links:
hrefs.add(link['href'])
return hrefs
def judgeSameSource(url1,url2):
_url1 = urlParser(url1)
_url2 = urlParser(url2)
if _url1['scheme'] == _url2['scheme'] and _url1['hostname'] == _url2['hostname'] and _url1['port'] == _url2['port']:
return True
else:
return False
def getFatherDoamin(domain):
fatherDomain = ""
tempDomain = domain.split(".")[1:]
for temp in tempDomain:
fatherDomain += temp + "."
fatherDomain = fatherDomain[0:-1]
return fatherDomain
def judgeSameFatherDomain(url1,url2):
_url1 = urlParser(url1)
_url2 = urlParser(url2)
if _url1['scheme'] == _url2['scheme'] and getFatherDoamin(_url1['hostname']) == getFatherDoamin(_url2['hostname']) and _url1['port'] == _url2['port']:
return True
else:
return False
def getAllSameFatherDomainLinks(links,url):
result = set()
for link in links:
if judgeSameFatherDomain(link, url):
result.add(link)
return result
def getAllSameSourceLinks(links,url):
result = set()
for link in links:
if judgeSameSource(link, url):
result.add(link)
return result
def getCompleteLinks(links, domain):
result = set()
for link in links:
if link.startswith("//"): # 解决有的URL是以//开头的的BUG , 这种情况并不是当前域名下的子文件夹
link = "http:" + link
continue
if not (link.startswith("http://") or link.startswith("https://")):
result.add(domain + link)
else:
result.add(link)
return result
def removeAllAnchors(links):
result = set()
for link in links:
if link.startswith("#"):
continue # 锚点 , 自动忽略
result.add(link)
return result
def hrefsFilter(links, url):
print "--------Removeing all anchors...",
links = removeAllAnchors(links)
print "Success!"
print "--------Fixing miss scheme...",
links = getCompleteLinks(links, url)
print "Success!"
print "--------Filtering all same father doamain links...",
# links = getAllSameFatherDomainLinks(links, url) # 获取所有子域名下的所有链接
print "Success!"
print "--------Filtering all same source links...",
links = getAllSameSourceLinks(links,url) # 获取同源策略下的所有链接
print "Success!"
print "--------Flitering all urls which is able to query...",
links = getAllQueryLinks(links) # 获取具有查询功能的URL
print "Success!"
print "--------Flitering urls such as : 'xxx.css?v=xxx'...",
links = getAllTrueQueryLinks(links) # 这个函数是为了防止 xxx.css?v=xxx 这种情况出现的 , 使用黑名单进行过滤
print "Success!"
print "--------Getting all pathes and parameters...",
links = analyseAllLinks(links)
print "Success!"
print "--------Merge the same pathes and parameters...",
links = mergeSameQuery(links)
print "Success!"
return links
def getAllQueryLinks(links):
tempLinks = set()
for link in links:
if "?" in link:
tempLinks.add(link)
return tempLinks
def getAllTrueQueryLinks(links):
blackList = ['css','js','html','htm','shtml']
tempLinks = set()
for link in links:
fileUrl = link.split("?")[0]
quertUrl = link.split("?")[1]
SIGN = True
for black in blackList:
if fileUrl.endswith(black):
SIGN = False
break
if SIGN:
tempLinks.add(link)
return tempLinks
def analyseAllLinks(links):
# 从URL中提取查询的文件和查询参数
result = []
for link in links:
templink = link.split("?")[0]
tempQuery = link.split("?")[1]
tempResult = {}
tempResult['url'] = templink
queryResult = {}
temptempQueries = tempQuery.split("&")
for temptempQuery in temptempQueries:
temptemptempKey = temptempQuery.split("=")[0] # 其中一个参数的键
temptemptempValue = temptempQuery.split("=")[1] # 其中一个参数的值
queryResult[temptemptempKey] = temptemptempValue
tempResult['value'] = queryResult
result.append(tempResult)
return result
def mergeSameQuery(links):
# 合并具有相同的查询参数的相同文件
results = []
for link in links:
tempResult = {}
tempResult['url'] = link['url']
SIGN = False
for result in results:
keysOfValueResult = []
for res in result['value']:
keysOfValueResult.append(res)
keysOfValueLink = []
for lin in link['value']:
keysOfValueLink.append(lin)
if link['url'] == result['url'] and keysOfValueLink == keysOfValueResult:
SIGN = True
break
if SIGN:
continue
tempResult['value'] = link['value']
results.append(tempResult)
return results
def formateUrl(url):
if not (url.startswith("http://") or url.startswith("https://")):
url = "http://" + url
if not url.endswith("/"):
url += "/"
return url
def getQueryParameters(url):
print "================Log================"
url = formateUrl(url)
print "Getting content of this url...",
content = getContent(url)
print "Success!"
print "---------------------------"
print "Creating document tree...",
soup = BeautifulSoup(content, "html.parser")
print "Success!"
print "---------------------------"
print "Finding all : <a herf=''></a>...",
links = getAllLinks(soup)
print "Success!"
print "---------------------------"
print "Getting all of the href value...",
hrefs = getAllHerfs(links)
print "Success!"
print "---------------------------"
print "Filtering valunable url..."
links = hrefsFilter(hrefs, url)
print "Success!"
print "=============Result=============="
for link in links:
print link
return links