Skip to content

Commit

Permalink
Fixes NikolaiT#149 plus some styles
Browse files Browse the repository at this point in the history
  • Loading branch information
Ronald Schmidt committed Feb 21, 2017
1 parent 77a7aa1 commit f51b71f
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 54 deletions.
11 changes: 6 additions & 5 deletions GoogleScraper/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
class WrongConfigurationError(Exception):
pass


def id_for_keywords(keywords):
"""Determine a unique id for the keywords.
Expand Down Expand Up @@ -97,7 +98,8 @@ def start_python_console(namespace=None, noipython=False, banner=''):
except ImportError:
pass
else:
import rlcompleter
pass
# import rlcompleter

readline.parse_and_bind("tab:complete")
code.interact(banner=banner, local=namespace)
Expand Down Expand Up @@ -202,7 +204,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
proxy_db = config.get('mysql_proxy_db', '')

# when no search engine is specified, use google
search_engines = config.get('search_engines', ['google',])
search_engines = config.get('search_engines', ['google'])
if not isinstance(search_engines, list):
if search_engines == '*':
search_engines = config.get('supported_search_engines')
Expand Down Expand Up @@ -238,8 +240,7 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
if not (keyword or keywords) and not kwfile:
# Just print the help.
get_command_line(True)
print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and '
'keyword with --keyword.')
print('No keywords to scrape for. Please provide either an keyword file (Option: --keyword-file) or specify and keyword with --keyword.')
return

cache_manager = CacheManager(config)
Expand Down Expand Up @@ -456,4 +457,4 @@ def main(return_results=False, parse_cmd_line=True, config_from_dict=None):
session.commit()

if return_results:
return scraper_search
return session
48 changes: 23 additions & 25 deletions GoogleScraper/parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ class Parser():
"""Parses SERP pages.
Each search engine results page (SERP) has a similar layout:
The main search results are usually in a html container element (#main, .results, #leftSide).
There might be separate columns for other search results (like ads for example). Then each
There might be separate columns for other search results (like ads for example). Then each
result contains basically a link, a snippet and a description (usually some text on the
target site). It's really astonishing how similar other search engines are to Google.
Each child class (that can actual parse a concrete search engine results page) needs
to specify css selectors for the different search types (Like normal search, news search, video search, ...).
Expand Down Expand Up @@ -73,10 +73,10 @@ def __init__(self, config={}, html='', query=''):
"""Create new Parser instance and parse all information.
Args:
html: The raw html from the search engine search. If not provided, you can parse
html: The raw html from the search engine search. If not provided, you can parse
the data later by calling parse(html) directly.
searchtype: The search type. By default "normal"
Raises:
Assertion error if the subclassed
specific parser cannot handle the the settings.
Expand Down Expand Up @@ -109,8 +109,8 @@ def __init__(self, config={}, html='', query=''):

def parse(self, html=None):
"""Public function to start parsing the search engine results.
Args:
Args:
html: The raw html data to extract the SERP entries from.
"""
if html:
Expand All @@ -137,7 +137,7 @@ def _parse_lxml(self, cleaner=None):

def _parse(self, cleaner=None):
"""Internal parse the dom according to the provided css selectors.
Raises: InvalidSearchTypeException if no css selectors for the searchtype could be found.
"""
self.num_results = 0
Expand All @@ -152,8 +152,7 @@ def _parse(self, cleaner=None):

self.num_results_for_query = self.first_match(num_results_selector, self.dom)
if not self.num_results_for_query:
logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__,
num_results_selector))
logger.debug('{}: Cannot parse num_results from serp page with selectors {}'.format(self.__class__.__name__, num_results_selector))

# get the current page we are at. Sometimes we search engines don't show this.
try:
Expand All @@ -180,7 +179,7 @@ def _parse(self, cleaner=None):

self.search_results[result_type] = []

for selector_specific, selectors in selector_class.items():
for _, selectors in selector_class.items():

if 'result_container' in selectors and selectors['result_container']:
css = '{container} {result_container}'.format(**selectors)
Expand Down Expand Up @@ -272,14 +271,14 @@ def first_match(self, selectors, element):
match = self.advanced_css(selector, element=element)
if match:
return match
except IndexError as e:
except IndexError:
pass

return False

def after_parsing(self):
"""Subclass specific behaviour after parsing happened.
Override in subclass to add search engine specific behaviour.
Commonly used to clean the results.
"""
Expand Down Expand Up @@ -312,7 +311,7 @@ def iter_serp_items(self):


"""
Here follow the different classes that provide CSS selectors
Here follow the different classes that provide CSS selectors
for different types of SERP pages of several common search engines.
Just look at them and add your own selectors in a new class if you
Expand Down Expand Up @@ -404,7 +403,7 @@ class GoogleParser(Parser):
image_search_selectors = {
'results': {
'de_ip': {
'container': 'li#isr_mc',
'container': '#isr_mc',
'result_container': 'div.rg_di',
'link': 'a.rg_l::attr(href)'
},
Expand All @@ -422,12 +421,12 @@ def __init__(self, *args, **kwargs):

def after_parsing(self):
"""Clean the urls.
A typical scraped results looks like the following:
'/url?q=http://www.youtube.com/user/Apple&sa=U&ei=\
lntiVN7JDsTfPZCMgKAO&ved=0CFQQFjAO&usg=AFQjCNGkX65O-hKLmyq1FX9HQqbb9iYn9A'
Clean with a short regex.
"""
super().after_parsing()
Expand Down Expand Up @@ -543,11 +542,10 @@ def after_parsing(self):
try:
i = self.html.index(substr)
if i:
self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i+len(self.query) + 150]).group()
self.num_results_for_query = re.search(r'— (.)*?"', self.html[i:i + len(self.query) + 150]).group()
except Exception as e:
logger.debug(str(e))


if self.searchtype == 'image':
for key, i in self.iter_serp_items():
for regex in (
Expand Down Expand Up @@ -626,7 +624,7 @@ class BingParser(Parser):
'ch_ip': {
'container': '#dg_c .imgres',
'result_container': '.dg_u',
'link': 'a.dv_i::attr(m)'
'link': 'a::attr(m)'
},
}
}
Expand Down Expand Up @@ -1049,12 +1047,12 @@ def parse_serp(config, html=None, parser=None, scraper=None, search_engine=None,

if __name__ == '__main__':
"""Originally part of https://github.com/NikolaiT/GoogleScraper.
Only for testing purposes: May be called directly with an search engine
Only for testing purposes: May be called directly with an search engine
search url. For example:
python3 parsing.py 'http://yandex.ru/yandsearch?text=GoogleScraper&lr=178&csg=82%2C4317%2C20%2C20%2C0%2C0%2C0'
Please note: Using this module directly makes little sense, because requesting such urls
directly without imitating a real browser (which is done in my GoogleScraper module) makes
the search engines return crippled html, which makes it impossible to parse.
Expand Down
18 changes: 9 additions & 9 deletions GoogleScraper/search_engine_parameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
# current geographic location.
'safe': 'off', # Turns the adult content filter on or off
'rls': None,
#Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official'
# Source of query with version of the client and language set. With firefox set to 'org.mozilla:en-US:official'
'sa': None,
# User search behavior parameter sa=N: User searched, sa=X: User clicked on related searches in the SERP
'source': None, # Google navigational parameter specifying where you came from, univ: universal search
Expand Down Expand Up @@ -117,8 +117,8 @@
'oe': 'UTF-8', # Sets the character encoding that is used to encode the results.
'ip': None,
# When queries are made using the HTTP protocol, the ip parameter contains the IP address of the user
#who submitted the search query. You do not supply this parameter with the search request. The ip
#parameter is returned in the XML search results. For example:
# who submitted the search query. You do not supply this parameter with the search request. The ip
# parameter is returned in the XML search results. For example:
'sitesearch': None,
# Limits search results to documents in the specified domain, host, or web directory. Has no effect if the q
# parameter is empty. This parameter has the same effect as the site special query term.
Expand Down Expand Up @@ -147,19 +147,19 @@
# ft are: 'i': filetype and 'e': -filetype
'as_lq': None,
# Specifies a URL, and causes search results to show pages that link to the that URL. This parameter has
#the same effect as the link special query term (see “Back Links” on page 20). No other query terms can
#be used when using this parameter.
# the same effect as the link special query term (see “Back Links” on page 20). No other query terms can
# be used when using this parameter.
'as_occt': None,
# Specifies where the search engine is to look for the query terms on the page: anywhere on the page, in
#the title, or in the URL.
# the title, or in the URL.
'as_oq': None,
# Combines the specified terms to the search query in parameter q, with an OR operation. This parameter
# has the same effect as the OR special query term (see “Boolean OR Search” on page 20).
'as_q': None, # Adds the specified query terms to the query terms in parameter q.
'as_sitesearch': None,
# Limits search results to documents in the specified domain, host or web directory, or excludes results
#from the specified location, depending on the value of as_dt. This parameter has the same effect as the
#site or -site special query terms. It has no effect if the q parameter is empty.
# from the specified location, depending on the value of as_dt. This parameter has the same effect as the
# site or -site special query terms. It has no effect if the q parameter is empty.
'entqr': None, # This parameter sets the query expansion policy according to the following valid values:
# 0: None
# 1: Standard Uses only the search appliance’s synonym file.
Expand All @@ -182,7 +182,7 @@
"""
bing_search_params = {

'adlt': 'off'
}

"""
Expand Down
25 changes: 10 additions & 15 deletions GoogleScraper/selenium_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class SelScrape(SearchEngineScrape, threading.Thread):
'google': '#pnnext',
'yandex': '.pager__button_kind_next',
'bing': '.sb_pagN',
'yahoo': '#pg-next',
'yahoo': '.compPagination .next',
'baidu': '.n',
'ask': '#paging div a.txt3.l_nu',
'blekko': '',
Expand Down Expand Up @@ -301,7 +301,7 @@ def handle_request_denied(self, status_code):

if self.config.get('manual_captcha_solving', False):
with self.captcha_lock:
import tempfile
# import tempfile

tf = tempfile.NamedTemporaryFile('wb')
tf.write(self.webdriver.get_screenshot_as_png())
Expand Down Expand Up @@ -450,15 +450,18 @@ def _find_next_page_element(self):
try:
# wait until the next page link is clickable
WebDriverWait(self.webdriver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, selector)))
except (WebDriverException, TimeoutException) as e:
except (WebDriverException, TimeoutException):
self._save_debug_screenshot()
raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e)))
# raise Exception('{}: Cannot locate next page element: {}'.format(self.name, str(e)))

return self.webdriver.find_element_by_css_selector(selector)

elif self.search_type == 'image':
self.page_down()
return True
if self.search_engine_name == 'google':
return self.webdriver.find_element_by_css_selector('input._kvc')
else:
return True

def wait_until_serp_loaded(self):
"""
Expand Down Expand Up @@ -595,17 +598,9 @@ def page_down(self):
Used for next page in image search mode or when the
next results are obtained by scrolling down a page.
"""
js = '''
var w = window,
d = document,
e = d.documentElement,
g = d.getElementsByTagName('body')[0],
y = w.innerHeight|| e.clientHeight|| g.clientHeight;
window.scrollBy(0,y);
return y;
'''
js = 'window.scrollTo(0,document.body.scrollHeight);'

time.sleep(5)
self.webdriver.execute_script(js)

def run(self):
Expand Down

0 comments on commit f51b71f

Please sign in to comment.