From 8a5aff3c1d68f48759c09d13aadba3130ab1655b Mon Sep 17 00:00:00 2001 From: ecoron Date: Thu, 7 Sep 2017 22:41:25 +0200 Subject: [PATCH 01/10] add example_chrome, improved chromedriver settings (#15) --- docs/conf.py | 2 +- docs/examples.rst | 31 +++++++++++++++++++++++++++++++ examples/example_chrome.py | 18 ++++++++++++++++++ scrapcore/scraper/selenium.py | 22 +++++++++++++++++++--- setup.py | 2 +- 5 files changed, 70 insertions(+), 5 deletions(-) create mode 100644 examples/example_chrome.py diff --git a/docs/conf.py b/docs/conf.py index 960d5e5..c172316 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,7 +60,7 @@ # The short X.Y version. version = '0.9' # The full version, including alpha/beta/rc tags. -release = '0.9.0' +release = '0.9.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/examples.rst b/docs/examples.rst index 623d7d7..d53a39d 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -59,6 +59,35 @@ custom path to the binary. if 'serp_title' in result and len(result['serp_title']) > 1: print(result['serp_title']) +Using Chrome +------------ + +.. code-block:: bash + + python examples\example_chrome.py + +It is possible to use Chrome, but we recomment PhantomJs, which is installed by default. +For using Chrome u need to download the latest `chromedriver`_ and to set the executable_path. + +.. code-block:: bash + + import pprint + import serpscrap + + keywords = ['berlin'] + + config = serpscrap.Config() + config.set('sel_browser', 'chrome') + config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') + + scrap = serpscrap.SerpScrap() + scrap.init(config=config.get(), keywords=keywords) + results = scrap.run() + + for result in results: + pprint.pprint(result) + print() + Image search ------------ @@ -157,3 +186,5 @@ References .. _`examples`: https://github.com/ecoron/SerpScrap/tree/master/examples .. _`example_simple.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_simple.py .. _`example_related.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_related.py +.. _`chromedriver`: https://sites.google.com/a/chromium.org/chromedriver/downloads + diff --git a/examples/example_chrome.py b/examples/example_chrome.py new file mode 100644 index 0000000..6d88732 --- /dev/null +++ b/examples/example_chrome.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import pprint +import serpscrap + +keywords = ['berlin'] + +config = serpscrap.Config() +config.set('sel_browser', 'chrome') +config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') + +scrap = serpscrap.SerpScrap() +scrap.init(config=config.get(), keywords=keywords) +results = scrap.run() + +for result in results: + pprint.pprint(result) + print() diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index a0c13cd..d4ebaf3 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -237,6 +237,7 @@ def _get_webdriver(self): def _get_Chrome(self): try: + chrome_ops = webdriver.ChromeOptions() if self.proxy: chrome_ops = webdriver.ChromeOptions() chrome_ops.add_argument( @@ -250,10 +251,25 @@ def _get_Chrome(self): executable_path=self.config['executebale_path'], chrome_options=chrome_ops ) - else: - self.webdriver = webdriver.Chrome( - executable_path=self.config['executable_path'] + + chrome_ops.add_argument('--no-sandbox') + chrome_ops.add_argument('--start-maximized') + chrome_ops.add_argument( + '--window-position={},{}'.format( + randint(10, 30), + randint(10, 30) + ) + ) + chrome_ops.add_argument( + '--window-size={},{}'.format( + randint(800, 1024), + randint(600, 900) ) + ) + self.webdriver = webdriver.Chrome( + executable_path=self.config['executable_path'], + chrome_options=chrome_ops + ) return True except WebDriverException: raise diff --git a/setup.py b/setup.py index 4df3ddb..935a60f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from setuptools import setup, find_packages -version = '0.9.0' +version = '0.9.1' setup( From 23eb33f2be7003523f561d91a8724200c22e68a5 Mon Sep 17 00:00:00 2001 From: ecoron Date: Sun, 10 Sep 2017 14:38:40 +0200 Subject: [PATCH 02/10] remove unused code --- scrapcore/scraper/selenium.py | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index d4ebaf3..57c07eb 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -60,7 +60,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'yahoo': '.compPagination .next', 'baidu': '.n', 'ask': '#paging div a.txt3.l_nu', - 'blekko': '', 'duckduckgo': '', 'googleimg': '#pnnext', 'baiduimg': '.n', @@ -74,7 +73,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': (By.NAME, 'wd'), 'duckduckgo': (By.NAME, 'q'), 'ask': (By.NAME, 'q'), - 'blekko': (By.NAME, 'q'), 'google': (By.NAME, 'q'), 'googleimg': (By.NAME, 'as_q'), 'baiduimg': (By.NAME, 'word'), @@ -102,7 +100,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': 'http://baidu.com/', 'duckduckgo': 'https://duckduckgo.com/', 'ask': 'http://ask.com/', - 'blekko': 'http://blekko.com/', } image_search_locations = { @@ -113,7 +110,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': 'http://image.baidu.com/', 'duckduckgo': None, # duckduckgo doesnt't support direct image search 'ask': 'http://www.ask.com/pictures/', - 'blekko': None, 'googleimg': 'https://www.google.com/advanced_image_search', 'baiduimg': 'http://image.baidu.com/', } @@ -488,7 +484,7 @@ def _goto_next_page(self): element.click() except WebDriverException: # See http://stackoverflow.com/questions/11908249/debugging-element-is-not-clickable-at-point-error - # first move mouse to the next element, some times the element is not visibility, like blekko.com + # first move mouse to the next element, some times the element is not visibility selector = self.next_page_selectors[self.search_engine_name] if selector: try: @@ -770,14 +766,6 @@ def wait_until_serp_loaded(self): super()._wait_until_search_input_field_appears() -class BlekkoSelScrape(SelScrape): - def __init__(self, *args, **kwargs): - SelScrape.__init__(self, *args, **kwargs) - - def _goto_next_page(self): - pass - - class AskSelScrape(SelScrape): def __init__(self, *args, **kwargs): SelScrape.__init__(self, *args, **kwargs) From fd81ac680cef3b0b372f1eaedd36c4f914ff847e Mon Sep 17 00:00:00 2001 From: ecoron Date: Sun, 10 Sep 2017 14:51:01 +0200 Subject: [PATCH 03/10] quit webdriver, also on exception #15 --- scrapcore/scraper/selenium.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index 57c07eb..c6eb825 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -626,7 +626,9 @@ def search(self): if self.search_param_fields: wait_res = self._wait_until_search_param_fields_appears() if wait_res is False: + self.quit() raise Exception('Waiting search param input fields time exceeds') + for param, field in self.search_param_fields.items(): if field[0] == By.ID: js_tpl = ''' @@ -647,7 +649,11 @@ def search(self): self.search_input.send_keys(self.query + Keys.ENTER) except ElementNotVisibleException: time.sleep(2) - self.search_input.send_keys(self.query + Keys.ENTER) + try: + self.search_input.send_keys(self.query + Keys.ENTER) + except Exception: + logger.error('send keys not possible, maybe page cannot loaded') + self.quit() except Exception: logger.error('send keys not possible') pass @@ -719,8 +725,11 @@ def run(self): self.build_search() self.search() - if self.webdriver: - self.webdriver.quit() + self.quit() + + def quit(self): + if self.webdriver: + self.webdriver.quit() """ From b96af2e1ad91ea4b2162ebd277a0218854b37abd Mon Sep 17 00:00:00 2001 From: ecoron Date: Tue, 12 Sep 2017 21:13:27 +0200 Subject: [PATCH 04/10] handle connection errors #18 --- scrapcore/scraper/selenium.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index c6eb825..b897f19 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -338,12 +338,16 @@ def _get_PhantomJS(self): logger.info('useragent: {}'.format(useragent)) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = useragent - self.webdriver = webdriver.PhantomJS( - executable_path=self.config['executable_path'], - service_args=service_args, - desired_capabilities=dcap - ) - return True + try: + self.webdriver = webdriver.PhantomJS( + executable_path=self.config['executable_path'], + service_args=service_args, + desired_capabilities=dcap + ) + return True + except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: + logger.error(err) + return False except WebDriverException as e: logger.error(e) return False From 094ff4f0f4fe6cb76c52049a2be41b730d054bd2 Mon Sep 17 00:00:00 2001 From: ecoron Date: Tue, 12 Sep 2017 21:29:55 +0200 Subject: [PATCH 05/10] maybe avoids IntegrityError #17 --- scrapcore/scraping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py index bb88625..07febfe 100644 --- a/scrapcore/scraping.py +++ b/scrapcore/scraping.py @@ -308,5 +308,5 @@ def update_proxy_status(self, status, ipinfo=None, online=True): proxy.status = status proxy.online = online - self.session.add(proxy) + self.session.merge(proxy, load=False) self.session.commit() From 2f1a382d97ec25406531b48111770ad5fb352fca Mon Sep 17 00:00:00 2001 From: ecoron Date: Tue, 12 Sep 2017 23:51:44 +0200 Subject: [PATCH 06/10] handle more connection errors #18 --- scrapcore/scraper/selenium.py | 10 ++++++++-- scrapcore/scraping.py | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index b897f19..61cca3a 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -164,6 +164,7 @@ def proxy_check(self, proxy): try: self.webdriver.get(self.config.get('proxy_info_url')) + time.sleep(2) try: text = re.search( r'(\{.*?\})', @@ -207,7 +208,10 @@ def _save_debug_screenshot(self): str(self.page_number), ) ) - self.webdriver.get_screenshot_as_file(location) + try: + self.webdriver.get_screenshot_as_file(location) + except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: + logger.error(err) def _set_xvfb_display(self): # TODO: should we check the format of the config? @@ -244,7 +248,7 @@ def _get_Chrome(self): ) ) self.webdriver = webdriver.Chrome( - executable_path=self.config['executebale_path'], + executable_path=self.config['executable_path'], chrome_options=chrome_ops ) @@ -678,6 +682,8 @@ def search(self): self._save_debug_screenshot() time.sleep(.5) self.html = self.webdriver.execute_script('return document.body.innerHTML;') + except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: + logger.error(err) except WebDriverException: self.html = self.webdriver.page_source diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py index 07febfe..8d2897e 100644 --- a/scrapcore/scraping.py +++ b/scrapcore/scraping.py @@ -308,5 +308,5 @@ def update_proxy_status(self, status, ipinfo=None, online=True): proxy.status = status proxy.online = online - self.session.merge(proxy, load=False) + self.session.merge(proxy, load=True) self.session.commit() From a0ebd249a7d926d660c2907e777a8847c32129fd Mon Sep 17 00:00:00 2001 From: ecoron Date: Tue, 12 Sep 2017 23:52:27 +0200 Subject: [PATCH 07/10] update proxy info url --- serpscrap/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serpscrap/config.py b/serpscrap/config.py index ef6f9e4..a66f91d 100644 --- a/serpscrap/config.py +++ b/serpscrap/config.py @@ -53,7 +53,7 @@ class Config(): }, 'proxy_file': '', 'proxy_check_url': 'http://canihazip.com/s', - 'proxy_info_url': 'http://ipinfo.io/json', + 'proxy_info_url': 'https://ipinfo.io/json', 'stop_on_detection': True, 'today': datetime.datetime.strftime( datetime.datetime.utcnow(), From b08e83f0545c39ec740bf475968fd79fa8823b72 Mon Sep 17 00:00:00 2001 From: Ronald Schmidt Date: Wed, 13 Sep 2017 17:11:45 +0200 Subject: [PATCH 08/10] shuffle proxy list, small fixes --- scrapcore/core.py | 11 ++++++----- scrapcore/scraper/selenium.py | 4 ++-- scrapcore/scraping.py | 7 +++++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/scrapcore/core.py b/scrapcore/core.py index 094fb3b..2a0ac02 100644 --- a/scrapcore/core.py +++ b/scrapcore/core.py @@ -4,6 +4,7 @@ import queue import threading +from random import shuffle from scrapcore.cachemanager import CacheManager from scrapcore.database import ScraperSearch from scrapcore.database import get_session, fixtures @@ -78,6 +79,7 @@ def main(self, return_results=False, config=None): if not proxies: raise Exception('''No proxies available. Turning down.''') + shuffle(proxies) # get a scoped sqlalchemy session session_cls = get_session(config, scoped=True) @@ -122,10 +124,10 @@ def main(self, return_results=False, config=None): self.logger.info(''' Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'''.format( - num_keywords=len(list(scrape_jobs)), - num_proxies=len(proxies), - num_threads=num_search_engines) - ) + num_keywords=len(list(scrape_jobs)), + num_proxies=len(proxies), + num_threads=num_search_engines) + ) progress_thread = None @@ -139,7 +141,6 @@ def main(self, return_results=False, config=None): for search_engine in search_engines: for proxy in proxies: - for worker in range(num_workers): num_worker += 1 workers.put( diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index 61cca3a..d042d2b 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -570,7 +570,7 @@ def wait_until_serp_loaded(self): elif self.search_engine_name == 'ask': selector = '#paging .pgcsel .pg' - content = None + # content = None try: time.sleep(1) WebDriverWait(self.webdriver, 5).until( @@ -582,7 +582,7 @@ def wait_until_serp_loaded(self): except TimeoutException: self._save_debug_screenshot() try: - content = self.webdriver.find_element_by_css_selector(selector).text + self.webdriver.find_element_by_css_selector(selector).text except NoSuchElementException: logger.error('Skipp it, no such element - SeleniumSearchError') raise SeleniumSearchError('Stop Scraping, seems we are blocked') diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py index 8d2897e..71b0bc2 100644 --- a/scrapcore/scraping.py +++ b/scrapcore/scraping.py @@ -308,5 +308,8 @@ def update_proxy_status(self, status, ipinfo=None, online=True): proxy.status = status proxy.online = online - self.session.merge(proxy, load=True) - self.session.commit() + try: + self.session.merge(proxy, load=True) + self.session.commit() + except: + pass From 740dce789659c31f2344c2be6753d0c6a766d17b Mon Sep 17 00:00:00 2001 From: Ronald Schmidt Date: Thu, 14 Sep 2017 20:10:15 +0200 Subject: [PATCH 09/10] Update configuration.rst --- docs/configuration.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/configuration.rst b/docs/configuration.rst index 71501a5..bc598ec 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -96,6 +96,9 @@ don't customize this setting, the default is used. Proxy file ---------- +This feature worksin versions <= 0.9.1 not stable, if you use more then one worker +and have more then one proxy in your file. + You can provide a list of proxys which should used for scraping the search engines. For this you have to create a proxy_file and to set the path to the file in the configuration. From caf8cc37f892678183546b561f68fdbf517ab605 Mon Sep 17 00:00:00 2001 From: Ronald Schmidt Date: Thu, 14 Sep 2017 20:11:20 +0200 Subject: [PATCH 10/10] Update configuration.rst --- docs/configuration.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index bc598ec..6e9e0a5 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -96,10 +96,10 @@ don't customize this setting, the default is used. Proxy file ---------- -This feature worksin versions <= 0.9.1 not stable, if you use more then one worker +This feature works not stable in versions <= 0.9.1, if you use more then one worker and have more then one proxy in your file. -You can provide a list of proxys which should used for scraping the search engines. +You can provide a list of proxies which should used for scraping the search engines. For this you have to create a proxy_file and to set the path to the file in the configuration. The proxy_file should look like this