From 8a5aff3c1d68f48759c09d13aadba3130ab1655b Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Thu, 7 Sep 2017 22:41:25 +0200
Subject: [PATCH 01/10] add example_chrome, improved chromedriver settings
 (#15)

---
 docs/conf.py                  |  2 +-
 docs/examples.rst             | 31 +++++++++++++++++++++++++++++++
 examples/example_chrome.py    | 18 ++++++++++++++++++
 scrapcore/scraper/selenium.py | 22 +++++++++++++++++++---
 setup.py                      |  2 +-
 5 files changed, 70 insertions(+), 5 deletions(-)
 create mode 100644 examples/example_chrome.py

diff --git a/docs/conf.py b/docs/conf.py
index 960d5e5..c172316 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -60,7 +60,7 @@
 # The short X.Y version.
 version = '0.9'
 # The full version, including alpha/beta/rc tags.
-release = '0.9.0'
+release = '0.9.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/examples.rst b/docs/examples.rst
index 623d7d7..d53a39d 100644
--- a/docs/examples.rst
+++ b/docs/examples.rst
@@ -59,6 +59,35 @@ custom path to the binary.
        if 'serp_title' in result and len(result['serp_title']) > 1:
            print(result['serp_title'])
 
+Using Chrome
+------------
+
+.. code-block:: bash
+
+   python examples\example_chrome.py
+
+It is possible to use Chrome, but we recomment PhantomJs, which is installed by default.
+For using Chrome u need to download the latest `chromedriver`_ and to set the executable_path.
+
+.. code-block:: bash
+
+   import pprint
+   import serpscrap
+   
+   keywords = ['berlin']
+   
+   config = serpscrap.Config()
+   config.set('sel_browser', 'chrome')
+   config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe')
+   
+   scrap = serpscrap.SerpScrap()
+   scrap.init(config=config.get(), keywords=keywords)
+   results = scrap.run()
+   
+   for result in results:
+       pprint.pprint(result)
+       print()
+
 Image search
 ------------
 
@@ -157,3 +186,5 @@ References
 .. _`examples`: https://github.com/ecoron/SerpScrap/tree/master/examples
 .. _`example_simple.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_simple.py
 .. _`example_related.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_related.py
+.. _`chromedriver`: https://sites.google.com/a/chromium.org/chromedriver/downloads
+
diff --git a/examples/example_chrome.py b/examples/example_chrome.py
new file mode 100644
index 0000000..6d88732
--- /dev/null
+++ b/examples/example_chrome.py
@@ -0,0 +1,18 @@
+#!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import pprint
+import serpscrap
+
+keywords = ['berlin']
+
+config = serpscrap.Config()
+config.set('sel_browser', 'chrome')
+config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe')
+
+scrap = serpscrap.SerpScrap()
+scrap.init(config=config.get(), keywords=keywords)
+results = scrap.run()
+
+for result in results:
+    pprint.pprint(result)
+    print()
diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
index a0c13cd..d4ebaf3 100644
--- a/scrapcore/scraper/selenium.py
+++ b/scrapcore/scraper/selenium.py
@@ -237,6 +237,7 @@ def _get_webdriver(self):
 
     def _get_Chrome(self):
         try:
+            chrome_ops = webdriver.ChromeOptions()
             if self.proxy:
                 chrome_ops = webdriver.ChromeOptions()
                 chrome_ops.add_argument(
@@ -250,10 +251,25 @@ def _get_Chrome(self):
                     executable_path=self.config['executebale_path'],
                     chrome_options=chrome_ops
                 )
-            else:
-                self.webdriver = webdriver.Chrome(
-                    executable_path=self.config['executable_path']
+
+            chrome_ops.add_argument('--no-sandbox')
+            chrome_ops.add_argument('--start-maximized')
+            chrome_ops.add_argument(
+                '--window-position={},{}'.format(
+                    randint(10, 30),
+                    randint(10, 30)
+                )
+            )
+            chrome_ops.add_argument(
+                '--window-size={},{}'.format(
+                    randint(800, 1024),
+                    randint(600, 900)
                 )
+            )
+            self.webdriver = webdriver.Chrome(
+                executable_path=self.config['executable_path'],
+                chrome_options=chrome_ops
+            )
             return True
         except WebDriverException:
             raise
diff --git a/setup.py b/setup.py
index 4df3ddb..935a60f 100644
--- a/setup.py
+++ b/setup.py
@@ -2,7 +2,7 @@
 # -*- coding: utf-8 -*-
 from setuptools import setup, find_packages
 
-version = '0.9.0'
+version = '0.9.1'
 
 
 setup(

From 23eb33f2be7003523f561d91a8724200c22e68a5 Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Sun, 10 Sep 2017 14:38:40 +0200
Subject: [PATCH 02/10] remove unused code

---
 scrapcore/scraper/selenium.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
index d4ebaf3..57c07eb 100644
--- a/scrapcore/scraper/selenium.py
+++ b/scrapcore/scraper/selenium.py
@@ -60,7 +60,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'yahoo': '.compPagination .next',
         'baidu': '.n',
         'ask': '#paging div a.txt3.l_nu',
-        'blekko': '',
         'duckduckgo': '',
         'googleimg': '#pnnext',
         'baiduimg': '.n',
@@ -74,7 +73,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'baidu': (By.NAME, 'wd'),
         'duckduckgo': (By.NAME, 'q'),
         'ask': (By.NAME, 'q'),
-        'blekko': (By.NAME, 'q'),
         'google': (By.NAME, 'q'),
         'googleimg': (By.NAME, 'as_q'),
         'baiduimg': (By.NAME, 'word'),
@@ -102,7 +100,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'baidu': 'http://baidu.com/',
         'duckduckgo': 'https://duckduckgo.com/',
         'ask': 'http://ask.com/',
-        'blekko': 'http://blekko.com/',
     }
 
     image_search_locations = {
@@ -113,7 +110,6 @@ class SelScrape(SearchEngineScrape, threading.Thread):
         'baidu': 'http://image.baidu.com/',
         'duckduckgo': None,  # duckduckgo doesnt't support direct image search
         'ask': 'http://www.ask.com/pictures/',
-        'blekko': None,
         'googleimg': 'https://www.google.com/advanced_image_search',
         'baiduimg': 'http://image.baidu.com/',
     }
@@ -488,7 +484,7 @@ def _goto_next_page(self):
                 element.click()
             except WebDriverException:
                 # See http://stackoverflow.com/questions/11908249/debugging-element-is-not-clickable-at-point-error
-                # first move mouse to the next element, some times the element is not visibility, like blekko.com
+                # first move mouse to the next element, some times the element is not visibility
                 selector = self.next_page_selectors[self.search_engine_name]
                 if selector:
                     try:
@@ -770,14 +766,6 @@ def wait_until_serp_loaded(self):
         super()._wait_until_search_input_field_appears()
 
 
-class BlekkoSelScrape(SelScrape):
-    def __init__(self, *args, **kwargs):
-        SelScrape.__init__(self, *args, **kwargs)
-
-    def _goto_next_page(self):
-        pass
-
-
 class AskSelScrape(SelScrape):
     def __init__(self, *args, **kwargs):
         SelScrape.__init__(self, *args, **kwargs)

From fd81ac680cef3b0b372f1eaedd36c4f914ff847e Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Sun, 10 Sep 2017 14:51:01 +0200
Subject: [PATCH 03/10] quit webdriver, also on exception #15

---
 scrapcore/scraper/selenium.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
index 57c07eb..c6eb825 100644
--- a/scrapcore/scraper/selenium.py
+++ b/scrapcore/scraper/selenium.py
@@ -626,7 +626,9 @@ def search(self):
             if self.search_param_fields:
                 wait_res = self._wait_until_search_param_fields_appears()
                 if wait_res is False:
+                    self.quit()
                     raise Exception('Waiting search param input fields time exceeds')
+
                 for param, field in self.search_param_fields.items():
                     if field[0] == By.ID:
                         js_tpl = '''
@@ -647,7 +649,11 @@ def search(self):
                 self.search_input.send_keys(self.query + Keys.ENTER)
             except ElementNotVisibleException:
                 time.sleep(2)
-                self.search_input.send_keys(self.query + Keys.ENTER)
+                try:
+                    self.search_input.send_keys(self.query + Keys.ENTER)
+                except Exception:
+                    logger.error('send keys not possible, maybe page cannot loaded')
+                    self.quit()
             except Exception:
                 logger.error('send keys not possible')
                 pass
@@ -719,8 +725,11 @@ def run(self):
                 self.build_search()
                 self.search()
 
-            if self.webdriver:
-                self.webdriver.quit()
+            self.quit()
+
+    def quit(self):
+        if self.webdriver:
+            self.webdriver.quit()
 
 
 """

From b96af2e1ad91ea4b2162ebd277a0218854b37abd Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Tue, 12 Sep 2017 21:13:27 +0200
Subject: [PATCH 04/10] handle connection errors #18

---
 scrapcore/scraper/selenium.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
index c6eb825..b897f19 100644
--- a/scrapcore/scraper/selenium.py
+++ b/scrapcore/scraper/selenium.py
@@ -338,12 +338,16 @@ def _get_PhantomJS(self):
             logger.info('useragent: {}'.format(useragent))
             dcap = dict(DesiredCapabilities.PHANTOMJS)
             dcap["phantomjs.page.settings.userAgent"] = useragent
-            self.webdriver = webdriver.PhantomJS(
-                executable_path=self.config['executable_path'],
-                service_args=service_args,
-                desired_capabilities=dcap
-            )
-            return True
+            try:
+                self.webdriver = webdriver.PhantomJS(
+                    executable_path=self.config['executable_path'],
+                    service_args=service_args,
+                    desired_capabilities=dcap
+                )
+                return True
+            except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
+                logger.error(err)
+                return False
         except WebDriverException as e:
             logger.error(e)
         return False

From 094ff4f0f4fe6cb76c52049a2be41b730d054bd2 Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Tue, 12 Sep 2017 21:29:55 +0200
Subject: [PATCH 05/10] maybe avoids IntegrityError #17

---
 scrapcore/scraping.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py
index bb88625..07febfe 100644
--- a/scrapcore/scraping.py
+++ b/scrapcore/scraping.py
@@ -308,5 +308,5 @@ def update_proxy_status(self, status, ipinfo=None, online=True):
                 proxy.status = status
                 proxy.online = online
 
-                self.session.add(proxy)
+                self.session.merge(proxy, load=False)
                 self.session.commit()

From 2f1a382d97ec25406531b48111770ad5fb352fca Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Tue, 12 Sep 2017 23:51:44 +0200
Subject: [PATCH 06/10] handle more connection errors #18

---
 scrapcore/scraper/selenium.py | 10 ++++++++--
 scrapcore/scraping.py         |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
index b897f19..61cca3a 100644
--- a/scrapcore/scraper/selenium.py
+++ b/scrapcore/scraper/selenium.py
@@ -164,6 +164,7 @@ def proxy_check(self, proxy):
 
         try:
             self.webdriver.get(self.config.get('proxy_info_url'))
+            time.sleep(2)
             try:
                 text = re.search(
                     r'(\{.*?\})',
@@ -207,7 +208,10 @@ def _save_debug_screenshot(self):
                 str(self.page_number),
             )
         )
-        self.webdriver.get_screenshot_as_file(location)
+        try:
+            self.webdriver.get_screenshot_as_file(location)
+        except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
+            logger.error(err)
 
     def _set_xvfb_display(self):
         # TODO: should we check the format of the config?
@@ -244,7 +248,7 @@ def _get_Chrome(self):
                     )
                 )
                 self.webdriver = webdriver.Chrome(
-                    executable_path=self.config['executebale_path'],
+                    executable_path=self.config['executable_path'],
                     chrome_options=chrome_ops
                 )
 
@@ -678,6 +682,8 @@ def search(self):
                     self._save_debug_screenshot()
                     time.sleep(.5)
                 self.html = self.webdriver.execute_script('return document.body.innerHTML;')
+            except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err:
+                logger.error(err)
             except WebDriverException:
                 self.html = self.webdriver.page_source
 
diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py
index 07febfe..8d2897e 100644
--- a/scrapcore/scraping.py
+++ b/scrapcore/scraping.py
@@ -308,5 +308,5 @@ def update_proxy_status(self, status, ipinfo=None, online=True):
                 proxy.status = status
                 proxy.online = online
 
-                self.session.merge(proxy, load=False)
+                self.session.merge(proxy, load=True)
                 self.session.commit()

From a0ebd249a7d926d660c2907e777a8847c32129fd Mon Sep 17 00:00:00 2001
From: ecoron <ronald.schmidt@zu-web.de>
Date: Tue, 12 Sep 2017 23:52:27 +0200
Subject: [PATCH 07/10] update proxy info url

---
 serpscrap/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/serpscrap/config.py b/serpscrap/config.py
index ef6f9e4..a66f91d 100644
--- a/serpscrap/config.py
+++ b/serpscrap/config.py
@@ -53,7 +53,7 @@ class Config():
         },
         'proxy_file': '',
         'proxy_check_url': 'http://canihazip.com/s',
-        'proxy_info_url': 'http://ipinfo.io/json',
+        'proxy_info_url': 'https://ipinfo.io/json',
         'stop_on_detection': True,
         'today': datetime.datetime.strftime(
             datetime.datetime.utcnow(),

From b08e83f0545c39ec740bf475968fd79fa8823b72 Mon Sep 17 00:00:00 2001
From: Ronald Schmidt <ronald.schmidt@sparwelt.de>
Date: Wed, 13 Sep 2017 17:11:45 +0200
Subject: [PATCH 08/10] shuffle proxy list, small fixes

---
 scrapcore/core.py             | 11 ++++++-----
 scrapcore/scraper/selenium.py |  4 ++--
 scrapcore/scraping.py         |  7 +++++--
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/scrapcore/core.py b/scrapcore/core.py
index 094fb3b..2a0ac02 100644
--- a/scrapcore/core.py
+++ b/scrapcore/core.py
@@ -4,6 +4,7 @@
 import queue
 import threading
 
+from random import shuffle
 from scrapcore.cachemanager import CacheManager
 from scrapcore.database import ScraperSearch
 from scrapcore.database import get_session, fixtures
@@ -78,6 +79,7 @@ def main(self, return_results=False, config=None):
 
         if not proxies:
             raise Exception('''No proxies available. Turning down.''')
+        shuffle(proxies)
 
         # get a scoped sqlalchemy session
         session_cls = get_session(config, scoped=True)
@@ -122,10 +124,10 @@ def main(self, return_results=False, config=None):
             self.logger.info('''
                 Going to scrape {num_keywords} keywords with {num_proxies}
                 proxies by using {num_threads} threads.'''.format(
-                    num_keywords=len(list(scrape_jobs)),
-                    num_proxies=len(proxies),
-                    num_threads=num_search_engines)
-                )
+                num_keywords=len(list(scrape_jobs)),
+                num_proxies=len(proxies),
+                num_threads=num_search_engines)
+            )
 
             progress_thread = None
 
@@ -139,7 +141,6 @@ def main(self, return_results=False, config=None):
             for search_engine in search_engines:
 
                 for proxy in proxies:
-
                     for worker in range(num_workers):
                         num_worker += 1
                         workers.put(
diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py
index 61cca3a..d042d2b 100644
--- a/scrapcore/scraper/selenium.py
+++ b/scrapcore/scraper/selenium.py
@@ -570,7 +570,7 @@ def wait_until_serp_loaded(self):
             elif self.search_engine_name == 'ask':
                 selector = '#paging .pgcsel .pg'
 
-            content = None
+            # content = None
             try:
                 time.sleep(1)
                 WebDriverWait(self.webdriver, 5).until(
@@ -582,7 +582,7 @@ def wait_until_serp_loaded(self):
             except TimeoutException:
                 self._save_debug_screenshot()
                 try:
-                    content = self.webdriver.find_element_by_css_selector(selector).text
+                    self.webdriver.find_element_by_css_selector(selector).text
                 except NoSuchElementException:
                     logger.error('Skipp it, no such element - SeleniumSearchError')
                     raise SeleniumSearchError('Stop Scraping, seems we are blocked')
diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py
index 8d2897e..71b0bc2 100644
--- a/scrapcore/scraping.py
+++ b/scrapcore/scraping.py
@@ -308,5 +308,8 @@ def update_proxy_status(self, status, ipinfo=None, online=True):
                 proxy.status = status
                 proxy.online = online
 
-                self.session.merge(proxy, load=True)
-                self.session.commit()
+                try:
+                    self.session.merge(proxy, load=True)
+                    self.session.commit()
+                except:
+                    pass

From 740dce789659c31f2344c2be6753d0c6a766d17b Mon Sep 17 00:00:00 2001
From: Ronald Schmidt <ronald.schmidt@zu-web.de>
Date: Thu, 14 Sep 2017 20:10:15 +0200
Subject: [PATCH 09/10] Update configuration.rst

---
 docs/configuration.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/configuration.rst b/docs/configuration.rst
index 71501a5..bc598ec 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -96,6 +96,9 @@ don't customize this setting, the default is used.
 Proxy file
 ----------
 
+This feature worksin versions <= 0.9.1 not stable, if you use more then one worker
+and have more then one proxy in your file.
+
 You can provide a list of proxys which should used for scraping the search engines.
 For this you have to create a proxy_file and to set the path to the file in the configuration.
 

From caf8cc37f892678183546b561f68fdbf517ab605 Mon Sep 17 00:00:00 2001
From: Ronald Schmidt <ronald.schmidt@zu-web.de>
Date: Thu, 14 Sep 2017 20:11:20 +0200
Subject: [PATCH 10/10] Update configuration.rst

---
 docs/configuration.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/configuration.rst b/docs/configuration.rst
index bc598ec..6e9e0a5 100644
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@@ -96,10 +96,10 @@ don't customize this setting, the default is used.
 Proxy file
 ----------
 
-This feature worksin versions <= 0.9.1 not stable, if you use more then one worker
+This feature works not stable in versions <= 0.9.1, if you use more then one worker
 and have more then one proxy in your file.
 
-You can provide a list of proxys which should used for scraping the search engines.
+You can provide a list of proxies which should used for scraping the search engines.
 For this you have to create a proxy_file and to set the path to the file in the configuration.
 
 The proxy_file should look like this