diff --git a/docs/conf.py b/docs/conf.py index 960d5e5..c172316 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -60,7 +60,7 @@ # The short X.Y version. version = '0.9' # The full version, including alpha/beta/rc tags. -release = '0.9.0' +release = '0.9.1' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/configuration.rst b/docs/configuration.rst index 71501a5..6e9e0a5 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -96,7 +96,10 @@ don't customize this setting, the default is used. Proxy file ---------- -You can provide a list of proxys which should used for scraping the search engines. +This feature works not stable in versions <= 0.9.1, if you use more then one worker +and have more then one proxy in your file. + +You can provide a list of proxies which should used for scraping the search engines. For this you have to create a proxy_file and to set the path to the file in the configuration. The proxy_file should look like this diff --git a/docs/examples.rst b/docs/examples.rst index 623d7d7..d53a39d 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -59,6 +59,35 @@ custom path to the binary. if 'serp_title' in result and len(result['serp_title']) > 1: print(result['serp_title']) +Using Chrome +------------ + +.. code-block:: bash + + python examples\example_chrome.py + +It is possible to use Chrome, but we recomment PhantomJs, which is installed by default. +For using Chrome u need to download the latest `chromedriver`_ and to set the executable_path. + +.. code-block:: bash + + import pprint + import serpscrap + + keywords = ['berlin'] + + config = serpscrap.Config() + config.set('sel_browser', 'chrome') + config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') + + scrap = serpscrap.SerpScrap() + scrap.init(config=config.get(), keywords=keywords) + results = scrap.run() + + for result in results: + pprint.pprint(result) + print() + Image search ------------ @@ -157,3 +186,5 @@ References .. _`examples`: https://github.com/ecoron/SerpScrap/tree/master/examples .. _`example_simple.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_simple.py .. _`example_related.py`: https://github.com/ecoron/SerpScrap/blob/master/examples/example_related.py +.. _`chromedriver`: https://sites.google.com/a/chromium.org/chromedriver/downloads + diff --git a/examples/example_chrome.py b/examples/example_chrome.py new file mode 100644 index 0000000..6d88732 --- /dev/null +++ b/examples/example_chrome.py @@ -0,0 +1,18 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import pprint +import serpscrap + +keywords = ['berlin'] + +config = serpscrap.Config() +config.set('sel_browser', 'chrome') +config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') + +scrap = serpscrap.SerpScrap() +scrap.init(config=config.get(), keywords=keywords) +results = scrap.run() + +for result in results: + pprint.pprint(result) + print() diff --git a/scrapcore/core.py b/scrapcore/core.py index 094fb3b..2a0ac02 100644 --- a/scrapcore/core.py +++ b/scrapcore/core.py @@ -4,6 +4,7 @@ import queue import threading +from random import shuffle from scrapcore.cachemanager import CacheManager from scrapcore.database import ScraperSearch from scrapcore.database import get_session, fixtures @@ -78,6 +79,7 @@ def main(self, return_results=False, config=None): if not proxies: raise Exception('''No proxies available. Turning down.''') + shuffle(proxies) # get a scoped sqlalchemy session session_cls = get_session(config, scoped=True) @@ -122,10 +124,10 @@ def main(self, return_results=False, config=None): self.logger.info(''' Going to scrape {num_keywords} keywords with {num_proxies} proxies by using {num_threads} threads.'''.format( - num_keywords=len(list(scrape_jobs)), - num_proxies=len(proxies), - num_threads=num_search_engines) - ) + num_keywords=len(list(scrape_jobs)), + num_proxies=len(proxies), + num_threads=num_search_engines) + ) progress_thread = None @@ -139,7 +141,6 @@ def main(self, return_results=False, config=None): for search_engine in search_engines: for proxy in proxies: - for worker in range(num_workers): num_worker += 1 workers.put( diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index a0c13cd..d042d2b 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -60,7 +60,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'yahoo': '.compPagination .next', 'baidu': '.n', 'ask': '#paging div a.txt3.l_nu', - 'blekko': '', 'duckduckgo': '', 'googleimg': '#pnnext', 'baiduimg': '.n', @@ -74,7 +73,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': (By.NAME, 'wd'), 'duckduckgo': (By.NAME, 'q'), 'ask': (By.NAME, 'q'), - 'blekko': (By.NAME, 'q'), 'google': (By.NAME, 'q'), 'googleimg': (By.NAME, 'as_q'), 'baiduimg': (By.NAME, 'word'), @@ -102,7 +100,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': 'http://baidu.com/', 'duckduckgo': 'https://duckduckgo.com/', 'ask': 'http://ask.com/', - 'blekko': 'http://blekko.com/', } image_search_locations = { @@ -113,7 +110,6 @@ class SelScrape(SearchEngineScrape, threading.Thread): 'baidu': 'http://image.baidu.com/', 'duckduckgo': None, # duckduckgo doesnt't support direct image search 'ask': 'http://www.ask.com/pictures/', - 'blekko': None, 'googleimg': 'https://www.google.com/advanced_image_search', 'baiduimg': 'http://image.baidu.com/', } @@ -168,6 +164,7 @@ def proxy_check(self, proxy): try: self.webdriver.get(self.config.get('proxy_info_url')) + time.sleep(2) try: text = re.search( r'(\{.*?\})', @@ -211,7 +208,10 @@ def _save_debug_screenshot(self): str(self.page_number), ) ) - self.webdriver.get_screenshot_as_file(location) + try: + self.webdriver.get_screenshot_as_file(location) + except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: + logger.error(err) def _set_xvfb_display(self): # TODO: should we check the format of the config? @@ -237,6 +237,7 @@ def _get_webdriver(self): def _get_Chrome(self): try: + chrome_ops = webdriver.ChromeOptions() if self.proxy: chrome_ops = webdriver.ChromeOptions() chrome_ops.add_argument( @@ -247,13 +248,28 @@ def _get_Chrome(self): ) ) self.webdriver = webdriver.Chrome( - executable_path=self.config['executebale_path'], + executable_path=self.config['executable_path'], chrome_options=chrome_ops ) - else: - self.webdriver = webdriver.Chrome( - executable_path=self.config['executable_path'] + + chrome_ops.add_argument('--no-sandbox') + chrome_ops.add_argument('--start-maximized') + chrome_ops.add_argument( + '--window-position={},{}'.format( + randint(10, 30), + randint(10, 30) ) + ) + chrome_ops.add_argument( + '--window-size={},{}'.format( + randint(800, 1024), + randint(600, 900) + ) + ) + self.webdriver = webdriver.Chrome( + executable_path=self.config['executable_path'], + chrome_options=chrome_ops + ) return True except WebDriverException: raise @@ -326,12 +342,16 @@ def _get_PhantomJS(self): logger.info('useragent: {}'.format(useragent)) dcap = dict(DesiredCapabilities.PHANTOMJS) dcap["phantomjs.page.settings.userAgent"] = useragent - self.webdriver = webdriver.PhantomJS( - executable_path=self.config['executable_path'], - service_args=service_args, - desired_capabilities=dcap - ) - return True + try: + self.webdriver = webdriver.PhantomJS( + executable_path=self.config['executable_path'], + service_args=service_args, + desired_capabilities=dcap + ) + return True + except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: + logger.error(err) + return False except WebDriverException as e: logger.error(e) return False @@ -472,7 +492,7 @@ def _goto_next_page(self): element.click() except WebDriverException: # See http://stackoverflow.com/questions/11908249/debugging-element-is-not-clickable-at-point-error - # first move mouse to the next element, some times the element is not visibility, like blekko.com + # first move mouse to the next element, some times the element is not visibility selector = self.next_page_selectors[self.search_engine_name] if selector: try: @@ -550,7 +570,7 @@ def wait_until_serp_loaded(self): elif self.search_engine_name == 'ask': selector = '#paging .pgcsel .pg' - content = None + # content = None try: time.sleep(1) WebDriverWait(self.webdriver, 5).until( @@ -562,7 +582,7 @@ def wait_until_serp_loaded(self): except TimeoutException: self._save_debug_screenshot() try: - content = self.webdriver.find_element_by_css_selector(selector).text + self.webdriver.find_element_by_css_selector(selector).text except NoSuchElementException: logger.error('Skipp it, no such element - SeleniumSearchError') raise SeleniumSearchError('Stop Scraping, seems we are blocked') @@ -614,7 +634,9 @@ def search(self): if self.search_param_fields: wait_res = self._wait_until_search_param_fields_appears() if wait_res is False: + self.quit() raise Exception('Waiting search param input fields time exceeds') + for param, field in self.search_param_fields.items(): if field[0] == By.ID: js_tpl = ''' @@ -635,7 +657,11 @@ def search(self): self.search_input.send_keys(self.query + Keys.ENTER) except ElementNotVisibleException: time.sleep(2) - self.search_input.send_keys(self.query + Keys.ENTER) + try: + self.search_input.send_keys(self.query + Keys.ENTER) + except Exception: + logger.error('send keys not possible, maybe page cannot loaded') + self.quit() except Exception: logger.error('send keys not possible') pass @@ -656,6 +682,8 @@ def search(self): self._save_debug_screenshot() time.sleep(.5) self.html = self.webdriver.execute_script('return document.body.innerHTML;') + except (ConnectionError, ConnectionRefusedError, ConnectionResetError) as err: + logger.error(err) except WebDriverException: self.html = self.webdriver.page_source @@ -707,8 +735,11 @@ def run(self): self.build_search() self.search() - if self.webdriver: - self.webdriver.quit() + self.quit() + + def quit(self): + if self.webdriver: + self.webdriver.quit() """ @@ -754,14 +785,6 @@ def wait_until_serp_loaded(self): super()._wait_until_search_input_field_appears() -class BlekkoSelScrape(SelScrape): - def __init__(self, *args, **kwargs): - SelScrape.__init__(self, *args, **kwargs) - - def _goto_next_page(self): - pass - - class AskSelScrape(SelScrape): def __init__(self, *args, **kwargs): SelScrape.__init__(self, *args, **kwargs) diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py index bb88625..71b0bc2 100644 --- a/scrapcore/scraping.py +++ b/scrapcore/scraping.py @@ -308,5 +308,8 @@ def update_proxy_status(self, status, ipinfo=None, online=True): proxy.status = status proxy.online = online - self.session.add(proxy) - self.session.commit() + try: + self.session.merge(proxy, load=True) + self.session.commit() + except: + pass diff --git a/serpscrap/config.py b/serpscrap/config.py index ef6f9e4..a66f91d 100644 --- a/serpscrap/config.py +++ b/serpscrap/config.py @@ -53,7 +53,7 @@ class Config(): }, 'proxy_file': '', 'proxy_check_url': 'http://canihazip.com/s', - 'proxy_info_url': 'http://ipinfo.io/json', + 'proxy_info_url': 'https://ipinfo.io/json', 'stop_on_detection': True, 'today': datetime.datetime.strftime( datetime.datetime.utcnow(), diff --git a/setup.py b/setup.py index 4df3ddb..935a60f 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from setuptools import setup, find_packages -version = '0.9.0' +version = '0.9.1' setup(