diff --git a/.gitignore b/.gitignore index 4b3cb78..1ebb2de 100644 --- a/.gitignore +++ b/.gitignore @@ -89,4 +89,5 @@ ENV/ .settings make.bat -phantomjs/ \ No newline at end of file +phantomjs/ +chromedriver/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml index 2f53b66..d8bc4bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,8 +8,7 @@ sudo: required install: - - sh install_chrome.sh -# - pip install -r requirements.txt +# - sh install_chrome.sh - python setup.py -q install # command to run tests script: pytest \ No newline at end of file diff --git a/README.rst b/README.rst index 04dc1bd..9b81e3b 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,7 @@ Extract these result types * results - standard search result * shopping - shopping teaser within regular search results -For each result in a resultspage get +For each result of a resultspage get ==================================== * domain @@ -44,8 +44,8 @@ For each result in a resultspage get Also get a screenshot of each result page. You can also scrape the text content of each result url. -It also possible to save the results as CSV for future analytics. -If required you can use your own proxylist. +It is also possible to save the results as CSV for future analytics. +If required you can also use your own proxylist. Ressources @@ -106,11 +106,26 @@ To avoid encode/decode issues use this command before you start using SerpScrap .. image:: https://raw.githubusercontent.com/ecoron/SerpScrap/master/docs/logo.png :target: https://github.com/ecoron/SerpScrap +Supported OS +------------ + +* SerpScrap should work on Linux, Windows and Mac OS with installed Python >= 3.4 +* SerpScrap requieres lxml +* Doesn't work on iOS Changes ------- Notes about major changes between releases +0.11.0 +====== + +* Chrome headless is now the default browser, usage of phantomJS is deprecated +* chromedriver is installed on the first run (tested on Linux and Windows. Mac OS should also work) +* behavior of scraping raw text contents from serp urls, and of course given urls, has changed +* run scraping of serp results and contents at once +* csv output format changed, now it's tab separated and quoted + 0.10.0 ====== @@ -132,13 +147,17 @@ Notes about major changes between releases References ---------- -SerpScrap is using `PhantomJs`_ a scriptable headless WebKit, which is installed automaticly on the first run (Linux, Windows). -The scrapcore is based on `GoogleScraper`_ with several improvements. +SerpScrap is using `Chrome headless`_ and `lxml`_ to scrape serp results. For raw text contents of fetched URL's, it is using `beautifulsoup4`_ . +SerpScrap also supports `PhantomJs`_ ,which is deprecated, a scriptable headless WebKit, which is installed automaticly on the first run (Linux, Windows). +The scrapcore was based on `GoogleScraper`_ , an outdated project, and has many changes and improvemts. .. target-notes:: .. _`install`: http://serpscrap.readthedocs.io/en/latest/install.html .. _`examples`: http://serpscrap.readthedocs.io/en/latest/examples.html +.. _`Chrome headless`: http://chromedriver.chromium.org/ +.. _`lxml`: https://lxml.de/ +.. _`beautifulsoup4`: https://www.crummy.com/software/BeautifulSoup/ .. _`PhantomJs`: https://github.com/ariya/phantomjs .. _`GoogleScraper`: https://github.com/NikolaiT/GoogleScraper diff --git a/docs/conf.py b/docs/conf.py index b3ae453..e3995cf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,9 +58,9 @@ # built documents. # # The short X.Y version. -version = '0.10' +version = '0.11' # The full version, including alpha/beta/rc tags. -release = '0.10.4' +release = '0.11.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/configuration.rst b/docs/configuration.rst index b7dc7a2..7688d6e 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -21,12 +21,13 @@ Default configuration * database_name: '/tmp/serpscrap' - path and name sqlite db (stores scrape results) * dir_screenshot: '/tmp/screenshots' - basedir for saved screenshots * do_caching: True - enable / disable caching -* executable_path: '/usr/local/bin/chromedriver' - path to chromedriver +* executable_path: '/usr/local/bin/chromedriver' - path to chromedriver, should detected automaticly * google_search_url: 'https://www.google.com/search?' - base search url, modify for other countries * headers: - dict to customize request header, see below * num_pages_for_keyword: 2 - number of result pages to scrape * num_results_per_page: 10 - number results per searchengine page * proxy_file: '' - path to proxy file, see below +* sel_browser: 'chrome' - browser (chrome, phantomjs) * scrape_urls: False - scrape urls of search results * screenshot: True - enable screenshots for each query * search_engines: ['google'] - search engines (google) @@ -80,7 +81,7 @@ for not provided config keys the deault values still exists. Headers ------- -You can customize your searchengine request headers +You can customize your searchengine request headers if you are using phantomJS by providing a dict in your configuration. If you don't customize this setting, the default is used. diff --git a/docs/examples.rst b/docs/examples.rst index 0a29583..8d9c064 100644 --- a/docs/examples.rst +++ b/docs/examples.rst @@ -34,62 +34,60 @@ You can disable url scraping by setting the config value scrape_urls to False. for result in results: print(result) -Simple Example - custom phantomjs path --------------------------------------- +Simple example using phantomjs (deprecated) +------------------------------------------- -If phantomjs could not installed, configure your -custom path to the binary. +.. code-block:: bash -.. code-block:: python + python examples\example_phantomjs.py +It is possible to use phantomJS, but we recomment Chrome. Depending on your choice both will be tried to install automaticly. +For using Chrome you need the latest `chromedriver`_ and to set the executable_path. + +.. code-block:: bash + + import pprint import serpscrap - keywords = ['seo trends', 'seo news', 'seo tools'] + keywords = ['berlin'] config = serpscrap.Config() - # only required if phantomjs binary could not detected - config.set('executable_path', '../phantomjs/phantomjs.exe') - config.set('num_workers', 1) - config.set('scrape_urls', False) + config.set('sel_browser', 'phantomjs') scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) results = scrap.run() + for result in results: - if 'serp_title' in result and len(result['serp_title']) > 1: - print(result['serp_title']) - -Using Chrome ------------- + pprint.pprint(result) + print() -.. code-block:: bash - python examples\example_chrome.py +Simple Example - custom phantomjs path (deprecated) +--------------------------------------------------- -It is possible to use Chrome, but we recomment PhantomJs, which is installed by default. -For using Chrome u need to download the latest `chromedriver`_ and to set the executable_path. +If phantomjs could not installed, configure your +custom path to the binary. -.. code-block:: bash +.. code-block:: python - import pprint import serpscrap - keywords = ['berlin'] + keywords = ['seo trends', 'seo news', 'seo tools'] config = serpscrap.Config() - config.set('sel_browser', 'chrome') - config.set('chrome_headless', True) - config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') - # linux - # config.set('executable_path', '/usr/local/bin/chromedriver') + config.set('sel_browser', 'phantomjs') + # only required if phantomjs binary could not detected + config.set('executable_path', '../phantomjs/phantomjs.exe') + config.set('num_workers', 1) + config.set('scrape_urls', False) scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) results = scrap.run() - for result in results: - pprint.pprint(result) - print() + if 'serp_title' in result and len(result['serp_title']) > 1: + print(result['serp_title']) Image search ------------ @@ -137,11 +135,10 @@ In this example we scrape only an url, without crawling any searchengine. config = serpscrap.Config() urlscrape = serpscrap.UrlScrape(config.get()) - results = urlscrape.scrap_url(url) + result = urlscrape.scrap_url(url) - for result in results: - print(result) - print() + print(result) + print() Command Line @@ -160,7 +157,7 @@ Example as_csv() save the results for later seo analytics by using the as_csv() method. this method needs as argument the path -to the file. +to the file. The saved file is tab separated and values are quoted. .. code-block:: python @@ -173,7 +170,33 @@ to the file. scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) - results = scrap.as_csv('/tmp/seo-research') + scrap.as_csv('/tmp/seo-research') + + +Example serp results and raw text of result urls +------------------------------------------------ + +You can scrape serp results and fetching the raw text contents of result urls at once + +.. code-block:: bash + + python examples\example_serp_urls.py + +The resulting data will have additional fields containing data from the scraped urls. + +.. code-block:: python + + import serpscrap + + keywords = ['blockchain'] + + config = serpscrap.Config() + config.set('scrape_urls', True) + + scrap = serpscrap.SerpScrap() + scrap.init(config=config.get(), keywords=keywords) + scrap.as_csv('/tmp/output') + Example related --------------- diff --git a/docs/index.rst b/docs/index.rst index 90d86d6..efcec9b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -32,7 +32,7 @@ Extract these result types * results - standard search result * shopping - shopping teaser within regular search results -For each result in a resultspage get +For each result of a resultspage get ==================================== * domain @@ -47,8 +47,8 @@ For each result in a resultspage get Also get a screenshot of each result page. You can also scrape the text content of each result url. -It also possible to save the results as CSV for future analytics. -If required you can use your own proxylist. +It is also possible to save the results as CSV for future analytics. +If required you can also use your own proxylist. Ressources @@ -88,11 +88,31 @@ SerpScrap in your applications More detailes in the `examples`_ section of the documentation. +Supported OS +------------ + +* SerpScrap should work on Linux, Windows and Mac OS with installed Python >= 3.4 +* SerpScrap requieres lxml +* Doesn't work on iOS Changes ======= Notes about major changes between releases +0.11.0 +------ + +* Chrome headless is now the default browser, usage of phantomJS is deprecated +* chromedriver is installed on the first run (tested on Linux and Windows. Mac OS should also work) +* behavior of scraping raw text contents from serp urls, and of course given urls, has changed +* run scraping of serp results and contents at once +* csv output format changed, now it's tab separated and quoted + +0.10.0 +------ + +* support for headless chrome, adjusted default time between scrapes + 0.9.0 ----- @@ -109,11 +129,17 @@ Notes about major changes between releases References ========== -SerpScrap is using `PhantomJs`_ a scriptable headless WebKit, which is installed automaticly on the first run (Linux, Windows) -The scrapcore is based on `GoogleScraper`_ with several improvements. +SerpScrap is using `Chrome headless`_ and `lxml`_ to scrape serp results. For raw text contents of fetched URL's, it is using `beautifulsoup4`_ . +SerpScrap also supports `PhantomJs`_ ,which is deprecated, a scriptable headless WebKit, which is installed automaticly on the first run (Linux, Windows). +The scrapcore was based on `GoogleScraper`_ , an outdated project, and has many changes and improvements. .. target-notes:: +.. _`install`: http://serpscrap.readthedocs.io/en/latest/install.html +.. _`examples`: http://serpscrap.readthedocs.io/en/latest/examples.html +.. _`Chrome headless`: http://chromedriver.chromium.org/ +.. _`lxml`: https://lxml.de/ +.. _`beautifulsoup4`: https://www.crummy.com/software/BeautifulSoup/ .. _`PhantomJs`: https://github.com/ariya/phantomjs .. _`GoogleScraper`: https://github.com/NikolaiT/GoogleScraper .. _`examples`: http://serpscrap.readthedocs.io/en/latest/examples.html diff --git a/docs/install.rst b/docs/install.rst index 45a242d..e7662ce 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -7,19 +7,34 @@ Install pip uninstall SerpScrap -y pip install SerpScrap --upgrade -On the first run SerpScrap will try to install the required PhantomJS binary on Windows and Linux instances. -If self install doesnt work you can configure your custom path to the phantomjs binary. +On the first run SerpScrap will try to install the required Chromedriver or PhantomJS binary on Windows and Linux instances. +If self install doesnt work you can configure your custom path to the chromedriver or phantomjs binary. +For Linux SerpScrap provides https://github.com/ecoron/SerpScrap/blob/master/install_chrome.sh, this should be executed automaticly on the first run. -Requirements Windows --------------------- +Chrome headless is recommended +------------------------------ -for windows some dependecies are provided as binaries for python extension packages. -you can find them under: http://www.lfd.uci.edu/~gohlke/pythonlibs/ -For your convenience here are the direct links: +By default SerpScrap is using the headless Chrome. +You can also use phantomJS, but it is deprecated and it is also blocked very fast by the searchengine. +We recommend to use headless Chrome. + +lxml +---- +lxml is required. + +Windows +======= +for windows you may need the lxml binary form here: http://www.lfd.uci.edu/~gohlke/pythonlibs/ +For your convenience here are the direct links: * `lxml`_ -maybe you need also `Microsoft Visual C++ Build Tools`_ installed. +In some cases you may need also `Microsoft Visual C++ Build Tools`_ installed. + +iOS +=== +is not supported yet + cli encoding issues ------------------- @@ -33,8 +48,9 @@ To avoid encode/decode issues use this command before you start using SerpScrap References +========== .. target-notes:: +.. _`lxml`: http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml .. _`Microsoft Visual C++ Build Tools`: http://landinghub.visualstudio.com/visual-cpp-build-tools -.. _`lxml`: http://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml \ No newline at end of file diff --git a/docs/results.rst b/docs/results.rst index 3722567..c32da19 100644 --- a/docs/results.rst +++ b/docs/results.rst @@ -28,6 +28,21 @@ If you prefer to save the results use the as_csv() method. } +If scrape_urls is True additional fields are appended to the resultset + +.. code-block:: python + + { + 'meta_robots': 'index, follow', # value of meta tag robots + 'meta_title': 'Title of the page', # title of the url + 'status': '200', # response code + 'url': 'https://de.wikipedia.org', # scraped url + 'encoding': 'utf-8', # encoding of the url + 'last_modified': '26.08.2018 11:35:40', # datetime url lastmodified + 'text_raw': 'The raw text content scraped from url' + } + + serp_type --------- diff --git a/examples/example_csv.py b/examples/example_csv.py index f028f6b..c141661 100644 --- a/examples/example_csv.py +++ b/examples/example_csv.py @@ -5,11 +5,6 @@ keywords = ['stellar'] config = serpscrap.Config() -config.set('sel_browser', 'chrome') -config.set('chrome_headless', True) -config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') -# for linux -# config.set('executable_path', '/usr/local/bin/chromedriver') config.set('scrape_urls', False) scrap = serpscrap.SerpScrap() diff --git a/examples/example_chrome.py b/examples/example_phantomjs.py similarity index 52% rename from examples/example_chrome.py rename to examples/example_phantomjs.py index e43aa0f..d452dbd 100644 --- a/examples/example_chrome.py +++ b/examples/example_phantomjs.py @@ -3,14 +3,10 @@ import pprint import serpscrap -keywords = ['sommer'] +keywords = ['herbst'] config = serpscrap.Config() -config.set('sel_browser', 'chrome') -config.set('chrome_headless', True) -config.set('executable_path', '/tmp/chromedriver_win32/chromedriver.exe') -# for linux -# config.set('executable_path', '/usr/local/bin/chromedriver') +config.set('sel_browser', 'phantomjs') scrap = serpscrap.SerpScrap() scrap.init(config=config.get(), keywords=keywords) diff --git a/examples/example_related.py b/examples/example_related.py index 1b971e2..0d7d2d7 100644 --- a/examples/example_related.py +++ b/examples/example_related.py @@ -23,7 +23,6 @@ def get_related(config, keywords, related): config = serpscrap.Config() config.set('scrape_urls', False) -config.set('num_workers', 1) keywords = ['cryptocurrency'] diff --git a/examples/example_serp_urls.py b/examples/example_serp_urls.py new file mode 100644 index 0000000..8b135d9 --- /dev/null +++ b/examples/example_serp_urls.py @@ -0,0 +1,12 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- +import serpscrap + +keywords = ['blockchain'] + +config = serpscrap.Config() +config.set('scrape_urls', True) + +scrap = serpscrap.SerpScrap() +scrap.init(config=config.get(), keywords=keywords) +scrap.as_csv('/tmp/output') \ No newline at end of file diff --git a/examples/example_simple.py b/examples/example_simple.py index 72acaeb..5ea8e7a 100644 --- a/examples/example_simple.py +++ b/examples/example_simple.py @@ -3,7 +3,7 @@ import pprint import serpscrap -keywords = ['berlin'] +keywords = ['bienen'] config = serpscrap.Config() diff --git a/examples/example_url.py b/examples/example_url.py index b842939..d22410a 100644 --- a/examples/example_url.py +++ b/examples/example_url.py @@ -7,8 +7,7 @@ config = serpscrap.Config() urlscrape = serpscrap.UrlScrape(config.get()) -results = urlscrape.scrap_url(url) +result = urlscrape.scrap_url(url) -for result in results: - print(result) - print() +print(result) +print() diff --git a/install_chrome.sh b/install_chrome.sh index 0152e28..d7990c5 100644 --- a/install_chrome.sh +++ b/install_chrome.sh @@ -12,7 +12,7 @@ SELENIUM_SUBDIR=$(echo "$SELENIUM_STANDALONE_VERSION" | cut -d"." -f-2) # Remove existing downloads and binaries so we can start from scratch. sudo apt-get remove google-chrome-stable rm ~/chromedriver_linux64.zip -sudo rm /usr/local/bin/chromedriver +# sudo rm /usr/local/bin/chromedriver # Install dependencies. sudo apt-get install -y unzip openjdk-8-jre-headless xvfb libxi6 libgconf-2-4 @@ -24,9 +24,9 @@ sudo apt-get -y update sudo apt-get -y install google-chrome-stable # Install ChromeDriver. -wget -N http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P ~/ -unzip ~/chromedriver_linux64.zip -d ~/ -rm ~/chromedriver_linux64.zip -sudo mv -f ~/chromedriver /usr/local/bin/chromedriver -sudo chown root:root /usr/local/bin/chromedriver -sudo chmod 0755 /usr/local/bin/chromedriver +# wget -N http://chromedriver.storage.googleapis.com/$CHROME_DRIVER_VERSION/chromedriver_linux64.zip -P ~/ +# unzip ~/chromedriver_linux64.zip -d ~/ +# rm ~/chromedriver_linux64.zip +# sudo mv -f ~/chromedriver /usr/local/bin/chromedriver +# sudo chown root:root /usr/local/bin/chromedriver +# sudo chmod 0755 /usr/local/bin/chromedriver diff --git a/serpscrap/chrome_install.py b/serpscrap/chrome_install.py new file mode 100644 index 0000000..cdc970c --- /dev/null +++ b/serpscrap/chrome_install.py @@ -0,0 +1,81 @@ +from scrapcore.logger import Logger +import os +import platform +import stat +import subprocess +import tempfile +import urllib.request +import zipfile + +logger = Logger() +logger.setup_logger() +logger = logger.get_logger() + + +class ChromeInstall(): + + home_dir = os.path.expanduser('chromedriver/') + binary_win = 'chromedriver.exe' + binary_linux = 'chromedriver' + binary_mac64 = 'chromedriver' + + def get_os(self): + return platform.system() + + def detect_chromedriver(self): + logger.info('detecting chromedriver') + this_os = self.get_os().lower() + if 'windows' in this_os: + if os.path.isfile(self.home_dir + self.binary_win): + os.chmod(self.home_dir + self.binary_win, 755) + return self.home_dir + self.binary_win + elif 'linux' in this_os: + if os.path.isfile(self.home_dir + self.binary_linux): + os.chmod(self.home_dir + self.binary_linux, 755) + return self.home_dir + self.binary_linux + elif 'darwin' in this_os: + if os.path.isfile(self.home_dir + self.binary_mac64): + os.chmod(self.home_dir + self.binary_mac64, 755) + return self.home_dir + self.binary_mac64 + else: + raise Exception(''' + Platform not supported. + install chromedriver by your own and update the path in your config + ''') + + def download(self): + logger.info('downloading chromedriver') + this_os = self.get_os().lower() + base_url = 'http://chromedriver.storage.googleapis.com/2.41/' + + if 'windows' in this_os: + file_name = 'chromedriver_win32.zip' + archive = 'zip' + elif 'linux' in this_os: + os.chmod('install_chrome.sh', 755 | stat.S_IEXEC) + subprocess.call('install_chrome.sh') + archive = 'zip' + file_name = 'chromedriver_linux64.zip' + elif 'darwin' in this_os: + file_name = 'chromedriver_mac64.zip' + archive = 'zip' + else: + raise Exception(''' + Platform not supported. + install chromedriver by your own and update the path in your config + ''') + # Download the file from `url` and save it under `file_name`: + tmp_dir = tempfile.gettempdir() + '/' + try: + urllib.request.urlretrieve(base_url + file_name, tmp_dir + file_name) + self.unpack(tmp_dir + file_name, archive) + except: + raise Exception('Download and unpack of chromedriver failed. Check if %(tmp_dir)s exists and has write permissions' % {'tmp_dir' : tmp_dir}) + + def unpack(self, file_path, archive): + logger.info('unpacking chromedriver') + if os.path.isdir(self.home_dir) is False: + os.mkdir(self.home_dir) + if 'zip' in archive: + with zipfile.ZipFile(file_path, 'r') as zip_ref: + zip_ref.extractall(self.home_dir) diff --git a/serpscrap/config.py b/serpscrap/config.py index acefce5..e96daa2 100644 --- a/serpscrap/config.py +++ b/serpscrap/config.py @@ -23,7 +23,7 @@ class Config(): 'search_engines': ['google'], 'num_pages_for_keyword': 2, 'scrape_method': 'selenium', - 'sel_browser': 'phantomjs', + 'sel_browser': 'chrome', 'chrome_headless': True, 'executable_path': '', 'do_caching': True, diff --git a/serpscrap/csv_writer.py b/serpscrap/csv_writer.py index 382cf2f..dee98cc 100644 --- a/serpscrap/csv_writer.py +++ b/serpscrap/csv_writer.py @@ -8,7 +8,7 @@ class CsvWriter(): def write(self, file_name, my_dict): try: with open(file_name, 'w', encoding='utf-8', newline='') as f: - w = csv.DictWriter(f, my_dict[0].keys(), dialect='excel', delimiter='\t') + w = csv.DictWriter(f, my_dict[0].keys(), dialect='excel', delimiter='\t', quotechar='"') w.writeheader() for row in my_dict[0:]: w.writerow(row) diff --git a/serpscrap/serpscrap.py b/serpscrap/serpscrap.py index ef2d57b..6cf533e 100644 --- a/serpscrap/serpscrap.py +++ b/serpscrap/serpscrap.py @@ -12,6 +12,7 @@ from scrapcore.logger import Logger from serpscrap.config import Config from serpscrap.csv_writer import CsvWriter +from serpscrap.chrome_install import ChromeInstall from serpscrap.phantom_install import PhantomInstall from serpscrap.urlscrape import UrlScrape @@ -76,7 +77,7 @@ def init(self, config=None, keywords=None): else: self.config = Config().get() - if self.config['executable_path'] == '': + if self.config['executable_path'] == '' and self.config['sel_browser'] == 'phantomjs': logger.info('preparing phantomjs') firstrun = PhantomInstall() phantomjs = firstrun.detect_phantomjs() @@ -89,6 +90,19 @@ def init(self, config=None, keywords=None): provide custom path in config''') self.config.__setitem__('executable_path', phantomjs) logger.info('using ' + str(phantomjs)) + elif self.config['executable_path'] == '' and self.config['sel_browser'] == 'chrome': + logger.info('preparing chromedriver') + firstrun = ChromeInstall() + chromedriver = firstrun.detect_chromedriver() + if chromedriver is None: + firstrun.download() + chromedriver = firstrun.detect_chromedriver() + if chromedriver is None: + raise Exception(''' + chromedriver binary not found, + provide custom path in config''') + self.config.__setitem__('executable_path', chromedriver) + logger.info('using ' + str(chromedriver)) # cleanup screenshot dir on init if os.path.exists(self.config['dir_screenshot']): @@ -121,9 +135,9 @@ def run(self): if self.config['scrape_urls']: for index, result in enumerate(self.results): if 'serp_type' in result and \ - 'ads_main' not in result['serp_type'] and \ 'serp_url' in result: - result_url = self.scrap_url(result['serp_url'])[0] + logger.info('Scraping URL: ' + result['serp_url']) + result_url = self.scrap_url(result['serp_url']) if 'status' in result_url: self.results[index].update(result_url) return self.results if isinstance(self.results, list) else [self.results] diff --git a/serpscrap/urlscrape.py b/serpscrap/urlscrape.py index 73ab184..1c878ad 100644 --- a/serpscrap/urlscrape.py +++ b/serpscrap/urlscrape.py @@ -2,7 +2,6 @@ # -*- coding: utf-8 -*- """SerpScrap.UrlScrape""" import chardet -import concurrent.futures import hashlib import html2text import json @@ -60,11 +59,10 @@ def scrap_url(self, url): UrlScrape.results.append(result) except: try: - with concurrent.futures.ThreadPoolExecutor(max_workers=self.url_threads) as executor: - executor.submit(UrlScrape.fetch_url, url, cache_file) + result = UrlScrape.fetch_url(url, cache_file) except: pass - return UrlScrape.results + return result @staticmethod def fetch_url(url, cache_file): @@ -130,7 +128,7 @@ def fetch_url(url, cache_file): json.dump(result, fp) except: pass - UrlScrape.results.append(result) + return result ascii_lowercase = "abcdefghijklmnopqrstuvwxyz" ascii_uppercase = ascii_lowercase.upper() diff --git a/setup.py b/setup.py index 75f7ba5..5e6d1eb 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from setuptools import setup, find_packages -version = '0.10.4' +version = '0.11.0' setup( @@ -24,13 +24,14 @@ install_requires=[ 'PySocks==1.6.8', 'chardet==3.0.4', - 'beautifulsoup4==4.6.0', + 'beautifulsoup4==4.6.3', 'html2text==2018.1.9', 'lxml==4.2.3', 'sqlalchemy==1.2.10', 'selenium==3.13.0', 'cssselect==1.0.3', ], + scripts=['install_chrome.sh'], classifiers=[ 'Development Status :: 4 - Beta', 'Intended Audience :: Developers', diff --git a/tests/test_basic.py b/tests/test_basic.py index cc4a2f8..14ccfb8 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -37,9 +37,9 @@ def test_simple(self): keywords = random.choice(self.keyword_list) config = Config() - config.set('sel_browser', 'chrome') - config.set('chrome_headless', True) - config.set('executable_path', '/usr/local/bin/chromedriver') +# config.set('sel_browser', 'chrome') +# config.set('chrome_headless', True) +# config.set('executable_path', '/usr/local/bin/chromedriver') scrap = SerpScrap() scrap.init(config=config.get(), keywords=keywords) results = scrap.run()