diff --git a/docs/conf.py b/docs/conf.py index 494e8ad..62f5711 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,9 +58,9 @@ # built documents. # # The short X.Y version. -version = '0.5' +version = '0.6' # The full version, including alpha/beta/rc tags. -release = '0.5.1' +release = '0.6.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/configuration.rst b/docs/configuration.rst index 43d9092..a2fb6b0 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -20,7 +20,8 @@ Default configuration * search_engines: ['google'] - search engines (google) * url_threads: 3 - number of threads if scrape_urls is true * use_own_ip: True - if using proxies set to False - +* sleeping_min: 5 - min seconds to sleep between scrapes +* sleeping_max: 15 - max seconds to sleep between scrapes Custom configuration -------------------- diff --git a/examples/example_advanced.py b/examples/example_advanced.py index 14d0382..fe36f0a 100644 --- a/examples/example_advanced.py +++ b/examples/example_advanced.py @@ -6,8 +6,8 @@ config = serpscrap.Config() -config.set('scrape_urls', True) -config.set('num_pages_for_keyword', 2) +config.set('scrape_urls', False) +config.set('num_pages_for_keyword', 5) config.set('url_threads', 5) scrap = serpscrap.SerpScrap() @@ -18,11 +18,17 @@ models = [] +print('--- origin titles ---') for result in results: if 'serp_title' in result and len(result['serp_title']) > 1: - model = markovi.get_model(result['serp_title'], 1) - if model.state_size > 0: - models.append(model) + print(result['serp_title']) + try: + model = markovi.get_model(result['serp_title'], 1) + if model.state_size > 0: + models.append(model) + except Exception: + pass +print('--- --- ---') model = markovi.get_combined_model(models) @@ -32,21 +38,26 @@ char_limit=150, tries=10, max_overlap_ratio=0.7, - max_overlap_total=25 + max_overlap_total=20 ) if isinstance(text, str): texts.append(text) +print('--- Generated Titles 1. iteration ---') for text in texts: - print(text+'\n') + print(text) +print('--- --- ---') tf = serpscrap.TfIdf().get_tfidf(texts) -print(tf[0:10]) +print('--- TfIdf Titles ---') +print(tf) +print('--- --- ---') model = markovi.get_model("\n".join(texts), 1) +print('--- Generated Titles 2. iteration ---') for _ in range(10): text = model.make_short_sentence( - char_limit=80, + max_chars=80, tries=10, max_overlap_ratio=0.7, max_overlap_total=20 diff --git a/examples/example_markovi.py b/examples/example_markovi.py index 4f1b930..d16871e 100644 --- a/examples/example_markovi.py +++ b/examples/example_markovi.py @@ -1,9 +1,10 @@ #!/usr/bin/python3 # -*- coding: utf-8 -*- -from serpscrap.markovi import Markovi +import pprint + from serpscrap.config import Config +from serpscrap.markovi import Markovi from serpscrap.urlscrape import UrlScrape -import pprint url = 'http://gutenberg.spiegel.de/buch/johann-wolfgang-goethe-gedichte-3670/231' @@ -18,4 +19,5 @@ for _ in range(5): texts.append(markovi.generate(content.__getitem__('text_raw'), 1)) -pprint.pprint(texts, width=120) +for text in texts: + pprint.pprint(text, width=120) diff --git a/requirements.txt b/requirements.txt index 125298b..a571b0a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,13 +2,13 @@ numpy==1.12.1 scipy==0.19.0 scikit-learn==0.18.1 lxml -chardet==2.3.0 -beautifulsoup4==4.5.3 +chardet==3.0.2 +beautifulsoup4==4.6.0 html2text==2016.9.19 -markovify==0.5.4 +markovify==0.6.0 PySocks==1.6.7 sqlalchemy==1.0.12 -selenium==3.3.3 -cssselect==0.9.1 +selenium==3.4.1 +cssselect==1.0.1 requests==2.13.0 aiohttp==0.21.5 \ No newline at end of file diff --git a/scrapcore/core.py b/scrapcore/core.py index 2b1991a..e7b56b3 100644 --- a/scrapcore/core.py +++ b/scrapcore/core.py @@ -55,7 +55,7 @@ def main(self, return_results=False, config=None): num_workers = int(config.get('num_workers')) scrape_method = config.get('scrape_method') pages = int(config.get('num_pages_for_keyword', 1)) - method = config.get('scrape_method', 'http') + method = config.get('scrape_method', 'selenium') result_writer = ResultWriter() result_writer.init_outfile(config, force_reload=True) diff --git a/scrapcore/database.py b/scrapcore/database.py index c8bb5f2..95d3cb9 100644 --- a/scrapcore/database.py +++ b/scrapcore/database.py @@ -149,11 +149,7 @@ def set_values_from_scraper(self, scraper): """Populate itself from a scraper object. A scraper may be any object of type: - - SelScrape - - HttpScrape - - AsyncHttpScrape - Args: A scraper object. """ diff --git a/scrapcore/parser/google_parser.py b/scrapcore/parser/google_parser.py index 52a8c07..b27c482 100644 --- a/scrapcore/parser/google_parser.py +++ b/scrapcore/parser/google_parser.py @@ -67,12 +67,12 @@ class GoogleParser(Parser): 'de_ip': { 'container': '#center_col', 'result_container': '.ads-ad', - 'link': 'h3 > a:first-child::attr(href)', + 'link': 'h3 > a:nth-child(2)::attr(href)', 'snippet': '.ads-creative::text', - 'title': 'h3 > a:first-child::text', + 'title': 'h3 > a:nth-child(2)::text', 'visible_link': '.ads-visurl cite::text', 'rating': 'div._Ond _Bu span::text', - 'sitelinks': 'div.osl::text' + 'sitelinks': 'ul._wEo::text' } }, 'ads_aside': { diff --git a/scrapcore/scraper/http.py b/scrapcore/scraper/http.py deleted file mode 100644 index bc7f49f..0000000 --- a/scrapcore/scraper/http.py +++ /dev/null @@ -1,297 +0,0 @@ -# -*- coding: utf-8 -*- -import datetime -import json -import logging -import socket -import threading -from urllib.parse import urlencode - -from scrapcore.parsing import Parsing -from scrapcore.scraping import SearchEngineScrape -from scrapcore.scraping import get_base_search_url_by_search_engine -from scrapcore.user_agent import random_user_agent -from scrapcore.tools import BlockedSearchException -import socks - -logger = logging.getLogger(__name__) - - -def get_GET_params_for_search_engine(query, - search_engine, - page_number=1, - num_results_per_page=10, - search_type='normal'): - """Returns the params of the url for the search engine and the search mode. - Args: - search_engine: The search engine. Example: 'google' - search_mode: The search mode. Example: 'image' or 'normal' - query: The search query - page_number: Which SERP page. - num_results_per_page: How many entries per page. - Returns: - The params for the GET url. - """ - - search_params = {} - - if search_engine == 'google': - # always use the english interface, such that we can detect - # state by some hard coded needles. - search_params['hl'] = 'en' - search_params['q'] = query - # only set when other num results than 10. - if num_results_per_page != 10: - search_params['num'] = str(num_results_per_page) - - if page_number > 1: - search_params['start'] = str((page_number - 1) * int(num_results_per_page)) - - if search_type == 'image': - search_params.update({ - 'oq': query, - 'site': 'imghp', - 'tbm': 'isch', - 'source': 'hp', - # 'sa': 'X', - 'biw': 1920, - 'bih': 881 - }) - elif search_type == 'video': - search_params.update({ - 'tbm': 'vid', - 'source': 'lnms', - 'sa': 'X', - 'biw': 1920, - 'bih': 881 - }) - elif search_type == 'news': - search_params.update({ - 'tbm': 'nws', - 'source': 'lnms', - 'sa': 'X' - }) - - elif search_engine == 'yandex': - search_params['text'] = query - if page_number > 1: - search_params['p'] = str(page_number - 1) - - elif search_engine == 'bing': - search_params['q'] = query - # bing doesn't support variable number of results (As far as I know). - if page_number > 1: - search_params['first'] = str(1 + ((page_number - 1) * 10)) - - elif search_engine == 'yahoo': - search_params['p'] = query - if page_number > 1: - search_params['b'] = str(1 + ((page_number - 1) * 10)) - search_params['ei'] = 'UTF-8' - - elif search_engine == 'baidu': - search_params['wd'] = query - if page_number > 1: - search_params['pn'] = str((page_number - 1) * 10) - search_params['ie'] = 'utf-8' - elif search_engine == 'duckduckgo': - search_params['q'] = query - elif search_engine == 'ask': - search_params['q'] = query - search_params['qsrc'] = '0' - search_params['l'] = 'dir' - search_params['qo'] = 'homepageSearchBox' - if page_number > 1: - search_params['page'] = str(page_number) - elif search_engine == 'blekko': - search_params['q'] = query - - return search_params - - -class HttpScrape(SearchEngineScrape, threading.Timer): - """Offers a fast way to query any search engine using raw HTTP requests. - Overrides the run() method of the superclass threading.Timer. - Each thread represents a crawl for one Search Engine SERP page. Inheriting - from threading.Timer allows the deriving class to delay execution - of the run() method. - Attributes: - results: Returns the found results. - """ - - def __init__(self, config, *args, time_offset=0.0, **kwargs): - """Initialize an HttScrape object to scrape over blocking http. - - HttpScrape inherits from SearchEngineScrape - and from threading.Timer. - """ - threading.Timer.__init__(self, time_offset, self.search) - SearchEngineScrape.__init__(self, config, *args, **kwargs) - - # Bind the requests module to this instance such that each - # instance may have an own proxy - self.requests = __import__('requests') - - # initialize the GET parameters for the search request - self.search_params = {} - - # Host and User-Agent field need to be set additionally. - self.headers = config.get('headers') - - self.scrape_method = 'http' - - self.base_search_url = get_base_search_url_by_search_engine( - self.config, - self.search_engine_name, - self.scrape_method - ) - - super().instance_creation_info(self.__class__.__name__) - - if self.search_engine_name == 'blekko': - logger.critical('blekko does not support http mode.') - self.startable = False - - def set_proxy(self): - """Setup a socks connection for the socks module bound to this instance. - - Args: - proxy: Namedtuple, Proxy to use for this thread. - """ - - def create_connection(address, timeout=None, source_address=None): - sock = socks.socksocket() - sock.connect(address) - return sock - - pmapping = { - 'socks4': 1, - 'socks5': 2, - 'http': 3 - } - # Patch the socket module - # rdns is by default on true. Never use rnds=False with TOR, otherwise you are screwed! - socks.setdefaultproxy(pmapping.get(self.proxy.proto), self.proxy.host, int(self.proxy.port), rdns=True) - socks.wrap_module(socket) - socket.create_connection = create_connection - - def switch_proxy(self, proxy): - super().switch_proxy() - - def proxy_check(self, proxy): - assert self.proxy and self.requests, 'Worker needs valid proxy instance and requests library to make ' \ - 'the proxy check.' - - online = False - status = 'Proxy check failed: {host}:{port} is not used while requesting'.format(host=self.proxy.host, port=self.proxy.port) - ipinfo = {} - - try: - text = self.requests.get(self.config.get('proxy_info_url')).text - try: - ipinfo = json.loads(text) - except ValueError: - pass - except self.requests.ConnectionError as e: - status = 'No connection to proxy server possible, aborting: {}'.format(e) - except self.requests.Timeout as e: - status = 'Timeout while connecting to proxy server: {}'.format(e) - except self.requests.exceptions.RequestException as e: - status = 'Unknown exception: {}'.format(e) - - if 'ip' in ipinfo and ipinfo['ip']: - online = True - status = 'Proxy is working.' - else: - logger.warning(status) - - super().update_proxy_status(status, ipinfo, online) - - return online - - def handle_request_denied(self, status_code=''): - """Handle request denied by the search engine. - - This is the perfect place to distinguish the different responses - if search engine detect exhaustive searching. - - Args: - status_code: The status code of the HTTP response. - - Returns: - """ - super().handle_request_denied(status_code) - - def build_search(self): - """Build the headers and params for the search request for the search engine.""" - - self.search_params = get_GET_params_for_search_engine(self.query, self.search_engine_name, - self.page_number, self.num_results_per_page, - self.search_type) - - self.parser = Parsing().get_parser_by_search_engine(self.search_engine_name) - self.parser = self.parser(config=self.config) - - def search(self, rand=True, timeout=15): - """The actual search for the search engine. - - When raising StopScrapingException, the scraper will stop. - - When return False, the scraper tries to continue with next keyword. - """ - - success = True - - self.build_search() - - if rand: - self.headers['User-Agent'] = random_user_agent() - - try: - super().detection_prevention_sleep() - super().keyword_info() - - request = self.requests.get(self.base_search_url + urlencode(self.search_params), - headers=self.headers, timeout=timeout) - - self.requested_at = datetime.datetime.utcnow() - self.html = request.text - - logger.debug('[HTTP - {url}, headers={headers}, params={params}'.format( - url=request.url, - headers=self.headers, - params=self.search_params)) - - needles = self.malicious_request_needles[self.search_engine_name] - if needles and needles['inhtml'] in self.html: - success = False - raise BlockedSearchException('Search temporary is blocked, slow down and try again later') - - except self.requests.ConnectionError as ce: - self.status = 'Network problem occurred {}'.format(ce) - success = False - except self.requests.Timeout as te: - self.status = 'Connection timeout {}'.format(te) - success = False - except self.requests.exceptions.RequestException as e: - # In case of any http networking exception that wasn't caught - # in the actual request, just end the worker. - self.status = 'Stopping scraping because {}'.format(e) - else: - if not request.ok: - self.handle_request_denied(request.status_code) - success = False - - super().after_search() - - return success - - def run(self): - super().before_search() - - if self.startable: - for self.query, self.pages_per_keyword in self.jobs.items(): - - for self.page_number in self.pages_per_keyword: - - if not self.search(rand=True): - self.missed_keywords.add(self.query) diff --git a/scrapcore/scraper/scrape_worker_factory.py b/scrapcore/scraper/scrape_worker_factory.py index 92e3167..946b7ab 100644 --- a/scrapcore/scraper/scrape_worker_factory.py +++ b/scrapcore/scraper/scrape_worker_factory.py @@ -1,8 +1,11 @@ # -*- coding: utf-8 -*- + class ScrapeWorkerFactory(): - def __init__(self, config, cache_manager=None, mode=None, proxy=None, search_engine=None, session=None, db_lock=None, - cache_lock=None, scraper_search=None, captcha_lock=None, progress_queue=None, browser_num=1): + def __init__(self, config, cache_manager=None, mode=None, proxy=None, + search_engine=None, session=None, db_lock=None, + cache_lock=None, scraper_search=None, captcha_lock=None, + progress_queue=None, browser_num=1): self.config = config self.cache_manager = cache_manager @@ -55,19 +58,4 @@ def get_worker(self): browser_num=self.browser_num, ) - elif self.mode == 'http': - from scrapcore.scraper.http import HttpScrape - return HttpScrape( - self.config, - cache_manager=self.cache_manager, - search_engine=self.search_engine, - jobs=self.jobs, - session=self.session, - scraper_search=self.scraper_search, - cache_lock=self.cache_lock, - db_lock=self.db_lock, - proxy=self.proxy, - progress_queue=self.progress_queue, - ) - return None diff --git a/scrapcore/scraper/selenium.py b/scrapcore/scraper/selenium.py index 17851e7..115adf0 100644 --- a/scrapcore/scraper/selenium.py +++ b/scrapcore/scraper/selenium.py @@ -11,10 +11,6 @@ import time from urllib.parse import quote -from scrapcore.scraping import MaliciousRequestDetected -from scrapcore.scraping import SearchEngineScrape, SeleniumSearchError -from scrapcore.scraping import get_base_search_url_by_search_engine -from scrapcore.user_agent import random_user_agent from selenium import webdriver from selenium.common.exceptions import ElementNotVisibleException from selenium.common.exceptions import NoSuchElementException @@ -25,6 +21,11 @@ from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait +from scrapcore.scraping import MaliciousRequestDetected +from scrapcore.scraping import SearchEngineScrape, SeleniumSearchError +from scrapcore.scraping import get_base_search_url_by_search_engine +from scrapcore.user_agent import random_user_agent + logger = logging.getLogger(__name__) @@ -199,10 +200,9 @@ def _save_debug_screenshot(self): """ tempdir = tempfile.gettempdir() location = os.path.join( - tempdir, 'serpscrap_{}_{}_{}_debug_screenshot.png'.format( + tempdir, 'serpscrap_{}_{}debug_screenshot.png'.format( self.search_engine_name, - self.browser_type, - str(time.time()) + self.query ) ) self.webdriver.get_screenshot_as_file(location) @@ -546,19 +546,23 @@ def wait_until_serp_loaded(self): content = None try: - time.sleep(0.5) - WebDriverWait(self.webdriver, 5).until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, selector), str(self.page_number))) + time.sleep(1) + WebDriverWait(self.webdriver, 5).until( + EC.text_to_be_present_in_element( + (By.CSS_SELECTOR, selector), + str(self.page_number) + ) + ) except TimeoutException: self._save_debug_screenshot() try: content = self.webdriver.find_element_by_css_selector(selector).text except NoSuchElementException: - # logger.error('SLEEPING FOR {} sec'.format(str(60 * 5))) - # time.sleep(60 * 5) logger.error('Skipp it, no such element - SeleniumSearchError') raise SeleniumSearchError('Stop Scraping, seems we are blocked') except Exception: - logger.error('Pagenumber={} did not appear in navigation. Got "{}" instead'.format(self.page_number, content)) + logger.error('Scrape Exception pass. Selector: ' + str(selector)) + self._save_debug_screenshot() pass elif self.search_type == 'image': @@ -576,91 +580,86 @@ def wait_until_title_contains_keyword(self): def search(self): """Search with webdriver. - - Fills out the search form of the search engine for each keyword. + Fills out the search form of the search engine for the keyword. Clicks the next link while pages_per_keyword is not reached. """ - for self.query, self.pages_per_keyword in self.jobs.items(): + self.search_input = self._wait_until_search_input_field_appears() + time.sleep(.25) - self.search_input = self._wait_until_search_input_field_appears() - time.sleep(.25) + if self.search_input is False and self.config.get('stop_on_detection'): + self.status = 'Malicious request detected' + return - if self.search_input is False and self.config.get('stop_on_detection'): - self.status = 'Malicious request detected' - return + if self.search_input is False: + # @todo: pass status_code + self.search_input = self.handle_request_denied() - if self.search_input is False: - # @todo: pass status_code - self.search_input = self.handle_request_denied() + if self.search_input: + try: + self.search_input.clear() + except Exception as e: + logger.error('Possible blocked search, sleep 30 sec, Scrape Exception: ' + str(e)) + self._save_debug_screenshot() + time.sleep(30) + time.sleep(.25) - if self.search_input: - try: - self.search_input.clear() - except Exception: - logger.error('Possible blocked search, sleep 30 sec') - time.sleep(30) - # return - time.sleep(.25) - - self.search_param_fields = self._get_search_param_fields() - - if self.search_param_fields: - wait_res = self._wait_until_search_param_fields_appears() - if wait_res is False: - raise Exception('Waiting search param input fields time exceeds') - for param, field in self.search_param_fields.items(): - if field[0] == By.ID: - js_tpl = ''' - var field = document.getElementById("%s"); - field.setAttribute("value", "%s"); - ''' - elif field[0] == By.NAME: - js_tpl = ''' - var fields = document.getElementsByName("%s"); - for (var f in fields) { - f.setAttribute("value", "%s"); - } - ''' - js_str = js_tpl % (field[1], self.search_param_values[param]) - self.webdriver.execute_script(js_str) + self.search_param_fields = self._get_search_param_fields() + + if self.search_param_fields: + wait_res = self._wait_until_search_param_fields_appears() + if wait_res is False: + raise Exception('Waiting search param input fields time exceeds') + for param, field in self.search_param_fields.items(): + if field[0] == By.ID: + js_tpl = ''' + var field = document.getElementById("%s"); + field.setAttribute("value", "%s"); + ''' + elif field[0] == By.NAME: + js_tpl = ''' + var fields = document.getElementsByName("%s"); + for (var f in fields) { + f.setAttribute("value", "%s"); + } + ''' + js_str = js_tpl % (field[1], self.search_param_values[param]) + self.webdriver.execute_script(js_str) - try: - self.search_input.send_keys(self.query + Keys.ENTER) - except ElementNotVisibleException: - time.sleep(2) - self.search_input.send_keys(self.query + Keys.ENTER) - except Exception: - logger.error('send keys not possible') - # time.sleep(60) - pass + try: + self.search_input.send_keys(self.query + Keys.ENTER) + except ElementNotVisibleException: + time.sleep(2) + self.search_input.send_keys(self.query + Keys.ENTER) + except Exception: + logger.error('send keys not possible') + pass - self.requested_at = datetime.datetime.utcnow() - else: - logger.debug('{}: Cannot get handle to the input form for keyword {}.'.format(self.name, self.query)) - continue + self.requested_at = datetime.datetime.utcnow() + else: + logger.debug('{}: Cannot get handle to the input form for keyword {}.'.format(self.name, self.query)) - super().detection_prevention_sleep() - super().keyword_info() + super().detection_prevention_sleep() + super().keyword_info() - for self.page_number in self.pages_per_keyword: + for self.page_number in self.pages_per_keyword: - self.wait_until_serp_loaded() + self.wait_until_serp_loaded() - try: - self.html = self.webdriver.execute_script('return document.body.innerHTML;') - except WebDriverException: - self.html = self.webdriver.page_source + try: + self.html = self.webdriver.execute_script('return document.body.innerHTML;') + except WebDriverException: + self.html = self.webdriver.page_source - super().after_search() + super().after_search() - # Click the next page link not when leaving the loop - # in the next iteration. - if self.page_number in self.pages_per_keyword: - next_url = self._goto_next_page() - self.requested_at = datetime.datetime.utcnow() + # Click the next page link not when leaving the loop + # in the next iteration. + if self.page_number in self.pages_per_keyword: + next_url = self._goto_next_page() + self.requested_at = datetime.datetime.utcnow() - if not next_url: - break + if not next_url: + break def page_down(self): """Scrolls down a page with javascript. @@ -676,27 +675,30 @@ def page_down(self): def run(self): """Run the SelScraper.""" - self._set_xvfb_display() + for self.query, self.pages_per_keyword in self.jobs.items(): + # for each keyword request a fresh webdriver instance + # with random useragent and window_size + self._set_xvfb_display() - if not self._get_webdriver(): - raise Exception('{}: Aborting due to no available selenium webdriver.'.format(self.name)) + if not self._get_webdriver(): + raise Exception('{}: Aborting due to no available selenium webdriver.'.format(self.name)) - try: - x = randint(800, 1024) - y = randint(600, 900) - self.webdriver.set_window_size(x, y) - self.webdriver.set_window_position(x * (self.browser_num % 4), y * (math.floor(self.browser_num // 4))) - except WebDriverException as e: - logger.error('Cannot set window size: {}'.format(e)) + try: + x = randint(800, 1024) + y = randint(600, 900) + self.webdriver.set_window_size(x, y) + self.webdriver.set_window_position(x * (self.browser_num % 4), y * (math.floor(self.browser_num // 4))) + except WebDriverException as e: + logger.error('Cannot set window size: {}'.format(e)) - super().before_search() + super().before_search() - if self.startable: - self.build_search() - self.search() + if self.startable: + self.build_search() + self.search() - if self.webdriver: - self.webdriver.quit() + if self.webdriver: + self.webdriver.quit() """ diff --git a/scrapcore/scraping.py b/scrapcore/scraping.py index d74febb..bb88625 100644 --- a/scrapcore/scraping.py +++ b/scrapcore/scraping.py @@ -11,7 +11,7 @@ from scrapcore.tools import Proxies logger = logging.getLogger(__name__) -SEARCH_MODES = ('http', 'selenium') +SEARCH_MODES = ('selenium') class GoogleSearchError(Exception): @@ -146,14 +146,11 @@ def __init__(self, self.requested_at = None # The name of the scraper self.name = '[{}]'.format(self.search_engine_name) + self.__class__.__name__ - # How long to sleep (in seconds) after every n-th request - # self.config.get(self.config.get('sleeping_ranges')) - self.sleeping_ranges = { - 1: (2, 5), - 5: (5, 10), - 30: (10, 20), - 127: (30, 50), - } + + # How long to sleep (in seconds) after every request + self.sleeping_min = self.config.get('sleeping_min') + self.sleeping_max = self.config.get('sleeping_max') + # the default timeout self.timeout = 5 # the status of the thread after finishing or failing @@ -258,22 +255,12 @@ def cache_results(self): db_lock=self.db_lock ) - def _largest_sleep_range(self, search_number): - """Sleep a given amount of time - dependent on the number of searches done.""" - - assert search_number >= 0 - if search_number != 0: - s = sorted(self.sleeping_ranges.keys(), reverse=True) - for n in s: - if search_number % n == 0: - return self.sleeping_ranges[n] - # sleep one second - return 1, 2 - def detection_prevention_sleep(self): - # match the largest sleep range - self.current_delay = random.randrange(*self._largest_sleep_range(self.search_number)) + # randomly delay from sleep range + self.current_delay = random.randrange( + self.sleeping_min, + self.sleeping_max + ) time.sleep(self.current_delay) def after_search(self): diff --git a/scrapcore/tools.py b/scrapcore/tools.py index f9f354d..e3ac05a 100644 --- a/scrapcore/tools.py +++ b/scrapcore/tools.py @@ -73,9 +73,9 @@ class Proxies(): def parse_proxy_file(self, fname): """Parses a proxy file The format should be like the following: - socks5 23.212.45.13:1080 username:password - socks4 23.212.45.13:80 username:password - http 23.212.45.13:80 + socks5 XX.XXX.XX.XX:1080 username:password + socks4 XX.XXX.XX.XX:80 username:password + http XX.XXX.XX.XX:80 If username and password aren't provided, we assumes that the proxy doesn't need auth credentials. Args: diff --git a/scrapcore/user_agent.py b/scrapcore/user_agent.py index 206b74b..3debc5f 100644 --- a/scrapcore/user_agent.py +++ b/scrapcore/user_agent.py @@ -49,7 +49,7 @@ 'Mozilla/5.0 (Macintosh, Intel Mac OS X 10_12_4) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.1 Safari/603.1.30', 'Mozilla/5.0 (Windows NT 6.1, rv:52.0) Gecko/20100101 Firefox/52.0', 'Mozilla/5.0 (Windows NT 6.1, WOW64, rv:45.0) Gecko/20100101 Firefox/45.0', - 'Mozilla/5.0 (Windows NT 10.0, WOW64, Trident/7.0, rv:11.0) like Gecko', + # 'Mozilla/5.0 (Windows NT 10.0, WOW64, Trident/7.0, rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.3, WOW64, Trident/7.0, rv:11.0) like Gecko', 'Mozilla/5.0 (Macintosh, Intel Mac OS X 10_12_3) AppleWebKit/602.4.8 (KHTML, like Gecko) Version/10.0.3 Safari/602.4.8', 'Mozilla/5.0 (Windows NT 10.0, Win64, x64, rv:52.0) Gecko/20100101 Firefox/52.0', diff --git a/scrapcore/validator_config.py b/scrapcore/validator_config.py index c28e3ea..2a51e08 100644 --- a/scrapcore/validator_config.py +++ b/scrapcore/validator_config.py @@ -19,5 +19,5 @@ def validate(self, config): if config.get('use_own_ip') != True and len(config.get('proxy_file')) == 0: raise Error('No proxy_file provided and using own IP is disabled.') - if config.get('scrape_method') not in ('http', 'selenium'): + if config.get('scrape_method') not in ('selenium'): raise Error('No such scrape_method {}'.format(config.get('scrape_method'))) diff --git a/serpscrap/config.py b/serpscrap/config.py index bb50ad7..2cd33fb 100644 --- a/serpscrap/config.py +++ b/serpscrap/config.py @@ -21,7 +21,7 @@ class Config(): 'use_own_ip': True, 'search_engines': ['google'], 'num_pages_for_keyword': 2, - 'scrape_method': 'selenium', # http + 'scrape_method': 'selenium', 'sel_browser': 'phantomjs', 'executable_path': '', 'do_caching': True, @@ -36,6 +36,8 @@ class Config(): 'log_level': 'INFO', 'num_workers': 1, 'num_results_per_page': 10, + 'sleeping_min': 5, + 'sleeping_max': 15, 'search_type': 'normal', 'google_search_url': 'https://www.google.com/search?', 'bing_search_url': 'http://www.bing.com/search?', diff --git a/serpscrap/serpscrap.py b/serpscrap/serpscrap.py index 4685c2e..ed98109 100644 --- a/serpscrap/serpscrap.py +++ b/serpscrap/serpscrap.py @@ -71,8 +71,7 @@ def init(self, config=None, keywords=None): else: self.config = Config().get() - if 'selenium' in self.config['scrape_method'] and \ - self.config['executable_path'] == '': + if self.config['executable_path'] == '': logger.info('preparing phantomjs') firstrun = PhantomInstall() phantomjs = firstrun.detect_phantomjs() diff --git a/setup.py b/setup.py index a1512a7..0888a29 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- from setuptools import setup, find_packages -version = '0.5.1' +version = '0.6.0' setup( @@ -17,22 +17,23 @@ long_description=open('README.rst').read(), author='Ronald Schmidt', author_email='ronald.schmidt@zu-web.de', + doc_url='http://serpscrap.readthedocs.io/en/latest/', url='https://github.com/ecoron/SerpScrap', license='MIT', packages=find_packages(), install_requires=[ 'PySocks==1.6.7', - 'chardet==2.3.0', - 'beautifulsoup4==4.5.3', + 'chardet==3.0.2', + 'beautifulsoup4==4.6.0', 'html2text==2016.9.19', - 'markovify==0.5.4', + 'markovify==0.6.0', 'numpy==1.12.1', 'scipy==0.19.0', 'scikit-learn==0.18.1', 'lxml', 'sqlalchemy==1.0.12', - 'selenium==3.3.3', - 'cssselect==0.9.1', + 'selenium==3.4.1', + 'cssselect==1.0.1', 'requests==2.13.0', 'aiohttp==0.21.5', ], diff --git a/tests/test_basic.py b/tests/test_basic.py index 16b3aa7..00ed1b6 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -26,6 +26,6 @@ def test_simple(self): scrap.init(config=config.get(), keywords=keywords) results = scrap.run() - assert len(config.get()) == 26 + assert len(config.get()) == 28 assert len(results) > 0 assert len(results[0]) > 0