diff --git a/README.md b/README.md index 3984e9c..a01edcc 100644 --- a/README.md +++ b/README.md @@ -88,9 +88,9 @@ Here is the list of available actions: - `Click(selector, click_options, wait_options)` - click on element on page - `Scroll(selector, wait_options)` - scroll page - `Screenshot(options)` - take screenshot +- `RecaptchaSolver(solve_recaptcha, close_on_empty)` - find or solve recaptcha on page - `Har()` - to get the HAR file, pass the `har_recording=True` argument to `PuppeteerRequest` at the start of execution. - `FillForm(input_mapping, submit_button)` - to fill out and submit forms on page. -- `RecaptchaSolver(solve_recaptcha)` - find or solve recaptcha on page - `CustomJsAction(js_function)` - evaluate JS function on page Available options essentially mirror [service](https://github.com/ispras/scrapy-puppeteer-service) method parameters, which in turn mirror puppeteer API functions to some extent. @@ -174,6 +174,42 @@ and will notify you about number of found captchas on the page. If you don't want the middleware to work on specific request you may provide special meta key: `'dont_recaptcha': True`. In this case RecaptchaMiddleware will just skip the request. +## Automatic context restoring + +Sometimes you may receive responses with status 422 (Unprocessable Entity). +This means the scrapy-puppeteer-services struggled to find provided context or page in its memory. +In such situations you can use this middleware to restore such contexts. + +Enabling the middleware: +```Python +DOWNLOADER_MIDDLEWARES = { # Strict order of middlewares + # 'scrapypuppeteer.middleware.PuppeteerRecaptchaDownloaderMiddleware': 1040, # You may also use recaptcha middleware + 'scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware': 1041, + 'scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware': 1042, +} +``` + +Settings of the middleware: +```Python +N_RETRY_RESTORING = 3 # Number of tries to restore a context +RESTORING_LENGTH = 2 # Number of restorable requests in a sequence +``` + +Currently, the middleware can only restart from the beginning of request-response sequence. +You can start this sequence with `recover_context` meta-key, just provide `True` value. +Example: +```Python +... +yield PuppeteerRequest( + url, + callback=self.click_on_navigation, + errback=self.errback, + close_page=False, + meta={'recover_context': True} +) +... +``` + ## TODO - [x] skeleton that could handle goto, click, scroll, and actions diff --git a/examples/spiders/dead_context.py b/examples/spiders/dead_context.py new file mode 100644 index 0000000..93dbee5 --- /dev/null +++ b/examples/spiders/dead_context.py @@ -0,0 +1,74 @@ +import scrapy + +from asyncio import sleep + +from scrapypuppeteer import PuppeteerRequest, PuppeteerResponse +from scrapypuppeteer.actions import Click, GoTo +from twisted.python.failure import Failure + + +class DeadContextSpider(scrapy.Spider): + custom_settings = { + "TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor", + "DOWNLOADER_MIDDLEWARES": { + "scrapypuppeteer.middleware.PuppeteerContextRestoreDownloaderMiddleware": 1041, + "scrapypuppeteer.middleware.PuppeteerServiceDownloaderMiddleware": 1042, + }, + "N_RETRY_RESTORING": 3, + "RESTORING_LENGTH": 2, + } + name = "dead_context" + + def start_requests(self): + urls = [ + "https://www.google.com/recaptcha/api2/demo", + "https://scrapy.org", + "https://pptr.dev", + ] + + for url in urls: + yield PuppeteerRequest( + url, + callback=self.click_on_navigation, + errback=self.errback, + close_page=False, + meta={"recover_context": True}, + ) + + async def click_on_navigation(self, response: PuppeteerResponse): + await sleep(4) + + click = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a:nth-child(3)" + ) + yield response.follow( + click, callback=self.click_back, errback=self.errback, close_page=False + ) + + async def click_back(self, response: PuppeteerResponse): + await sleep(4) + + click = Click( + "#__docusaurus > nav > div.navbar__inner > div:nth-child(1) > a.navbar__brand > b" + ) + yield response.follow( + click, callback=self.goto_api, errback=self.errback, close_page=False + ) + + async def goto_api(self, response): + await sleep(4) + + yield response.follow( + GoTo("api/puppeteer.puppeteernode"), + callback=self.empty_action, + errback=self.errback, + close_page=False, + ) + + @staticmethod + async def empty_action(response, **kwargs): + await sleep(4) + + @staticmethod + def errback(failure: Failure): + print(failure) diff --git a/scrapypuppeteer/middleware.py b/scrapypuppeteer/middleware.py index 99568c5..68d454a 100644 --- a/scrapypuppeteer/middleware.py +++ b/scrapypuppeteer/middleware.py @@ -1,320 +1,24 @@ -import logging -from collections import defaultdict -from typing import List, Union +import warnings -from scrapy import signals -from scrapy.crawler import Crawler -from scrapy.exceptions import IgnoreRequest, NotConfigured +import scrapy.exceptions - -from scrapypuppeteer.actions import ( - Click, - RecaptchaSolver, - Screenshot, - Scroll, - CustomJsAction, -) -from scrapypuppeteer.response import ( - PuppeteerResponse, - PuppeteerHtmlResponse, -) -from scrapypuppeteer.request import ActionRequest, PuppeteerRequest, CloseContextRequest -from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( - PyppeteerBrowserManager, +from .middlewares import ( + PuppeteerServiceDownloaderMiddleware, + PuppeteerRecaptchaDownloaderMiddleware, + PuppeteerContextRestoreDownloaderMiddleware, ) -from scrapypuppeteer.browser_managers.service_browser_manager import ( - ServiceBrowserManager, -) -from scrapypuppeteer.browser_managers.playwright_browser_manager import ( - PlaywrightBrowserManager, -) - -from scrapypuppeteer.browser_managers import BrowserManager - - -class PuppeteerServiceDownloaderMiddleware: - """ - This downloader middleware converts PuppeteerRequest instances to - Puppeteer service API requests and then converts its responses to - PuppeteerResponse instances. Additionally, it tracks all browser contexts - that spider uses and performs cleanup request to service right before - spider is closed. - - Additionally, the middleware uses these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' - - Settings: - - PUPPETEER_SERVICE_URL (str) - Service URL, e.g. 'http://localhost:3000' - - PUPPETEER_INCLUDE_HEADERS (bool|list[str]) - Determines which request headers will be sent to remote site by puppeteer service. - Either True (all headers), False (no headers) or list of header names. - May be overridden per request. - By default, only cookies are sent. - - PUPPETEER_INCLUDE_META (bool) - Determines whether to send or not user's meta attached by user. - Default to False. - """ - - SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" - INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" - SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" - DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately - - EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" - - service_logger = logging.getLogger(__name__) - - def __init__( - self, - crawler: Crawler, - service_url: str, - include_headers: Union[bool, List[str]], - include_meta: bool, - browser_manager: BrowserManager, - ): - self.service_base_url = service_url - self.include_headers = include_headers - self.include_meta = include_meta - self.crawler = crawler - self.used_contexts = defaultdict(set) - self.browser_manager = browser_manager - - @classmethod - def from_crawler(cls, crawler): - service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) - if cls.INCLUDE_HEADERS_SETTING in crawler.settings: - try: - include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) - except ValueError: - include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) - else: - include_headers = cls.DEFAULT_INCLUDE_HEADERS - include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) - - execution_method = crawler.settings.get( - cls.EXECUTION_METHOD_SETTING, "PUPPETEER" - ).lower() - - if execution_method == "pyppeteer": - browser_manager = PyppeteerBrowserManager() - elif execution_method == "puppeteer": - browser_manager = ServiceBrowserManager( - service_url, include_meta, include_headers, crawler - ) - elif execution_method == "playwright": - browser_manager = PlaywrightBrowserManager() - else: - raise NameError("Wrong EXECUTION_METHOD") - - middleware = cls( - crawler, service_url, include_headers, include_meta, browser_manager - ) - crawler.signals.connect( - middleware.browser_manager.close_used_contexts, signal=signals.spider_idle - ) - return middleware - - def process_request(self, request, spider): - return self.browser_manager.process_request(request) - - def process_response(self, request, response, spider): - return self.browser_manager.process_response(self, request, response, spider) - -class PuppeteerRecaptchaDownloaderMiddleware: - """ - This middleware is supposed to solve recaptcha on the page automatically. - If there is no captcha on the page then this middleware will do nothing - on the page, so your 2captcha balance will remain the same. - It can submit recaptcha if "submit button" is provided. - It will not "submit" captcha if there is no submit-selector. - If you want to turn Recaptcha solving off on the exact request provide - meta-key 'dont_recaptcha' with True value. The middleware will skip the request - through itself. - - The middleware uses additionally these meta-keys, do not use them, because their changing - could possibly (almost probably) break determined behaviour: - '_captcha_submission', '_captcha_solving' - - Settings: - - RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) - RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not - RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and - these domains' submit selectors, e.g. - 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' - it could be also squeezed to - 'ecaptcha/api2/de': '#recaptcha-demo-submit' - also you can use not just strings but Click actions with required parameters: - 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') - In general - domain is a unique identifying string which is contained in web-page url - If there is no button to submit recaptcha then provide empty string to a domain. - This setting can also be a string. If so the middleware will only click the button - related to this selector. - This setting can also be unprovided. In this case every web-page you crawl is supposed to be - without submit button, or you manually do it yourself. - """ - - MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" - RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" - SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" - - def __init__(self, recaptcha_solving: bool, submit_selectors: dict): - self.submit_selectors = submit_selectors - self.recaptcha_solving = recaptcha_solving - self._page_responses = dict() - self._page_closing = set() - - @classmethod - def from_crawler(cls, crawler: Crawler): - activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) - if not activation: - raise NotConfigured - recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) - - try: - submit_selectors = crawler.settings.getdict( - cls.SUBMIT_SELECTORS_SETTING, dict() - ) - except ValueError: - submit_selectors = { - "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") - } - except Exception as exception: - raise ValueError( - f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" - ) - - for key in submit_selectors.keys(): - submit_selector = submit_selectors[key] - if isinstance(submit_selector, str): - submit_selectors[key] = Click(selector=submit_selector) - elif not isinstance(submit_selector, Click): - raise ValueError( - "Submit selector must be str or Click," - f"but {type(submit_selector)} provided" - ) - return cls(recaptcha_solving, submit_selectors) - - @staticmethod - def is_recaptcha_producing_action(action) -> bool: - return not isinstance( - action, - (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), - ) - - def process_request(self, request, **_): - if request.meta.get("dont_recaptcha", False): - return None - - # Checking if we need to close page after action - if isinstance(request, PuppeteerRequest): - if self.is_recaptcha_producing_action(request.action): - if request.close_page and not request.meta.get( - "_captcha_submission", False - ): - request.close_page = False - request.dont_filter = True - self._page_closing.add(request) - return request - - def process_response(self, request, response, spider): - if not isinstance( - response, PuppeteerResponse - ): # We only work with PuppeteerResponses - return response - - puppeteer_request = response.puppeteer_request - if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses - return response - - if puppeteer_request.meta.pop( - "_captcha_submission", False - ): # Submitted captcha - return self.__gen_response(response) - - if puppeteer_request.meta.pop("_captcha_solving", False): - # RECaptchaSolver was called by recaptcha middleware - return self._submit_recaptcha(request, response, spider) - - if not self.is_recaptcha_producing_action(puppeteer_request.action): - # No recaptcha after these actions - return response - - # Any puppeteer response besides PuppeteerRecaptchaSolverResponse - return self._solve_recaptcha(request, response) - - def _solve_recaptcha(self, request, response): - self._page_responses[response.page_id] = ( - response # Saving main response to return it later - ) - - recaptcha_solver = RecaptchaSolver( - solve_recaptcha=self.recaptcha_solving, - close_on_empty=self.__is_closing(response, remove_request=False), - ) - return response.follow( - recaptcha_solver, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - meta={"_captcha_solving": True}, - close_page=False, - ) - - def _submit_recaptcha(self, request, response, spider): - if not response.puppeteer_request.action.solve_recaptcha: - spider.log( - message=f"Found {len(response.recaptcha_data['captchas'])} captcha " - f"but did not solve due to argument", - level=logging.INFO, - ) - return self.__gen_response(response) - # Click "submit button"? - if response.recaptcha_data["captchas"] and self.submit_selectors: - # We need to click "submit button" - for domain, submitting in self.submit_selectors.items(): - if domain in response.url: - if not submitting.selector: - return self.__gen_response(response) - return response.follow( - action=submitting, - callback=request.callback, - cb_kwargs=request.cb_kwargs, - errback=request.errback, - close_page=self.__is_closing(response), - meta={"_captcha_submission": True}, - ) - raise IgnoreRequest( - "No submit selector found to click on the page but captcha found" - ) - return self.__gen_response(response) - - def __gen_response(self, response): - main_response_data = dict() - main_response_data["page_id"] = ( - None if self.__is_closing(response) else response.puppeteer_request.page_id - ) - - main_response = self._page_responses.pop(response.page_id) - - if isinstance(main_response, PuppeteerHtmlResponse): - if isinstance(response.puppeteer_request.action, RecaptchaSolver): - main_response_data["body"] = response.html - elif isinstance(response.puppeteer_request.action, Click): - main_response_data["body"] = response.body +warnings.warn( + "Import from `scrapypuppeteer.middleware` is deprecated. " + "Use `scrapypuppeteer.middlewares` instead.", + scrapy.exceptions.ScrapyDeprecationWarning, + stacklevel=2, +) - return main_response.replace(**main_response_data) - def __is_closing(self, response, remove_request: bool = True) -> bool: - main_request = self._page_responses[response.page_id].puppeteer_request - close_page = main_request in self._page_closing - if close_page and remove_request: - self._page_closing.remove(main_request) - return close_page +__all__ = [ + "PuppeteerServiceDownloaderMiddleware", + "PuppeteerRecaptchaDownloaderMiddleware", + "PuppeteerContextRestoreDownloaderMiddleware", +] diff --git a/scrapypuppeteer/middlewares/__init__.py b/scrapypuppeteer/middlewares/__init__.py new file mode 100644 index 0000000..fa2a319 --- /dev/null +++ b/scrapypuppeteer/middlewares/__init__.py @@ -0,0 +1,3 @@ +from .service import PuppeteerServiceDownloaderMiddleware +from .recaptcha import PuppeteerRecaptchaDownloaderMiddleware +from .restore import PuppeteerContextRestoreDownloaderMiddleware diff --git a/scrapypuppeteer/middlewares/recaptcha.py b/scrapypuppeteer/middlewares/recaptcha.py new file mode 100644 index 0000000..4755d9e --- /dev/null +++ b/scrapypuppeteer/middlewares/recaptcha.py @@ -0,0 +1,204 @@ +import logging + +from scrapy.crawler import Crawler +from scrapy.exceptions import IgnoreRequest, NotConfigured + +from scrapypuppeteer.actions import ( + Click, + RecaptchaSolver, + Screenshot, + Scroll, + CustomJsAction, +) +from scrapypuppeteer.response import PuppeteerResponse, PuppeteerHtmlResponse +from scrapypuppeteer.request import PuppeteerRequest + +recaptcha_logger = logging.getLogger(__name__) + + +class PuppeteerRecaptchaDownloaderMiddleware: + """ + This middleware is supposed to solve recaptcha on the page automatically. + If there is no captcha on the page then this middleware will do nothing + on the page, so your 2captcha balance will remain the same. + It can submit recaptcha if "submit button" is provided. + It will not "submit" captcha if there is no submit-selector. + + If you want to turn Recaptcha solving off on the exact request provide + meta-key 'dont_recaptcha' with True value. The middleware will skip the request + through itself. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + '_captcha_submission', '_captcha_solving' + + Settings: + + RECAPTCHA_ACTIVATION: bool = True - activates or not the middleware (if not - raises NotConfigured) + RECAPTCHA_SOLVING: bool = True - whether solve captcha automatically or not + RECAPTCHA_SUBMIT_SELECTORS: str | dict = {} - dictionary consisting of domains and + these domains' submit selectors, e.g. + 'www.google.com/recaptcha/api2/demo': '#recaptcha-demo-submit' + it could be also squeezed to + 'ecaptcha/api2/de': '#recaptcha-demo-submit' + also you can use not just strings but Click actions with required parameters: + 'ogle.com/recaptcha': Click('#recaptcha-demo-submit') + In general - domain is a unique identifying string which is contained in web-page url + If there is no button to submit recaptcha then provide empty string to a domain. + This setting can also be a string. If so the middleware will only click the button + related to this selector. + This setting can also be unprovided. In this case every web-page you crawl is supposed to be + without submit button, or you manually do it yourself. + """ + + MIDDLEWARE_ACTIVATION_SETTING = "RECAPTCHA_ACTIVATION" + RECAPTCHA_SOLVING_SETTING = "RECAPTCHA_SOLVING" + SUBMIT_SELECTORS_SETTING = "RECAPTCHA_SUBMIT_SELECTORS" + + def __init__(self, recaptcha_solving: bool, submit_selectors: dict): + self.submit_selectors = submit_selectors + self.recaptcha_solving = recaptcha_solving + self._page_responses = dict() + self._page_closing = set() + + @classmethod + def from_crawler(cls, crawler: Crawler): + activation = crawler.settings.get(cls.MIDDLEWARE_ACTIVATION_SETTING, True) + if not activation: + raise NotConfigured + recaptcha_solving = crawler.settings.get(cls.RECAPTCHA_SOLVING_SETTING, True) + + try: + submit_selectors = crawler.settings.getdict( + cls.SUBMIT_SELECTORS_SETTING, dict() + ) + except ValueError: + submit_selectors = { + "": crawler.settings.get(cls.SUBMIT_SELECTORS_SETTING, "") + } + except Exception as exception: + raise ValueError( + f"Wrong argument(s) inside {cls.SUBMIT_SELECTORS_SETTING}: {exception}" + ) + + for key in submit_selectors.keys(): + submit_selector = submit_selectors[key] + if isinstance(submit_selector, str): + submit_selectors[key] = Click(selector=submit_selector) + elif not isinstance(submit_selector, Click): + raise TypeError( + f"Submit selector must be str or Click, got {type(submit_selector)}" + ) + return cls(recaptcha_solving, submit_selectors) + + def process_request(self, request, spider): + if request.meta.get("dont_recaptcha", False): + return None + + if isinstance(request, PuppeteerRequest): + if request.close_page and not request.meta.get( + "_captcha_submission", False + ): + request.close_page = False + request.dont_filter = True + self._page_closing.add(request) + return request + return None + + def process_response(self, request, response, spider): + if not isinstance( + response, PuppeteerResponse + ): # We only work with PuppeteerResponses + return response + + puppeteer_request = response.puppeteer_request + if puppeteer_request.meta.get("dont_recaptcha", False): # Skip such responses + return response + + if puppeteer_request.meta.pop( + "_captcha_submission", False + ): # Submitted captcha + return self.__gen_response(response) + + if puppeteer_request.meta.pop("_captcha_solving", False): + # RECaptchaSolver was called by recaptcha middleware + return self._submit_recaptcha(request, response, spider) + + if isinstance( + puppeteer_request.action, + (Screenshot, Scroll, CustomJsAction, RecaptchaSolver), + ): + # No recaptcha after these actions + return response + + # Any puppeteer response besides PuppeteerRecaptchaSolverResponse + return self._solve_recaptcha(request, response) + + def _solve_recaptcha(self, request, response): + self._page_responses[response.page_id] = ( + response # Saving main response to return it later + ) + + recaptcha_solver = RecaptchaSolver( + solve_recaptcha=self.recaptcha_solving, + close_on_empty=self.__is_closing(response, remove_request=False), + ) + return response.follow( + recaptcha_solver, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + meta={"_captcha_solving": True}, + close_page=False, + ) + + def _submit_recaptcha(self, request, response, spider): + if not response.puppeteer_request.action.solve_recaptcha: + recaptcha_logger.log( + level=logging.INFO, + msg=f"Found {len(response.recaptcha_data['captchas'])} captcha " + f"but did not solve due to argument", + ) + return self.__gen_response(response) + # Click "submit button"? + if response.recaptcha_data["captchas"] and self.submit_selectors: + # We need to click "submit button" + for domain, submitting in self.submit_selectors.items(): + if domain in response.url: + if not submitting.selector: + return self.__gen_response(response) + return response.follow( + action=submitting, + callback=request.callback, + cb_kwargs=request.cb_kwargs, + errback=request.errback, + close_page=self.__is_closing(response), + meta={"_captcha_submission": True}, + ) + raise IgnoreRequest( + "No submit selector found to click on the page but captcha found" + ) + return self.__gen_response(response) + + def __gen_response(self, response): + main_response_data = dict() + main_response_data["page_id"] = ( + None if self.__is_closing(response) else response.puppeteer_request.page_id + ) + + main_response = self._page_responses.pop(response.page_id) + + if isinstance(main_response, PuppeteerHtmlResponse): + if isinstance(response.puppeteer_request.action, RecaptchaSolver): + main_response_data["body"] = response.html + elif isinstance(response.puppeteer_request.action, Click): + main_response_data["body"] = response.body + + return main_response.replace(**main_response_data) + + def __is_closing(self, response, remove_request: bool = True) -> bool: + main_request = self._page_responses[response.page_id].puppeteer_request + close_page = main_request in self._page_closing + if close_page and remove_request: + self._page_closing.remove(main_request) + return close_page diff --git a/scrapypuppeteer/middlewares/restore.py b/scrapypuppeteer/middlewares/restore.py new file mode 100644 index 0000000..db6e1ad --- /dev/null +++ b/scrapypuppeteer/middlewares/restore.py @@ -0,0 +1,157 @@ +import json +import logging + +from typing import Union +from http import HTTPStatus + +from scrapy.crawler import Crawler +from scrapy.exceptions import IgnoreRequest + +from scrapypuppeteer.response import PuppeteerResponse +from scrapypuppeteer.request import PuppeteerRequest + +restore_logger = logging.getLogger(__name__) + + +class PuppeteerContextRestoreDownloaderMiddleware: + """ + This middleware allows you to recover puppeteer context. + + If you want to recover puppeteer context starting from the specified first request provide + `recover_context` meta-key with `True` value. + + The middleware uses additionally these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + `__request_binding`, `__restore_count`, `__context_id`. + + Settings: + + RESTORING_LENGTH: int = 1 - number of restorable requests in a sequence. + N_RETRY_RESTORING: int = 1 - number of tries to restore a context. + """ + + N_RETRY_RESTORING_SETTING = "N_RETRY_RESTORING" + RESTORING_LENGTH_SETTING = "RESTORING_LENGTH" + + def __init__(self, restoring_length: int, n_retry_restoring: int): + self.restoring_length = restoring_length + self.n_retry_restoring = n_retry_restoring + self.context_requests = {} + self.context_length = {} + + @classmethod + def from_crawler(cls, crawler: Crawler): + restoring_length = crawler.settings.get(cls.RESTORING_LENGTH_SETTING, 1) + if not isinstance(restoring_length, int): + raise TypeError( + f"`{cls.RESTORING_LENGTH_SETTING}` must be an integer, got {type(restoring_length)}" + ) + elif restoring_length < 1: + raise ValueError( + f"`{cls.RESTORING_LENGTH_SETTING}` must be greater than or equal to 1" + ) + + n_retry_restoring = crawler.settings.get(cls.N_RETRY_RESTORING_SETTING, 1) + if not isinstance(n_retry_restoring, int): + raise TypeError( + f"`{cls.N_RETRY_RESTORING_SETTING}` must be an integer, got {type(n_retry_restoring)}" + ) + elif n_retry_restoring < 1: + raise ValueError( + f"`{cls.N_RETRY_RESTORING_SETTING}` must be greater than or equal to 1" + ) + + return cls(restoring_length, n_retry_restoring) + + @staticmethod + def process_request(request, spider): + if not isinstance(request, PuppeteerRequest): + return None + + if not request.meta.pop("recover_context", False): + return None + + if request.context_id or request.page_id: + raise IgnoreRequest( + f"Request {request} is not in the beginning of the request-response sequence" + ) + + request.meta["__request_binding"] = True + request.dont_filter = True + return None + + def process_response(self, request, response, spider): + puppeteer_request: Union[PuppeteerRequest, None] = request.meta.get( + "puppeteer_request", None + ) + request_binding = puppeteer_request is not None and puppeteer_request.meta.get( + "__request_binding", False + ) + + if isinstance(response, PuppeteerResponse): + if request_binding: + self._bind_context(request, response) + if response.context_id in self.context_length: + # Update number of actions in context + self.context_length[response.context_id] += 1 + elif ( + puppeteer_request is not None + and response.status == HTTPStatus.UNPROCESSABLE_ENTITY + ): + # One PuppeteerRequest has failed with 422 error + if request_binding: + # Could not get context, retry + if request.meta.get("__restore_count", 0) < self.n_retry_restoring: + request.meta["__restore_count"] += 1 + return request + else: + return self._restore_context(response) + return response + + def _bind_context(self, request, response): + if request.meta.get("__context_id", None) is not None: + # Need to update context_id + self.__delete_context(request.meta["__context_id"], None) + restoring_request = request.copy() + restoring_request.meta["__restore_count"] = restoring_request.meta.get( + "__restore_count", 0 + ) + restoring_request.meta["__context_id"] = response.context_id + self.context_requests[response.context_id] = restoring_request + self.context_length[response.context_id] = 0 + + def _restore_context(self, response): + context_id = json.loads(response.text).get("contextId", None) + + if context_id in self.context_requests: + restoring_request = self.context_requests[context_id] + + if self.context_length[context_id] >= self.restoring_length + 1: + # Too many actions in context + self.__delete_context( + context_id, + f"Too many actions in context ({restoring_request}). Deleting it.", + ) + elif restoring_request.meta["__restore_count"] >= self.n_retry_restoring: + # Too many retries + self.__delete_context( + context_id, + f"Too many retries in context ({restoring_request}). Deleting it.", + ) + else: + # Restoring + restoring_request.meta["__restore_count"] += 1 + restore_logger.log( + level=logging.DEBUG, + msg=f"Restoring the request {restoring_request}", + ) + self.context_length[context_id] = 1 + return restoring_request + return response + + def __delete_context(self, context_id: str, reason: Union[str, None]): + del self.context_length[context_id] + del self.context_requests[context_id] + + if reason is not None: + restore_logger.log(level=logging.INFO, msg=reason) diff --git a/scrapypuppeteer/middlewares/service.py b/scrapypuppeteer/middlewares/service.py new file mode 100644 index 0000000..4ec41a8 --- /dev/null +++ b/scrapypuppeteer/middlewares/service.py @@ -0,0 +1,112 @@ +import logging +from collections import defaultdict +from typing import List, Union + +from scrapy import signals +from scrapy.crawler import Crawler + +from scrapypuppeteer.browser_managers.pyppeteer_browser_manager import ( + PyppeteerBrowserManager, +) +from scrapypuppeteer.browser_managers.service_browser_manager import ( + ServiceBrowserManager, +) +from scrapypuppeteer.browser_managers.playwright_browser_manager import ( + PlaywrightBrowserManager, +) + +from scrapypuppeteer.browser_managers import BrowserManager + + +class PuppeteerServiceDownloaderMiddleware: + """ + This downloader middleware converts PuppeteerRequest instances to + Puppeteer service API requests and then converts its responses to + PuppeteerResponse instances. Additionally, it tracks all browser contexts + that spider uses and performs cleanup request to service right before + spider is closed. + + Additionally, the middleware uses these meta-keys, do not use them, because their changing + could possibly (almost probably) break determined behaviour: + 'puppeteer_request', 'dont_obey_robotstxt', 'proxy' + + Settings: + + PUPPETEER_SERVICE_URL (str) + Service URL, e.g. 'http://localhost:3000' + + PUPPETEER_INCLUDE_HEADERS (bool|list[str]) + Determines which request headers will be sent to remote site by puppeteer service. + Either True (all headers), False (no headers) or list of header names. + May be overridden per request. + By default, only cookies are sent. + + PUPPETEER_INCLUDE_META (bool) + Determines whether to send or not user's meta attached by user. + Default to False. + """ + + SERVICE_URL_SETTING = "PUPPETEER_SERVICE_URL" + INCLUDE_HEADERS_SETTING = "PUPPETEER_INCLUDE_HEADERS" + SERVICE_META_SETTING = "PUPPETEER_INCLUDE_META" + DEFAULT_INCLUDE_HEADERS = ["Cookie"] # TODO send them separately + + EXECUTION_METHOD_SETTING = "EXECUTION_METHOD" + + service_logger = logging.getLogger(__name__) + + def __init__( + self, + crawler: Crawler, + service_url: str, + include_headers: Union[bool, List[str]], + include_meta: bool, + browser_manager: BrowserManager, + ): + self.service_base_url = service_url + self.include_headers = include_headers + self.include_meta = include_meta + self.crawler = crawler + self.used_contexts = defaultdict(set) + self.browser_manager = browser_manager + + @classmethod + def from_crawler(cls, crawler): + service_url = crawler.settings.get(cls.SERVICE_URL_SETTING) + if cls.INCLUDE_HEADERS_SETTING in crawler.settings: + try: + include_headers = crawler.settings.getbool(cls.INCLUDE_HEADERS_SETTING) + except ValueError: + include_headers = crawler.settings.getlist(cls.INCLUDE_HEADERS_SETTING) + else: + include_headers = cls.DEFAULT_INCLUDE_HEADERS + include_meta = crawler.settings.getbool(cls.SERVICE_META_SETTING, False) + + execution_method = crawler.settings.get( + cls.EXECUTION_METHOD_SETTING, "PUPPETEER" + ).lower() + + if execution_method == "pyppeteer": + browser_manager = PyppeteerBrowserManager() + elif execution_method == "puppeteer": + browser_manager = ServiceBrowserManager( + service_url, include_meta, include_headers, crawler + ) + elif execution_method == "playwright": + browser_manager = PlaywrightBrowserManager() + else: + raise NameError("Wrong EXECUTION_METHOD") + + middleware = cls( + crawler, service_url, include_headers, include_meta, browser_manager + ) + crawler.signals.connect( + middleware.browser_manager.close_used_contexts, signal=signals.spider_idle + ) + return middleware + + def process_request(self, request, spider): + return self.browser_manager.process_request(request) + + def process_response(self, request, response, spider): + return self.browser_manager.process_response(self, request, response, spider)