From 45bb6e3d00d87045782a5d09a9c00e18073759c4 Mon Sep 17 00:00:00 2001 From: Hiroya Matsubara Date: Sat, 5 Oct 2024 08:27:57 +0900 Subject: [PATCH] add dpk_connector to dpk (#637) * add bluecrawl connector to dpk Signed-off-by: Hiroya Matsubara * update test data Signed-off-by: Hiroya Matsubara * rename Signed-off-by: Hiroya Matsubara * correct build Signed-off-by: Hiroya Matsubara * remove unnecessary file Signed-off-by: Hiroya Matsubara * add documentation Signed-off-by: Hiroya Matsubara * rename folder Signed-off-by: Hiroya Matsubara * renamed library workflow files Signed-off-by: David Wood --------- Signed-off-by: Hiroya Matsubara Signed-off-by: David Wood Co-authored-by: David Wood --- .github/workflows/test-connector-lib.yml | 32 ++ .../{test-lib.yml => test-processing-lib.yml} | 2 +- data-connector-lib/Makefile | 49 +++ data-connector-lib/README.md | 30 ++ data-connector-lib/doc/overview.md | 47 +++ data-connector-lib/pyproject.toml | 61 ++++ .../src/dpk_connector/__init__.py | 13 + .../src/dpk_connector/core/__init__.py | 11 + .../src/dpk_connector/core/crawler.py | 216 +++++++++++ .../src/dpk_connector/core/item.py | 21 ++ .../src/dpk_connector/core/logging.py | 22 ++ .../src/dpk_connector/core/middlewares.py | 263 ++++++++++++++ .../src/dpk_connector/core/pipelines.py | 29 ++ .../src/dpk_connector/core/settings.py | 70 ++++ .../dpk_connector/core/spiders/__init__.py | 11 + .../src/dpk_connector/core/spiders/sitemap.py | 334 ++++++++++++++++++ .../src/dpk_connector/core/utils.py | 92 +++++ .../test/dpk_connector/core/__init__.py | 0 .../test/dpk_connector/core/test_crawler.py | 28 ++ .../dpk_connector/core/test_middlewares.py | 59 ++++ .../dpk_connector/core/test_sitemap_spider.py | 97 +++++ .../core/test_sitemap_spider/index.html | 39 ++ .../test/dpk_connector/core/test_utils.py | 152 ++++++++ 23 files changed, 1677 insertions(+), 1 deletion(-) create mode 100644 .github/workflows/test-connector-lib.yml rename .github/workflows/{test-lib.yml => test-processing-lib.yml} (98%) create mode 100644 data-connector-lib/Makefile create mode 100644 data-connector-lib/README.md create mode 100644 data-connector-lib/doc/overview.md create mode 100644 data-connector-lib/pyproject.toml create mode 100644 data-connector-lib/src/dpk_connector/__init__.py create mode 100644 data-connector-lib/src/dpk_connector/core/__init__.py create mode 100644 data-connector-lib/src/dpk_connector/core/crawler.py create mode 100644 data-connector-lib/src/dpk_connector/core/item.py create mode 100644 data-connector-lib/src/dpk_connector/core/logging.py create mode 100644 data-connector-lib/src/dpk_connector/core/middlewares.py create mode 100644 data-connector-lib/src/dpk_connector/core/pipelines.py create mode 100644 data-connector-lib/src/dpk_connector/core/settings.py create mode 100644 data-connector-lib/src/dpk_connector/core/spiders/__init__.py create mode 100644 data-connector-lib/src/dpk_connector/core/spiders/sitemap.py create mode 100644 data-connector-lib/src/dpk_connector/core/utils.py create mode 100644 data-connector-lib/test/dpk_connector/core/__init__.py create mode 100644 data-connector-lib/test/dpk_connector/core/test_crawler.py create mode 100644 data-connector-lib/test/dpk_connector/core/test_middlewares.py create mode 100644 data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py create mode 100644 data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html create mode 100644 data-connector-lib/test/dpk_connector/core/test_utils.py diff --git a/.github/workflows/test-connector-lib.yml b/.github/workflows/test-connector-lib.yml new file mode 100644 index 000000000..26e3490d7 --- /dev/null +++ b/.github/workflows/test-connector-lib.yml @@ -0,0 +1,32 @@ +name: Test Data Connector lib + +on: + workflow_dispatch: + push: + branches: + - "dev" + - "releases/**" + tags: + - "*" + paths: + - "data-connector-lib/**" + - "!data-connector-lib/**.md" + - ".make.*" + pull_request: + branches: + - "dev" + - "releases/**" + paths: + - "data-connector-lib/**" + - "!data-connector-lib/**.md" + - ".make.*" + +jobs: + test-dpk-connector: + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Test dpk_connector + run: | + make -C data-connector-lib venv test diff --git a/.github/workflows/test-lib.yml b/.github/workflows/test-processing-lib.yml similarity index 98% rename from .github/workflows/test-lib.yml rename to .github/workflows/test-processing-lib.yml index 5a1cff872..5d76c13f7 100644 --- a/.github/workflows/test-lib.yml +++ b/.github/workflows/test-processing-lib.yml @@ -1,4 +1,4 @@ -name: Test DPK libs and (Optionally) Push base DPK images +name: Test Data Processing libs and (Optionally) Push base DPK images on: workflow_dispatch: diff --git a/data-connector-lib/Makefile b/data-connector-lib/Makefile new file mode 100644 index 000000000..c72aeb74d --- /dev/null +++ b/data-connector-lib/Makefile @@ -0,0 +1,49 @@ +# Use make help, to see the available rules +REPOROOT=.. +include $(REPOROOT)/.make.defaults + +clean:: + @# Help: Clean up the distribution build and the venv + rm -rf dist venv + rm -rf src/*egg-info + +.check-env:: + @echo "Checks passed" + +setup:: + +set-versions: .check-env + $(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml + +build:: build-dist + +#build:: update-toml .defaults.build-dist +build-dist :: .defaults.build-dist + +publish:: publish-dist + +publish-dist :: .check-env .defaults.publish-dist + +venv:: pyproject.toml + @# Help: Create the virtual environment using pyproject.toml + rm -r dist venv || true + rm -rf src/*egg-info || true + rm makeenv || true + $(PYTHON) -m venv venv + source venv/bin/activate; \ + pip install --upgrade pip; \ + pip install -e .; \ + pip install pytest pytest-mock pytest-datadir pytest-cov moto==5.0.5 markupsafe==2.0.1 + +image:: + @# Help: Placeholder does nothing for now. + @echo "Image building for ray is in the works (comming soon)." + +# Here we run each test directory of tests and each ray launched test separately, because +# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication. +# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped.. +# TODO: the following fails. Why? source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST) . +.PHONY: test +test:: venv + @# Help: Use the already-built virtual environment to run pytest on the test directory. + source venv/bin/activate; $(PYTEST); diff --git a/data-connector-lib/README.md b/data-connector-lib/README.md new file mode 100644 index 000000000..facacaf04 --- /dev/null +++ b/data-connector-lib/README.md @@ -0,0 +1,30 @@ +# DPK Connector + +DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/). +For more details read [the documentation](doc/overview.md). + +## Virtual Environment + +The project uses `pyproject.toml` and a Makefile for operations. +To do development you should establish the virtual environment +```shell +make venv +``` +and then either activate +```shell +source venv/bin/activate +``` +or set up your IDE to use the venv directory when developing in this project + +## Library Artifact Build and Publish + +To test, build and publish the library +```shell +make test build publish +``` + +To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file. + +## How to use + +See [the overview](doc/overview.md). diff --git a/data-connector-lib/doc/overview.md b/data-connector-lib/doc/overview.md new file mode 100644 index 000000000..4e48001a2 --- /dev/null +++ b/data-connector-lib/doc/overview.md @@ -0,0 +1,47 @@ +# DPK Connector Overview + +The Data Prep Kit Connector (DPK Connector) is a Python library for scalable and compliant web crawling. + +Features: +- Robots.txt compliant: The Connector follows allow/disallow lists and some extended directives such as `Crawl-delay` in robots.txt of websites. +- Sitemap support: The Connector automatically parses sitemap urls from input and tries to find them from robots.txt. +- User agent and headers customization: You can use your own user agent string and request headers. +- Domain and path focus: You can limit domains and paths accessed by the library. +- Mime type filters: You can restrict mime types which can be downloaded. +- Parallel processing: Requests to websites are processed in parallel. + +## Example usage + +```python +from dpk_connector import crawl, shutdown + + +def main(): + """ + An example of running a crawl. + """ + + def on_downloaded(url: str, body: bytes, headers: dict) -> None: + """ + Callback function called when a page has been downloaded. + You have access to the request URL, response body and headers. + """ + print(f"url: {url}, headers: {headers}, body: {body[:64]}") + + user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0" + + # Start crawling + crawl( + ["https://crawler-test.com/"], + on_downloaded, + user_agent=user_agent, + depth_limit=0, + ) # blocking call + + # Shutdown all crawls + shutdown() + + +if __name__ == "__main__": + main() +``` diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml new file mode 100644 index 000000000..cc4e8571a --- /dev/null +++ b/data-connector-lib/pyproject.toml @@ -0,0 +1,61 @@ +[project] +name = "dpk_connector" +version = "0.2.2.dev0" +requires-python = ">=3.10" +keywords = [ + "data", + "data acquisition", + "crawler", + "web crawler", + "llm", + "generative", + "ai", + "fine-tuning", + "llmapps", +] +description = "Scalable and Compliant Web Crawler" +license = { text = "Apache-2.0" } +readme = { file = "README.md", content-type = "text/markdown" } +authors = [{ name = "Hiroya Matsubara", email = "hmtbr@jp.ibm.com" }] +dependencies = [ + "scrapy>=2.11.2", + "pydantic>=2.8.1", + "tldextract>=5.1.2", +] + +[project_urls] +Repository = "https://github.com/IBM/data-prep-kit" +Issues = "https://github.com/IBM/data-prep-kit/issues" +Documentation = "https://ibm.github.io/data-prep-kit/" + +[build-system] +requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"] +build-backend = "setuptools.build_meta" + +[project.optional-dependencies] +dev = [ + "twine", + "pytest>=7.3.2", + "pytest-dotenv>=0.5.2", + "pytest-env>=1.0.0", + "pre-commit>=3.3.2", + "pytest-cov>=4.1.0", + "pytest-mock>=3.10.0", + "pytest-datadir>=1.5.0", + "moto==5.0.5", + "markupsafe==2.0.1", +] + +[options] +package_dir = ["src", "test"] + +[options.packages.find] +where = ["src/dpk_connector"] + +[tool.pytest.ini_options] +# Currently we use low coverage since we have to run tests separately (see makefile) +#addopts = "--cov --cov-report term-missing --cov-fail-under 25" +markers = ["unit: unit tests", "integration: integration tests"] + +[tool.coverage.run] +include = ["src/*"] diff --git a/data-connector-lib/src/dpk_connector/__init__.py b/data-connector-lib/src/dpk_connector/__init__.py new file mode 100644 index 000000000..14780b08c --- /dev/null +++ b/data-connector-lib/src/dpk_connector/__init__.py @@ -0,0 +1,13 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from dpk_connector.core.crawler import async_crawl, crawl, shutdown # noqa diff --git a/data-connector-lib/src/dpk_connector/core/__init__.py b/data-connector-lib/src/dpk_connector/core/__init__.py new file mode 100644 index 000000000..0e134987e --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/__init__.py @@ -0,0 +1,11 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py new file mode 100644 index 000000000..f024e63b1 --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/crawler.py @@ -0,0 +1,216 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import threading +from typing import Any, Callable, Collection, Type, cast + +from scrapy import Spider +from scrapy.crawler import Crawler, CrawlerRunner +from scrapy.settings import Settings +from twisted.internet.defer import Deferred + +from dpk_connector.core.utils import validate_domain, validate_url + +_lock = threading.Lock() +_reactor_initialized = False +_reactor_started = False + + +def _run_reactor(): + from twisted.internet import reactor + + reactor.run(installSignalHandlers=False) + + +_reactor_thread: threading.Thread = threading.Thread( + target=_run_reactor, + daemon=True, +) + + +def _start_reactor(): + with _lock: + global _reactor_started + if not _reactor_started: + _reactor_thread.start() + _reactor_started = True + + +def _stop_reactor(): + from twisted.internet import reactor + + try: + reactor.stop() + except RuntimeError: + pass + + +class MultiThreadedCrawlerRunner(CrawlerRunner): + def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler: + if isinstance(spidercls, str): + spidercls = self.spider_loader.load(spidercls) + with _lock: + global _reactor_initialized + init_reactor = not _reactor_initialized + crawler = Crawler( + cast(Type[Spider], spidercls), self.settings, init_reactor + ) + _reactor_initialized = True + return crawler + + +def async_crawl( + seed_urls: Collection[str], + on_downloaded: Callable[[str, bytes, dict[str, str]], None], + user_agent: str = "", + headers: dict[str, str] = {}, + allow_domains: Collection[str] = (), + path_focus: bool = False, + allow_mime_types: Collection[str] = ( + "application/pdf", + "text/html", + "text/markdown", + "text/plain", + ), + disallow_mime_types: Collection[str] = (), + depth_limit: int = -1, + download_limit: int = -1, +) -> Deferred[None]: + # Assisted by WCA@IBM + # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2 + """ + Do crawl asynchronously. + + Parameters: + seed_urls (Collection[str]): A collection of seed URLs to start the crawl from. + on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page. + user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)". + headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary. + allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs. + path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted. + allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain". + disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection. + depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit. + download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit. + + Returns: + Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish. + """ + if not seed_urls: + raise ValueError(f"Empty seed URLs.") + for url in seed_urls: + if not validate_url(url): + raise ValueError(f"Seed URL {url} is not valid.") + for domain in allow_domains: + if not validate_domain(domain): + raise ValueError(f"Allow domain {domain} is not valid.") + if depth_limit < -1: + raise ValueError(f"Invalid depth limit {depth_limit}") + if download_limit < -1: + raise ValueError(f"Invalid download limit {download_limit}") + + settings = Settings() + settings.setmodule("dpk_connector.core.settings", priority="project") + + if user_agent: + settings.set("USER_AGENT", user_agent, priority="spider") + if headers: + settings.set("DEFAULT_REQUEST_HEADERS", headers) + if depth_limit == 0: + depth_limit = -1 + elif depth_limit == -1: + depth_limit = 0 + settings.set("DEPTH_LIMIT", depth_limit, priority="spider") + if download_limit == -1: + download_limit = 0 + settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider") + + runner = MultiThreadedCrawlerRunner(settings) + runner.crawl( + "dpk-connector-sitemap", + seed_urls=seed_urls, + callback=on_downloaded, + allow_domains=allow_domains, + path_focus=path_focus, + allow_mime_types=allow_mime_types, + disallow_mime_types=disallow_mime_types, + disable_sitemap_search=True, + ) + _start_reactor() + return runner.join() + + +def crawl( + seed_urls: Collection[str], + on_downloaded: Callable[[str, bytes, dict[str, str]], None], + user_agent: str = "", + headers: dict[str, str] = {}, + allow_domains: Collection[str] = (), + path_focus: bool = False, + allow_mime_types: Collection[str] = ( + "application/pdf", + "text/html", + "text/markdown", + "text/plain", + ), + disallow_mime_types: Collection[str] = (), + depth_limit: int = -1, + download_limit: int = -1, +) -> None: + # Assisted by WCA@IBM + # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2 + """ + Do crawl synchronously. + + Parameters: + seed_urls (Collection[str]): A collection of seed URLs to start the crawl from. + on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page. + user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)". + headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary. + allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs. + path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted. + allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain". + disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection. + depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit. + download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit. + + Returns: + None + """ + condition = threading.Condition() + + def on_completed(result: Any): + with condition: + condition.notify() + + d = async_crawl( + seed_urls, + on_downloaded, + user_agent, + headers, + allow_domains, + path_focus, + allow_mime_types, + disallow_mime_types, + depth_limit, + download_limit, + ) + d.addBoth(on_completed) + with condition: + condition.wait() + + +def shutdown(): + """ + Shutdown all crawls. + """ + _stop_reactor() diff --git a/data-connector-lib/src/dpk_connector/core/item.py b/data-connector-lib/src/dpk_connector/core/item.py new file mode 100644 index 000000000..5fcf68b49 --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/item.py @@ -0,0 +1,21 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from dataclasses import dataclass + + +@dataclass +class ConnectorItem: + dropped: bool = False + downloaded: bool = False + system_request: bool = False + sitemap: bool = False diff --git a/data-connector-lib/src/dpk_connector/core/logging.py b/data-connector-lib/src/dpk_connector/core/logging.py new file mode 100644 index 000000000..523a1a7c3 --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/logging.py @@ -0,0 +1,22 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from scrapy.logformatter import LogFormatter as ScrapyLogFormatter + + +class QuietLogFormatter(ScrapyLogFormatter): + def scraped(self, item, response, spider): + return ( + super().scraped(item, response, spider) + if spider.settings.getbool("LOG_SCRAPED_ITEMS") + else None + ) diff --git a/data-connector-lib/src/dpk_connector/core/middlewares.py b/data-connector-lib/src/dpk_connector/core/middlewares.py new file mode 100644 index 000000000..7d0738b79 --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/middlewares.py @@ -0,0 +1,263 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import logging +from typing import Any, Generator, Iterable + +from scrapy import Spider, signals +from scrapy.crawler import Crawler +from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware +from scrapy.downloadermiddlewares.stats import DownloaderStats +from scrapy.exceptions import NotConfigured +from scrapy.http import Request, Response +from scrapy.http.request import NO_CALLBACK +from scrapy.robotstxt import ProtegoRobotParser, RobotParser +from scrapy.statscollectors import StatsCollector +from scrapy.utils.httpobj import urlparse_cached +from scrapy.utils.python import to_unicode +from twisted.internet.defer import Deferred + +from dpk_connector.core.item import ConnectorItem +from dpk_connector.core.utils import get_content_type, get_etld1, get_mime_type, get_netloc + +logger = logging.getLogger(__name__) + + +class DelayingProtegoRobotParser(ProtegoRobotParser): + """ + Robots.txt parser supporting crawl-delay/request-rate. + """ + + def __init__(self, robotstxt_body: str | bytes, spider: Spider): + super().__init__(robotstxt_body, spider) + self.max_delay = spider.crawler.settings.getfloat("ROBOTS_MAX_CRAWL_DELAY", 60) + + def delay(self, user_agent: str | bytes) -> float | None: + user_agent = to_unicode(user_agent) + crawl_delay = self.rp.crawl_delay(user_agent) + request_rate = self.rp.request_rate(user_agent) + if crawl_delay is None and request_rate is None: + return None + crawl_delay = crawl_delay or 0 + request_rate = ( + request_rate.seconds / request_rate.requests if request_rate else 0 + ) + delay = min(max(crawl_delay, request_rate), self.max_delay) + return delay + + +class DelayingRobotsTxtMiddleware(RobotsTxtMiddleware): + """ + Downloader middleware to follow crawl-delay/request-rate directives of robots.txt. + """ + + def __init__(self, crawler: Crawler, download_timeout: float): + super().__init__(crawler) + self.download_timeout = download_timeout + self._delays: dict[str, float] = {} + crawler.signals.connect( + self._request_reached_downloader, signal=signals.request_reached_downloader + ) + + @classmethod + def from_crawler(cls, crawler: Crawler): + download_timeout = crawler.settings.getfloat("ROBOTSTXT_DOWNLOAD_TIMEOUT") + if not download_timeout: + download_timeout = crawler.settings.getfloat("DOWNLOAD_TIMEOUT") + return cls(crawler, download_timeout) + + def _request_reached_downloader(self, request: Request, spider: Spider) -> None: + key = request.meta.get("download_slot") + if slot := self.crawler.engine.downloader.slots.get(key): + parts = urlparse_cached(request) + domain = parts.netloc + if domain in self._delays: + delay = self._delays[domain] + if delay and slot.delay < delay: + slot.delay = delay + slot.randomize_delay = False + + def process_request_2( + self, rp: RobotParser, request: Request, spider: Spider + ) -> None: + super().process_request_2(rp, request, spider) + if isinstance(rp, DelayingProtegoRobotParser): + parts = urlparse_cached(request) + domain = parts.netloc + if domain not in self._delays: + user_agent = self._robotstxt_useragent + if not user_agent: + user_agent = request.headers.get( + b"User-Agent", self._default_useragent + ) + delay = rp.delay(user_agent) or 0.0 + self._delays[domain] = delay + if delay: + logger.info( + f"Set download delay to {delay} according to robots.txt. domain: {domain}" + ) + + def robot_parser(self, request: Request, spider: Spider): + url = urlparse_cached(request) + netloc = url.netloc + + if netloc not in self._parsers: + self._parsers[netloc] = Deferred() + robotsurl = f"{url.scheme}://{url.netloc}/robots.txt" + robotsreq = Request( + robotsurl, + priority=self.DOWNLOAD_PRIORITY, + meta={ + "dont_obey_robotstxt": True, + "system_request": True, + "download_timeout": self.download_timeout, + }, + callback=NO_CALLBACK, + ) + dfd = self.crawler.engine.download(robotsreq) + dfd.addCallback(self._parse_robots, netloc, spider) + dfd.addErrback(self._logerror, robotsreq, spider) + dfd.addErrback(self._robots_error, netloc) + self.crawler.stats.inc_value("robotstxt/request_count") + + if isinstance(self._parsers[netloc], Deferred): + d = Deferred() + + def cb(result): + d.callback(result) + return result + + self._parsers[netloc].addCallback(cb) + return d + return self._parsers[netloc] + + +def _update_request_stats( + stats: StatsCollector, + request: Request, + spider: Spider, + prefix: str, + skip_domains: bool = False, +): + # request count + stats.inc_value(prefix, spider=spider) + # proxy distribution + proxy = request.meta.get("proxy", "None") + stats.inc_value(f"{prefix}/proxy/{proxy}", spider=spider) + if not skip_domains: + # domain distribution + domain = get_etld1(to_unicode(request.url)) + stats.inc_value(f"{prefix}/domain/{domain}", spider=spider) + # subdomain distribution + sub_domain = get_netloc(request) + stats.inc_value(f"{prefix}/subdomain/{sub_domain}", spider=spider) + + +def _update_stats( + stats: StatsCollector, + request: Request, + response: Response, + spider: Spider, + prefix: str, + skip_domains: bool = False, +): + _update_request_stats(stats, request, spider, prefix, skip_domains) + # mime type distribution + content_type = get_content_type(response) + if not content_type: + stats.inc_value(f"{prefix}/mime_type/None", spider=spider) + else: + mime_type = get_mime_type(content_type) + stats.inc_value(f"{prefix}/mime_type/{mime_type}", spider=spider) + # status code distribution + stats.inc_value(f"{prefix}/status_code/{response.status}", spider=spider) + + +def _update_sitemap_stats(stats: StatsCollector, spider: Spider, prefix: str): + # sitemap + stats.inc_value(f"{prefix}/sitemap", spider=spider) + + +class ConnectorRequestedStats(DownloaderStats): + """ + Downloader middleware to expose additional stats. + """ + + def __init__(self, stats: StatsCollector, skip_domains: bool): + super().__init__(stats) + self.skip_domains = skip_domains + + @classmethod + def from_crawler(cls, crawler: Crawler): + if not crawler.settings.getbool("DOWNLOADER_STATS"): + raise NotConfigured + skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS") + return cls(crawler.stats, skip_domains) + + def process_request(self, request: Request, spider: Spider): + super().process_request(request, spider) + prefix = "dpk_connector/requested" + if not request.meta.get("system_request", False): + _update_request_stats( + self.stats, request, spider, prefix, self.skip_domains + ) + if request.meta.get("sitemap", False): + _update_sitemap_stats(self.stats, spider, prefix) + + def process_response(self, request: Request, response: Response, spider: Spider): + ret = super().process_response(request, response, spider) + prefix = "dpk_connector/accessed" + if not request.meta.get("system_request", False): + _update_stats( + self.stats, request, response, spider, prefix, self.skip_domains + ) + if request.meta.get("sitemap", False): + _update_sitemap_stats(self.stats, spider, prefix) + return ret + + +class ConnectorDownloadedStats: + """ + Spider middleware to expose additional stats. + """ + + def __init__(self, stats: StatsCollector, skip_domains: bool): + self.stats = stats + self.skip_domains = skip_domains + + @classmethod + def from_crawler(cls, crawler: Crawler): + if not crawler.settings.getbool("DOWNLOADER_STATS"): + raise NotConfigured + skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS") + return cls(crawler.stats, skip_domains) + + def process_spider_output( + self, + response: Response, + result: Iterable[Request | ConnectorItem], + spider: Spider, + ) -> Generator[Any, Any, None]: + for r in result: + if isinstance(r, ConnectorItem): + if (not r.system_request) and r.downloaded: + _update_stats( + self.stats, + response.request, + response, + spider, + "dpk_connector/downloaded", + self.skip_domains, + ) + if r.sitemap: + _update_sitemap_stats(self.stats, spider, "dpk_connector/downloaded") + yield r diff --git a/data-connector-lib/src/dpk_connector/core/pipelines.py b/data-connector-lib/src/dpk_connector/core/pipelines.py new file mode 100644 index 000000000..b9c35d896 --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/pipelines.py @@ -0,0 +1,29 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +from typing import Any +from scrapy import Spider +from scrapy.crawler import Crawler +from scrapy.exceptions import DropItem + +from dpk_connector.core.item import ConnectorItem + + +class DropPipeline: + @classmethod + def from_crawler(cls, crawler: Crawler): + return cls() + + def process_item(self, item: ConnectorItem, spider: Spider) -> Any: + if item.system_request or (not item.downloaded): + raise DropItem + return item diff --git a/data-connector-lib/src/dpk_connector/core/settings.py b/data-connector-lib/src/dpk_connector/core/settings.py new file mode 100644 index 000000000..041ada253 --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/settings.py @@ -0,0 +1,70 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +BOT_NAME = "dpk-connector" + +SPIDER_MODULES = ["dpk_connector.core.spiders"] + +# Robots +ROBOTSTXT_OBEY = True +ROBOTS_MAX_CRAWL_DELAY = 60 +ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser" + +# Downloader parameters +CONCURRENT_REQUESTS = 20 +CONCURRENT_REQUESTS_PER_DOMAIN = 10 +DOWNLOAD_DELAY = 0 +RANDOMIZE_DOWNLOAD_DELAY = True +DOWNLOAD_TIMEOUT = 180 + +# Autothrottle +AUTOTHROTTLE_ENABLED = True +AUTOTHROTTLE_START_DELAY = 0 +AUTOTHROTTLE_MAX_DELAY = 300 +AUTOTHROTTLE_TARGET_CONCURRENCY = 10 +AUTOTHROTTLE_DEBUG = False + +# Middlewares/pipelines/extensions +SPIDER_MIDDLEWARES = { + "dpk_connector.core.middlewares.ConnectorDownloadedStats": 10, +} +DOWNLOADER_MIDDLEWARES = { + "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": None, + "dpk_connector.core.middlewares.DelayingRobotsTxtMiddleware": 100, + "scrapy.downloadermiddlewares.stats.DownloaderStats": None, + "dpk_connector.core.middlewares.ConnectorRequestedStats": 850, +} +ITEM_PIPELINES = { + "dpk_connector.core.pipelines.DropPipeline": 100, +} +EXTENSIONS = { + "scrapy.extensions.telnet.TelnetConsole": None, + "scrapy.extensions.memdebug.MemoryDebugger": None, +} + +# Queue +SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue" + +# Logging +LOG_LEVEL = "INFO" +LOG_SCRAPED_ITEMS = False +LOG_FORMATTER = "dpk_connector.core.logging.QuietLogFormatter" + +# Periodic logging +PERIODIC_LOG_DELTA = True +PERIODIC_LOG_STATS = True +PERIODIC_LOG_TIMING_ENABLED = True +LOGSTATS_INTERVAL = 300 + +# Misc +REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" +TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" diff --git a/data-connector-lib/src/dpk_connector/core/spiders/__init__.py b/data-connector-lib/src/dpk_connector/core/spiders/__init__.py new file mode 100644 index 000000000..0e134987e --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/spiders/__init__.py @@ -0,0 +1,11 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py new file mode 100644 index 000000000..f24d4088b --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py @@ -0,0 +1,334 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import logging +from typing import Any, Callable, Collection, Generator +from urllib.parse import ParseResult + +from scrapy import Request +from scrapy.http import Response +from scrapy.link import Link +from scrapy.linkextractors import LinkExtractor +from scrapy.spiders import SitemapSpider +from scrapy.spiders.sitemap import iterloc +from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots + +from dpk_connector.core.item import ConnectorItem +from dpk_connector.core.utils import ( + get_base_url, + get_content_type, + get_etld1, + get_focus_path, + is_allowed_path, + urlparse_cached, +) + + +class BaseSitemapSpider(SitemapSpider): + SITEMAP_DOWNLOAD_PRIORITY = 10 + + name = "base-sitemap" + + def __init__( + self, + seed_urls: Collection[str], + allow_domains: Collection[str] = (), + path_focus: bool = False, + allow_mime_types: Collection[str] = (), + disallow_mime_types: Collection[str] = (), + depth_limit: int = 0, + disable_sitemap_search: bool = False, + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.depth_limit = depth_limit + self.sitemap_search = (not disable_sitemap_search) and depth_limit >= 0 + + # Build sitemap url candidates + self.input_seed_urls = seed_urls + sitemap_urls = [] + sitemaps_seen = [] + for seed_url in seed_urls: + parts = urlparse_cached(seed_url) + if seed_url.endswith( + ( + ".xml", + ".xml.gz", + "/robots.txt", + "robots.txt/", + "/sitemap", + "/sitemap/", + ) + ): + sitemap_urls.append(seed_url) + elif self.sitemap_search: + sitemap_urls.extend(self._get_sitemap_urls(parts)) + sitemaps_seen.append(parts.netloc) + self.seed_urls = set(seed_urls) - set(sitemap_urls) + self.sitemap_urls = set(sitemap_urls) + self.sitemaps_seen = set(sitemaps_seen) + + # Extract focus paths + self.focus_paths: set[str] = set() + if path_focus: + for seed_url in self.seed_urls: + path = get_focus_path(seed_url) + if path is not None: + self.focus_paths.add(path) + + # Domains and mime types filtering + self.allowed_domains = set( + allow_domains + if len(allow_domains) > 0 + else [get_etld1(url) for url in seed_urls] + ) + self.allow_mime_types = set( + [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else () + ) + self.disallow_mime_types = set( + [m.lower() for m in disallow_mime_types] + if len(disallow_mime_types) > 0 + else () + ) + + # Link extraction from html + self.link_extractor = LinkExtractor( + allow_domains=self.allowed_domains, + unique=True, + deny_extensions=(), + tags=("a", "area", "link"), + ) + + self.log( + f"Seed URLs: {self.seed_urls}, sitemap URLs: {self.sitemap_urls}, allow domains: {self.allowed_domains}, focus paths: {self.focus_paths}, allow mime types: {self.allow_mime_types}, disallow mime types: {self.disallow_mime_types}, depth limit: {self.depth_limit}, sitemap search: {self.sitemap_search}", + logging.INFO, + ) + + def _get_sitemap_urls(self, parts: ParseResult) -> list[str]: + base_url = get_base_url(parts) + sitemap_variations = ( + "robots.txt", + "robots.txt/", + "sitemap.xml", + "sitemap_index.xml", + "sitemapindex.xml", + "sitemap", + "sitemap-index.xml", + "sitemap/index.xml", + "sitemap/sitemap.xml", + "sitemap1.xml", + ) + return [f"{base_url}/{sitemap}" for sitemap in sitemap_variations] + + def start_requests(self): + for url in self.sitemap_urls: + yield Request( + url, + self._parse_sitemap, + priority=self.SITEMAP_DOWNLOAD_PRIORITY, + meta={ + "seed_url": url, + "previous_url": "", + "system_request": True, + "sitemap": True, + }, + ) + for url in self.seed_urls: + yield Request( + url, + self.parse, + meta={ + "seed_url": url, + "previous_url": "", + }, + ) + + def _parse_sitemap(self, response: Response): + yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True) + + seed_url = response.meta["seed_url"] + + if response.url.endswith("/robots.txt") or response.url.endswith( + "/robots.txt/" + ): + for url in sitemap_urls_from_robots(response.text, base_url=response.url): + yield Request( + url, + callback=self._parse_sitemap, + priority=self.SITEMAP_DOWNLOAD_PRIORITY, + meta={ + "seed_url": seed_url, + "previous_url": response.url, + "system_request": True, + "sitemap": True, + }, + ) + else: + body = self._get_sitemap_body(response) + if not body: + self.log( + f"Ignoring invalid sitemap: {response}", + logging.WARN, + extra={"spider": self}, + ) + return + + s = Sitemap(body) + it = self.sitemap_filter(s) + + if s.type == "sitemapindex": + for loc in iterloc(it, self.sitemap_alternate_links): + if any( + x.search(loc) for x in self._follow + ) and self._is_allowed_path(loc): + yield Request( + loc, + callback=self._parse_sitemap, + priority=self.SITEMAP_DOWNLOAD_PRIORITY, + meta={ + "seed_url": seed_url, + "previous_url": response.url, + "system_request": True, + "sitemap": True, + }, + ) + elif s.type == "urlset": + for loc in iterloc(it, self.sitemap_alternate_links): + for r, c in self._cbs: + if r.search(loc) and self._is_allowed_path(loc): + yield Request( + loc, + callback=c, + meta={ + "seed_url": seed_url, + "previous_url": response.url, + }, + ) + break + + def _is_allowed_path(self, input: str | Request | Response) -> bool: + return is_allowed_path(input, self.focus_paths) + + def _is_allowed_content_type(self, content_type: str) -> bool: + return any([mtype in content_type for mtype in self.allow_mime_types]) + + def _is_disallowed_content_type(self, content_type: str) -> bool: + return any([mtype in content_type for mtype in self.disallow_mime_types]) + + def _should_download(self, content_type: str | None) -> bool: + if (not self.allow_mime_types) and (not self.disallow_mime_types): + return True + if not content_type: + return False + ctype = content_type.lower() + if not self.allow_mime_types: + return not self._is_disallowed_content_type(ctype) + if not self.disallow_mime_types: + return self._is_allowed_content_type(ctype) + return ( + not self._is_disallowed_content_type(ctype) + ) and self._is_allowed_content_type(ctype) + + def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]: + depth = response.meta.get("depth", 0) + depth_limit = self.depth_limit + if (depth_limit == 0 or depth < depth_limit) and self.sitemap_search: + parts = urlparse_cached(response) + domain = parts.netloc + if domain not in self.sitemaps_seen: + self.log( + f"New domain {domain} found. Search for sitemap.", logging.INFO + ) + self.sitemaps_seen.add(domain) + for sitemap in self._get_sitemap_urls(parts): + yield Request( + sitemap, + callback=self._parse_sitemap, + priority=self.SITEMAP_DOWNLOAD_PRIORITY, + meta={ + "seed_url": response.meta["seed_url"], + "previous_url": response.url, + "system_request": True, + "sitemap": True, + }, + ) + + def _explore_links( + self, response: Response, links: list[Link] + ) -> Generator[Request, Any, None]: + depth = response.meta.get("depth", 0) + depth_limit = self.depth_limit + if depth_limit == 0 or depth < depth_limit: + for link in links: + if self._is_allowed_path(link.url): + yield Request( + link.url, + callback=self.parse, + meta={ + "seed_url": response.meta["seed_url"], + "previous_url": response.url, + }, + ) + + +class ConnectorSitemapSpider(BaseSitemapSpider): + name = "dpk-connector-sitemap" + + def __init__( + self, + callback: Callable[[str, bytes, dict[str, str]], None], + *args, + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.callback = callback + + def parse( + self, response: Response, **kwargs: Any + ) -> Generator[Request | ConnectorItem, Any, None]: + drop = False + content_type = get_content_type(response) + if not content_type: + drop = True + is_html = "text/html" in content_type.lower() + should_download = self._should_download(content_type) + if not (is_html or should_download): + drop = True + if drop: + yield ConnectorItem( + dropped=True, downloaded=False, system_request=False, sitemap=False + ) + return + + # Download contents + if should_download: + self.callback( + str(response.url), response.body, response.headers.to_unicode_dict() + ) + # to count up downloaded pages and collect stats + yield ConnectorItem( + dropped=False, downloaded=True, system_request=False, sitemap=False + ) + else: + yield ConnectorItem( + dropped=False, downloaded=False, system_request=False, sitemap=False + ) + + # Search for sitemap + yield from self._explore_sitemap(response) + + # Extract links and dispatch them + links = self.link_extractor.extract_links(response) if is_html else [] + yield from self._explore_links(response, links) diff --git a/data-connector-lib/src/dpk_connector/core/utils.py b/data-connector-lib/src/dpk_connector/core/utils.py new file mode 100644 index 000000000..d2dfa760d --- /dev/null +++ b/data-connector-lib/src/dpk_connector/core/utils.py @@ -0,0 +1,92 @@ +# (C) Copyright IBM Corp. 2024. +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +import re +from urllib.parse import ParseResult, urlparse + +import tldextract +from scrapy.http import Request, Response +from scrapy.http.headers import Headers +from scrapy.utils.httpobj import urlparse_cached as _urlparse_cached + + +def _get_header_value(headers: Headers, key: str) -> str | None: + value = headers.get(key) + return value.decode("utf-8") if value else None + + +def get_header_value(response: Response, key: str) -> str | None: + return _get_header_value(response.headers, key) + + +def get_content_type(response: Response) -> str | None: + return get_header_value(response, "Content-Type") + + +def get_mime_type(content_type: str) -> str: + return content_type.split(";")[0].strip() + + +def urlparse_cached(input: str | Request | Response) -> ParseResult: + return urlparse(input) if isinstance(input, str) else _urlparse_cached(input) + + +def get_netloc(input: str | Request | Response) -> str: + return urlparse_cached(input).netloc + + +def get_base_url(input: str | Request | Response | ParseResult) -> str: + if isinstance(input, ParseResult): + parts = input + else: + parts = urlparse_cached(input) + return f"{parts.scheme}://{parts.netloc}" + + +def get_etld1(url: str) -> str: + ext = tldextract.extract(url) + return f"{ext.domain}.{ext.suffix}" + + +def get_focus_path(url: str) -> str | None: + parts = urlparse_cached(url) + if len(parts.path.split("/")) > 2: + return "/".join(parts.path.split("/")[:-1]) + "/" + return None + + +def _check_path(url_path: str, focus_element: str): + if focus_element.startswith("/"): + return url_path.startswith(focus_element) + else: + return focus_element in url_path + + +def is_allowed_path(input: str | Request | Response, focus_paths: set[str]) -> bool: + if not focus_paths: + return True + url_path = urlparse_cached(input).path + return any(_check_path(url_path.lower(), p.lower()) for p in focus_paths) + + +def validate_url(url: str) -> bool: + result = urlparse(url) + if result.scheme not in ("http", "https"): + return False + if not result.netloc: + return False + return True + + +def validate_domain(domain: str) -> bool: + pattern = r"^([a-zA-Z0-9][a-zA-Z0-9\-]{1,61}[a-zA-Z0-9]\.)+[a-zA-Z]{2,}$" + return bool(re.match(pattern, domain)) diff --git a/data-connector-lib/test/dpk_connector/core/__init__.py b/data-connector-lib/test/dpk_connector/core/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/data-connector-lib/test/dpk_connector/core/test_crawler.py b/data-connector-lib/test/dpk_connector/core/test_crawler.py new file mode 100644 index 000000000..88d90293a --- /dev/null +++ b/data-connector-lib/test/dpk_connector/core/test_crawler.py @@ -0,0 +1,28 @@ +import pytest + +from dpk_connector.core.crawler import crawl + + +def test_invalid_crawler(): + def on_downloaded(url: str, body: bytes, headers: dict[str, str]): + pass + + with pytest.raises(ValueError) as e: + crawl([], on_downloaded) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["invalidseedurl"], on_downloaded) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",)) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, depth_limit=-10) + assert isinstance(e.value, ValueError) is True + + with pytest.raises(ValueError) as e: + crawl(["http://example.com"], on_downloaded, download_limit=-10) + assert isinstance(e.value, ValueError) is True diff --git a/data-connector-lib/test/dpk_connector/core/test_middlewares.py b/data-connector-lib/test/dpk_connector/core/test_middlewares.py new file mode 100644 index 000000000..a2346888f --- /dev/null +++ b/data-connector-lib/test/dpk_connector/core/test_middlewares.py @@ -0,0 +1,59 @@ +import pytest +from dpk_connector.core.middlewares import DelayingProtegoRobotParser +from pytest_mock import MockerFixture + + +@pytest.mark.parametrize( + "ua,expected", + [ + ("Mozilla", 7.7), + ("MSIE", 10), + ("Macintosh", 60), + ("ua1", None), + ("ua2", None), + ], +) +def test_robots_crawl_delay(mocker: MockerFixture, ua: str, expected: float): + robots_txt = """ +User-agent: * +Crawl-delay: 7.7 +User-agent: MSIE +Crawl-delay: 10 +User-agent: Macintosh +Crawl-delay: 10000 +User-agent: ua1 +Crawl-delay: not numeric value +User-agent: ua2 +""" + crawler = mocker.patch("dpk_connector.core.middlewares.Crawler") + parser = DelayingProtegoRobotParser(robots_txt.encode("utf-8"), crawler.spider) + parser.max_delay = 60 + assert parser.delay(ua) == expected + + +@pytest.mark.parametrize( + "ua,expected", + [ + ("Mozilla", 10), + ("MSIE", 60), + ("Macintosh", 60), + ("ua1", None), + ("ua2", None), + ], +) +def test_robots_request_rate(mocker: MockerFixture, ua: str, expected: float): + robots_txt = """ +User-agent: * +Request-rate: 1/10s +User-agent: MSIE +Request-rate: 1/1m 0900-1730 +User-agent: Macintosh +Request-rate: 2/1h 0000-1736 +User-agent: ua1 +Request-rate: invalid value +User-agent: ua2 +""" + crawler = mocker.patch("dpk_connector.core.middlewares.Crawler") + parser = DelayingProtegoRobotParser(robots_txt.encode("utf-8"), crawler.spider) + parser.max_delay = 60 + assert parser.delay(ua) == expected diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py new file mode 100644 index 000000000..826963f2f --- /dev/null +++ b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py @@ -0,0 +1,97 @@ +from pathlib import Path + +import pytest +from scrapy import Request +from scrapy.crawler import Crawler +from scrapy.http import HtmlResponse + +from dpk_connector.core.item import ConnectorItem +from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider + + +@pytest.fixture +def crawler() -> Crawler: + crawler = Crawler( + ConnectorSitemapSpider, + settings={ + "STATS_CLASS": "scrapy.statscollectors.MemoryStatsCollector", + "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", + }, + ) + crawler._apply_settings() + return crawler + + +def test_init_path_focus(): + spider = BaseSitemapSpider( + seed_urls=( + "https://openshiftjsonschema.dev/v4.9.18-standalone-strict/", + "https://openshiftjsonschema.dev/", + ), + path_focus=True, + ) + assert spider.seed_urls == { + "https://openshiftjsonschema.dev/v4.9.18-standalone-strict/", + "https://openshiftjsonschema.dev/", + } + assert spider.sitemap_urls == { + "https://openshiftjsonschema.dev/robots.txt", + "https://openshiftjsonschema.dev/robots.txt/", + "https://openshiftjsonschema.dev/sitemap.xml", + "https://openshiftjsonschema.dev/sitemap_index.xml", + "https://openshiftjsonschema.dev/sitemapindex.xml", + "https://openshiftjsonschema.dev/sitemap", + "https://openshiftjsonschema.dev/sitemap-index.xml", + "https://openshiftjsonschema.dev/sitemap/index.xml", + "https://openshiftjsonschema.dev/sitemap/sitemap.xml", + "https://openshiftjsonschema.dev/sitemap1.xml", + } + assert spider.allowed_domains == {"openshiftjsonschema.dev"} + assert spider.sitemaps_seen == {"openshiftjsonschema.dev"} + assert spider.focus_paths == {"/v4.9.18-standalone-strict/"} + + +def test_parse(datadir: Path, crawler: Crawler): + response_body = (datadir / "index.html").read_text() + + def callback(url: str, body: bytes, headers: dict): + assert url == "http://example.com/index.html" + assert body.decode("utf-8") == response_body + assert headers == {"Content-Type": "text/html"} + + spider = ConnectorSitemapSpider.from_crawler( + crawler, seed_urls=("http://example.com",), callback=callback + ) + request = Request( + "http://example.com/index.html", + meta={ + "seed_url": "http://example.com", + "depth": 1, + }, + ) + response = HtmlResponse( + "http://example.com/index.html", + headers={"Content-Type": "text/html"}, + body=response_body, + request=request, + encoding="utf-8", + ) + parsed = spider.parse(response) + + item = next(parsed) + assert item == ConnectorItem( + dropped=False, downloaded=True, system_request=False, sitemap=False + ) + + for next_request in parsed: + assert isinstance(next_request, Request) is True + assert next_request.url in ( + "http://example.com/blog/", + "http://example.com/contents/", + "http://example.com/css/app.css", + "http://example.com/favicon.ico?r=1.6", + "http://example.com/", + ) + assert next_request.callback == spider.parse + assert next_request.meta["seed_url"] == "http://example.com" + assert next_request.meta["previous_url"] == "http://example.com/index.html" diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html new file mode 100644 index 000000000..f77ae5828 --- /dev/null +++ b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html @@ -0,0 +1,39 @@ + + + + + Max External Links + + + + + + + + + + + + + +
+
+ +

Max External Links

+ + link1 external +
link2 external +
link3 external +
blog +
contents +

+
+
+ + + + diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py new file mode 100644 index 000000000..096a4e194 --- /dev/null +++ b/data-connector-lib/test/dpk_connector/core/test_utils.py @@ -0,0 +1,152 @@ +# Assisted by WCA@IBM +# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2 + +import pytest +from dpk_connector.core.utils import ( + get_base_url, + get_content_type, + get_etld1, + get_focus_path, + get_header_value, + get_mime_type, + is_allowed_path, + urlparse_cached, + validate_domain, + validate_url, +) +from pytest_mock import MockerFixture +from scrapy.http import Request, Response + + +def test_get_header_value(): + response = Response( + "http://example.com", headers={"Content-Type": "application/json"} + ) + assert get_header_value(response, "Content-Type") == "application/json" + + +def test_get_header_value_not_found(): + response = Response("http://example.com") + assert get_header_value(response, "Content-Type") is None + + +def test_get_content_type(): + response = Response("http://example.com", headers={"Content-Type": "text/html"}) + assert get_content_type(response) == "text/html" + + +def test_get_content_type_not_found(): + response = Response("http://example.com") + assert get_content_type(response) is None + + +def test_urlparse_cached_with_str(mocker: MockerFixture): + import dpk_connector.core.utils + + spy = mocker.spy(dpk_connector.core.utils, "urlparse") + urlparse_cached("http://example.com") + spy.assert_called_once_with("http://example.com") + + +def test_urlparse_cached_with_request(mocker: MockerFixture): + import dpk_connector.core.utils + + spy = mocker.spy(dpk_connector.core.utils, "_urlparse_cached") + request = Request("http://example.com") + urlparse_cached(request) + spy.assert_called_once_with(request) + + +def test_urlparse_cached_with_response(mocker: MockerFixture): + import dpk_connector.core.utils + + spy = mocker.spy(dpk_connector.core.utils, "_urlparse_cached") + response = Response("http://example.com") + urlparse_cached(response) + spy.assert_called_once_with(response) + + +def test_get_base_url(): + url = "http://localhost:8000/test?param=value" + assert get_base_url(url) == "http://localhost:8000" + + +@pytest.mark.parametrize( + "url,expected", + [ + ("http://www.example.com", "example.com"), + ("https://www.example.co.uk", "example.co.uk"), + ("http://www.example.com/path?query=string#fragment", "example.com"), + ], +) +def test_get_etld1(url: str, expected: str): + assert get_etld1(url) == expected + + +@pytest.mark.parametrize( + "url,expected", + [ + ("https://www.example.com", None), + ("https://www.example.com/page", None), + ("https://www.example.com/page/", "/page/"), + ("https://www.example.com/page/subpage", "/page/"), + ("https://www.example.com/page/subpage/", "/page/subpage/"), + ], +) +def test_get_focus_path(url: str, expected: str | None): + assert get_focus_path(url) == expected + + +@pytest.mark.parametrize( + "url,paths,expected", + [ + ("http://localhost/", set(), True), + ("http://localhost/test/", {"/test/"}, True), + ("http://localhost/test/", {"/test2/"}, False), + ("http://localhost/test", {"/test/"}, False), + ("http://localhost/test/", {"/test/", "/test2/"}, True), + ("http://localhost/test3/", {"/test/", "/test2/"}, False), + ], +) +def test_is_allowed_path(url: str, paths: set[str], expected: bool): + assert is_allowed_path(url, paths) is expected + + +@pytest.mark.parametrize( + "url,expected", + [ + ("http://www.cwi.nl:80/%7Eguido/Python.html", True), + ("/data/Python.html", False), + ("532", False), + ("dkakasdkjdjakdjadjfalskdjfalk", False), + ("https://stackoverflow.com", True), + ], +) +def test_validate_url(url: str, expected: bool): + assert validate_url(url) is expected + + +@pytest.mark.parametrize( + "content_type,expected", + [ + ("text/html", "text/html"), + ("application/json; charset=utf-8", "application/json"), + ], +) +def test_get_mime_type(content_type: str, expected: str): + assert get_mime_type(content_type) == expected + + +@pytest.mark.parametrize( + "domain,expected", + [ + ("example.com", True), + ("sub-domain.example.com", True), + ("a" * 254 + ".com", False), + ("", False), + ("example", False), + ("sub_domain.example.com", False), + ], +) +def test_validate_domain(domain: str, expected: bool): + assert validate_domain(domain) is expected