From 45bb6e3d00d87045782a5d09a9c00e18073759c4 Mon Sep 17 00:00:00 2001
From: Hiroya Matsubara <hmtbr@jp.ibm.com>
Date: Sat, 5 Oct 2024 08:27:57 +0900
Subject: [PATCH] add dpk_connector to dpk (#637)

* add bluecrawl connector to dpk

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* update test data

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* rename

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* correct build

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* remove unnecessary file

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* add documentation

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* rename folder

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>

* renamed library workflow files

Signed-off-by: David Wood <dawood@us.ibm.com>

---------

Signed-off-by: Hiroya Matsubara <hmtbr@jp.ibm.com>
Signed-off-by: David Wood <dawood@us.ibm.com>
Co-authored-by: David Wood <dawood@us.ibm.com>
---
 .github/workflows/test-connector-lib.yml      |  32 ++
 .../{test-lib.yml => test-processing-lib.yml} |   2 +-
 data-connector-lib/Makefile                   |  49 +++
 data-connector-lib/README.md                  |  30 ++
 data-connector-lib/doc/overview.md            |  47 +++
 data-connector-lib/pyproject.toml             |  61 ++++
 .../src/dpk_connector/__init__.py             |  13 +
 .../src/dpk_connector/core/__init__.py        |  11 +
 .../src/dpk_connector/core/crawler.py         | 216 +++++++++++
 .../src/dpk_connector/core/item.py            |  21 ++
 .../src/dpk_connector/core/logging.py         |  22 ++
 .../src/dpk_connector/core/middlewares.py     | 263 ++++++++++++++
 .../src/dpk_connector/core/pipelines.py       |  29 ++
 .../src/dpk_connector/core/settings.py        |  70 ++++
 .../dpk_connector/core/spiders/__init__.py    |  11 +
 .../src/dpk_connector/core/spiders/sitemap.py | 334 ++++++++++++++++++
 .../src/dpk_connector/core/utils.py           |  92 +++++
 .../test/dpk_connector/core/__init__.py       |   0
 .../test/dpk_connector/core/test_crawler.py   |  28 ++
 .../dpk_connector/core/test_middlewares.py    |  59 ++++
 .../dpk_connector/core/test_sitemap_spider.py |  97 +++++
 .../core/test_sitemap_spider/index.html       |  39 ++
 .../test/dpk_connector/core/test_utils.py     | 152 ++++++++
 23 files changed, 1677 insertions(+), 1 deletion(-)
 create mode 100644 .github/workflows/test-connector-lib.yml
 rename .github/workflows/{test-lib.yml => test-processing-lib.yml} (98%)
 create mode 100644 data-connector-lib/Makefile
 create mode 100644 data-connector-lib/README.md
 create mode 100644 data-connector-lib/doc/overview.md
 create mode 100644 data-connector-lib/pyproject.toml
 create mode 100644 data-connector-lib/src/dpk_connector/__init__.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/__init__.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/crawler.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/item.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/logging.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/middlewares.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/pipelines.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/settings.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/spiders/__init__.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
 create mode 100644 data-connector-lib/src/dpk_connector/core/utils.py
 create mode 100644 data-connector-lib/test/dpk_connector/core/__init__.py
 create mode 100644 data-connector-lib/test/dpk_connector/core/test_crawler.py
 create mode 100644 data-connector-lib/test/dpk_connector/core/test_middlewares.py
 create mode 100644 data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
 create mode 100644 data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html
 create mode 100644 data-connector-lib/test/dpk_connector/core/test_utils.py

diff --git a/.github/workflows/test-connector-lib.yml b/.github/workflows/test-connector-lib.yml
new file mode 100644
index 000000000..26e3490d7
--- /dev/null
+++ b/.github/workflows/test-connector-lib.yml
@@ -0,0 +1,32 @@
+name: Test Data Connector lib
+
+on:
+    workflow_dispatch:
+    push:
+        branches:
+            - "dev"
+            - "releases/**"
+        tags:
+            - "*"
+        paths:
+            - "data-connector-lib/**"
+            - "!data-connector-lib/**.md"
+            - ".make.*"
+    pull_request:
+        branches:
+            - "dev"
+            - "releases/**"
+        paths:
+            - "data-connector-lib/**"
+            - "!data-connector-lib/**.md"
+            - ".make.*"
+
+jobs:
+    test-dpk-connector:
+        runs-on: ubuntu-22.04
+        steps:
+            - name: Checkout
+              uses: actions/checkout@v4
+            - name: Test dpk_connector
+              run: |
+                  make -C data-connector-lib venv test
diff --git a/.github/workflows/test-lib.yml b/.github/workflows/test-processing-lib.yml
similarity index 98%
rename from .github/workflows/test-lib.yml
rename to .github/workflows/test-processing-lib.yml
index 5a1cff872..5d76c13f7 100644
--- a/.github/workflows/test-lib.yml
+++ b/.github/workflows/test-processing-lib.yml
@@ -1,4 +1,4 @@
-name: Test DPK libs and (Optionally) Push base DPK images
+name: Test Data Processing libs and (Optionally) Push base DPK images
 
 on:
     workflow_dispatch:
diff --git a/data-connector-lib/Makefile b/data-connector-lib/Makefile
new file mode 100644
index 000000000..c72aeb74d
--- /dev/null
+++ b/data-connector-lib/Makefile
@@ -0,0 +1,49 @@
+# Use make help, to see the available rules
+REPOROOT=..
+include $(REPOROOT)/.make.defaults
+
+clean::
+	@# Help: Clean up the distribution build and the venv
+	rm -rf dist venv
+	rm -rf src/*egg-info
+
+.check-env::
+	@echo "Checks passed"
+
+setup::
+
+set-versions: .check-env
+	$(MAKE) TOML_VERSION=$(DPK_LIB_VERSION) .defaults.update-toml
+
+build:: build-dist
+
+#build:: update-toml .defaults.build-dist
+build-dist :: .defaults.build-dist
+
+publish:: publish-dist
+
+publish-dist :: .check-env .defaults.publish-dist
+
+venv::	pyproject.toml
+	@# Help: Create the virtual environment using pyproject.toml
+	rm -r dist venv || true
+	rm -rf src/*egg-info || true
+	rm makeenv || true
+	$(PYTHON) -m venv venv
+	source venv/bin/activate; 	\
+	pip install --upgrade pip;	\
+	pip install -e .;		\
+	pip install pytest pytest-mock pytest-datadir pytest-cov moto==5.0.5 markupsafe==2.0.1
+
+image::
+	@# Help: Placeholder does nothing for now.
+	@echo "Image building for ray is in the works (comming soon)."
+
+# Here we run each test directory of tests and each ray launched test separately, because
+# it seems when running multiple ray launch tests in a single pytest run there is some sort of ray.init() duplication.
+# pytest-forked was tried, but then we get SIGABRT in pytest when running the s3 tests, some of which are skipped..
+# TODO: the following fails.  Why?  source venv/bin/activate; export PYTHONPATH=../src; cd test; $(PYTEST)  .
+.PHONY: test
+test::   venv
+	@# Help: Use the already-built virtual environment to run pytest on the test directory.
+	source venv/bin/activate; $(PYTEST);
diff --git a/data-connector-lib/README.md b/data-connector-lib/README.md
new file mode 100644
index 000000000..facacaf04
--- /dev/null
+++ b/data-connector-lib/README.md
@@ -0,0 +1,30 @@
+# DPK Connector
+
+DPK Connector is a scalable and compliant web crawler developed for data acquisition towards LLM development. It is built on [Scrapy](https://scrapy.org/).
+For more details read [the documentation](doc/overview.md).
+
+## Virtual Environment
+
+The project uses `pyproject.toml` and a Makefile for operations.
+To do development you should establish the virtual environment
+```shell
+make venv
+```
+and then either activate
+```shell
+source venv/bin/activate
+```
+or set up your IDE to use the venv directory when developing in this project
+
+## Library Artifact Build and Publish
+
+To test, build and publish the library
+```shell
+make test build publish
+```
+
+To up the version number, edit the Makefile to change VERSION and rerun the above. This will require committing both the `Makefile` and the autotmatically updated `pyproject.toml` file.
+
+## How to use
+
+See [the overview](doc/overview.md).
diff --git a/data-connector-lib/doc/overview.md b/data-connector-lib/doc/overview.md
new file mode 100644
index 000000000..4e48001a2
--- /dev/null
+++ b/data-connector-lib/doc/overview.md
@@ -0,0 +1,47 @@
+# DPK Connector Overview
+
+The Data Prep Kit Connector (DPK Connector) is a Python library for scalable and compliant web crawling.
+
+Features:
+- Robots.txt compliant: The Connector follows allow/disallow lists and some extended directives such as `Crawl-delay` in robots.txt of websites.
+- Sitemap support: The Connector automatically parses sitemap urls from input and tries to find them from robots.txt.
+- User agent and headers customization: You can use your own user agent string and request headers.
+- Domain and path focus: You can limit domains and paths accessed by the library.
+- Mime type filters: You can restrict mime types which can be downloaded.
+- Parallel processing: Requests to websites are processed in parallel.
+
+## Example usage
+
+```python
+from dpk_connector import crawl, shutdown
+
+
+def main():
+    """
+    An example of running a crawl.
+    """
+
+    def on_downloaded(url: str, body: bytes, headers: dict) -> None:
+        """
+        Callback function called when a page has been downloaded.
+        You have access to the request URL, response body and headers.
+        """
+        print(f"url: {url}, headers: {headers}, body: {body[:64]}")
+
+    user_agent = "Mozilla/5.0 (X11; Linux i686; rv:125.0) Gecko/20100101 Firefox/125.0"
+
+    # Start crawling
+    crawl(
+        ["https://crawler-test.com/"],
+        on_downloaded,
+        user_agent=user_agent,
+        depth_limit=0,
+    )  # blocking call
+
+    # Shutdown all crawls
+    shutdown()
+
+
+if __name__ == "__main__":
+    main()
+```
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
new file mode 100644
index 000000000..cc4e8571a
--- /dev/null
+++ b/data-connector-lib/pyproject.toml
@@ -0,0 +1,61 @@
+[project]
+name = "dpk_connector"
+version = "0.2.2.dev0"
+requires-python = ">=3.10"
+keywords = [
+    "data",
+    "data acquisition",
+    "crawler",
+    "web crawler",
+    "llm",
+    "generative",
+    "ai",
+    "fine-tuning",
+    "llmapps",
+]
+description = "Scalable and Compliant Web Crawler"
+license = { text = "Apache-2.0" }
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [{ name = "Hiroya Matsubara", email = "hmtbr@jp.ibm.com" }]
+dependencies = [
+    "scrapy>=2.11.2",
+    "pydantic>=2.8.1",
+    "tldextract>=5.1.2",
+]
+
+[project_urls]
+Repository = "https://github.com/IBM/data-prep-kit"
+Issues = "https://github.com/IBM/data-prep-kit/issues"
+Documentation = "https://ibm.github.io/data-prep-kit/"
+
+[build-system]
+requires = ["setuptools>=68.0.0", "wheel", "setuptools_scm[toml]>=7.1.0"]
+build-backend = "setuptools.build_meta"
+
+[project.optional-dependencies]
+dev = [
+    "twine",
+    "pytest>=7.3.2",
+    "pytest-dotenv>=0.5.2",
+    "pytest-env>=1.0.0",
+    "pre-commit>=3.3.2",
+    "pytest-cov>=4.1.0",
+    "pytest-mock>=3.10.0",
+    "pytest-datadir>=1.5.0",
+    "moto==5.0.5",
+    "markupsafe==2.0.1",
+]
+
+[options]
+package_dir = ["src", "test"]
+
+[options.packages.find]
+where = ["src/dpk_connector"]
+
+[tool.pytest.ini_options]
+# Currently we use low coverage since we have to run tests separately (see makefile)
+#addopts = "--cov --cov-report term-missing --cov-fail-under 25"
+markers = ["unit: unit tests", "integration: integration tests"]
+
+[tool.coverage.run]
+include = ["src/*"]
diff --git a/data-connector-lib/src/dpk_connector/__init__.py b/data-connector-lib/src/dpk_connector/__init__.py
new file mode 100644
index 000000000..14780b08c
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/__init__.py
@@ -0,0 +1,13 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from dpk_connector.core.crawler import async_crawl, crawl, shutdown  # noqa
diff --git a/data-connector-lib/src/dpk_connector/core/__init__.py b/data-connector-lib/src/dpk_connector/core/__init__.py
new file mode 100644
index 000000000..0e134987e
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/__init__.py
@@ -0,0 +1,11 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py
new file mode 100644
index 000000000..f024e63b1
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/crawler.py
@@ -0,0 +1,216 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import threading
+from typing import Any, Callable, Collection, Type, cast
+
+from scrapy import Spider
+from scrapy.crawler import Crawler, CrawlerRunner
+from scrapy.settings import Settings
+from twisted.internet.defer import Deferred
+
+from dpk_connector.core.utils import validate_domain, validate_url
+
+_lock = threading.Lock()
+_reactor_initialized = False
+_reactor_started = False
+
+
+def _run_reactor():
+    from twisted.internet import reactor
+
+    reactor.run(installSignalHandlers=False)
+
+
+_reactor_thread: threading.Thread = threading.Thread(
+    target=_run_reactor,
+    daemon=True,
+)
+
+
+def _start_reactor():
+    with _lock:
+        global _reactor_started
+        if not _reactor_started:
+            _reactor_thread.start()
+            _reactor_started = True
+
+
+def _stop_reactor():
+    from twisted.internet import reactor
+
+    try:
+        reactor.stop()
+    except RuntimeError:
+        pass
+
+
+class MultiThreadedCrawlerRunner(CrawlerRunner):
+    def _create_crawler(self, spidercls: str | type[Spider]) -> Crawler:
+        if isinstance(spidercls, str):
+            spidercls = self.spider_loader.load(spidercls)
+        with _lock:
+            global _reactor_initialized
+            init_reactor = not _reactor_initialized
+            crawler = Crawler(
+                cast(Type[Spider], spidercls), self.settings, init_reactor
+            )
+            _reactor_initialized = True
+        return crawler
+
+
+def async_crawl(
+    seed_urls: Collection[str],
+    on_downloaded: Callable[[str, bytes, dict[str, str]], None],
+    user_agent: str = "",
+    headers: dict[str, str] = {},
+    allow_domains: Collection[str] = (),
+    path_focus: bool = False,
+    allow_mime_types: Collection[str] = (
+        "application/pdf",
+        "text/html",
+        "text/markdown",
+        "text/plain",
+    ),
+    disallow_mime_types: Collection[str] = (),
+    depth_limit: int = -1,
+    download_limit: int = -1,
+) -> Deferred[None]:
+    # Assisted by WCA@IBM
+    # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
+    """
+    Do crawl asynchronously.
+
+    Parameters:
+        seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
+        on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
+        user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
+        headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
+        allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
+        allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
+        disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
+        depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
+        download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+
+    Returns:
+        Deferred[None]: A Twisted deferred object that can be used to wait for the crawler to finish.
+    """
+    if not seed_urls:
+        raise ValueError(f"Empty seed URLs.")
+    for url in seed_urls:
+        if not validate_url(url):
+            raise ValueError(f"Seed URL {url} is not valid.")
+    for domain in allow_domains:
+        if not validate_domain(domain):
+            raise ValueError(f"Allow domain {domain} is not valid.")
+    if depth_limit < -1:
+        raise ValueError(f"Invalid depth limit {depth_limit}")
+    if download_limit < -1:
+        raise ValueError(f"Invalid download limit {download_limit}")
+
+    settings = Settings()
+    settings.setmodule("dpk_connector.core.settings", priority="project")
+
+    if user_agent:
+        settings.set("USER_AGENT", user_agent, priority="spider")
+    if headers:
+        settings.set("DEFAULT_REQUEST_HEADERS", headers)
+    if depth_limit == 0:
+        depth_limit = -1
+    elif depth_limit == -1:
+        depth_limit = 0
+    settings.set("DEPTH_LIMIT", depth_limit, priority="spider")
+    if download_limit == -1:
+        download_limit = 0
+    settings.set("CLOSESPIDER_ITEMCOUNT", download_limit, priority="spider")
+
+    runner = MultiThreadedCrawlerRunner(settings)
+    runner.crawl(
+        "dpk-connector-sitemap",
+        seed_urls=seed_urls,
+        callback=on_downloaded,
+        allow_domains=allow_domains,
+        path_focus=path_focus,
+        allow_mime_types=allow_mime_types,
+        disallow_mime_types=disallow_mime_types,
+        disable_sitemap_search=True,
+    )
+    _start_reactor()
+    return runner.join()
+
+
+def crawl(
+    seed_urls: Collection[str],
+    on_downloaded: Callable[[str, bytes, dict[str, str]], None],
+    user_agent: str = "",
+    headers: dict[str, str] = {},
+    allow_domains: Collection[str] = (),
+    path_focus: bool = False,
+    allow_mime_types: Collection[str] = (
+        "application/pdf",
+        "text/html",
+        "text/markdown",
+        "text/plain",
+    ),
+    disallow_mime_types: Collection[str] = (),
+    depth_limit: int = -1,
+    download_limit: int = -1,
+) -> None:
+    # Assisted by WCA@IBM
+    # Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
+    """
+    Do crawl synchronously.
+
+    Parameters:
+        seed_urls (Collection[str]): A collection of seed URLs to start the crawl from.
+        on_downloaded (Callable[[str, bytes, dict[str, str]], None]): The callback function to be called for each downloaded page.
+        user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
+        headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
+        allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
+        allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
+        disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
+        depth_limit (int): The maximum depth of the crawl. Default is -1, which means no limit.
+        download_limit (int): The maximum number of pages to download. Default is -1, which means no limit. This is a soft limit, meaning that a crawler may download more pages than the specified limit.
+
+    Returns:
+        None
+    """
+    condition = threading.Condition()
+
+    def on_completed(result: Any):
+        with condition:
+            condition.notify()
+
+    d = async_crawl(
+        seed_urls,
+        on_downloaded,
+        user_agent,
+        headers,
+        allow_domains,
+        path_focus,
+        allow_mime_types,
+        disallow_mime_types,
+        depth_limit,
+        download_limit,
+    )
+    d.addBoth(on_completed)
+    with condition:
+        condition.wait()
+
+
+def shutdown():
+    """
+    Shutdown all crawls.
+    """
+    _stop_reactor()
diff --git a/data-connector-lib/src/dpk_connector/core/item.py b/data-connector-lib/src/dpk_connector/core/item.py
new file mode 100644
index 000000000..5fcf68b49
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/item.py
@@ -0,0 +1,21 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from dataclasses import dataclass
+
+
+@dataclass
+class ConnectorItem:
+    dropped: bool = False
+    downloaded: bool = False
+    system_request: bool = False
+    sitemap: bool = False
diff --git a/data-connector-lib/src/dpk_connector/core/logging.py b/data-connector-lib/src/dpk_connector/core/logging.py
new file mode 100644
index 000000000..523a1a7c3
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/logging.py
@@ -0,0 +1,22 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from scrapy.logformatter import LogFormatter as ScrapyLogFormatter
+
+
+class QuietLogFormatter(ScrapyLogFormatter):
+    def scraped(self, item, response, spider):
+        return (
+            super().scraped(item, response, spider)
+            if spider.settings.getbool("LOG_SCRAPED_ITEMS")
+            else None
+        )
diff --git a/data-connector-lib/src/dpk_connector/core/middlewares.py b/data-connector-lib/src/dpk_connector/core/middlewares.py
new file mode 100644
index 000000000..7d0738b79
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/middlewares.py
@@ -0,0 +1,263 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import logging
+from typing import Any, Generator, Iterable
+
+from scrapy import Spider, signals
+from scrapy.crawler import Crawler
+from scrapy.downloadermiddlewares.robotstxt import RobotsTxtMiddleware
+from scrapy.downloadermiddlewares.stats import DownloaderStats
+from scrapy.exceptions import NotConfigured
+from scrapy.http import Request, Response
+from scrapy.http.request import NO_CALLBACK
+from scrapy.robotstxt import ProtegoRobotParser, RobotParser
+from scrapy.statscollectors import StatsCollector
+from scrapy.utils.httpobj import urlparse_cached
+from scrapy.utils.python import to_unicode
+from twisted.internet.defer import Deferred
+
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.utils import get_content_type, get_etld1, get_mime_type, get_netloc
+
+logger = logging.getLogger(__name__)
+
+
+class DelayingProtegoRobotParser(ProtegoRobotParser):
+    """
+    Robots.txt parser supporting crawl-delay/request-rate.
+    """
+
+    def __init__(self, robotstxt_body: str | bytes, spider: Spider):
+        super().__init__(robotstxt_body, spider)
+        self.max_delay = spider.crawler.settings.getfloat("ROBOTS_MAX_CRAWL_DELAY", 60)
+
+    def delay(self, user_agent: str | bytes) -> float | None:
+        user_agent = to_unicode(user_agent)
+        crawl_delay = self.rp.crawl_delay(user_agent)
+        request_rate = self.rp.request_rate(user_agent)
+        if crawl_delay is None and request_rate is None:
+            return None
+        crawl_delay = crawl_delay or 0
+        request_rate = (
+            request_rate.seconds / request_rate.requests if request_rate else 0
+        )
+        delay = min(max(crawl_delay, request_rate), self.max_delay)
+        return delay
+
+
+class DelayingRobotsTxtMiddleware(RobotsTxtMiddleware):
+    """
+    Downloader middleware to follow crawl-delay/request-rate directives of robots.txt.
+    """
+
+    def __init__(self, crawler: Crawler, download_timeout: float):
+        super().__init__(crawler)
+        self.download_timeout = download_timeout
+        self._delays: dict[str, float] = {}
+        crawler.signals.connect(
+            self._request_reached_downloader, signal=signals.request_reached_downloader
+        )
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        download_timeout = crawler.settings.getfloat("ROBOTSTXT_DOWNLOAD_TIMEOUT")
+        if not download_timeout:
+            download_timeout = crawler.settings.getfloat("DOWNLOAD_TIMEOUT")
+        return cls(crawler, download_timeout)
+
+    def _request_reached_downloader(self, request: Request, spider: Spider) -> None:
+        key = request.meta.get("download_slot")
+        if slot := self.crawler.engine.downloader.slots.get(key):
+            parts = urlparse_cached(request)
+            domain = parts.netloc
+            if domain in self._delays:
+                delay = self._delays[domain]
+                if delay and slot.delay < delay:
+                    slot.delay = delay
+                    slot.randomize_delay = False
+
+    def process_request_2(
+        self, rp: RobotParser, request: Request, spider: Spider
+    ) -> None:
+        super().process_request_2(rp, request, spider)
+        if isinstance(rp, DelayingProtegoRobotParser):
+            parts = urlparse_cached(request)
+            domain = parts.netloc
+            if domain not in self._delays:
+                user_agent = self._robotstxt_useragent
+                if not user_agent:
+                    user_agent = request.headers.get(
+                        b"User-Agent", self._default_useragent
+                    )
+                delay = rp.delay(user_agent) or 0.0
+                self._delays[domain] = delay
+                if delay:
+                    logger.info(
+                        f"Set download delay to {delay} according to robots.txt. domain: {domain}"
+                    )
+
+    def robot_parser(self, request: Request, spider: Spider):
+        url = urlparse_cached(request)
+        netloc = url.netloc
+
+        if netloc not in self._parsers:
+            self._parsers[netloc] = Deferred()
+            robotsurl = f"{url.scheme}://{url.netloc}/robots.txt"
+            robotsreq = Request(
+                robotsurl,
+                priority=self.DOWNLOAD_PRIORITY,
+                meta={
+                    "dont_obey_robotstxt": True,
+                    "system_request": True,
+                    "download_timeout": self.download_timeout,
+                },
+                callback=NO_CALLBACK,
+            )
+            dfd = self.crawler.engine.download(robotsreq)
+            dfd.addCallback(self._parse_robots, netloc, spider)
+            dfd.addErrback(self._logerror, robotsreq, spider)
+            dfd.addErrback(self._robots_error, netloc)
+            self.crawler.stats.inc_value("robotstxt/request_count")
+
+        if isinstance(self._parsers[netloc], Deferred):
+            d = Deferred()
+
+            def cb(result):
+                d.callback(result)
+                return result
+
+            self._parsers[netloc].addCallback(cb)
+            return d
+        return self._parsers[netloc]
+
+
+def _update_request_stats(
+    stats: StatsCollector,
+    request: Request,
+    spider: Spider,
+    prefix: str,
+    skip_domains: bool = False,
+):
+    # request count
+    stats.inc_value(prefix, spider=spider)
+    # proxy distribution
+    proxy = request.meta.get("proxy", "None")
+    stats.inc_value(f"{prefix}/proxy/{proxy}", spider=spider)
+    if not skip_domains:
+        # domain distribution
+        domain = get_etld1(to_unicode(request.url))
+        stats.inc_value(f"{prefix}/domain/{domain}", spider=spider)
+        # subdomain distribution
+        sub_domain = get_netloc(request)
+        stats.inc_value(f"{prefix}/subdomain/{sub_domain}", spider=spider)
+
+
+def _update_stats(
+    stats: StatsCollector,
+    request: Request,
+    response: Response,
+    spider: Spider,
+    prefix: str,
+    skip_domains: bool = False,
+):
+    _update_request_stats(stats, request, spider, prefix, skip_domains)
+    # mime type distribution
+    content_type = get_content_type(response)
+    if not content_type:
+        stats.inc_value(f"{prefix}/mime_type/None", spider=spider)
+    else:
+        mime_type = get_mime_type(content_type)
+        stats.inc_value(f"{prefix}/mime_type/{mime_type}", spider=spider)
+    # status code distribution
+    stats.inc_value(f"{prefix}/status_code/{response.status}", spider=spider)
+
+
+def _update_sitemap_stats(stats: StatsCollector, spider: Spider, prefix: str):
+    # sitemap
+    stats.inc_value(f"{prefix}/sitemap", spider=spider)
+
+
+class ConnectorRequestedStats(DownloaderStats):
+    """
+    Downloader middleware to expose additional stats.
+    """
+
+    def __init__(self, stats: StatsCollector, skip_domains: bool):
+        super().__init__(stats)
+        self.skip_domains = skip_domains
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        if not crawler.settings.getbool("DOWNLOADER_STATS"):
+            raise NotConfigured
+        skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS")
+        return cls(crawler.stats, skip_domains)
+
+    def process_request(self, request: Request, spider: Spider):
+        super().process_request(request, spider)
+        prefix = "dpk_connector/requested"
+        if not request.meta.get("system_request", False):
+            _update_request_stats(
+                self.stats, request, spider, prefix, self.skip_domains
+            )
+        if request.meta.get("sitemap", False):
+            _update_sitemap_stats(self.stats, spider, prefix)
+
+    def process_response(self, request: Request, response: Response, spider: Spider):
+        ret = super().process_response(request, response, spider)
+        prefix = "dpk_connector/accessed"
+        if not request.meta.get("system_request", False):
+            _update_stats(
+                self.stats, request, response, spider, prefix, self.skip_domains
+            )
+        if request.meta.get("sitemap", False):
+            _update_sitemap_stats(self.stats, spider, prefix)
+        return ret
+
+
+class ConnectorDownloadedStats:
+    """
+    Spider middleware to expose additional stats.
+    """
+
+    def __init__(self, stats: StatsCollector, skip_domains: bool):
+        self.stats = stats
+        self.skip_domains = skip_domains
+
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        if not crawler.settings.getbool("DOWNLOADER_STATS"):
+            raise NotConfigured
+        skip_domains = crawler.settings.getbool("STATS_SKIP_DOMAINS")
+        return cls(crawler.stats, skip_domains)
+
+    def process_spider_output(
+        self,
+        response: Response,
+        result: Iterable[Request | ConnectorItem],
+        spider: Spider,
+    ) -> Generator[Any, Any, None]:
+        for r in result:
+            if isinstance(r, ConnectorItem):
+                if (not r.system_request) and r.downloaded:
+                    _update_stats(
+                        self.stats,
+                        response.request,
+                        response,
+                        spider,
+                        "dpk_connector/downloaded",
+                        self.skip_domains,
+                    )
+                if r.sitemap:
+                    _update_sitemap_stats(self.stats, spider, "dpk_connector/downloaded")
+            yield r
diff --git a/data-connector-lib/src/dpk_connector/core/pipelines.py b/data-connector-lib/src/dpk_connector/core/pipelines.py
new file mode 100644
index 000000000..b9c35d896
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/pipelines.py
@@ -0,0 +1,29 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+from typing import Any
+from scrapy import Spider
+from scrapy.crawler import Crawler
+from scrapy.exceptions import DropItem
+
+from dpk_connector.core.item import ConnectorItem
+
+
+class DropPipeline:
+    @classmethod
+    def from_crawler(cls, crawler: Crawler):
+        return cls()
+
+    def process_item(self, item: ConnectorItem, spider: Spider) -> Any:
+        if item.system_request or (not item.downloaded):
+            raise DropItem
+        return item
diff --git a/data-connector-lib/src/dpk_connector/core/settings.py b/data-connector-lib/src/dpk_connector/core/settings.py
new file mode 100644
index 000000000..041ada253
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/settings.py
@@ -0,0 +1,70 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+BOT_NAME = "dpk-connector"
+
+SPIDER_MODULES = ["dpk_connector.core.spiders"]
+
+# Robots
+ROBOTSTXT_OBEY = True
+ROBOTS_MAX_CRAWL_DELAY = 60
+ROBOTSTXT_PARSER = "dpk_connector.core.middlewares.DelayingProtegoRobotParser"
+
+# Downloader parameters
+CONCURRENT_REQUESTS = 20
+CONCURRENT_REQUESTS_PER_DOMAIN = 10
+DOWNLOAD_DELAY = 0
+RANDOMIZE_DOWNLOAD_DELAY = True
+DOWNLOAD_TIMEOUT = 180
+
+# Autothrottle
+AUTOTHROTTLE_ENABLED = True
+AUTOTHROTTLE_START_DELAY = 0
+AUTOTHROTTLE_MAX_DELAY = 300
+AUTOTHROTTLE_TARGET_CONCURRENCY = 10
+AUTOTHROTTLE_DEBUG = False
+
+# Middlewares/pipelines/extensions
+SPIDER_MIDDLEWARES = {
+    "dpk_connector.core.middlewares.ConnectorDownloadedStats": 10,
+}
+DOWNLOADER_MIDDLEWARES = {
+    "scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware": None,
+    "dpk_connector.core.middlewares.DelayingRobotsTxtMiddleware": 100,
+    "scrapy.downloadermiddlewares.stats.DownloaderStats": None,
+    "dpk_connector.core.middlewares.ConnectorRequestedStats": 850,
+}
+ITEM_PIPELINES = {
+    "dpk_connector.core.pipelines.DropPipeline": 100,
+}
+EXTENSIONS = {
+    "scrapy.extensions.telnet.TelnetConsole": None,
+    "scrapy.extensions.memdebug.MemoryDebugger": None,
+}
+
+# Queue
+SCHEDULER_MEMORY_QUEUE = "scrapy.squeues.LifoMemoryQueue"
+
+# Logging
+LOG_LEVEL = "INFO"
+LOG_SCRAPED_ITEMS = False
+LOG_FORMATTER = "dpk_connector.core.logging.QuietLogFormatter"
+
+# Periodic logging
+PERIODIC_LOG_DELTA = True
+PERIODIC_LOG_STATS = True
+PERIODIC_LOG_TIMING_ENABLED = True
+LOGSTATS_INTERVAL = 300
+
+# Misc
+REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
+TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
diff --git a/data-connector-lib/src/dpk_connector/core/spiders/__init__.py b/data-connector-lib/src/dpk_connector/core/spiders/__init__.py
new file mode 100644
index 000000000..0e134987e
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/spiders/__init__.py
@@ -0,0 +1,11 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
new file mode 100644
index 000000000..f24d4088b
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
@@ -0,0 +1,334 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import logging
+from typing import Any, Callable, Collection, Generator
+from urllib.parse import ParseResult
+
+from scrapy import Request
+from scrapy.http import Response
+from scrapy.link import Link
+from scrapy.linkextractors import LinkExtractor
+from scrapy.spiders import SitemapSpider
+from scrapy.spiders.sitemap import iterloc
+from scrapy.utils.sitemap import Sitemap, sitemap_urls_from_robots
+
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.utils import (
+    get_base_url,
+    get_content_type,
+    get_etld1,
+    get_focus_path,
+    is_allowed_path,
+    urlparse_cached,
+)
+
+
+class BaseSitemapSpider(SitemapSpider):
+    SITEMAP_DOWNLOAD_PRIORITY = 10
+
+    name = "base-sitemap"
+
+    def __init__(
+        self,
+        seed_urls: Collection[str],
+        allow_domains: Collection[str] = (),
+        path_focus: bool = False,
+        allow_mime_types: Collection[str] = (),
+        disallow_mime_types: Collection[str] = (),
+        depth_limit: int = 0,
+        disable_sitemap_search: bool = False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.depth_limit = depth_limit
+        self.sitemap_search = (not disable_sitemap_search) and depth_limit >= 0
+
+        # Build sitemap url candidates
+        self.input_seed_urls = seed_urls
+        sitemap_urls = []
+        sitemaps_seen = []
+        for seed_url in seed_urls:
+            parts = urlparse_cached(seed_url)
+            if seed_url.endswith(
+                (
+                    ".xml",
+                    ".xml.gz",
+                    "/robots.txt",
+                    "robots.txt/",
+                    "/sitemap",
+                    "/sitemap/",
+                )
+            ):
+                sitemap_urls.append(seed_url)
+            elif self.sitemap_search:
+                sitemap_urls.extend(self._get_sitemap_urls(parts))
+            sitemaps_seen.append(parts.netloc)
+        self.seed_urls = set(seed_urls) - set(sitemap_urls)
+        self.sitemap_urls = set(sitemap_urls)
+        self.sitemaps_seen = set(sitemaps_seen)
+
+        # Extract focus paths
+        self.focus_paths: set[str] = set()
+        if path_focus:
+            for seed_url in self.seed_urls:
+                path = get_focus_path(seed_url)
+                if path is not None:
+                    self.focus_paths.add(path)
+
+        # Domains and mime types filtering
+        self.allowed_domains = set(
+            allow_domains
+            if len(allow_domains) > 0
+            else [get_etld1(url) for url in seed_urls]
+        )
+        self.allow_mime_types = set(
+            [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
+        )
+        self.disallow_mime_types = set(
+            [m.lower() for m in disallow_mime_types]
+            if len(disallow_mime_types) > 0
+            else ()
+        )
+
+        # Link extraction from html
+        self.link_extractor = LinkExtractor(
+            allow_domains=self.allowed_domains,
+            unique=True,
+            deny_extensions=(),
+            tags=("a", "area", "link"),
+        )
+
+        self.log(
+            f"Seed URLs: {self.seed_urls}, sitemap URLs: {self.sitemap_urls}, allow domains: {self.allowed_domains}, focus paths: {self.focus_paths}, allow mime types: {self.allow_mime_types}, disallow mime types: {self.disallow_mime_types}, depth limit: {self.depth_limit}, sitemap search: {self.sitemap_search}",
+            logging.INFO,
+        )
+
+    def _get_sitemap_urls(self, parts: ParseResult) -> list[str]:
+        base_url = get_base_url(parts)
+        sitemap_variations = (
+            "robots.txt",
+            "robots.txt/",
+            "sitemap.xml",
+            "sitemap_index.xml",
+            "sitemapindex.xml",
+            "sitemap",
+            "sitemap-index.xml",
+            "sitemap/index.xml",
+            "sitemap/sitemap.xml",
+            "sitemap1.xml",
+        )
+        return [f"{base_url}/{sitemap}" for sitemap in sitemap_variations]
+
+    def start_requests(self):
+        for url in self.sitemap_urls:
+            yield Request(
+                url,
+                self._parse_sitemap,
+                priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                meta={
+                    "seed_url": url,
+                    "previous_url": "",
+                    "system_request": True,
+                    "sitemap": True,
+                },
+            )
+        for url in self.seed_urls:
+            yield Request(
+                url,
+                self.parse,
+                meta={
+                    "seed_url": url,
+                    "previous_url": "",
+                },
+            )
+
+    def _parse_sitemap(self, response: Response):
+        yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
+
+        seed_url = response.meta["seed_url"]
+
+        if response.url.endswith("/robots.txt") or response.url.endswith(
+            "/robots.txt/"
+        ):
+            for url in sitemap_urls_from_robots(response.text, base_url=response.url):
+                yield Request(
+                    url,
+                    callback=self._parse_sitemap,
+                    priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                    meta={
+                        "seed_url": seed_url,
+                        "previous_url": response.url,
+                        "system_request": True,
+                        "sitemap": True,
+                    },
+                )
+        else:
+            body = self._get_sitemap_body(response)
+            if not body:
+                self.log(
+                    f"Ignoring invalid sitemap: {response}",
+                    logging.WARN,
+                    extra={"spider": self},
+                )
+                return
+
+            s = Sitemap(body)
+            it = self.sitemap_filter(s)
+
+            if s.type == "sitemapindex":
+                for loc in iterloc(it, self.sitemap_alternate_links):
+                    if any(
+                        x.search(loc) for x in self._follow
+                    ) and self._is_allowed_path(loc):
+                        yield Request(
+                            loc,
+                            callback=self._parse_sitemap,
+                            priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                            meta={
+                                "seed_url": seed_url,
+                                "previous_url": response.url,
+                                "system_request": True,
+                                "sitemap": True,
+                            },
+                        )
+            elif s.type == "urlset":
+                for loc in iterloc(it, self.sitemap_alternate_links):
+                    for r, c in self._cbs:
+                        if r.search(loc) and self._is_allowed_path(loc):
+                            yield Request(
+                                loc,
+                                callback=c,
+                                meta={
+                                    "seed_url": seed_url,
+                                    "previous_url": response.url,
+                                },
+                            )
+                            break
+
+    def _is_allowed_path(self, input: str | Request | Response) -> bool:
+        return is_allowed_path(input, self.focus_paths)
+
+    def _is_allowed_content_type(self, content_type: str) -> bool:
+        return any([mtype in content_type for mtype in self.allow_mime_types])
+
+    def _is_disallowed_content_type(self, content_type: str) -> bool:
+        return any([mtype in content_type for mtype in self.disallow_mime_types])
+
+    def _should_download(self, content_type: str | None) -> bool:
+        if (not self.allow_mime_types) and (not self.disallow_mime_types):
+            return True
+        if not content_type:
+            return False
+        ctype = content_type.lower()
+        if not self.allow_mime_types:
+            return not self._is_disallowed_content_type(ctype)
+        if not self.disallow_mime_types:
+            return self._is_allowed_content_type(ctype)
+        return (
+            not self._is_disallowed_content_type(ctype)
+        ) and self._is_allowed_content_type(ctype)
+
+    def _explore_sitemap(self, response: Response) -> Generator[Request, Any, None]:
+        depth = response.meta.get("depth", 0)
+        depth_limit = self.depth_limit
+        if (depth_limit == 0 or depth < depth_limit) and self.sitemap_search:
+            parts = urlparse_cached(response)
+            domain = parts.netloc
+            if domain not in self.sitemaps_seen:
+                self.log(
+                    f"New domain {domain} found. Search for sitemap.", logging.INFO
+                )
+                self.sitemaps_seen.add(domain)
+                for sitemap in self._get_sitemap_urls(parts):
+                    yield Request(
+                        sitemap,
+                        callback=self._parse_sitemap,
+                        priority=self.SITEMAP_DOWNLOAD_PRIORITY,
+                        meta={
+                            "seed_url": response.meta["seed_url"],
+                            "previous_url": response.url,
+                            "system_request": True,
+                            "sitemap": True,
+                        },
+                    )
+
+    def _explore_links(
+        self, response: Response, links: list[Link]
+    ) -> Generator[Request, Any, None]:
+        depth = response.meta.get("depth", 0)
+        depth_limit = self.depth_limit
+        if depth_limit == 0 or depth < depth_limit:
+            for link in links:
+                if self._is_allowed_path(link.url):
+                    yield Request(
+                        link.url,
+                        callback=self.parse,
+                        meta={
+                            "seed_url": response.meta["seed_url"],
+                            "previous_url": response.url,
+                        },
+                    )
+
+
+class ConnectorSitemapSpider(BaseSitemapSpider):
+    name = "dpk-connector-sitemap"
+
+    def __init__(
+        self,
+        callback: Callable[[str, bytes, dict[str, str]], None],
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.callback = callback
+
+    def parse(
+        self, response: Response, **kwargs: Any
+    ) -> Generator[Request | ConnectorItem, Any, None]:
+        drop = False
+        content_type = get_content_type(response)
+        if not content_type:
+            drop = True
+        is_html = "text/html" in content_type.lower()
+        should_download = self._should_download(content_type)
+        if not (is_html or should_download):
+            drop = True
+        if drop:
+            yield ConnectorItem(
+                dropped=True, downloaded=False, system_request=False, sitemap=False
+            )
+            return
+
+        # Download contents
+        if should_download:
+            self.callback(
+                str(response.url), response.body, response.headers.to_unicode_dict()
+            )
+            # to count up downloaded pages and collect stats
+            yield ConnectorItem(
+                dropped=False, downloaded=True, system_request=False, sitemap=False
+            )
+        else:
+            yield ConnectorItem(
+                dropped=False, downloaded=False, system_request=False, sitemap=False
+            )
+
+        # Search for sitemap
+        yield from self._explore_sitemap(response)
+
+        # Extract links and dispatch them
+        links = self.link_extractor.extract_links(response) if is_html else []
+        yield from self._explore_links(response, links)
diff --git a/data-connector-lib/src/dpk_connector/core/utils.py b/data-connector-lib/src/dpk_connector/core/utils.py
new file mode 100644
index 000000000..d2dfa760d
--- /dev/null
+++ b/data-connector-lib/src/dpk_connector/core/utils.py
@@ -0,0 +1,92 @@
+# (C) Copyright IBM Corp. 2024.
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+################################################################################
+
+import re
+from urllib.parse import ParseResult, urlparse
+
+import tldextract
+from scrapy.http import Request, Response
+from scrapy.http.headers import Headers
+from scrapy.utils.httpobj import urlparse_cached as _urlparse_cached
+
+
+def _get_header_value(headers: Headers, key: str) -> str | None:
+    value = headers.get(key)
+    return value.decode("utf-8") if value else None
+
+
+def get_header_value(response: Response, key: str) -> str | None:
+    return _get_header_value(response.headers, key)
+
+
+def get_content_type(response: Response) -> str | None:
+    return get_header_value(response, "Content-Type")
+
+
+def get_mime_type(content_type: str) -> str:
+    return content_type.split(";")[0].strip()
+
+
+def urlparse_cached(input: str | Request | Response) -> ParseResult:
+    return urlparse(input) if isinstance(input, str) else _urlparse_cached(input)
+
+
+def get_netloc(input: str | Request | Response) -> str:
+    return urlparse_cached(input).netloc
+
+
+def get_base_url(input: str | Request | Response | ParseResult) -> str:
+    if isinstance(input, ParseResult):
+        parts = input
+    else:
+        parts = urlparse_cached(input)
+    return f"{parts.scheme}://{parts.netloc}"
+
+
+def get_etld1(url: str) -> str:
+    ext = tldextract.extract(url)
+    return f"{ext.domain}.{ext.suffix}"
+
+
+def get_focus_path(url: str) -> str | None:
+    parts = urlparse_cached(url)
+    if len(parts.path.split("/")) > 2:
+        return "/".join(parts.path.split("/")[:-1]) + "/"
+    return None
+
+
+def _check_path(url_path: str, focus_element: str):
+    if focus_element.startswith("/"):
+        return url_path.startswith(focus_element)
+    else:
+        return focus_element in url_path
+
+
+def is_allowed_path(input: str | Request | Response, focus_paths: set[str]) -> bool:
+    if not focus_paths:
+        return True
+    url_path = urlparse_cached(input).path
+    return any(_check_path(url_path.lower(), p.lower()) for p in focus_paths)
+
+
+def validate_url(url: str) -> bool:
+    result = urlparse(url)
+    if result.scheme not in ("http", "https"):
+        return False
+    if not result.netloc:
+        return False
+    return True
+
+
+def validate_domain(domain: str) -> bool:
+    pattern = r"^([a-zA-Z0-9][a-zA-Z0-9\-]{1,61}[a-zA-Z0-9]\.)+[a-zA-Z]{2,}$"
+    return bool(re.match(pattern, domain))
diff --git a/data-connector-lib/test/dpk_connector/core/__init__.py b/data-connector-lib/test/dpk_connector/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/data-connector-lib/test/dpk_connector/core/test_crawler.py b/data-connector-lib/test/dpk_connector/core/test_crawler.py
new file mode 100644
index 000000000..88d90293a
--- /dev/null
+++ b/data-connector-lib/test/dpk_connector/core/test_crawler.py
@@ -0,0 +1,28 @@
+import pytest
+
+from dpk_connector.core.crawler import crawl
+
+
+def test_invalid_crawler():
+    def on_downloaded(url: str, body: bytes, headers: dict[str, str]):
+        pass
+
+    with pytest.raises(ValueError) as e:
+        crawl([], on_downloaded)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["invalidseedurl"], on_downloaded)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, allow_domains=("invaliddomain",))
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, depth_limit=-10)
+    assert isinstance(e.value, ValueError) is True
+
+    with pytest.raises(ValueError) as e:
+        crawl(["http://example.com"], on_downloaded, download_limit=-10)
+    assert isinstance(e.value, ValueError) is True
diff --git a/data-connector-lib/test/dpk_connector/core/test_middlewares.py b/data-connector-lib/test/dpk_connector/core/test_middlewares.py
new file mode 100644
index 000000000..a2346888f
--- /dev/null
+++ b/data-connector-lib/test/dpk_connector/core/test_middlewares.py
@@ -0,0 +1,59 @@
+import pytest
+from dpk_connector.core.middlewares import DelayingProtegoRobotParser
+from pytest_mock import MockerFixture
+
+
+@pytest.mark.parametrize(
+    "ua,expected",
+    [
+        ("Mozilla", 7.7),
+        ("MSIE", 10),
+        ("Macintosh", 60),
+        ("ua1", None),
+        ("ua2", None),
+    ],
+)
+def test_robots_crawl_delay(mocker: MockerFixture, ua: str, expected: float):
+    robots_txt = """
+User-agent: *
+Crawl-delay: 7.7
+User-agent: MSIE
+Crawl-delay: 10
+User-agent: Macintosh
+Crawl-delay: 10000
+User-agent: ua1
+Crawl-delay: not numeric value
+User-agent: ua2
+"""
+    crawler = mocker.patch("dpk_connector.core.middlewares.Crawler")
+    parser = DelayingProtegoRobotParser(robots_txt.encode("utf-8"), crawler.spider)
+    parser.max_delay = 60
+    assert parser.delay(ua) == expected
+
+
+@pytest.mark.parametrize(
+    "ua,expected",
+    [
+        ("Mozilla", 10),
+        ("MSIE", 60),
+        ("Macintosh", 60),
+        ("ua1", None),
+        ("ua2", None),
+    ],
+)
+def test_robots_request_rate(mocker: MockerFixture, ua: str, expected: float):
+    robots_txt = """
+User-agent: *
+Request-rate: 1/10s
+User-agent: MSIE
+Request-rate: 1/1m 0900-1730
+User-agent: Macintosh
+Request-rate: 2/1h 0000-1736
+User-agent: ua1
+Request-rate: invalid value
+User-agent: ua2
+"""
+    crawler = mocker.patch("dpk_connector.core.middlewares.Crawler")
+    parser = DelayingProtegoRobotParser(robots_txt.encode("utf-8"), crawler.spider)
+    parser.max_delay = 60
+    assert parser.delay(ua) == expected
diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
new file mode 100644
index 000000000..826963f2f
--- /dev/null
+++ b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
@@ -0,0 +1,97 @@
+from pathlib import Path
+
+import pytest
+from scrapy import Request
+from scrapy.crawler import Crawler
+from scrapy.http import HtmlResponse
+
+from dpk_connector.core.item import ConnectorItem
+from dpk_connector.core.spiders.sitemap import BaseSitemapSpider, ConnectorSitemapSpider
+
+
+@pytest.fixture
+def crawler() -> Crawler:
+    crawler = Crawler(
+        ConnectorSitemapSpider,
+        settings={
+            "STATS_CLASS": "scrapy.statscollectors.MemoryStatsCollector",
+            "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7",
+        },
+    )
+    crawler._apply_settings()
+    return crawler
+
+
+def test_init_path_focus():
+    spider = BaseSitemapSpider(
+        seed_urls=(
+            "https://openshiftjsonschema.dev/v4.9.18-standalone-strict/",
+            "https://openshiftjsonschema.dev/",
+        ),
+        path_focus=True,
+    )
+    assert spider.seed_urls == {
+        "https://openshiftjsonschema.dev/v4.9.18-standalone-strict/",
+        "https://openshiftjsonschema.dev/",
+    }
+    assert spider.sitemap_urls == {
+        "https://openshiftjsonschema.dev/robots.txt",
+        "https://openshiftjsonschema.dev/robots.txt/",
+        "https://openshiftjsonschema.dev/sitemap.xml",
+        "https://openshiftjsonschema.dev/sitemap_index.xml",
+        "https://openshiftjsonschema.dev/sitemapindex.xml",
+        "https://openshiftjsonschema.dev/sitemap",
+        "https://openshiftjsonschema.dev/sitemap-index.xml",
+        "https://openshiftjsonschema.dev/sitemap/index.xml",
+        "https://openshiftjsonschema.dev/sitemap/sitemap.xml",
+        "https://openshiftjsonschema.dev/sitemap1.xml",
+    }
+    assert spider.allowed_domains == {"openshiftjsonschema.dev"}
+    assert spider.sitemaps_seen == {"openshiftjsonschema.dev"}
+    assert spider.focus_paths == {"/v4.9.18-standalone-strict/"}
+
+
+def test_parse(datadir: Path, crawler: Crawler):
+    response_body = (datadir / "index.html").read_text()
+
+    def callback(url: str, body: bytes, headers: dict):
+        assert url == "http://example.com/index.html"
+        assert body.decode("utf-8") == response_body
+        assert headers == {"Content-Type": "text/html"}
+
+    spider = ConnectorSitemapSpider.from_crawler(
+        crawler, seed_urls=("http://example.com",), callback=callback
+    )
+    request = Request(
+        "http://example.com/index.html",
+        meta={
+            "seed_url": "http://example.com",
+            "depth": 1,
+        },
+    )
+    response = HtmlResponse(
+        "http://example.com/index.html",
+        headers={"Content-Type": "text/html"},
+        body=response_body,
+        request=request,
+        encoding="utf-8",
+    )
+    parsed = spider.parse(response)
+
+    item = next(parsed)
+    assert item == ConnectorItem(
+        dropped=False, downloaded=True, system_request=False, sitemap=False
+    )
+
+    for next_request in parsed:
+        assert isinstance(next_request, Request) is True
+        assert next_request.url in (
+            "http://example.com/blog/",
+            "http://example.com/contents/",
+            "http://example.com/css/app.css",
+            "http://example.com/favicon.ico?r=1.6",
+            "http://example.com/",
+        )
+        assert next_request.callback == spider.parse
+        assert next_request.meta["seed_url"] == "http://example.com"
+        assert next_request.meta["previous_url"] == "http://example.com/index.html"
diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html
new file mode 100644
index 000000000..f77ae5828
--- /dev/null
+++ b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider/index.html
@@ -0,0 +1,39 @@
+
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>Max External Links</title>
+
+      <meta content="en" HTTP-EQUIV="content-language"/>
+
+    <link type="text/css" href="/css/app.css" rel="stylesheet"/>
+    <link type="image/x-icon" href="/favicon.ico?r=1.6" rel="icon"/>
+    <script type="text/javascript" src="/bower_components/jquery/jquery.min.js"></script>
+
+
+  </head>
+  <body>
+
+    <div id="header_bg">
+      <div id="header">
+        <a href="/" id="logo">DPK Connector test</span></a>
+      </div>
+    </div>
+
+    <div class="row">
+      <div class="large-12 columns">
+
+          <h1>Max External Links</h1>
+
+        <a href="http://aaaaaa.org">link1 external</a>
+<br /><a href="http://bbbbbb.co.uk">link2 external</a>
+<br /><a href="http://xxx.com">link3 external</a>
+<br /><a href="http://example.com/blog/">blog</a>
+<br /><a href="http://example.com/contents/">contents</a>
+<br /><br />
+      </div>
+    </div>
+
+  </body>
+
+</html>
diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py
new file mode 100644
index 000000000..096a4e194
--- /dev/null
+++ b/data-connector-lib/test/dpk_connector/core/test_utils.py
@@ -0,0 +1,152 @@
+# Assisted by WCA@IBM
+# Latest GenAI contribution: ibm/granite-20b-code-instruct-v2
+
+import pytest
+from dpk_connector.core.utils import (
+    get_base_url,
+    get_content_type,
+    get_etld1,
+    get_focus_path,
+    get_header_value,
+    get_mime_type,
+    is_allowed_path,
+    urlparse_cached,
+    validate_domain,
+    validate_url,
+)
+from pytest_mock import MockerFixture
+from scrapy.http import Request, Response
+
+
+def test_get_header_value():
+    response = Response(
+        "http://example.com", headers={"Content-Type": "application/json"}
+    )
+    assert get_header_value(response, "Content-Type") == "application/json"
+
+
+def test_get_header_value_not_found():
+    response = Response("http://example.com")
+    assert get_header_value(response, "Content-Type") is None
+
+
+def test_get_content_type():
+    response = Response("http://example.com", headers={"Content-Type": "text/html"})
+    assert get_content_type(response) == "text/html"
+
+
+def test_get_content_type_not_found():
+    response = Response("http://example.com")
+    assert get_content_type(response) is None
+
+
+def test_urlparse_cached_with_str(mocker: MockerFixture):
+    import dpk_connector.core.utils
+
+    spy = mocker.spy(dpk_connector.core.utils, "urlparse")
+    urlparse_cached("http://example.com")
+    spy.assert_called_once_with("http://example.com")
+
+
+def test_urlparse_cached_with_request(mocker: MockerFixture):
+    import dpk_connector.core.utils
+
+    spy = mocker.spy(dpk_connector.core.utils, "_urlparse_cached")
+    request = Request("http://example.com")
+    urlparse_cached(request)
+    spy.assert_called_once_with(request)
+
+
+def test_urlparse_cached_with_response(mocker: MockerFixture):
+    import dpk_connector.core.utils
+
+    spy = mocker.spy(dpk_connector.core.utils, "_urlparse_cached")
+    response = Response("http://example.com")
+    urlparse_cached(response)
+    spy.assert_called_once_with(response)
+
+
+def test_get_base_url():
+    url = "http://localhost:8000/test?param=value"
+    assert get_base_url(url) == "http://localhost:8000"
+
+
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("http://www.example.com", "example.com"),
+        ("https://www.example.co.uk", "example.co.uk"),
+        ("http://www.example.com/path?query=string#fragment", "example.com"),
+    ],
+)
+def test_get_etld1(url: str, expected: str):
+    assert get_etld1(url) == expected
+
+
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("https://www.example.com", None),
+        ("https://www.example.com/page", None),
+        ("https://www.example.com/page/", "/page/"),
+        ("https://www.example.com/page/subpage", "/page/"),
+        ("https://www.example.com/page/subpage/", "/page/subpage/"),
+    ],
+)
+def test_get_focus_path(url: str, expected: str | None):
+    assert get_focus_path(url) == expected
+
+
+@pytest.mark.parametrize(
+    "url,paths,expected",
+    [
+        ("http://localhost/", set(), True),
+        ("http://localhost/test/", {"/test/"}, True),
+        ("http://localhost/test/", {"/test2/"}, False),
+        ("http://localhost/test", {"/test/"}, False),
+        ("http://localhost/test/", {"/test/", "/test2/"}, True),
+        ("http://localhost/test3/", {"/test/", "/test2/"}, False),
+    ],
+)
+def test_is_allowed_path(url: str, paths: set[str], expected: bool):
+    assert is_allowed_path(url, paths) is expected
+
+
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("http://www.cwi.nl:80/%7Eguido/Python.html", True),
+        ("/data/Python.html", False),
+        ("532", False),
+        ("dkakasdkjdjakdjadjfalskdjfalk", False),
+        ("https://stackoverflow.com", True),
+    ],
+)
+def test_validate_url(url: str, expected: bool):
+    assert validate_url(url) is expected
+
+
+@pytest.mark.parametrize(
+    "content_type,expected",
+    [
+        ("text/html", "text/html"),
+        ("application/json; charset=utf-8", "application/json"),
+    ],
+)
+def test_get_mime_type(content_type: str, expected: str):
+    assert get_mime_type(content_type) == expected
+
+
+@pytest.mark.parametrize(
+    "domain,expected",
+    [
+        ("example.com", True),
+        ("sub-domain.example.com", True),
+        ("a" * 254 + ".com", False),
+        ("", False),
+        ("example", False),
+        ("sub_domain.example.com", False),
+    ],
+)
+def test_validate_domain(domain: str, expected: bool):
+    assert validate_domain(domain) is expected