Merge branch 'dev' into licenses

IBM · Oct 22, 2024 · 4624a40 · 4624a40
2 parents c15b4e9 + b297156
commit 4624a40
Show file tree

Hide file tree

Showing 44 changed files with 11,259 additions and 80 deletions.
diff --git a/data-connector-lib/pyproject.toml b/data-connector-lib/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "data_prep_connector"
-version = "0.2.2.dev0"
+version = "0.2.2.dev1"
 requires-python = ">=3.10"
 keywords = [
     "data",

diff --git a/data-connector-lib/src/dpk_connector/core/crawler.py b/data-connector-lib/src/dpk_connector/core/crawler.py
@@ -74,6 +74,7 @@ def async_crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -96,6 +97,7 @@ def async_crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -140,6 +142,7 @@ def async_crawl(
         seed_urls=seed_urls,
         callback=on_downloaded,
         allow_domains=allow_domains,
+        subdomain_focus=subdomain_focus,
         path_focus=path_focus,
         allow_mime_types=allow_mime_types,
         disallow_mime_types=disallow_mime_types,
@@ -155,6 +158,7 @@ def crawl(
     user_agent: str = "",
     headers: dict[str, str] = {},
     allow_domains: Collection[str] = (),
+    subdomain_focus: bool = False,
     path_focus: bool = False,
     allow_mime_types: Collection[str] = (
         "application/pdf",
@@ -177,6 +181,7 @@ def crawl(
         user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
         headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
         allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
+        subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
         path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
         allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
         disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
@@ -198,6 +203,7 @@ def on_completed(result: Any):
         user_agent,
         headers,
         allow_domains,
+        subdomain_focus,
         path_focus,
         allow_mime_types,
         disallow_mime_types,

diff --git a/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py b/data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
@@ -28,6 +28,7 @@
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     is_allowed_path,
     urlparse_cached,
 )
@@ -42,6 +43,7 @@ def __init__(
         self,
         seed_urls: Collection[str],
         allow_domains: Collection[str] = (),
+        subdomain_focus: bool = False,
         path_focus: bool = False,
         allow_mime_types: Collection[str] = (),
         disallow_mime_types: Collection[str] = (),
@@ -88,11 +90,15 @@ def __init__(
                     self.focus_paths.add(path)
 
         # Domains and mime types filtering
-        self.allowed_domains = set(
-            allow_domains
-            if len(allow_domains) > 0
-            else [get_etld1(url) for url in seed_urls]
-        )
+        if allow_domains:
+            self.allowed_domains = set(allow_domains)
+        elif subdomain_focus:
+            self.allowed_domains = set()
+            for url in seed_urls:
+                if fqdn := get_fqdn(url):
+                    self.allowed_domains.add(fqdn)
+        else:
+            self.allowed_domains = set(get_etld1(url) for url in seed_urls)
         self.allow_mime_types = set(
             [m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
         )
@@ -155,7 +161,9 @@ def start_requests(self):
             )
 
     def _parse_sitemap(self, response: Response):
-        yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
+        yield ConnectorItem(
+            dropped=False, downloaded=False, system_request=True, sitemap=True
+        )
 
         seed_url = response.meta["seed_url"]
 

diff --git a/data-connector-lib/src/dpk_connector/core/utils.py b/data-connector-lib/src/dpk_connector/core/utils.py
@@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
     return f"{ext.domain}.{ext.suffix}"
 
 
+def get_fqdn(url: str) -> str:
+    ext = tldextract.extract(url)
+    return ext.fqdn
+
+
 def get_focus_path(url: str) -> str | None:
     parts = urlparse_cached(url)
     if len(parts.path.split("/")) > 2:

diff --git a/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py b/data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
@@ -33,6 +33,21 @@ def crawler() -> Crawler:
     return crawler
 
 
+def test_init_subdomain_focus():
+    spider = BaseSitemapSpider(
+        seed_urls=(
+            "http://blog.example.com/",
+            "http://contents.example.com/",
+        ),
+        subdomain_focus=True,
+    )
+    assert spider.seed_urls == {
+        "http://blog.example.com/",
+        "http://contents.example.com/",
+    }
+    assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}
+
+
 def test_init_path_focus():
     spider = BaseSitemapSpider(
         seed_urls=(

diff --git a/data-connector-lib/test/dpk_connector/core/test_utils.py b/data-connector-lib/test/dpk_connector/core/test_utils.py
@@ -19,6 +19,7 @@
     get_content_type,
     get_etld1,
     get_focus_path,
+    get_fqdn,
     get_header_value,
     get_mime_type,
     is_allowed_path,
@@ -93,6 +94,21 @@ def test_get_etld1(url: str, expected: str):
     assert get_etld1(url) == expected
 
 
+@pytest.mark.parametrize(
+    "url,expected",
+    [
+        ("http://www.example.com", "www.example.com"),
+        ("https://www.example.co.uk", "www.example.co.uk"),
+        ("http://www.example.com/path?query=string#fragment", "www.example.com"),
+        ("http://localhost:8080/", ""),
+        ("http://www.example.com:8080/", "www.example.com"),
+        ("http://www.sub.example.com:8080/", "www.sub.example.com"),
+    ],
+)
+def test_get_fqdn(url: str, expected: str):
+    assert get_fqdn(url) == expected
+
+
 @pytest.mark.parametrize(
     "url,expected",
     [

diff --git a/examples/notebooks/intro/.gitignore b/examples/notebooks/intro/.gitignore
@@ -0,0 +1,10 @@
+output*/
+
+## File system artifacts
+.directory
+.DS_Store
+
+
+## Python output
+__pycache__
+.ipynb_checkpoints/
diff --git a/examples/notebooks/intro/README.md b/examples/notebooks/intro/README.md
@@ -0,0 +1,36 @@
+# Data Prep Kit Introduction
+
+This is an example featuring some of the features of data prep kit.
+
+## Running the code
+
+The code can be run on either 
+
+1.  Google colab: very easy to run; no local setup needed.
+2.  On your local Python environment.  Here is a quick guide.  You can  find instructions for latest version [here](../../../README.md#-getting-started)
+
+```bash
+conda create -n data-prep-kit -y python=3.11
+conda activate data-prep-kit
+
+# install the following in 'data-prep-kit' environment
+pip3 install data-prep-tooklit==0.2.1
+pip3 install data-prep-toolkit-transforms==0.2.1
+pip3 install data-prep-toolkit-transforms-ray==0.2.1
+pip3 install jupyterlab   ipykernel  ipywidgets
+
+## install custom kernel
+## Important: Use this kernel when running example notebooks!
+python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit"
+
+# start jupyter and run the notebooks with this jupyter
+jupyter lab
+```
+
+## Intro
+
+This notebook will demonstrate processing PDFs
+
+`PDFs ---> text ---> chunks --->   exact dedupe ---> fuzzy dedupe ---> embeddings`
+
+[python version](dpk_intro_1_python.ipynb)  &nbsp;   |   &nbsp;  [ray version](dpk_intro_1_ray.ipynb)