Skip to content

Commit

Permalink
Merge branch 'dev' into licenses
Browse files Browse the repository at this point in the history
  • Loading branch information
daw3rd committed Oct 22, 2024
2 parents c15b4e9 + b297156 commit 4624a40
Show file tree
Hide file tree
Showing 44 changed files with 11,259 additions and 80 deletions.
2 changes: 1 addition & 1 deletion data-connector-lib/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "data_prep_connector"
version = "0.2.2.dev0"
version = "0.2.2.dev1"
requires-python = ">=3.10"
keywords = [
"data",
Expand Down
6 changes: 6 additions & 0 deletions data-connector-lib/src/dpk_connector/core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def async_crawl(
user_agent: str = "",
headers: dict[str, str] = {},
allow_domains: Collection[str] = (),
subdomain_focus: bool = False,
path_focus: bool = False,
allow_mime_types: Collection[str] = (
"application/pdf",
Expand All @@ -96,6 +97,7 @@ def async_crawl(
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
Expand Down Expand Up @@ -140,6 +142,7 @@ def async_crawl(
seed_urls=seed_urls,
callback=on_downloaded,
allow_domains=allow_domains,
subdomain_focus=subdomain_focus,
path_focus=path_focus,
allow_mime_types=allow_mime_types,
disallow_mime_types=disallow_mime_types,
Expand All @@ -155,6 +158,7 @@ def crawl(
user_agent: str = "",
headers: dict[str, str] = {},
allow_domains: Collection[str] = (),
subdomain_focus: bool = False,
path_focus: bool = False,
allow_mime_types: Collection[str] = (
"application/pdf",
Expand All @@ -177,6 +181,7 @@ def crawl(
user_agent (str): The user agent string to use for the crawler. Defaults to "Scrapy/VERSION (+https://scrapy.org)".
headers (dict[str, str]): A dictionary of additional headers to send with each request. Default is an empty dictionary.
allow_domains (Collection[str]): A collection of domains to restrict the crawler to. Default is the domains of the seed URLs.
subdomain_focus (bool): If specified, only links under the subdomains of the input seed URLs will be extracted. Ignored if `allow_domains` is specified.
path_focus (bool): If specified, only links under the paths of the input seed URLs will be extracted.
allow_mime_types (Collection[str]): A collection of MIME types to allow during the crawl. Default is a collection containing "application/pdf", "text/html", "text/markdown", and "text/plain".
disallow_mime_types (Collection[str]): A collection of MIME types to disallow during the crawl. Default is an empty collection.
Expand All @@ -198,6 +203,7 @@ def on_completed(result: Any):
user_agent,
headers,
allow_domains,
subdomain_focus,
path_focus,
allow_mime_types,
disallow_mime_types,
Expand Down
20 changes: 14 additions & 6 deletions data-connector-lib/src/dpk_connector/core/spiders/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
get_content_type,
get_etld1,
get_focus_path,
get_fqdn,
is_allowed_path,
urlparse_cached,
)
Expand All @@ -42,6 +43,7 @@ def __init__(
self,
seed_urls: Collection[str],
allow_domains: Collection[str] = (),
subdomain_focus: bool = False,
path_focus: bool = False,
allow_mime_types: Collection[str] = (),
disallow_mime_types: Collection[str] = (),
Expand Down Expand Up @@ -88,11 +90,15 @@ def __init__(
self.focus_paths.add(path)

# Domains and mime types filtering
self.allowed_domains = set(
allow_domains
if len(allow_domains) > 0
else [get_etld1(url) for url in seed_urls]
)
if allow_domains:
self.allowed_domains = set(allow_domains)
elif subdomain_focus:
self.allowed_domains = set()
for url in seed_urls:
if fqdn := get_fqdn(url):
self.allowed_domains.add(fqdn)
else:
self.allowed_domains = set(get_etld1(url) for url in seed_urls)
self.allow_mime_types = set(
[m.lower() for m in allow_mime_types] if len(allow_mime_types) > 0 else ()
)
Expand Down Expand Up @@ -155,7 +161,9 @@ def start_requests(self):
)

def _parse_sitemap(self, response: Response):
yield ConnectorItem(dropped=False, downloaded=False, system_request=True, sitemap=True)
yield ConnectorItem(
dropped=False, downloaded=False, system_request=True, sitemap=True
)

seed_url = response.meta["seed_url"]

Expand Down
5 changes: 5 additions & 0 deletions data-connector-lib/src/dpk_connector/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,11 @@ def get_etld1(url: str) -> str:
return f"{ext.domain}.{ext.suffix}"


def get_fqdn(url: str) -> str:
ext = tldextract.extract(url)
return ext.fqdn


def get_focus_path(url: str) -> str | None:
parts = urlparse_cached(url)
if len(parts.path.split("/")) > 2:
Expand Down
15 changes: 15 additions & 0 deletions data-connector-lib/test/dpk_connector/core/test_sitemap_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,21 @@ def crawler() -> Crawler:
return crawler


def test_init_subdomain_focus():
spider = BaseSitemapSpider(
seed_urls=(
"http://blog.example.com/",
"http://contents.example.com/",
),
subdomain_focus=True,
)
assert spider.seed_urls == {
"http://blog.example.com/",
"http://contents.example.com/",
}
assert spider.allowed_domains == {"blog.example.com", "contents.example.com"}


def test_init_path_focus():
spider = BaseSitemapSpider(
seed_urls=(
Expand Down
16 changes: 16 additions & 0 deletions data-connector-lib/test/dpk_connector/core/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
get_content_type,
get_etld1,
get_focus_path,
get_fqdn,
get_header_value,
get_mime_type,
is_allowed_path,
Expand Down Expand Up @@ -93,6 +94,21 @@ def test_get_etld1(url: str, expected: str):
assert get_etld1(url) == expected


@pytest.mark.parametrize(
"url,expected",
[
("http://www.example.com", "www.example.com"),
("https://www.example.co.uk", "www.example.co.uk"),
("http://www.example.com/path?query=string#fragment", "www.example.com"),
("http://localhost:8080/", ""),
("http://www.example.com:8080/", "www.example.com"),
("http://www.sub.example.com:8080/", "www.sub.example.com"),
],
)
def test_get_fqdn(url: str, expected: str):
assert get_fqdn(url) == expected


@pytest.mark.parametrize(
"url,expected",
[
Expand Down
10 changes: 10 additions & 0 deletions examples/notebooks/intro/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
output*/

## File system artifacts
.directory
.DS_Store


## Python output
__pycache__
.ipynb_checkpoints/
36 changes: 36 additions & 0 deletions examples/notebooks/intro/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Data Prep Kit Introduction

This is an example featuring some of the features of data prep kit.

## Running the code

The code can be run on either

1. Google colab: very easy to run; no local setup needed.
2. On your local Python environment. Here is a quick guide. You can find instructions for latest version [here](../../../README.md#-getting-started)

```bash
conda create -n data-prep-kit -y python=3.11
conda activate data-prep-kit

# install the following in 'data-prep-kit' environment
pip3 install data-prep-tooklit==0.2.1
pip3 install data-prep-toolkit-transforms==0.2.1
pip3 install data-prep-toolkit-transforms-ray==0.2.1
pip3 install jupyterlab ipykernel ipywidgets

## install custom kernel
## Important: Use this kernel when running example notebooks!
python -m ipykernel install --user --name=data-prep-kit --display-name "dataprepkit"

# start jupyter and run the notebooks with this jupyter
jupyter lab
```

## Intro

This notebook will demonstrate processing PDFs

`PDFs ---> text ---> chunks ---> exact dedupe ---> fuzzy dedupe ---> embeddings`

[python version](dpk_intro_1_python.ipynb)   |   [ray version](dpk_intro_1_ray.ipynb)
Loading

0 comments on commit 4624a40

Please sign in to comment.