diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..357913c8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,20 @@ +venv +*.egg-info +dist +data +docs +integrations +notebooks +scripts +.gitattributes +.gitignore +config.override.yml +build +.pytest_cache +.github +.idea +*.iml +examples +Dockerfile +coverage.xml +.coverage diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7f8fad6..aa9db2be 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,101 +1,241 @@ -name: "CI" +name: CI on: push: jobs: - code-format: - name: "πŸ” Python code format" + python-build: + name: πŸ—οΈ Build Python wheels + strategy: + matrix: + python: + - '3.10' runs-on: ubuntu-latest steps: - - name: "πŸ“₯ Check-out" - uses: actions/checkout@v4 - - name: "🧰 Install Python" - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: pip - cache-dependency-path: pyproject.toml - - name: "🧰 Install Protoc" - run: sudo apt install protobuf-compiler - - name: "🧰 Install dependencies" - run: | - python -m pip install --upgrade pip - pip install . - pip install .[tests] - - name: "πŸ” Check Python code format" - run: flake8 archive_query_log/ - lint: - name: "πŸ” Python Lint" + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Install Protoc + run: sudo apt install protobuf-compiler + - name: 🧰 Install Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - name: 🧰 Install dependencies + run: pip install build twine + - name: πŸ—οΈ Build Python wheels + run: python -m build + - name: πŸ§ͺ Check package bundles + run: twine check dist/* + - name: πŸ“€ Upload Python wheels + uses: actions/upload-artifact@v3 + if: matrix.python == '3.10' + with: + name: wheels + path: dist + python-code-check: + name: πŸ” Check Python code + strategy: + matrix: + python: + - '3.10' runs-on: ubuntu-latest steps: - - name: "πŸ“₯ Check-out" - uses: actions/checkout@v4 - - name: "🧰 Install Python" - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: pip - cache-dependency-path: pyproject.toml - - name: "🧰 Install Protoc" - run: sudo apt install protobuf-compiler - - name: "🧰 Install dependencies" - run: | - python -m pip install --upgrade pip - pip install . - pip install .[tests] - - name: "πŸ” Lint Python code" - run: pylint -E archive_query_log - unit-tests: - name: "πŸ§ͺ Python unit tests" + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Install Protoc + run: sudo apt install protobuf-compiler + - name: 🧰 Install Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - name: 🧰 Install dependencies + run: pip install .[tests] + - name: πŸ” Check Python code + run: ruff . + python-typing: + name: πŸ” Check Python static typing + strategy: + matrix: + python: + - '3.10' runs-on: ubuntu-latest steps: - - name: "πŸ“₯ Check-out" - uses: actions/checkout@v4 - - name: "🧰 Install Python" - uses: actions/setup-python@v5 - with: - python-version: "3.10" - cache: pip - cache-dependency-path: pyproject.toml - - name: "🧰 Install Protoc" - run: sudo apt install protobuf-compiler - - name: "🧰 Install dependencies" - run: | - python -m pip install --upgrade pip - pip install . - pip install .[tests] - - name: "πŸ§ͺ Test Python code" - run: pytest --cov=./ --cov-report=xml --capture=no archive_query_log/ - - name: "πŸ“€ Upload coverage to Codecov" - uses: codecov/codecov-action@v4 - with: - token: ${{ secrets.CODECOV_TOKEN }} - release: - name: "πŸš€ Create GitHub release" + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Install Protoc + run: sudo apt install protobuf-compiler + - name: 🧰 Install Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - name: 🧰 Install dependencies + run: pip install .[tests] + - name: πŸ” Check Python static typing + run: mypy . + python-security: + name: πŸ” Check Python code security + strategy: + matrix: + python: + - '3.10' + runs-on: ubuntu-latest + steps: + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Install Protoc + run: sudo apt install protobuf-compiler + - name: 🧰 Install Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - name: 🧰 Install dependencies + run: pip install .[tests] + - name: πŸ” Check Python code security + run: bandit -c pyproject.toml -r . + python-test: + name: πŸ§ͺ Test Python code + strategy: + matrix: + python: + - '3.10' + runs-on: ubuntu-latest + steps: + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Install Protoc + run: sudo apt install protobuf-compiler + - name: 🧰 Install Python + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python }} + cache: pip + cache-dependency-path: pyproject.toml + - name: 🧰 Install dependencies + run: pip install .[tests] + - name: πŸ§ͺ Test Python code + run: pytest --cov --cov-report=xml archive_query_log + - name: πŸ“€ Upload coverage to Codecov + uses: codecov/codecov-action@v3 + if: matrix.python == '3.10' + with: + fail_ci_if_error: true + token: ${{ secrets.CODECOV_TOKEN }} + docker-build: + name: πŸ—οΈ Build Docker image + runs-on: ubuntu-latest + steps: + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: 🧰 Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: πŸ—οΈ Build Docker image + uses: docker/build-push-action@v4 + with: + context: . + push: false + cache-from: type=gha + cache-to: type=gha,mode=max + python-publish: + name: πŸš€ Publish Python wheels + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + needs: + - python-build + - python-code-check + - python-typing + - python-security + - python-test + - docker-build + runs-on: ubuntu-latest + permissions: + id-token: write + steps: + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: πŸ“₯ Download Python wheels + uses: actions/download-artifact@v3 + with: + name: wheels + path: dist + - name: πŸš€ Publish Python wheels + uses: pypa/gh-action-pypi-publish@release/v1 + docker-publish: + name: πŸš€ Publish Docker image + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') + permissions: + packages: write + needs: + - python-build + - python-code-check + - python-typing + - python-security + - python-test + - docker-build + runs-on: ubuntu-latest + steps: + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: 🧰 Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: 🧰 Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: πŸ”‘ Login to GitHub Packages + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: ℹ️ Extract image metadata + id: meta + uses: docker/metadata-action@v4 + with: + images: | + ghcr.io/${{ github.repository }} + - name: πŸš€ Build and push image + uses: docker/build-push-action@v4 + with: + context: . + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + github-release: + name: πŸš€ Create GitHub release if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags') needs: - - code-format - - lint - - unit-tests + - python-build + - python-code-check + - python-typing + - python-security + - python-test + - docker-build + permissions: + contents: write runs-on: ubuntu-latest steps: - - name: "πŸ“₯ Check-out" - uses: actions/checkout@v4 - - name: "🏷️ Get version tag" - id: get-version - run: echo ::set-output name=tag::${GITHUB_REF/refs\/tags\//} - - name: "πŸ“₯ Download Python wheel" - uses: actions/upload-artifact@v4 - with: - name: wheel - path: dist/* - - name: "πŸš€ Create GitHub release" - uses: softprops/action-gh-release@v1 - with: - name: Release ${{ steps.get-version.outputs.tag }} - files: dist/* - fail_on_unmatched_files: true - draft: false - prerelease: false - generate_release_notes: true \ No newline at end of file + - name: πŸ“₯ Check-out + uses: actions/checkout@v3 + - name: πŸ“₯ Download Python wheels + uses: actions/download-artifact@v3 + with: + name: wheels + path: dist + - name: πŸš€ Create GitHub release + uses: softprops/action-gh-release@v1 + with: + name: Release ${{ github.ref_name }} + files: dist/* + fail_on_unmatched_files: true + draft: false + prerelease: false + generate_release_notes: true diff --git a/.gitignore b/.gitignore index 8bd84a93..745c1e5a 100644 --- a/.gitignore +++ b/.gitignore @@ -498,5 +498,5 @@ cython_debug/ # Custom exclusions *.lop -reimer*.pdf -cd-cover.pdf +.idea/kubernetes-settings.xml +config.override.yml diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml new file mode 100644 index 00000000..6b9ec1b6 --- /dev/null +++ b/.idea/dataSources.xml @@ -0,0 +1,23 @@ + + + + + sqlite.xerial + true + org.sqlite.JDBC + jdbc:sqlite:$PROJECT_DIR$/data/stats-cache/cache.db + + + + $ProjectFileDir$ + + + file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.39.2/sqlite-jdbc-3.39.2.jar + + + file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.34.0/sqlite-jdbc-3.34.0.jar + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 00000000..8affd009 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,17 @@ + + + + \ No newline at end of file diff --git a/.idea/jsLibraryMappings.xml b/.idea/jsLibraryMappings.xml new file mode 100644 index 00000000..f54474e3 --- /dev/null +++ b/.idea/jsLibraryMappings.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..8cb97236 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,8 @@ + + + + + + + \ No newline at end of file diff --git a/.markdownlint.yml b/.markdownlint.yml new file mode 100644 index 00000000..ce696e1f --- /dev/null +++ b/.markdownlint.yml @@ -0,0 +1,3 @@ +default: true +MD024: false +MD013: false diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..55816d1a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,23 @@ +FROM python:3.10-slim + +RUN --mount=type=cache,target=/var/cache/apt \ + apt-get -y update && \ + apt-get -y install git build-essential zlib1g-dev protobuf-compiler + +RUN --mount=type=cache,target=/root/.cache/pip \ + ([ -d /venv ] || python3.10 -m venv /venv) && \ + /venv/bin/pip install --upgrade pip + +WORKDIR /workspace/ +ADD pyproject.toml pyproject.toml +ARG PSEUDO_VERSION=1 +RUN --mount=type=cache,target=/root/.cache/pip \ + SETUPTOOLS_SCM_PRETEND_VERSION=${PSEUDO_VERSION} \ + /venv/bin/pip install -e . +RUN --mount=source=.git,target=.git,type=bind \ + --mount=type=cache,target=/root/.cache/pip \ + /venv/bin/pip install -e . + +ADD . . + +ENTRYPOINT ["/venv/bin/python", "-m", "archive_query_log"] diff --git a/README.md b/README.md index 8aabfeb4..5bb642f5 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ -[![CI](https://img.shields.io/github/actions/workflow/status/webis-de/archive-query-log/ci.yml?branch=main&style=flat-square)](https://github.com/webis-de/archive-query-log/actions/workflows/ci.yml) -[![Code coverage](https://img.shields.io/codecov/c/github/webis-de/archive-query-log?style=flat-square)](https://codecov.io/github/webis-de/archive-query-log/) +[![Paper DOI](https://img.shields.io/badge/DOI-10.1145%2F3539618.3591890-blue?style=flat-square)](https://doi.org/10.1145/3539618.3591890) [![arXiv preprint](https://img.shields.io/badge/arXiv-2304.00413-blue?style=flat-square)](https://arxiv.org/abs/2304.00413) -[![Papers with Code](https://img.shields.io/badge/papers%20with%20code-AQL--22-blue?style=flat-square)](https://paperswithcode.com/paper/the-archive-query-log-mining-millions-of) +[![Papers with Code](https://img.shields.io/badge/papers%20with%20code-AQL--22-blue?style=flat-square)](https://paperswithcode.com/paper/the-archive-query-log-mining-millions-of) +[![CI status](https://img.shields.io/github/actions/workflow/status/webis-de/archive-query-log/ci.yml?branch=main&style=flat-square)](https://github.com/webis-de/archive-query-log/actions/workflows/ci.yml) +[![Code coverage](https://img.shields.io/codecov/c/github/webis-de/archive-query-log?style=flat-square)](https://codecov.io/github/webis-de/archive-query-log/) +[![Maintenance](https://img.shields.io/maintenance/yes/2024?style=flat-square)](https://github.com/webis-de/archive-query-log/graphs/contributors) [![Issues](https://img.shields.io/github/issues/webis-de/archive-query-log?style=flat-square)](https://github.com/webis-de/archive-query-log/issues) +[![Pull requests](https://img.shields.io/github/issues-pr/webis-de/archive-query-log?style=flat-square)](https://github.com/webis-de/archive-query-log/pulls) [![Commit activity](https://img.shields.io/github/commit-activity/m/webis-de/archive-query-log?style=flat-square)](https://github.com/webis-de/archive-query-log/commits) [![License](https://img.shields.io/github/license/webis-de/archive-query-log?style=flat-square)](LICENSE) @@ -12,13 +15,12 @@ Mining Millions of Search Result Pages of Hundreds of Search Engines from 25&nbs [![Queries TSNE](docs/queries-tsne-teaser.png)](docs/queries-tsne.png) -Start now by running [your custom analysis/experiment](#integrations), scraping [your own query log](#tldr), or just look at [our example files](data/examples). +Start now by running [your custom analysis/experiment](#integrations), scraping [your query log](#crawling), or looking at [our example files](data/examples). ## Contents - [Integrations](#integrations) -- [Installation](#installation) -- [Usage](#tldr) +- [Crawling](#crawling) - [Development](#development) - [Contribute](#contribute) - [Abstract](#abstract) @@ -27,312 +29,340 @@ Start now by running [your custom analysis/experiment](#integrations), scraping ### Running Experiments on the AQL -The data in the Archive Query Log is highly sensitive (still, you can [re-crawl everything from the Wayback Machine](#usage)). For that reason, we ensure that custom experiments or analyises can not leak sensitive data (please [get in touch](#contribute) if you have questions) by using [TIRA](https://tira.io) as a platform for custom analyses/experiments. In TIRA, you submit a Docker image that implements your experiment. Your software is then executed in sandboxed mode (without internet connection) to ensure that your software does not leak sensitive information. After your software execution finished, administrators will review your submission and unblind it so that you can access the outputs. -Please refer to our [dedicated TIRA tutorial](integrations/tira/) as starting point for your experiments. +The data in the Archive Query Log is highly sensitive (still, you can [re-crawl everything from the Wayback Machine](#crawling)). For that reason, we ensure that custom experiments or analyses can not leak sensitive data (please [get in touch](#contribute) if you have questions) by using [TIRA](https://tira.io) as a platform for custom analyses/experiments. In TIRA, you submit a Docker image that implements your experiment. Your software is then executed in sandboxed mode (without an internet connection) to ensure that your software does not leak sensitive information. After your software execution is finished, administrators will review your submission and unblind it so that you can access the outputs. +Please refer to our [dedicated TIRA tutorial](integrations/tira/README.md) as the starting point for your experiments. -## Installation +## Crawling -1. Install [Python 3.10](https://python.org/downloads/) -2. Create and activate virtual environment: - ```shell - python3.10 -m venv venv/ - source venv/bin/activate - ``` -4. Install dependencies: - ```shell - pip install -e . - ``` +For running the CLI and crawl a query log on your own machine, please refer to the [instructions for single-machine deployments](#single-machine-pypidocker). +If instead you want to scale up and run the crawling pipelines on a cluster, please refer to the [instructions for cluster deployments](#cluster-helmkubernetes). -## Usage +### Single-Machine (PyPi/Docker) -To quickly scrape a sample query log, jump to the [TL;DR](#tldr). +To run the Archive Query Log CLI on your machine, you can either use our [PyPi package](#installation-pypi) or the [Docker image](#installation-docker). +(If you absolutely need to, you can also install the [Python CLI](#installation-python-from-source) or the Docker image from source.) -If you want to learn more about each step here are some more detailed guides: +#### Installation (PyPi) -1. [Search providers](#1-search-providers) -2. [Fetch archived URLs](#2-archived-urls) -3. [Parse archived query URLs](#3-archived-query-urls) -4. [Download archived raw SERPs](#4-archived-raw-serps) -5. [Parse archived SERPs](#5-archived-parsed-serps) +First you need to install [Python 3.10](https://python.org/downloads/), the [Protobuf compiler](https://grpc.io/docs/protoc-installation/), and [pipx](https://pypa.github.io/pipx/installation/) (this allows you to install the AQL CLI in a virtual environment).Then, you can install the Archive Query Log CLI by running: -### TL;DR +```shell +pipx install archive-query-log +``` -Let's start with a small example and construct a query log for the [ChatNoir](https://chatnoir.eu) search engine: +Now you can run the Archive Query Log CLI by running: +```shell +aql --help +``` -1. `python -m archive_query_log make archived-urls chatnoir` -2. `python -m archive_query_log make archived-query-urls chatnoir` -3. `python -m archive_query_log make archived-raw-serps chatnoir` -4. `python -m archive_query_log make archived-parsed-serps chatnoir` +#### Installation (Python from source) -Got the idea? Now you're ready to scrape your own query logs! To scale things up and understand the data, just keep on reading. For more details on how to add more search providers, see [below](#contribute). +
-### 1. Search providers +First, install [Python 3.10](https://python.org/downloads/) and the [Protobuf compiler](https://grpc.io/docs/protoc-installation/) and then clone this repository. From inside the repository directory, create a virtual environment and activate it: -Manually or semi-automatically collect a list of search providers that you would like to scrape query logs from. +```shell +python3.10 -m venv venv/ +source venv/bin/activate +``` -The list of search providers should be stored in a single [YAML][yaml-spec] file at [`data/selected-services.yaml`](data/selected-services.yaml) and contain one entry per search provider, like shown below: +Install the Archive Query Log by running: -```yaml -- name: string # search providers name (alexa_domain - alexa_public_suffix) - public_suffix: string # public suffix (https://publicsuffix.org/) of alexa_domain - alexa_domain: string # domain as it appears in Alexa top-1M ranks - alexa_rank: int # rank from fused Alexa top-1M rankings - category: string # manual annotation - notes: string # manual annotation - input_field: bool # manual annotation - search_form: bool # manual annotation - search_div: bool # manual annotation - domains: # known domains of the search providers (including the main domain) - - string - - string - - ... - query_parsers: # query parsers in order of precedence - - pattern: regex - type: query_parameter # for URLs like https://example.com/search?q=foo - parameter: string - - pattern: regex - type: fragment_parameter # for URLs like https://example.com/search#q=foo - parameter: string - - pattern: regex - type: query_parameter # for URLs like https://example.com/search/foo - path_prefix: string - - ... - page_parsers: # page number parsers in order of precedence - - pattern: regex - type: query_parameter # for URLs like https://example.com/search?page=2 - parameter: string - - ... - offset_parsers: # page offset parsers in order of precedence - - pattern: regex - type: query_parameter # for URLs like https://example.com/search?start=11 - parameter: string - - ... - interpreted_query_parsers: # interpreted query parsers in order of precedence - - ... - results_parsers: # search result and snippet parsers in order of precedence - - ... -- ... +```shell +pip install -e . ``` -In the source code, a search provider corresponds to the Python class [`Service`](archive_query_log/model/__init__.py). +Now you can run the Archive Query Log CLI by running: -### 2. Archived URLs +```shell +aql --help +``` -Fetch all archived URLs for a search provider from the Internet Archive's Wayback Machine. +
-You can run this step with the following command line, where `` is the name of the search provider you want to fetch archived URLs from: +#### Installation (Docker) -```shell: -python -m archive_query_log make archived-urls +You only need to install [Docker](https://docs.docker.com/get-docker/). + +**Note:** The commands below use the syntax of the [PyPi installation](#installation-pypi). To run the same commands with the Docker installation, replace `aql` with `docker run -it -v "$(pwd)"/config.override.yml:/workspace/config.override.yml ghcr.io/webis-de/archive-query-log`, for example: + +```shell +docker run -it -v "$(pwd)"/config.override.yml:/workspace/config.override.yml ghcr.io/webis-de/archive-query-log --help ``` -This will create multiple files in the `archived-urls` subdirectory under the [data directory](#pro-tip--specify-a-custom-data-directory), based on the search provider's name (``), domain (``), and the Wayback Machine's CDX [page number][cdx-pagination] (``) from which the URLs were originally fetched: +#### Installation (Docker from source) +
+ +First, install [Docker](https://docs.docker.com/get-docker/) and clone this repository. From inside the repository directory, build the Docker image like this: + +```shell +docker build -t aql . ``` -/archived-urls///.jsonl.gz + +**Note:** The commands below use the syntax of the [PyPi installation](#installation-pypi). To run the same commands with the Docker installation, replace `aql` with `docker run -it -v "$(pwd)"/config.override.yml:/workspace/config.override.yml aql`, for example: + +```shell +docker run -it -v "$(pwd)"/config.override.yml:/workspace/config.override.yml aql --help ``` -Here, the `` is a 10-digit number with leading zeros, e.g., `0000000001`. +
-Each individual file is a GZIP-compressed [JSONL][jsonl-spec] file with one archived URL per line, in arbitrary order. Each line contains the following fields: +#### Configuration -```json -{ - "url": "string", - // archived URL - "timestamp": "int" - // archive timestamp as POSIX integer -} +Crawling the Archive Query Log requires access to an Elasticsearch cluster and some S3 block storage. To configure access to the Elasticsearch cluster and S3, add a `config.override.yml` file in the current directory with the following contents. Replace the placeholders with your actual credentials: + +```yaml +es: + host: "" + port: 9200 + username: "" + password: "" +s3: + endpoint_url: "" + bucket_name: archive-query-log + access_key: "" + secret_key: "" ``` -In the source code, an archived URL corresponds to the Python class [`ArchivedUrl`](archive_query_log/model/__init__.py). +#### Toy Example: Crawl ChatNoir SERPs from the Wayback Machine + +The crawling pipeline of the Archive Query Log can best be understood by looking at a small toy example. Here, we want to crawl and parse SERPs of the [ChatNoir search engine](https://chatnoir.eu) from the [Wayback Machine](https://web.archive.org). -### 3. Archived Query URLs +> TODO: Add example instructions. -Parse and filter archived URLs that contain a query and may point to a search engine result page (SERP). +#### Add an archive service -You can run this step with the following command line, where `` is the name of the search provider you want to parse query URLs from: +Add new web archive services (e.g., the [Wayback Machine](https://web.archive.org)) to the AQL by running: -```shell: -python -m archive_query_log make archived-query-urls +```shell +aql archives add ``` -This will create multiple files in the `archived-query-urls` subdirectory under the [data directory](#pro-tip--specify-a-custom-data-directory), based on the search provider's name (``), domain (``), and the Wayback Machine's CDX [page number][cdx-pagination] (``) from which the URLs were originally fetched: +We maintain a list of compatible web archives [below](#compatible-archives). -``` -/archived-query-urls///.jsonl.gz +##### Compatible archives + +The web archives below are known to be compatible with the Archive Query Log crawler and can be used to mine SERPs. + + + +| Name | CDX API URL | Memento API URL | +|:--|:--|:--| +| [Wayback Machine](https://web.archive.org) | | | + +#### Add a search provider + +Add new search providers (e.g., [Google](https://google.com)) to the AQL by running: + +```shell +aql providers add ``` -Here, the `` is a 10-digit number with leading zeros, e.g., `0000000001`. - -Each individual file is a GZIP-compressed [JSONL][jsonl-spec] file with one archived query URL per line, in arbitrary order. Each line contains the following fields: - -```json -{ - "url": "string", - // archived URL - "timestamp": "int", - // archive timestamp as POSIX integer - "query": "string", - // parsed query - "page": "int", - // result page number (optional) - "offset": "int" - // result page offset (optional) -} +A search provider can be any website that offers some search functionality. Ideally, you should also look at common prefixes of the URLs of the search results pages (e.g., `/search` for Google). Narrowing down URL prefixes helps to avoid crawling too many captures that do not contain search results. + +Refer to the [import instructions below](#import) to import providers from the AQL-22 YAML file format. + +#### Build source pairs + +Once you have added at least one [archive](#add-an-archive-service) and one [search provider](#add-a-search-provider), we want to crawl archived captures of SERPs for each search provider and for each archive service. That is, we compute the cross-product of archives and the search providers' domains and URL prefixes (roughly: archiveΓ—provider). Start building source pairs (i.e., archive–provider pairs) by running: + +```shell +aql sources build ``` -In the source code, an archived query URL corresponds to the Python class [`ArchivedQueryUrl`](archive_query_log/model/__init__.py). +Running the command again after adding more archives or providers will automatically create the missing source pairs. -### 4. Archived Raw SERPs +#### Fetch captures -Download the raw HTML content of archived search engine result pages (SERPs). +For each [source pair](#build-source-pairs), we now fetch captures from the archive service that corresponds to the provider's domain and URL prefix given in the source pair. Again, rerunning the command after adding more source pairs fetches just the missing captures. -You can run this step with the following command line, where `` is the name of the search provider you want to download raw SERP HTML contents from: +#### Parse SERP URLs + +Not every capture necessarily points to a search engine result page (SERP). But usually, SERPs contain the user query in the URL, so we can filter out non-SERP captures by parsing the URLs. + +```shell +aql serps parse url-query -```shell: -python -m archive_query_log make archived-raw-serps ``` -This will create multiple files in the `archived-urls` subdirectory under the [data directory](#pro-tip--specify-a-custom-data-directory), based on the search provider's name (``), domain (``), and the Wayback Machine's CDX [page number][cdx-pagination] (``) from which the URLs were originally fetched. Archived raw SERPs are stored as 1GB-sized WARC chunk files, that is, WARC chunks are "filled" sequentially up to a size of 1GB each. If a chunk is full, a new chunk is created. +Parsing the query from the capture URL will add SERPs to a new, more focused index that only contains SERPs. From the SERPs, we can also parse the page number and offset of the SERP, if available. +```shell +aql serps parse url-page +aql serps parse url-offset ``` -/archived-raw-serps////.jsonl.gz + +All the above commands can be run in parallel, and they can be run multiple times to update the SERP index. Already parsed SERPs will be skipped. + +#### Download SERP WARCs + +Up to this point, we have only fetched the metadata of the captures, most prominently the URL. However, the snippets of the SERPs are not contained in the metadata but only on the web page. So, we need to download the actual web pages from the archive service. + +```shell +aql serps download warc ``` -Here, the `` and `` are both 10-digit numbers with leading zeros, e.g., `0000000001`. - -Each individual file is a GZIP-compressed [WARC][warc-spec] file with one WARC request and one WARC response per archived raw SERP. WARC records are arbitrarily ordered within or across chunks, but the WARC request and response for the same archived query URL are kept together. The archived query URL is stored in the WARC request's and response's `Archived-URL` field in [JSONL][jsonl-spec] format (the same format as in the previous step): - -```json -{ - "url": "string", - // archived URL - "timestamp": "int", - // archive timestamp as POSIX integer - "query": "string", - // parsed query - "page": "int", - // result page number (optional) - "offset": "int" - // result page offset (optional) -} +This command will download the contents of each SERP to a WARC file that is stored in the configured S3 bucket. A pointer to the WARC file is stored in the SERP index so that we can quickly access a specific SERP's contents later. + +#### Parsing SERP WARCs + +From the WARC, we can again parse the query as it appears on the SERP. + +```shell +aql serps parse serp-query ``` -In the source code, an archived raw SERP corresponds to the Python class [`ArchivedRawSerp`](archive_query_log/model/__init__.py). +More importantly, we can parse the snippets of the SERP. + +```shell +aql serps parse serp-snippets +``` -### 5. Archived Parsed SERPs +Parsing the snippets from the SERP's WARC contents will also add the SERP's results to a new index. -Parse and filter archived SERPs from raw contents. +#### Download SERP snippet WARCs -You can run this step with the following command line, where `` is the name of the search provider you want to parse SERPs from: +To get the full text of each referenced result from the SERP, we need to download a capture of the result from the web archive. Intuitively, we would like to download a capture of the result at the exact same time as the SERP was captured. But often, web archives crawl the results later or not at all. Therefore, the implementation searches for the nearest captures before and after the SERP's timestamp and downloads these two captures for each result, if any can be found. -```shell: -python -m archive_query_log make archived-parsed-serps +```shell +aql results download warc ``` -This will create multiple files in the `archived-serps` subdirectory under the [data directory](#pro-tip--specify-a-custom-data-directory), based on the search provider's name (``), domain (``), and the Wayback Machine's CDX [page number][cdx-pagination] (``) from which the URLs were originally fetched: +This command will again download the result's contents to a WARC file that is stored in the configured S3 bucket. A pointer to the WARC file is stored in the result index for random access to the contents of a specific result. + +### Import +We support automatically importing providers and parsers from the AQL-22 YAML-file format (see [`data/selected-services.yaml`](data/selected-services.yaml)). To import the services and parsers from the AQL-22 YAML file, run the following commands: + +```shell +aql providers import +aql parsers url-query import +aql parsers url-page import +aql parsers url-offset import +aql parsers warc-query import +aql parsers warc-snippets import ``` -/archived-serps///.jsonl.gz + +We also support importing a previous crawl of captures from the AQL-22 file system backend: + +```shell +aql captures import aql-22 ``` -Here, the `` is a 10-digit number with leading zeros, e.g., `0000000001`. - -Each individual file is a GZIP-compressed [JSONL][jsonl-spec] file with one archived parsed SERP per line, in arbitrary order. Each line contains the following fields: - -```json -{ - "url": "string", - // archived URL - "timestamp": "int", - // archive timestamp as POSIX integer - "query": "string", - // parsed query - "page": "int", - // result page number (optional) - "offset": "int", - // result page offset (optional) - "interpreted_query": "string", - // query displayed on the SERP (e.g. with spelling correction; optional) - "results": [ - { - "url": "string", - // URL of the result - "title": "string", - // title of the result - "snippet": "string" - // snippet of the result (highlighting normalized to ) - }, - ... - ] -} +Last, we support importing all archives from the [Archive-It](https://archive-it.org/) web archive service: + +```shell +aql archives import archive-it ``` -In the source code, an archived parsed SERP corresponds to the Python class [`ArchivedParsedSerp`](archive_query_log/model/__init__.py). +### Cluster (Helm/Kubernetes) -### Pro Tip: Specify a Custom Data Directory +Running the Archive Query Log on a cluster is recommended for large-scale crawls. We provide a Helm chart that automatically starts crawling and parsing jobs for you and stores the results in an Elasticsearch cluster. -By default, the data directory is set to [`data/`](data). You can change this with the `--data-directory` option, e.g.: +#### Installation -```shell -python -m archive_query_log make archived-urls --data-directory /mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/ +Just install [Helm](https://helm.sh/docs/intro/quickstart/) and configure `kubectl` for your cluster. + +#### Configuration + +Crawling the Archive Query Log requires access to an Elasticsearch cluster and some S3 block storage. Configure the Elasticsearch and S3 credentials in a `values.override.yaml` file like this: + +```yaml +elasticsearch: + host: "" + port: 9200 + username: "" + password: "" +s3: + endpoint_url: "" + bucket_name: archive-query-log + access_key: "" + secret_key: "" ``` -### Pro Tip: Limit Scraping for Testing +#### Deployment -If the search provider you're scraping queries for is very large and has many domains, testing your settings on a smaller sample from that search provider can be helpful. You can specify a single domain to scrape from like this: +Let us deploy the Helm chart on the cluster (we are testing first with `--dry-run` to see if everything works): ```shell -python -m archive_query_log make archived-urls +helm upgrade --install --values ./helm/values.override.yaml --dry-run archive-query-log ./helm ``` -If a domain is very popular and therefore has many archived URLs, -you can further limit the number of archived URLs to scrape by selecting -a [page](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api) -from the Wayback Machine's -[CDX API](https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api): +If everything works and the output looks good, you can remove the `--dry-run` flag to actually deploy the chart. + +#### Uninstall + +If you no longer need the chart, you can uninstall it: ```shell -python -m archive_query_log make archived-urls +helm uninstall archive-query-log ``` ## Citation -If you use the Archive Query Log dataset or the code to generate it in your research, please cite the following paper describing the AQL and its use-cases: +If you use the Archive Query Log dataset or the crawling code in your research, please cite the following paper describing the AQL and its use cases: -> TODO +> Jan Heinrich Reimer, Sebastian Schmidt, Maik FrΓΆbe, Lukas Gienapp, Harrisen Scells, Benno Stein, Matthias Hagen, and Martin Potthast. [The Archive Query Log: Mining Millions of Search Result Pages of Hundreds of Search Engines from 25 Years of Web Archives.](https://webis.de/publications.html?q=archive#reimer_2023) In Hsin-Hsi Chen et al., editors, _46th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2023)_, pages 2848–2860, July 2023. ACM. You can use the following BibTeX entry for citation: ```bibtex -% TODO +@InProceedings{reimer:2023, + author = {{Jan Heinrich} Reimer and Sebastian Schmidt and Maik Fr{\"o}be and Lukas Gienapp and Harrisen Scells and Benno Stein and Matthias Hagen and Martin Potthast}, + booktitle = {46th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR 2023)}, + doi = {10.1145/3539618.3591890}, + editor = {Hsin{-}Hsi Chen and Wei{-}Jou (Edward) Duh and Hen{-}Hsen Huang and Makoto P. Kato and Josiane Mothe and Barbara Poblete}, + ids = {potthast:2023u}, + isbn = {9781450394086}, + month = jul, + numpages = 13, + pages = {2848--2860}, + publisher = {ACM}, + site = {Taipei, Taiwan}, + title = {{The Archive Query Log: Mining Millions of Search Result Pages of Hundreds of Search Engines from 25 Years of Web Archives}}, + url = {https://dl.acm.org/doi/10.1145/3539618.3591890}, + year = 2023 +} ``` ## Development -Run tests: +Refer to the local [Python installation](#installation-python-from-source) instructions to set up the development environment and install the dependencies. + +Then, also install the test dependencies: + +```shell +pip install -e .[tests] +``` + +After having implemented a new feature, please check the code format, inspect common LINT errors, and run all unit tests with the following commands: + ```shell -flake8 archive_query_log -pylint -E archive_query_log -pytest archive_query_log +ruff . # Code format and LINT +mypy . # Static typing +bandit -c pyproject.toml -r . # Security +pytest . # Unit tests ``` -Add new tests for parsers: +### Add new tests for parsers + +At the moment, our workflow for adding new tests for parsers goes like this: 1. Select the number of tests to run per service and the number of services. -2. Auto-generate unit tests and download WARCs with [generate_tests.py](archive_query_log/results/test/generate_tests.py) +2. Auto-generate unit tests and download WARCs with [generate_tests.py](archive_query_log/legacy/results/test/generate_tests.py) 3. Run the tests. 4. Failing tests will open a diff editor with the approval and a web browser tab with the Wayback URL. -5. Use the web browser dev tools to find the query input field and search result CSS paths. +5. Use the web browser dev tools to find the query input field and the search result CSS paths. 6. Close diffs and tabs and re-run tests. ## Contribute -If you've found an important search provider to be missing from this query log, please suggest it by creating an [issue][repo-issues]. We also very gratefully accept [pull requests][repo-prs] for adding [search providers](#1-search-providers) or new parser configurations! +If you have found an important search provider missing from this query log, please suggest it by creating an [issue](https://github.com/webis-de/archive-query-log/issues). We also gratefully accept [pull requests](https://github.com/webis-de/archive-query-log/pulls) for adding search providers or new parser configurations! + +If you are unsure about anything, post an [issue](https://github.com/webis-de/archive-query-log/issues/new) or contact us: -If you're unsure about anything, post an [issue][repo-issues], or contact us: -- [heinrich.reimer@uni-jena.de](mailto:heinrich.reimer@uni-jena.de) +- [heinrich.merker@uni-jena.de](mailto:heinrich.merker@uni-jena.de) - [s.schmidt@uni-leipzig.de](mailto:s.schmidt@uni-leipzig.de) - [maik.froebe@uni-jena.de](mailto:maik.froebe@uni-jena.de) - [lukas.gienapp@uni-leipzig.de](mailto:lukas.gienapp@uni-leipzig.de) @@ -341,25 +371,12 @@ If you're unsure about anything, post an [issue][repo-issues], or contact us: - [matthias.hagen@uni-jena.de](mailto:matthias.hagen@uni-jena.de) - [martin.potthast@uni-leipzig.de](mailto:martin.potthast@uni-leipzig.de) -We're happy to help! +We are happy to help! ## License -This repository is released under the [MIT license](LICENSE). Files in the `data/` directory are exempt from this license. -If you use the AQL in your research, we'd be glad if you'd [cite us](#citation). +This repository is released under the [MIT license](LICENSE). Files in the `data/` directory are exempt from this license. If you use the AQL in your research, we would be glad if you could [cite us](#citation). ## Abstract The Archive Query Log (AQL) is a previously unused, comprehensive query log collected at the Internet Archive over the last 25 years. Its first version includes 356 million queries, 166 million search result pages, and 1.7 billion search results across 550 search providers. Although many query logs have been studied in the literature, the search providers that own them generally do not publish their logs to protect user privacy and vital business data. Of the few query logs publicly available, none combines size, scope, and diversity. The AQL is the first to do so, enabling research on new retrieval models and (diachronic) search engine analyses. Provided in a privacy-preserving manner, it promotes open research as well as more transparency and accountability in the search industry. - -[repo-issues]: https://git.webis.de/code-research/web-search/web-archive-query-log/-/issues - -[repo-prs]: https://git.webis.de/code-research/web-search/web-archive-query-log/-/merge_requests - -[cdx-pagination]: https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api - -[warc-spec]: https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/ - -[jsonl-spec]: https://jsonlines.org/ - -[yaml-spec]: https://yaml.org/ diff --git a/archive_query_log/__init__.py b/archive_query_log/__init__.py index 2941cb21..a583e879 100644 --- a/archive_query_log/__init__.py +++ b/archive_query_log/__init__.py @@ -1,12 +1,3 @@ -from logging import getLogger -from pathlib import Path +from importlib_metadata import version -__version__ = "0.1.0" - -PROJECT_DIRECTORY_PATH = Path(__file__).parent.parent -DATA_DIRECTORY_PATH = PROJECT_DIRECTORY_PATH / "data" -# DATA_DIRECTORY_PATH = Path("/mnt/ceph/storage/TODO") - -CDX_API_URL = "https://web.archive.org/cdx/search/cdx" - -LOGGER = getLogger(__name__) +__version__ = version("archive-query-log") diff --git a/archive_query_log/__main__.py b/archive_query_log/__main__.py index b16ac59e..2328a1ce 100644 --- a/archive_query_log/__main__.py +++ b/archive_query_log/__main__.py @@ -1,4 +1,4 @@ -from archive_query_log.cli import main +from archive_query_log.cli import cli if __name__ == "__main__": - main() + cli() diff --git a/archive_query_log/api/main.py b/archive_query_log/api/main.py new file mode 100644 index 00000000..415baaca --- /dev/null +++ b/archive_query_log/api/main.py @@ -0,0 +1,393 @@ +from datetime import datetime +from pathlib import Path +from typing import Type + +from elasticsearch_dsl.query import Exists, Query, Term +from expiringdict import ExpiringDict +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +from mergedeep import Strategy, merge +from pydantic import BaseModel +from yaml import safe_load + +from archive_query_log.config import Config +from archive_query_log.orm import Archive, Provider, Source, Capture, \ + BaseDocument, Serp, Result, UrlQueryParser, UrlPageParser, \ + UrlOffsetParser, WarcQueryParser, WarcSnippetsParser + +_CACHE_SECONDS_STATISTICS = 60 * 5 # 5 minutes +_CACHE_SECONDS_PROGRESS = 60 * 10 # 10 minutes + +_DEFAULT_CONFIG_PATH = Path("../../config.yml") +_DEFAULT_CONFIG_OVERRIDE_PATH = Path("../../config.override.yml") +_DEFAULT_CONFIG_PATHS = [] +if _DEFAULT_CONFIG_PATH.exists(): + _DEFAULT_CONFIG_PATHS.append(_DEFAULT_CONFIG_PATH) +if _DEFAULT_CONFIG_OVERRIDE_PATH.exists(): + _DEFAULT_CONFIG_PATHS.append(_DEFAULT_CONFIG_OVERRIDE_PATH) + + +class Statistics(BaseModel): + name: str + description: str + total: int + disk_size: str | None + last_modified: datetime | None + + +class Progress(BaseModel): + input_name: str + output_name: str + description: str + total: int + current: int + + +DocumentType = Type[BaseDocument] + +_statistics_cache: dict[ + tuple[DocumentType, str], + Statistics, +] = ExpiringDict( + max_len=100, + max_age_seconds=_CACHE_SECONDS_STATISTICS, +) + + +def _convert_bytes(bytes_count: int) -> str: + step_unit = 1000.0 + bytes_count_decimal: float = bytes_count + for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", "RB"]: + if bytes_count_decimal < step_unit: + return f"{bytes_count_decimal:3.1f}β€―{unit}" + bytes_count_decimal /= step_unit + return f"{bytes_count_decimal:3.1f}β€―QB" + + +def _get_statistics( + config: Config, + name: str, + description: str, + document: DocumentType, + filter_query: Query | None = None, +) -> Statistics: + key = (document, repr(filter_query)) + if key in _statistics_cache: + return _statistics_cache[key] + + document.index().refresh(using=config.es.client) + stats = document.index().stats(using=config.es.client) + search = document.search(using=config.es.client) + if filter_query is not None: + search = search.filter(filter_query) + total = search.count() + last_modified_response = ( + search + .query(Exists(field="last_modified")) + .sort("-last_modified") + .extra(size=1) + .execute() + ) + if last_modified_response.hits.total.value == 0: + last_modified = None + else: + last_modified = last_modified_response.hits[0].last_modified + + statistics = Statistics( + name=name, + description=description, + total=total, + disk_size=str( + _convert_bytes(stats["_all"]["total"]["store"]["size_in_bytes"]) + if filter_query is None else None + ), + last_modified=last_modified, + ) + _statistics_cache[key] = statistics + return statistics + + +_progress_cache: dict[ + tuple[DocumentType, str, str], + Progress, +] = ExpiringDict( + max_len=100, + max_age_seconds=_CACHE_SECONDS_PROGRESS, +) + + +def _get_processed_progress( + config: Config, + input_name: str, + output_name: str, + description: str, + document: DocumentType, + status_field: str, + filter_query: Query | None = None, +) -> Progress: + key = (document, repr(filter_query), status_field) + if key in _progress_cache: + return _progress_cache[key] + + document.index().refresh(using=config.es.client) + search = document.search(using=config.es.client) + if filter_query is not None: + search = search.filter(filter_query) + total = search.count() + search_processed = search.filter(Term(**{status_field: False})) + total_processed = search_processed.count() + progress = Progress( + input_name=input_name, + output_name=output_name, + description=description, + total=total, + current=total_processed, + ) + _progress_cache[key] = progress + return progress + +# how to start api in bash: "uvicorn main:app --reload" + +app = FastAPI() + +# CORS settings +origins = [ + "http://localhost:3000", +] + +app.add_middleware( + CORSMiddleware, + allow_origins=origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +@app.get("/statistics") +def get_statistics(): + + if len(_DEFAULT_CONFIG_PATHS ) == 0: + raise RuntimeError("No config file specified.") + config_dict: dict = {} + for config_path in _DEFAULT_CONFIG_PATHS : + with config_path.open("rb") as config_file: + next_config_dict = safe_load(config_file) + merge(config_dict, next_config_dict, strategy=Strategy.REPLACE) + config: Config = Config.from_dict(config_dict) + + statistics_list = [ + _get_statistics( + config=config, + name="Archives", + description="Web archiving services that offer CDX and Memento APIs.", + document=Archive, + ), + _get_statistics( + config=config, + name="Providers", + description="Search providers, i.e., websites that offer a search functionality.", + document=Provider, + ), + _get_statistics( + config=config, + name="Sources", + description="The cross product of all archives and the provider's domains and URL prefixes.", + document=Source, + ), + _get_statistics( + config=config, + name="Captures", + description="Captures matching from the archives that match domain and URL prefixes.", + document=Capture, + ), + _get_statistics( + config=config, + name="SERPs", + description="Search engine result pages that have been identified among the captures.", + document=Serp, + ), + _get_statistics( + config=config, + name="+ URL query", + description="SERPs for which the query has been parsed from the URL.", + document=Serp, + filter_query=Exists(field="url_query"), + ), + _get_statistics( + config=config, + name="+ URL page", + description="SERPs for which the page has been parsed from the URL.", + document=Serp, + filter_query=Exists(field="url_page"), + ), + _get_statistics( + config=config, + name="+ URL offset", + description="SERPs for which the offset has been parsed from the URL.", + document=Serp, + filter_query=Exists(field="url_offset"), + ), + _get_statistics( + config=config, + name="+ WARC", + description="SERPs for which the WARC has been downloaded.", + document=Serp, + filter_query=Exists(field="warc_location"), + ), + _get_statistics( + config=config, + name="+ WARC query", + description="SERPs for which the query has been parsed from the WARC.", + document=Serp, + filter_query=Exists(field="warc_query"), + ), + _get_statistics( + config=config, + name="+ WARC snippets", + description="SERPs for which the snippets have been parsed from the WARC.", + document=Serp, + filter_query=Exists(field="warc_snippets_parser.id"), + ), + _get_statistics( + config=config, + name="Results", + description="Search result from the SERPs.", + document=Result, + ), + _get_statistics( + config=config, + name="URL query parsers", + description="Parser to get the query from a SERP's URL.", + document=UrlQueryParser, + ), + _get_statistics( + config=config, + name="URL page parsers", + description="Parser to get the page from a SERP's URL.", + document=UrlPageParser, + ), + _get_statistics( + config=config, + name="URL offset parsers", + description="Parser to get the offset from a SERP's URL.", + document=UrlOffsetParser, + ), + _get_statistics( + config=config, + name="WARC query parsers", + description="Parser to get the query from a SERP's WARC contents.", + document=WarcQueryParser, + ), + _get_statistics( + config=config, + name="WARC snippets parsers", + description="Parser to get the snippets from a SERP's WARC contents.", + document=WarcSnippetsParser, + ), + ] + + return statistics_list + + +@app.get("/progress") +def get_progress(): + + if len(_DEFAULT_CONFIG_PATHS ) == 0: + raise RuntimeError("No config file specified.") + config_dict: dict = {} + for config_path in _DEFAULT_CONFIG_PATHS : + with config_path.open("rb") as config_file: + next_config_dict = safe_load(config_file) + merge(config_dict, next_config_dict, strategy=Strategy.REPLACE) + config: Config = Config.from_dict(config_dict) + + progress_list = [ + _get_processed_progress( + config=config, + input_name="Archives", + output_name="Sources", + description="Build sources for all archives.", + document=Archive, + filter_query=~Exists(field="exclusion_reason"), + status_field="should_build_sources", + ), + _get_processed_progress( + config=config, + input_name="Providers", + output_name="Sources", + description="Build sources for all search providers.", + document=Provider, + filter_query=~Exists(field="exclusion_reason"), + status_field="should_build_sources", + ), + _get_processed_progress( + config=config, + input_name="Sources", + output_name="Captures", + description="Fetch CDX captures for all domains and prefixes in the sources.", + document=Source, + status_field="should_fetch_captures", + ), + _get_processed_progress( + config=config, + input_name="Captures", + output_name="SERPs", + description="Parse queries from capture URLs.", + document=Capture, + status_field="url_query_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse page from SERP URLs.", + document=Serp, + status_field="url_page_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse offset from SERP URLs.", + document=Serp, + status_field="url_offset_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Download WARCs.", + document=Serp, + filter_query=Term(capture__status_code=200), + status_field="warc_downloader.should_download", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse query from WARC contents.", + document=Serp, + filter_query=Exists(field="warc_location"), + status_field="warc_query_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse snippets from WARC contents.", + document=Serp, + filter_query=Exists(field="warc_location"), + status_field="warc_snippets_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="Results", + output_name="Results", + description="Download WARCs.", + document=Result, + filter_query=Exists(field="snippet.url"), + status_field="warc_downloader.should_download", + ), + ] + + return progress_list diff --git a/archive_query_log/archives/__init__.py b/archive_query_log/archives/__init__.py new file mode 100644 index 00000000..ed7724e8 --- /dev/null +++ b/archive_query_log/archives/__init__.py @@ -0,0 +1,81 @@ +from uuid import uuid4 + +from click import echo, prompt +from elasticsearch_dsl import Search +from elasticsearch_dsl.query import Term +from elasticsearch_dsl.response import Response + +from archive_query_log.config import Config +from archive_query_log.orm import Archive +from archive_query_log.utils.time import utc_now + + +def add_archive( + config: Config, + name: str | None, + description: str | None, + cdx_api_url: str, + memento_api_url: str, + priority: float | None, + no_merge: bool = False, + auto_merge: bool = False, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + Archive.index().refresh(using=config.es.client) + last_modified = utc_now() + should_build_sources = True + existing_archive_search: Search = ( + Archive.search(using=config.es.client) + .query( + Term(cdx_api_url=cdx_api_url) | + Term(memento_api_url=memento_api_url) + ) + ) + existing_archive_response: Response = existing_archive_search.execute() + if existing_archive_response.hits.total.value > 0: + if no_merge: + return + existing_archive: Archive = existing_archive_response[0] + archive_id = existing_archive.id + if auto_merge: + should_merge = True + else: + echo(f"Archive {archive_id} already exists with " + f"conflicting API endpoints.") + add_to_existing = prompt("Merge with existing archive? [y/N]", + type=str, default="n", show_default=False) + should_merge = add_to_existing.lower() == "y" + if not should_merge: + return + + if name is None: + name = existing_archive.name + if description is None: + description = existing_archive.description + if priority is None: + priority = existing_archive.priority + + if cdx_api_url == existing_archive.cdx_api_url and \ + memento_api_url == existing_archive.memento_api_url: + last_modified = existing_archive.last_modified + should_build_sources = existing_archive.should_build_sources + + if not auto_merge: + echo(f"Update archive {archive_id}.") + else: + archive_id = str(uuid4()) + if not no_merge and not auto_merge: + echo(f"Add new archive {archive_id}.") + + archive = Archive( + id=archive_id, + last_modified=last_modified, + name=name, + description=description, + cdx_api_url=cdx_api_url, + memento_api_url=memento_api_url, + priority=priority, + should_build_sources=should_build_sources, + ) + archive.save(using=config.es.client) diff --git a/archive_query_log/captures/__init__.py b/archive_query_log/captures/__init__.py new file mode 100644 index 00000000..f4077381 --- /dev/null +++ b/archive_query_log/captures/__init__.py @@ -0,0 +1,154 @@ +from itertools import chain +from typing import Iterable, Iterator +from urllib.parse import urljoin +from uuid import uuid5 +from warnings import warn + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, RankFeature, Term +from requests import ConnectTimeout, HTTPError, Response +from tqdm.auto import tqdm +from web_archive_api.cdx import CdxApi, CdxMatchType + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_CAPTURE +from archive_query_log.orm import Source, Capture, InnerParser +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now, UTC + + +def _iter_captures( + config: Config, + source: Source, +) -> Iterator[Capture]: + cdx_api = CdxApi( + api_url=source.archive.cdx_api_url, + session=config.http.session, + ) + url = f"https://{source.provider.domain}" + url = urljoin(url, source.provider.url_path_prefix) + url = url.removeprefix("https://") + cdx_captures = cdx_api.iter_captures( + url=url, + match_type=CdxMatchType.PREFIX, + ) + for cdx_capture in cdx_captures: + if len(cdx_capture.url) > 32766: + warn(RuntimeWarning( + f"The URL {cdx_capture.url} exceeds the " + f"maximum length of Elasticsearch." + f" It will be skipped." + )) + continue + + capture_utc_timestamp_text = ( + cdx_capture.timestamp.astimezone(UTC).strftime("%Y%m%d%H%M%S")) + capture_id_components = ( + source.archive.cdx_api_url, + cdx_capture.url, + capture_utc_timestamp_text, + ) + capture_id = str(uuid5( + NAMESPACE_CAPTURE, + ":".join(capture_id_components), + )) + yield Capture( + id=capture_id, + last_modified=utc_now(), + archive=source.archive, + provider=source.provider, + url=cdx_capture.url, + url_key=cdx_capture.url_key, + timestamp=cdx_capture.timestamp.astimezone(UTC), + status_code=cdx_capture.status_code, + digest=cdx_capture.digest, + mimetype=cdx_capture.mimetype, + filename=cdx_capture.filename, + offset=cdx_capture.offset, + length=cdx_capture.length, + access=cdx_capture.access, + redirect_url=cdx_capture.redirect_url, + flags=([flag.value for flag in cdx_capture.flags] + if cdx_capture.flags is not None else None), + collection=cdx_capture.collection, + source=cdx_capture.source, + source_collection=cdx_capture.source_collection, + url_query_parser=InnerParser( + should_parse=True, + ), + ) + + +def _add_captures_actions( + config: Config, + source: Source, +) -> Iterator[dict]: + # Re-check if fetching captures is necessary. + if (source.should_fetch_captures is not None and + not source.should_fetch_captures): + return + + captures_iter = _iter_captures(config, source) + try: + for capture in captures_iter: + yield capture.to_dict(include_meta=True) + except ConnectTimeout as e: + # The archives' CDX are usually very slow, so we expect timeouts. + # Rather than failing, we just warn and continue with the next source. + # But we do not mark this source as fetched, so that we try again. + warn(RuntimeWarning( + f"Connection timeout while fetching captures " + f"for source {source.id}: {e}")) + return + except HTTPError as e: + ignored = False + response: Response = e.response + if response.status_code == 403: + warn(RuntimeWarning( + f"Unauthorized to fetch captures for source " + f"domain {source.provider.domain} and " + f"URL prefix {source.provider.url_path_prefix}." + )) + ignored = True + if not ignored: + raise e + + yield update_action( + source, + should_fetch_captures=False, + last_fetched_captures=utc_now(), + ) + + +def fetch_captures(config: Config) -> None: + changed_sources_search: Search = ( + Source.search(using=config.es.client) + .filter(~Term(should_fetch_captures=False)) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_sources = changed_sources_search.count() + if num_changed_sources > 0: + echo(f"Fetching captures for {num_changed_sources} " + f"new/changed sources.") + changed_sources: Iterable[Source] = ( + changed_sources_search + .params(preserve_order=True) + .scan() + ) + changed_sources = safe_iter_scan(changed_sources) + # noinspection PyTypeChecker + changed_sources = tqdm(changed_sources, total=num_changed_sources, + desc="Fetching captures", unit="source") + actions = chain.from_iterable( + _add_captures_actions(config, source) + for source in changed_sources + ) + config.es.bulk(actions) + else: + echo("No new/changed sources.") diff --git a/archive_query_log/cli/__init__.py b/archive_query_log/cli/__init__.py index a1b026fc..7a994abc 100644 --- a/archive_query_log/cli/__init__.py +++ b/archive_query_log/cli/__init__.py @@ -1,8 +1,100 @@ -# flake8: noqa -from archive_query_log.cli.main import main -from archive_query_log.cli.alexa import alexa -from archive_query_log.cli.external import external -from archive_query_log.cli.make import make_group -from archive_query_log.cli.stats import stats_command -from archive_query_log.cli.corpus import corpus_command -from archive_query_log.cli.index import index_command +from pathlib import Path +from typing import Any, Type, Iterable + +from click import group, Context, Parameter, echo, option, pass_context, \ + Path as PathType, UsageError +from elasticsearch_dsl import Document +from mergedeep import merge, Strategy +from tqdm.auto import tqdm +from yaml import safe_load + +from archive_query_log import __version__ as app_version +from archive_query_log.cli.archives import archives +from archive_query_log.cli.captures import captures +from archive_query_log.cli.monitoring import monitoring +from archive_query_log.cli.parsers import parsers +from archive_query_log.cli.providers import providers +from archive_query_log.cli.results import results +from archive_query_log.cli.serps import serps +from archive_query_log.cli.sources import sources +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config +from archive_query_log.orm import ( + Archive, Provider, Source, Capture, Serp, Result, UrlQueryParser, + UrlPageParser, UrlOffsetParser, WarcQueryParser) + + +def echo_version( + context: Context, + _parameter: Parameter, + value: Any, +) -> None: + if not value or context.resilient_parsing: + return + echo(app_version) + context.exit() + + +_DEFAULT_CONFIG_PATH = Path("config.yml") +_DEFAULT_CONFIG_OVERRIDE_PATH = Path("config.override.yml") +_DEFAULT_CONFIG_PATHS = [] +if _DEFAULT_CONFIG_PATH.exists(): + _DEFAULT_CONFIG_PATHS.append(_DEFAULT_CONFIG_PATH) +if _DEFAULT_CONFIG_OVERRIDE_PATH.exists(): + _DEFAULT_CONFIG_PATHS.append(_DEFAULT_CONFIG_OVERRIDE_PATH) + + +@group() +@option("-V", "--version", is_flag=True, callback=echo_version, + expose_value=False, is_eager=True) +@option("-f", "--config-file", "config_paths", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, writable=False, + resolve_path=True, allow_dash=False), + default=_DEFAULT_CONFIG_PATHS, multiple=True, required=True) +@pass_context +def cli(context: Context, config_paths: list[Path]) -> None: + if len(config_paths) == 0: + raise UsageError("No config file specified.") + config_dict: dict = {} + for config_path in config_paths: + with config_path.open("rb") as config_file: + next_config_dict = safe_load(config_file) + merge(config_dict, next_config_dict, strategy=Strategy.REPLACE) + config: Config = Config.from_dict(config_dict) + context.obj = config + + +@cli.command() +@pass_config +def init(config: Config) -> None: + indices_list: list[Type[Document]] = [ + Archive, + Provider, + Source, + Capture, + Serp, + Result, + UrlQueryParser, + UrlPageParser, + UrlOffsetParser, + WarcQueryParser, + ] + # noinspection PyTypeChecker + indices: Iterable[Type[Document]] = tqdm( + indices_list, + desc="Initialize indices", + unit="index", + ) + for index in indices: + index.init(using=config.es.client) + + +cli.add_command(archives) +cli.add_command(providers) +cli.add_command(parsers) +cli.add_command(sources) +cli.add_command(captures) +cli.add_command(serps) +cli.add_command(results) +cli.add_command(monitoring) diff --git a/archive_query_log/cli/archives.py b/archive_query_log/cli/archives.py new file mode 100644 index 00000000..41fd54d1 --- /dev/null +++ b/archive_query_log/cli/archives.py @@ -0,0 +1,81 @@ +from click import group, option, IntRange, FloatRange + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config +from archive_query_log.imports.archive_it import \ + DEFAULT_ARCHIVE_IT_PAGE_SIZE, DEFAULT_ARCHIVE_IT_WAYBACK_URL, \ + DEFAULT_ARCHIVE_IT_API_URL +from archive_query_log.orm import Archive + + +@group() +def archives() -> None: + pass + + +@archives.command() +@option("-n", "--name", type=str, required=True, + prompt="Name") +@option("-d", "--description", type=str) +@option("-c", "--cdx-api-url", type=str, required=True, + prompt="CDX API URL", metavar="URL") +@option("-m", "--memento-api-url", type=str, required=True, + prompt="Memento API URL", metavar="URL") +@option("--priority", type=FloatRange(min=0, min_open=False)) +@pass_config +def add( + config: Config, + name: str, + description: str | None, + cdx_api_url: str, + memento_api_url: str, + priority: float | None, +) -> None: + from archive_query_log.archives import add_archive + Archive.init(using=config.es.client) + add_archive( + config=config, + name=name, + description=description, + cdx_api_url=cdx_api_url, + memento_api_url=memento_api_url, + priority=priority, + ) + + +@archives.group("import") +def import_() -> None: + pass + + +@import_.command() +@option("--api-url", type=str, required=True, + default=DEFAULT_ARCHIVE_IT_API_URL, metavar="URL") +@option("--wayback-url", type=str, required=True, + default=DEFAULT_ARCHIVE_IT_WAYBACK_URL, metavar="URL") +@option("--page-size", type=IntRange(min=1), required=True, + default=DEFAULT_ARCHIVE_IT_PAGE_SIZE) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--no-merge", is_flag=True, default=False, type=bool) +@option("--auto-merge", is_flag=True, default=False, type=bool) +@pass_config +def archive_it( + config: Config, + api_url: str, + wayback_url: str, + page_size: int, + priority: float | None, + no_merge: bool, + auto_merge: bool, +) -> None: + from archive_query_log.imports.archive_it import import_archives + Archive.init(using=config.es.client) + import_archives( + config=config, + api_url=api_url, + wayback_url=wayback_url, + page_size=page_size, + no_merge=no_merge, + auto_merge=auto_merge, + priority=priority, + ) diff --git a/archive_query_log/cli/captures.py b/archive_query_log/cli/captures.py new file mode 100644 index 00000000..67fe0916 --- /dev/null +++ b/archive_query_log/cli/captures.py @@ -0,0 +1,62 @@ +from pathlib import Path + +from click import group, Path as PathType, argument, option + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config +from archive_query_log.orm import Capture + + +@group() +def captures() -> None: + pass + + +@captures.command() +@pass_config +def fetch(config: Config) -> None: + from archive_query_log.captures import fetch_captures + Capture.init(using=config.es.client) + fetch_captures(config) + + +@captures.group("import") +def import_() -> None: + pass + + +_CEPH_DIR = Path("/mnt/ceph/storage/") +_DEFAULT_DATA_DIR = ( + _CEPH_DIR / "data-in-progress/data-research/web-search/" + "archive-query-log/focused/" + if _CEPH_DIR.is_mount() and _CEPH_DIR.exists() + else None) + + +@import_.command(help="Import captures from the AQL-22 dataset.") +@argument("data_dir_path", + type=PathType(path_type=Path, exists=True, file_okay=False, + dir_okay=True, readable=True, writable=False, + resolve_path=True, allow_dash=False), + metavar="DATA_DIR", required=True, default=_DEFAULT_DATA_DIR) +@option("--check-memento/--no-check-memento", default=True) +@option("--search-provider", type=str, envvar="SEARCH_PROVIDER") +@option("--search-provider-index", type=int, + envvar="SEARCH_PROVIDER_INDEX") +@pass_config +def aql_22( + config: Config, + data_dir_path: Path, + check_memento: bool, + search_provider: str | None, + search_provider_index: int | None, +) -> None: + from archive_query_log.imports.aql22 import import_captures + Capture.init(using=config.es.client) + import_captures( + config=config, + data_dir_path=data_dir_path, + check_memento=check_memento, + search_provider=search_provider, + search_provider_index=search_provider_index, + ) diff --git a/archive_query_log/cli/external.py b/archive_query_log/cli/external.py deleted file mode 100644 index 35fb9b05..00000000 --- a/archive_query_log/cli/external.py +++ /dev/null @@ -1,228 +0,0 @@ -from json import loads -from math import nan -from pathlib import Path -from re import compile, escape -from urllib.parse import quote - -from click import argument -from pandas import DataFrame, read_csv, Series, concat -from yaml import dump - -from archive_query_log import DATA_DIRECTORY_PATH -from archive_query_log.cli import main -from archive_query_log.cli.util import PathParam - -sheets_id = "1LnIJYFBYQtZ32rxnT6RPGMOvuRIUQMoEx7tOS0z7Mi8" -sheet_services = "Services" -sheet_domains = "Domains" -sheet_url_prefixes = "URL Prefixes" -sheet_query_parsers = "Query Parsers" -sheet_page_parsers = "Page Parsers" - - -@main.group("external") -def external(): - pass - - -def from_sheets(sheet_name: str, transpose: bool = False) -> DataFrame: - url = f"https://docs.google.com/spreadsheets/d/{sheets_id}/" \ - f"gviz/tq?tqx=out:csv&sheet={quote(sheet_name)}" - if transpose: - df = read_csv( - url, low_memory=False, na_values=[""], keep_default_na=False, - ) - return DataFrame([ - { - "name": column, - "value": value, - } - for column in df.columns - for value in df[column].dropna() - ]) - else: - return read_csv(url) - - -def load_services() -> DataFrame: - df = from_sheets(sheet_services) - df = df[~df["service"].str.contains(".", regex=False)] - df["name"] = df["service"] - df["public_suffix"] = df["tld"] - df["alexa_domain"] = df["name"] + "." + df["public_suffix"] - df["alexa_rank"] = df["rank"] - df["notes"] = df["Notes"] - for col in ["has_input_field", "has_search_form", "has_search_div"]: - df[col.removeprefix("has_")] = df[col].replace("FALSCH", False) - df[col].replace("False", False, inplace=True) - df[col].replace("True", True, inplace=True) - df["alexa_rank"].astype(int, copy=False) - df["alexa_rank"].replace(99999, nan, inplace=True) - return df[["name", "public_suffix", "alexa_domain", "alexa_rank", - "category", "notes", "input_field", - "search_form", "search_div"]] - - -def load_domains() -> DataFrame: - df = from_sheets(sheet_domains, transpose=True) - df["domain"] = df["value"] - return df[["name", "domain"]] - - -def url_prefix_pattern(url_prefix: str) -> str | None: - if url_prefix == "": - return None - return f"^https?://[^/]+/{escape(url_prefix)}" - - -compile(r"[^/]+/images/search\?") - - -def load_url_prefixes() -> DataFrame: - df = from_sheets(sheet_url_prefixes, transpose=True) - df["value"].replace("NULL", "", inplace=True) - df["pattern"] = df["value"].map(url_prefix_pattern) - return df[["name", "pattern"]] - - -def load_query_parsers() -> DataFrame: - df = from_sheets(sheet_query_parsers, transpose=True) - df["query_parser"] = df["value"] - return df[["name", "query_parser"]] - - -def load_page_offset_parsers() -> DataFrame: - df = from_sheets(sheet_page_parsers, transpose=True) - df["value"].replace("NULL", "{}", inplace=True) - df["page_offset_parser"] = df["value"] - return df[["name", "page_offset_parser"]] - - -def service_domains(domains: DataFrame, service: Series) -> list[str]: - return sorted( - set(list(domains[domains["name"] == service["name"]]["domain"])) | { - service["alexa_domain"]}) - - -def query_parser(row: Series) -> dict: - row = row.to_dict() - row.update(loads(row["query_parser"])) - url_pattern = "" if row["pattern"] is None else row["pattern"] - if row["type"] == "qp": - return { - "url_pattern": url_pattern, - "type": "query_parameter", - "parameter": row["key"] - } - elif row["type"] == "fp": - return { - "url_pattern": url_pattern, - "type": "fragment_parameter", - "parameter": row["key"] - } - elif row["type"] == "ps": - return { - "url_pattern": url_pattern, - "type": "path_suffix", - "path_prefix": row["key"] - } - else: - raise NotImplementedError() - - -page_offset_parser_map = {"parameter": "query_parameter", - "suffix": "path_suffix", - "fragment": "fragment_parameter"} - - -def page_offset_parser(row: Series, count="results") -> dict: - row = row.to_dict() - row.update(loads(row["page_offset_parser"])) - if row["count"] == count: - url_pattern = "" if row["pattern"] is None else row["pattern"] - return { - "url_pattern": url_pattern, - "type": page_offset_parser_map[row["type"]], - "parameter": row["key"] - } - else: - raise NotImplementedError() - - -def page_offset_parser_series(page_offset_parsers, services, count): - return [ - sorted(( - page_offset_parser(row, count=count) - for _, row in - page_offset_parsers[ - (page_offset_parsers["name"].str.fullmatch(service["name"])) & - (page_offset_parsers["page_offset_parser"].str.contains( - f'"count": "{count}"' - )) - ].iterrows() - ), key=lambda pp: str(pp["url_pattern"])) - for _, service in services.iterrows() - ] - - -@external.command("import-services") -@argument( - "services-file", - type=PathParam( - exists=False, - file_okay=True, - dir_okay=False, - writable=True, - readable=False, - resolve_path=True, - path_type=Path, - ), - default=DATA_DIRECTORY_PATH / "services.yaml", -) -def import_services(services_file: Path): - services = load_services() - domains = load_domains() - services["domains"] = [ - service_domains(domains, row) - for _, row in services.iterrows() - ] - query_parsers = concat( - [ - load_url_prefixes(), - load_query_parsers()[["query_parser"]] - ], - axis="columns") - services["query_parsers"] = [ - sorted(( - query_parser(row) - for _, row in - query_parsers[ - query_parsers["name"].str.endswith(service["name"]) - ].iterrows() - ), key=lambda qp: str(qp["url_pattern"])) - for _, service in services.iterrows() - ] - page_offset_parsers = concat( - [ - load_url_prefixes(), - load_page_offset_parsers()[["page_offset_parser"]] - ], - axis="columns") - services["page_parsers"] = page_offset_parser_series( - page_offset_parsers, services, count="pages" - ) - services["offset_parsers"] = page_offset_parser_series( - page_offset_parsers, services, count="results" - ) - services["interpreted_query_parsers"] = [ - [] - for _, service in services.iterrows() - ] - services["results_parsers"] = [ - [] - for _, service in services.iterrows() - ] - services.replace({nan: None}, inplace=True) - services_dict = services.to_dict(orient="records") - with services_file.open("wt") as file: - dump(services_dict, stream=file, sort_keys=False) diff --git a/archive_query_log/cli/index.py b/archive_query_log/cli/index.py deleted file mode 100644 index 6a724642..00000000 --- a/archive_query_log/cli/index.py +++ /dev/null @@ -1,108 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from contextlib import ExitStack -from pathlib import Path - -from click import option, BOOL -from tqdm.auto import tqdm - -from archive_query_log import DATA_DIRECTORY_PATH -from archive_query_log.cli import main -from archive_query_log.cli.util import PathParam -from archive_query_log.index import ArchivedRawSerpIndex, \ - ArchivedUrlIndex, ArchivedQueryUrlIndex, ArchivedParsedSerpIndex, \ - ArchivedSearchResultSnippetIndex, ArchivedRawSearchResultIndex - - -@main.command( - "index", - help="Generate helper indices.", -) -@option( - "-d", "--data-directory", "--data-directory-path", - type=PathParam( - exists=True, - file_okay=False, - dir_okay=True, - writable=True, - readable=False, - resolve_path=True, - path_type=Path, - ), - default=DATA_DIRECTORY_PATH -) -@option( - "-f", "--focused", - type=BOOL, - default=False, - is_flag=True, -) -def index_command( - data_directory: Path, - focused: bool, -) -> None: - with ExitStack() as stack: - archived_url_index = stack.enter_context( - ArchivedUrlIndex( - data_directory=data_directory, - focused=focused, - ) - ) - archived_query_url_index = stack.enter_context( - ArchivedQueryUrlIndex( - data_directory=data_directory, - focused=focused, - ) - ) - archived_raw_serp_index = stack.enter_context( - ArchivedRawSerpIndex( - data_directory=data_directory, - focused=focused, - ) - ) - archived_parsed_serp_index = stack.enter_context( - ArchivedParsedSerpIndex( - data_directory=data_directory, - focused=focused, - ) - ) - archived_search_result_snippet_index = stack.enter_context( - ArchivedSearchResultSnippetIndex( - data_directory=data_directory, - focused=focused, - ) - ) - archived_raw_search_result_index = stack.enter_context( - ArchivedRawSearchResultIndex( - data_directory=data_directory, - focused=focused, - ) - ) - # archived_parsed_search_result_index = stack.enter_context( - # ArchivedParsedSearchResultIndex( - # data_directory=data_directory, - # focused=focused, - # ) - # ) - indexes = [ - archived_url_index, - archived_query_url_index, - archived_raw_serp_index, - archived_parsed_serp_index, - archived_search_result_snippet_index, - archived_raw_search_result_index, - # archived_parsed_search_result_index, - ] - - pool = ThreadPoolExecutor() - progress = tqdm( - total=len(indexes), - desc="Build indices", - unit="index", - ) - - def run_index(index) -> None: - index.index() - progress.update() - - for _ in pool.map(run_index, indexes): - pass diff --git a/archive_query_log/cli/main.py b/archive_query_log/cli/main.py deleted file mode 100644 index 7d25cf02..00000000 --- a/archive_query_log/cli/main.py +++ /dev/null @@ -1,6 +0,0 @@ -from click import group - - -@group() -def main(): - pass diff --git a/archive_query_log/cli/make.py b/archive_query_log/cli/make.py deleted file mode 100644 index 0ce6f16f..00000000 --- a/archive_query_log/cli/make.py +++ /dev/null @@ -1,281 +0,0 @@ -from asyncio import run -from pathlib import Path - -from click import option, argument, STRING, IntRange, BOOL - -from archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL, LOGGER -from archive_query_log.cli import main -from archive_query_log.cli.util import PathParam, ServiceChoice - - -@main.group("make") -def make_group(): - pass - - -def _data_directory_option(): - return option( - "-d", "--data-directory", "--data-directory-path", - type=PathParam( - exists=True, - file_okay=False, - dir_okay=True, - writable=True, - readable=False, - resolve_path=True, - path_type=Path, - ), - default=DATA_DIRECTORY_PATH - ) - - -def _focused_argument(): - return option( - "-f", "--focused", - type=BOOL, - default=False, - is_flag=True, - ) - - -def _service_name_argument(): - return argument( - "service", - type=ServiceChoice(), - required=False, - ) - - -def _domain_argument(): - return argument( - "domain", - type=STRING, - required=False, - ) - - -def _cdx_page_argument(): - return argument( - "cdx_page", - type=IntRange(min=0), - required=False, - ) - - -@make_group.command( - "archived-urls", - help="Fetch archived URLs from the Wayback Machine's CDX API.", -) -@_data_directory_option() -@_focused_argument() -@_service_name_argument() -@_domain_argument() -@_cdx_page_argument() -def archived_urls_command( - data_directory: Path, - focused: bool, - service: str | None, - domain: str | None, - cdx_page: int | None, -) -> None: - from archive_query_log.config import SERVICES - from archive_query_log.urls.fetch import ArchivedUrlsFetcher, \ - UrlMatchScope - if focused: - data_directory = data_directory / "focused" - service_configs = [SERVICES[service]] if service else SERVICES.values() - for service_config in service_configs: - match_scope = UrlMatchScope.PREFIX if focused else UrlMatchScope.DOMAIN - fetcher = ArchivedUrlsFetcher( - match_scope=match_scope, - include_status_codes={200}, - exclude_status_codes=set(), - include_mime_types={"text/html"}, - exclude_mime_types=set(), - cdx_api_url=CDX_API_URL - ) - if focused and len(service_config.focused_url_prefixes) == 0: - LOGGER.warning( - f"No focused URL prefixes configured for service {service}." - ) - run(fetcher.fetch_service( - data_directory=data_directory, - focused=focused, - service=service_config, - domain=domain, - cdx_page=cdx_page, - )) - - -@make_group.command( - "archived-query-urls", - help="Parse queries from fetched archived URLs.", -) -@_data_directory_option() -@_focused_argument() -@_service_name_argument() -@_domain_argument() -@_cdx_page_argument() -def archived_query_urls_command( - data_directory: Path, - focused: bool, - service: str | None, - domain: str | None, - cdx_page: int | None, -) -> None: - from archive_query_log.config import SERVICES - from archive_query_log.queries.parse import ArchivedQueryUrlParser - if focused: - data_directory = data_directory / "focused" - service_configs = [SERVICES[service]] if service else SERVICES.values() - for service_config in service_configs: - if len(service_config.query_parsers) == 0: - LOGGER.warning( - f"No query parsers configured for service {service}." - ) - if len(service_config.page_parsers) == 0 \ - and len(service_config.offset_parsers) == 0: - LOGGER.warning( - f"No page or offset parsers configured for service {service}." - ) - parser = ArchivedQueryUrlParser( - query_parsers=service_config.query_parsers, - page_parsers=service_config.page_parsers, - offset_parsers=service_config.offset_parsers, - ) - parser.parse_service( - data_directory=data_directory, - focused=focused, - service=service_config, - domain=domain, - cdx_page=cdx_page, - ) - - -@make_group.command( - "archived-raw-serps", - help="Download raw SERP contents (as WARC files) for parsed queries.", -) -@_data_directory_option() -@_focused_argument() -@_service_name_argument() -@_domain_argument() -@_cdx_page_argument() -def archived_raw_serps_command( - data_directory: Path, - focused: bool, - service: str | None, - domain: str | None, - cdx_page: int | None, -) -> None: - from archive_query_log.config import SERVICES - from archive_query_log.download.warc import WebArchiveWarcDownloader - if focused: - data_directory = data_directory / "focused" - service_configs = [SERVICES[service]] if service else SERVICES.values() - for service_config in service_configs: - downloader = WebArchiveWarcDownloader(verbose=True) - run(downloader.download_service( - data_directory=data_directory, - focused=focused, - service=service_config, - domain=domain, - cdx_page=cdx_page, - )) - - -@make_group.command( - "archived-parsed-serps", - help="Parse SERP results from raw SERPs.", -) -@_data_directory_option() -@_focused_argument() -@_service_name_argument() -@_domain_argument() -@_cdx_page_argument() -def archived_parsed_serps_command( - data_directory: Path, - focused: bool, - service: str | None, - domain: str | None, - cdx_page: int | None, -) -> None: - from archive_query_log.config import SERVICES - from archive_query_log.results.parse import ArchivedParsedSerpParser - if focused: - data_directory = data_directory / "focused" - service_configs = [SERVICES[service]] if service else SERVICES.values() - for service_config in service_configs: - if len(service_config.results_parsers) == 0: - LOGGER.warning( - f"No result parsers configured for service {service}." - ) - if len(service_config.interpreted_query_parsers) == 0: - LOGGER.warning( - f"No interpreted query parsers configured" - f"for service {service}." - ) - parser = ArchivedParsedSerpParser( - results_parsers=service_config.results_parsers, - interpreted_query_parsers=service_config.interpreted_query_parsers, - ) - parser.parse_service( - data_directory=data_directory, - focused=focused, - service=service_config, - domain=domain, - cdx_page=cdx_page, - ) - - -@make_group.command( - "archived-raw-search-results", - help="Download raw search result contents (as WARC files) " - "for parsed SERPs.", -) -@_data_directory_option() -@_focused_argument() -@_service_name_argument() -@_domain_argument() -@_cdx_page_argument() -def archived_raw_search_results_command( - data_directory: Path, - focused: bool, - service: str | None, - domain: str | None, - cdx_page: int | None, -) -> None: - from archive_query_log.config import SERVICES - from archive_query_log.download.warc import WebArchiveWarcDownloader - service_configs = [SERVICES[service]] if service else SERVICES.values() - if focused: - data_directory = data_directory / "focused" - for service_config in service_configs: - downloader = WebArchiveWarcDownloader(verbose=True) - run(downloader.download_service( - data_directory=data_directory, - focused=focused, - service=service_config, - domain=domain, - cdx_page=cdx_page, - snippets=True, - )) - - -@make_group.command( - "archived-parsed-search-results", - help="Parse search results from raw search result contents.", -) -@_data_directory_option() -@_focused_argument() -@_service_name_argument() -@_domain_argument() -@_cdx_page_argument() -def archived_parsed_search_results_command( - data_directory: Path, - focused: bool, - service: str | None, - domain: str | None, - cdx_page: int | None, -) -> None: - raise NotImplementedError() diff --git a/archive_query_log/cli/monitoring.py b/archive_query_log/cli/monitoring.py new file mode 100644 index 00000000..7ff9e78f --- /dev/null +++ b/archive_query_log/cli/monitoring.py @@ -0,0 +1,24 @@ +from click import group, option + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config + + +@group() +def monitoring() -> None: + pass + + +@monitoring.command() +@option("-h", "--host", type=str, default="127.0.0.1", + help="The interface to bind to.") +@option("-p", "--port", type=int, default=5000, + help="The port to bind to.") +@pass_config +def run( + config: Config, + host: str, + port: int, +) -> None: + from archive_query_log.monitoring import run_monitoring + run_monitoring(config, host, port) diff --git a/archive_query_log/cli/parsers.py b/archive_query_log/cli/parsers.py new file mode 100644 index 00000000..56cabb55 --- /dev/null +++ b/archive_query_log/cli/parsers.py @@ -0,0 +1,474 @@ +from pathlib import Path + +from click import group, option, Choice, Path as PathType, UsageError, \ + FloatRange + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config +from archive_query_log.orm import UrlQueryParserType, \ + UrlQueryParser, UrlPageParserType, UrlPageParser, \ + UrlOffsetParser, UrlOffsetParserType, WarcQueryParserType, \ + WarcQueryParser, WarcSnippetsParserType, WarcSnippetsParser, \ + WarcDirectAnswersParserType, WarcDirectAnswersParser, \ + WarcMainContentParserType, WarcMainContentParser + + +@group() +def parsers() -> None: + pass + + +@parsers.group() +def url_query() -> None: + pass + + +CHOICES_URL_QUERY_PARSER_TYPE = [ + "query-parameter", + "fragment-parameter", + "path-segment", +] + + +@url_query.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_URL_QUERY_PARSER_TYPE), required=True) +@option("--parameter", type=str) +@option("--segment", type=int) +@option("--remove-pattern-regex", type=str) +@option("--space-pattern-regex", type=str) +@pass_config +def url_query_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, + parameter: str | None, + segment: int | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + from archive_query_log.parsers.url_query import add_url_query_parser + parser_type_strict: UrlQueryParserType + if parser_type == "query-parameter": + parser_type_strict = "query_parameter" + if parameter is None: + raise UsageError("No query parameter given.") + elif parser_type == "fragment-parameter": + parser_type_strict = "fragment_parameter" + if parameter is not None: + raise UsageError("No fragment parameter given.") + elif parser_type == "path-segment": + parser_type_strict = "path_segment" + if segment is None: + raise UsageError("No path segment given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + UrlQueryParser.init(using=config.es.client) + add_url_query_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + parameter=parameter, + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +@url_query.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@pass_config +def url_query_import(config: Config, services_path: Path) -> None: + from archive_query_log.imports.yaml import import_url_query_parsers + UrlQueryParser.init(using=config.es.client) + import_url_query_parsers(config, services_path) + + +@parsers.group() +def url_page() -> None: + pass + + +CHOICES_URL_PAGE_PARSER_TYPE = [ + "query-parameter", + "fragment-parameter", + "path-segment", +] + + +@url_page.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_URL_PAGE_PARSER_TYPE), required=True) +@option("--parameter", type=str) +@option("--segment", type=int) +@option("--remove-pattern-regex", type=str) +@option("--space-pattern-regex", type=str) +@pass_config +def url_page_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, + parameter: str | None, + segment: int | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + from archive_query_log.parsers.url_page import add_url_page_parser + parser_type_strict: UrlPageParserType + if parser_type == "query-parameter": + parser_type_strict = "query_parameter" + if parameter is None: + raise UsageError("No query parameter given.") + elif parser_type == "fragment-parameter": + parser_type_strict = "fragment_parameter" + if parameter is not None: + raise UsageError("No fragment parameter given.") + elif parser_type == "path-segment": + parser_type_strict = "path_segment" + if segment is None: + raise UsageError("No path segment given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + UrlPageParser.init(using=config.es.client) + add_url_page_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + parameter=parameter, + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +@url_page.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@pass_config +def url_page_import(config: Config, services_path: Path) -> None: + from archive_query_log.imports.yaml import import_url_page_parsers + UrlPageParser.init(using=config.es.client) + import_url_page_parsers(config, services_path) + + +@parsers.group() +def url_offset() -> None: + pass + + +CHOICES_URL_OFFSET_PARSER_TYPE = [ + "query-parameter", + "fragment-parameter", + "path-segment", +] + + +@url_offset.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_URL_OFFSET_PARSER_TYPE), required=True) +@option("--parameter", type=str) +@option("--segment", type=int) +@option("--remove-pattern-regex", type=str) +@option("--space-pattern-regex", type=str) +@pass_config +def url_offset_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, + parameter: str | None, + segment: int | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + from archive_query_log.parsers.url_offset import add_url_offset_parser + parser_type_strict: UrlOffsetParserType + if parser_type == "query-parameter": + parser_type_strict = "query_parameter" + if parameter is None: + raise UsageError("No query parameter given.") + elif parser_type == "fragment-parameter": + parser_type_strict = "fragment_parameter" + if parameter is not None: + raise UsageError("No fragment parameter given.") + elif parser_type == "path-segment": + parser_type_strict = "path_segment" + if segment is None: + raise UsageError("No path segment given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + UrlOffsetParser.init(using=config.es.client) + add_url_offset_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + parameter=parameter, + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +@url_offset.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@pass_config +def url_offset_import(config: Config, services_path: Path) -> None: + from archive_query_log.imports.yaml import import_url_offset_parsers + UrlOffsetParser.init(using=config.es.client) + import_url_offset_parsers(config, services_path) + + +@parsers.group() +def warc_query() -> None: + pass + + +CHOICES_WARC_QUERY_PARSER_TYPE = [ + "xpath", +] + + +@warc_query.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_WARC_QUERY_PARSER_TYPE), required=True) +@option("--xpath", type=str) +@option("--remove-pattern-regex", type=str) +@option("--space-pattern-regex", type=str) +@pass_config +def warc_query_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, + xpath: str | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + from archive_query_log.parsers.warc_query import add_warc_query_parser + parser_type_strict: WarcQueryParserType + if parser_type == "xpath": + parser_type_strict = "xpath" + if xpath is None: + raise UsageError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + WarcQueryParser.init(using=config.es.client) + add_warc_query_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + xpath=xpath, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +@warc_query.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@pass_config +def warc_query_import(config: Config, services_path: Path) -> None: + from archive_query_log.imports.yaml import import_warc_query_parsers + WarcQueryParser.init(using=config.es.client) + import_warc_query_parsers(config, services_path) + + +@parsers.group() +def warc_snippets() -> None: + pass + + +CHOICES_WARC_SNIPPETS_PARSER_TYPE = [ + "xpath", +] + + +@warc_snippets.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_WARC_SNIPPETS_PARSER_TYPE), required=True) +@option("--xpath", type=str) +@option("--url-xpath", type=str) +@option("--title-xpath", type=str) +@option("--text-xpath", type=str) +@pass_config +def warc_snippets_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, + xpath: str | None, + url_xpath: str | None, + title_xpath: str | None, + text_xpath: str | None, +) -> None: + from archive_query_log.parsers.warc_snippets import \ + add_warc_snippets_parser + parser_type_strict: WarcSnippetsParserType + if parser_type == "xpath": + parser_type_strict = "xpath" + if xpath is None: + raise UsageError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + WarcSnippetsParser.init(using=config.es.client) + add_warc_snippets_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + xpath=xpath, + url_xpath=url_xpath, + title_xpath=title_xpath, + text_xpath=text_xpath, + ) + + +@warc_snippets.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@pass_config +def warc_snippets_import(config: Config, services_path: Path) -> None: + from archive_query_log.imports.yaml import import_warc_snippets_parsers + WarcSnippetsParser.init(using=config.es.client) + import_warc_snippets_parsers(config, services_path) + + +@parsers.group() +def warc_direct_answers() -> None: + pass + + +CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE = [ + "xpath", +] + + +@warc_direct_answers.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_WARC_DIRECT_ANSWERS_PARSER_TYPE), required=True) +@option("--xpath", type=str) +@option("--url-xpath", type=str) +@option("--text-xpath", type=str) +@pass_config +def warc_direct_answers_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, + xpath: str | None, + url_xpath: str | None, + text_xpath: str | None, +) -> None: + from archive_query_log.parsers.warc_direct_answers import \ + add_warc_direct_answers_parser + parser_type_strict: WarcDirectAnswersParserType + if parser_type == "xpath": + parser_type_strict = "xpath" + if xpath is None: + raise UsageError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + WarcDirectAnswersParser.init(using=config.es.client) + add_warc_direct_answers_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + xpath=xpath, + url_xpath=url_xpath, + text_xpath=text_xpath, + ) + + +@parsers.group() +def warc_main_content() -> None: + pass + + +CHOICES_WARC_MAIN_CONTENT_PARSER_TYPE = [ + "resiliparse", +] + + +@warc_main_content.command("add") +@option("--provider-id", type=str) +@option("--url-pattern-regex", type=str) +@option("--priority", type=FloatRange(min=0, min_open=False)) +@option("--parser-type", + type=Choice(CHOICES_WARC_MAIN_CONTENT_PARSER_TYPE), required=True) +@pass_config +def warc_main_content_add( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: str, +) -> None: + from archive_query_log.parsers.warc_main_content import \ + add_warc_main_content_parser + parser_type_strict: WarcMainContentParserType + if parser_type == "resiliparse": + parser_type_strict = "resiliparse" + else: + raise ValueError(f"Invalid parser type: {parser_type}") + WarcMainContentParser.init(using=config.es.client) + add_warc_main_content_parser( + config=config, + provider_id=provider_id, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type_strict, + ) diff --git a/archive_query_log/cli/providers.py b/archive_query_log/cli/providers.py new file mode 100644 index 00000000..f3e95883 --- /dev/null +++ b/archive_query_log/cli/providers.py @@ -0,0 +1,82 @@ +from pathlib import Path + +from click import group, option, Path as PathType, FloatRange + +from archive_query_log.cli.util import validate_split_domains, pass_config +from archive_query_log.config import Config +from archive_query_log.orm import Provider + + +@group() +def providers() -> None: + pass + + +@providers.command() +@option("--name", type=str) +@option("--description", type=str) +@option("--notes", type=str) +@option("--exclusion-reason", "--exclusion", type=str) +@option("--domains", "--domain", type=str, multiple=True, + required=True, callback=validate_split_domains) +@option("--url-path-prefixes", "--url-path-prefix", type=str, + multiple=True, required=True, metavar="PREFIXES") +@option("--priority", type=FloatRange(min=0, min_open=False)) +@pass_config +def add( + config: Config, + name: str | None, + description: str | None, + notes: str | None, + exclusion_reason: str | None, + domains: list[str], + url_path_prefixes: list[str], + priority: float | None, +) -> None: + from archive_query_log.providers import add_provider + Provider.init(using=config.es.client) + add_provider( + config=config, + name=name, + description=description, + notes=notes, + exclusion_reason=exclusion_reason, + domains=set(domains), + url_path_prefixes=set(url_path_prefixes), + priority=priority, + ) + + +@providers.command("import") +@option("-s", "--services-file", "services_path", + type=PathType(path_type=Path, exists=True, file_okay=True, + dir_okay=False, readable=True, resolve_path=True, + allow_dash=False), + default=Path("data") / "selected-services.yaml") +@option("-c", "--cache-dir", "cache_path", + type=PathType(path_type=Path, exists=False, file_okay=False, + dir_okay=True, readable=True, writable=True, + resolve_path=True, allow_dash=False), + default=Path("data") / "cache" / "provider-names") +@option("--review", type=int) +@option("--no-merge", is_flag=True, default=False, type=bool) +@option("--auto-merge", is_flag=True, default=False, type=bool) +@pass_config +def import_( + config: Config, + services_path: Path, + cache_path: Path, + review: int | None, + no_merge: bool, + auto_merge: bool, +) -> None: + from archive_query_log.imports.yaml import import_providers + Provider.init(using=config.es.client) + import_providers( + config=config, + services_path=services_path, + cache_path=cache_path, + review=review, + no_merge=no_merge, + auto_merge=auto_merge, + ) diff --git a/archive_query_log/cli/results.py b/archive_query_log/cli/results.py new file mode 100644 index 00000000..f31ebea2 --- /dev/null +++ b/archive_query_log/cli/results.py @@ -0,0 +1,21 @@ +from click import group + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config + + +@group() +def results(): + pass + + +@results.group() +def download(): + pass + + +@download.command(help="Download archived documents of captures as WARC.") +@pass_config +def warc(config: Config) -> None: + from archive_query_log.downloaders.warc import download_results_warc + download_results_warc(config) diff --git a/archive_query_log/cli/serps.py b/archive_query_log/cli/serps.py new file mode 100644 index 00000000..bd9c8a05 --- /dev/null +++ b/archive_query_log/cli/serps.py @@ -0,0 +1,73 @@ +from click import group + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config +from archive_query_log.orm import Serp, Result + + +@group() +def serps(): + pass + + +@serps.group() +def parse(): + pass + + +@parse.command() +@pass_config +def url_query(config: Config) -> None: + from archive_query_log.parsers.url_query import parse_serps_url_query + Serp.init(using=config.es.client) + parse_serps_url_query(config) + + +@parse.command() +@pass_config +def url_page(config: Config) -> None: + from archive_query_log.parsers.url_page import parse_serps_url_page + parse_serps_url_page(config) + + +@parse.command() +@pass_config +def url_offset(config: Config) -> None: + from archive_query_log.parsers.url_offset import parse_serps_url_offset + parse_serps_url_offset(config) + + +@parse.command() +@pass_config +def warc_query(config: Config) -> None: + from archive_query_log.parsers.warc_query import parse_serps_warc_query + parse_serps_warc_query(config) + + +@parse.command() +@pass_config +def warc_snippets(config: Config) -> None: + from archive_query_log.parsers.warc_snippets import \ + parse_serps_warc_snippets + Result.init(using=config.es.client) + parse_serps_warc_snippets(config) + + +@parse.command() +@pass_config +def warc_direct_answers(config: Config) -> None: + from archive_query_log.parsers.warc_direct_answers import \ + parse_serps_warc_direct_answers + parse_serps_warc_direct_answers(config) + + +@serps.group() +def download(): + pass + + +@download.command(help="Download archived documents of captures as WARC.") +@pass_config +def warc(config: Config) -> None: + from archive_query_log.downloaders.warc import download_serps_warc + download_serps_warc(config) diff --git a/archive_query_log/cli/sources.py b/archive_query_log/cli/sources.py new file mode 100644 index 00000000..c2e0a900 --- /dev/null +++ b/archive_query_log/cli/sources.py @@ -0,0 +1,28 @@ +from click import group, option + +from archive_query_log.cli.util import pass_config +from archive_query_log.config import Config +from archive_query_log.orm import Source + + +@group() +def sources(): + pass + + +@sources.command() +@option("--skip-archives", is_flag=True) +@option("--skip-providers", is_flag=True) +@pass_config +def build( + config: Config, + skip_archives: bool, + skip_providers: bool, +) -> None: + from archive_query_log.sources import build_sources + Source.init(using=config.es.client) + build_sources( + config=config, + skip_archives=skip_archives, + skip_providers=skip_providers, + ) diff --git a/archive_query_log/cli/stats.py b/archive_query_log/cli/stats.py deleted file mode 100644 index 8ecd9cf6..00000000 --- a/archive_query_log/cli/stats.py +++ /dev/null @@ -1,232 +0,0 @@ -from asyncio import run -from gzip import open as gzip_open -from json import loads -from math import inf -from pathlib import Path - -from click import option, BOOL, IntRange -from pandas import DataFrame -from tqdm.auto import tqdm - -from archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL, LOGGER -from archive_query_log.cli import main -from archive_query_log.cli.util import PathParam -from archive_query_log.config import SERVICES - -# See: -# https://github.com/internetarchive/wayback/blob/master/wayback-cdx-server/README.md#pagination-api -_URLS_PER_BLOCK = 3000 -_BLOCKS_PER_PAGE = 50 - - -def _all_archived_urls( - data_directory: Path, - focused: bool, - service: str, -) -> int: - from archive_query_log.config import SERVICES - from archive_query_log.urls.fetch import ArchivedUrlsFetcher, \ - UrlMatchScope - service_config = SERVICES[service] - match_scope = UrlMatchScope.PREFIX if focused else UrlMatchScope.DOMAIN - fetcher = ArchivedUrlsFetcher( - match_scope=match_scope, - include_status_codes={200}, - exclude_status_codes=set(), - include_mime_types={"text/html"}, - exclude_mime_types=set(), - cdx_api_url=CDX_API_URL, - ) - if focused: - if len(service_config.focused_url_prefixes) == 0: - LOGGER.warning( - f"No focused URL prefixes configured for service {service}." - ) - num_pages = run(fetcher.num_service_pages( - data_directory=data_directory, - focused=focused, - service=service_config, - )) - return num_pages * _BLOCKS_PER_PAGE * _URLS_PER_BLOCK - - -@main.command( - "stats", - help="Get stats for the most recent exported corpus.", -) -@option( - "-d", "--data-directory", "--data-directory-path", - type=PathParam( - exists=True, - file_okay=False, - dir_okay=True, - writable=True, - readable=False, - resolve_path=True, - path_type=Path, - ), - default=DATA_DIRECTORY_PATH -) -@option( - "-f", "--focused", - type=BOOL, - default=False, - is_flag=True, -) -@option( - "--min-rank", "--min-alexa-rank", - type=IntRange(min=1), - required=False, -) -@option( - "--max-rank", "--max-alexa-rank", - type=IntRange(min=1), - required=False, -) -@option( - "-c", "--corpus-directory", "--corpus-directory-path", - type=PathParam( - exists=False, - file_okay=False, - dir_okay=True, - writable=True, - readable=True, - resolve_path=True, - path_type=Path, - ), - required=False, -) -@option( - "-o", "--output", "--output-path", - type=PathParam( - exists=False, - file_okay=True, - dir_okay=False, - writable=True, - readable=False, - resolve_path=True, - path_type=Path, - ), - required=False, -) -def stats_command( - data_directory: Path, - focused: bool, - min_rank: int | None, - max_rank: int | None, - corpus_directory: Path | None, - output: Path | None, -) -> None: - services = SERVICES.values() - if min_rank is not None: - services = ( - service - for service in services - if (service.alexa_rank is not None and - service.alexa_rank >= min_rank) - ) - if max_rank is not None: - services = ( - service - for service in services - if (service.alexa_rank is not None and - service.alexa_rank <= max_rank) - ) - services = sorted(services, key=lambda service: service.alexa_rank or inf) - - results: dict[str, dict[str, int]] = { - service.name: { - "all-archived-urls": 0, - "archived-urls": 0, - "archived-query-urls": 0, - "archived-raw-serps": 0, - "archived-parsed-serps": 0, - "archived-snippets": 0, - "archived-raw-search-results": 0, - "archived-parsed-search-results": 0, - } - for service in services - } - - for service in services: - results[service.name]["all-archived-urls"] = _all_archived_urls( - data_directory, - focused, - service.name, - ) - - corpus_path: Path - if corpus_directory is not None: - corpus_path = corpus_directory - elif focused: - corpus_path = data_directory / "focused" / "corpus" - else: - corpus_path = data_directory / "corpus" - - if corpus_path.exists(): - queries_paths = sorted( - corpus_path.glob("queries-*.jsonl.gz"), - reverse=True, - ) - documents_paths = sorted( - corpus_path.glob("documents-*.jsonl.gz"), - reverse=True, - ) - if len(queries_paths) > 0 and len(documents_paths) > 0: - queries_path = queries_paths[0] - documents_path = documents_paths[0] - - with gzip_open(queries_path, "rt") as queries_file: - lines = tqdm( - queries_file, - desc="Read queries corpus" - ) - for line in lines: - query = loads(line) - service_name = query["service"] - if query["archived_url_location"] is not None: - results[service_name]["archived-urls"] += 1 - if query["archived_query_url_location"] is not None: - results[service_name]["archived-query-urls"] += 1 - if query["archived_raw_serp_location"] is not None: - results[service_name]["archived-raw-serps"] += 1 - if query["archived_parsed_serp_location"] is not None: - results[service_name]["archived-parsed-serps"] += 1 - - with gzip_open(documents_path, "rt") as documents_file: - lines = tqdm( - documents_file, - desc="Read documents corpus" - ) - for line in lines: - document = loads(line) - service_name = document["service"] - if document["archived_snippet_location"] is not None: - results[service_name]["archived-snippets"] += 1 - if document[ - "archived_raw_search_result_location" - ] is not None: - results[service_name][ - "archived-raw-search-results"] += 1 - if document[ - "archived_parsed_search_result_location" - ] is not None: - results[service_name][ - "archived-parsed-search-results"] += 1 - - output_path: Path - if output is not None: - output_path = output - elif focused: - output_path = data_directory / "focused" / "stats.csv" - else: - output_path = data_directory / "stats.csv" - - df = DataFrame([ - { - "service": service_name, - **service_results, - } - for service_name, service_results in results.items() - ]) - df.to_csv(output_path, index=False) diff --git a/archive_query_log/cli/util.py b/archive_query_log/cli/util.py index 52c795ce..f76144b7 100644 --- a/archive_query_log/cli/util.py +++ b/archive_query_log/cli/util.py @@ -1,69 +1,25 @@ -from typing import Dict, Any, List -from urllib.parse import urlparse +from typing import Sequence -from click import Parameter, Context -from click.shell_completion import CompletionItem -from click.types import StringParamType, Path, Choice +from click import Parameter, Context, BadParameter, make_pass_decorator +from archive_query_log.config import Config -class UrlParam(StringParamType): - name = "url" - def convert(self, value, param, ctx): - value = super().convert(value, param, ctx) - if value is None: - return None - tokens = urlparse(value) - if not tokens.scheme or not tokens.netloc: - self.fail(f"{value} is not a valid URL", param, ctx) - return value +def validate_split_domains( + _context: Context, + _parameter: Parameter, + value: Sequence[str], +) -> Sequence[str]: + valid_domains = [] + for domains in value: + for domain in domains.split(","): + domain = domain.strip() + if not domain.islower(): + raise BadParameter(f"Domain must be lowercase: {domain}") + if "." not in domain: + raise BadParameter(f"Not a valid domain: {domain}") + valid_domains.append(domain) + return valid_domains -URL = UrlParam() - -PathParam = Path - - -class ServiceChoice(Choice): - - def __init__(self) -> None: - super().__init__(choices=[], case_sensitive=False) - - def _ensure_choices(self): - if len(self.choices) == 0: - from archive_query_log.config import SERVICES - self.choices = sorted(SERVICES.keys()) - - def to_info_dict(self) -> Dict[str, Any]: - self._ensure_choices() - return super().to_info_dict() - - def get_metavar(self, param: Parameter) -> str: - self._ensure_choices() - return super().get_metavar(param) - - def get_missing_message(self, param: Parameter) -> str: - self._ensure_choices() - return super().get_missing_message(param) - - def convert( - self, - value: Any, - param: Parameter | None, - ctx: Context | None, - ) -> Any: - self._ensure_choices() - return super().convert(value, param, ctx) - - def __repr__(self) -> str: - self._ensure_choices() - return super().__repr__() - - def shell_complete( - self, - ctx: Context, - param: Parameter, - incomplete: str, - ) -> List[CompletionItem]: - self._ensure_choices() - return super().shell_complete(ctx, param, incomplete) +pass_config = make_pass_decorator(Config) diff --git a/archive_query_log/config.py b/archive_query_log/config.py index 71cd35b0..1fcf4d6a 100644 --- a/archive_query_log/config.py +++ b/archive_query_log/config.py @@ -1,9 +1,136 @@ -from typing import Mapping +from dataclasses import dataclass +from functools import cached_property +from typing import Iterable, Any -from archive_query_log import DATA_DIRECTORY_PATH -from archive_query_log.model import Service -from archive_query_log.services import read_services +from dataclasses_json import DataClassJsonMixin +from elasticsearch import Elasticsearch +from elasticsearch.helpers import streaming_bulk +from pyrate_limiter import Limiter, RequestRate, Duration +from requests import Session +from requests_ratelimiter import LimiterAdapter +from urllib3 import Retry +from warc_s3 import WarcS3Store -# Load all services that have parsers and create the services for them. -SERVICES_PATH = DATA_DIRECTORY_PATH / "selected-services.yaml" -SERVICES: Mapping[str, Service] = read_services(SERVICES_PATH) +from archive_query_log import __version__ as version + + +@dataclass(frozen=True) +class EsConfig(DataClassJsonMixin): + host: str + port: int + username: str + password: str + max_retries: int = 5 + bulk_chunk_size: int = 500 + bulk_max_chunk_bytes: int = 100 * 1024 * 1024 + bulk_initial_backoff: int = 2 + bulk_max_backoff: int = 60 + + @cached_property + def client(self) -> Elasticsearch: + return Elasticsearch( + hosts=f"https://{self.host}:{self.port}", + http_auth=(self.username, self.password), + timeout=60, + max_retries=self.max_retries, + retry_on_status=(502, 503, 504), + retry_on_timeout=True, + ) + + def streaming_bulk( + self, + actions: Iterable[dict], + ) -> Iterable[tuple[bool, Any]]: + return streaming_bulk( + client=self.client, + actions=actions, + chunk_size=self.bulk_chunk_size, + max_chunk_bytes=self.bulk_max_chunk_bytes, + initial_backoff=self.bulk_initial_backoff, + max_backoff=self.bulk_max_backoff, + max_retries=self.max_retries, + raise_on_error=True, + raise_on_exception=True, + yield_ok=True, + ) + + def bulk(self, actions: Iterable[dict]) -> None: + for _ in self.streaming_bulk(actions): + pass + + +@dataclass(frozen=True) +class S3Config(DataClassJsonMixin): + endpoint_url: str + access_key: str + secret_key: str + bucket_name: str + + @cached_property + def warc_store(self) -> WarcS3Store: + return WarcS3Store( + endpoint_url=self.endpoint_url, + access_key=self.access_key, + secret_key=self.secret_key, + bucket_name=self.bucket_name, + max_file_records=1000, + quiet=True, + ) + + +@dataclass(frozen=True) +class HttpConfig(DataClassJsonMixin): + max_retries: int = 5 + + @cached_property + def session(self) -> Session: + session = Session() + session.headers.update({ + "User-Agent": f"AQL/{version} (Webis group)", + }) + _retries = Retry( + total=20, + connect=5, + read=5, + redirect=10, + status=10, + backoff_factor=1, + status_forcelist=[502, 503, 504], + respect_retry_after_header=True, + ) + _limiter = Limiter( + RequestRate(1, Duration.SECOND * 10), + ) + _adapter = LimiterAdapter( + max_retries=_retries, + limiter=_limiter, + per_host=True, + ) + # noinspection HttpUrlsUsage + session.mount("http://", _adapter) + session.mount("https://", _adapter) + return session + + @cached_property + def session_no_retry(self) -> Session: + session = Session() + session.headers.update({ + "User-Agent": f"AQL/{version} (Webis group)", + }) + _limiter = Limiter( + RequestRate(1, Duration.SECOND * 10), + ) + _adapter = LimiterAdapter( + limiter=_limiter, + ) + # noinspection HttpUrlsUsage + session.mount("http://", _adapter) + session.mount("https://", _adapter) + return session + + +@dataclass(frozen=True) +class Config(DataClassJsonMixin): + es: EsConfig + s3: S3Config + http: HttpConfig = HttpConfig() diff --git a/archive_query_log/dashboard/.browserslistrc b/archive_query_log/dashboard/.browserslistrc new file mode 100644 index 00000000..dc3bc09a --- /dev/null +++ b/archive_query_log/dashboard/.browserslistrc @@ -0,0 +1,4 @@ +> 1% +last 2 versions +not dead +not ie 11 diff --git a/archive_query_log/dashboard/.editorconfig b/archive_query_log/dashboard/.editorconfig new file mode 100644 index 00000000..7053c49a --- /dev/null +++ b/archive_query_log/dashboard/.editorconfig @@ -0,0 +1,5 @@ +[*.{js,jsx,ts,tsx,vue}] +indent_style = space +indent_size = 2 +trim_trailing_whitespace = true +insert_final_newline = true diff --git a/archive_query_log/dashboard/.gitignore b/archive_query_log/dashboard/.gitignore new file mode 100644 index 00000000..11f5d714 --- /dev/null +++ b/archive_query_log/dashboard/.gitignore @@ -0,0 +1,22 @@ +.DS_Store +node_modules +/dist + +# local env files +.env.local +.env.*.local + +# Log files +npm-debug.log* +yarn-debug.log* +yarn-error.log* +pnpm-debug.log* + +# Editor directories and files +.idea +.vscode +*.suo +*.ntvs* +*.njsproj +*.sln +*.sw? diff --git a/archive_query_log/dashboard/README.md b/archive_query_log/dashboard/README.md new file mode 100644 index 00000000..8a6488fc --- /dev/null +++ b/archive_query_log/dashboard/README.md @@ -0,0 +1,17 @@ +# πŸ“ˆ aql-monitoring + +Monitor and manage the crawling of the [Archive Query Log](https://github.com/webis-de/archive-query-log). + +## Installation + +```shell +npm install +``` + +TODO: Add better instructions. + +## Usage + +For starting the website, navigate to the repo and do +```npm run dev-vite``` +in your terminal diff --git a/archive_query_log/dashboard/components.d.ts b/archive_query_log/dashboard/components.d.ts new file mode 100644 index 00000000..744ee4d9 --- /dev/null +++ b/archive_query_log/dashboard/components.d.ts @@ -0,0 +1,16 @@ +/* eslint-disable */ +/* prettier-ignore */ +// @ts-nocheck +// Generated by unplugin-vue-components +// Read more: https://github.com/vuejs/core/pull/3399 +export {} + +declare module 'vue' { + export interface GlobalComponents { + Footer: typeof import('./src/components/Footer.vue')['default'] + Header: typeof import('./src/components/Header.vue')['default'] + Home: typeof import('./src/components/Home.vue')['default'] + ProgressTable: typeof import('./src/components/ProgressTable.vue')['default'] + StatisticsTable: typeof import('./src/components/StatisticsTable.vue')['default'] + } +} diff --git a/archive_query_log/dashboard/index.html b/archive_query_log/dashboard/index.html new file mode 100644 index 00000000..e1d03f93 --- /dev/null +++ b/archive_query_log/dashboard/index.html @@ -0,0 +1,14 @@ + + + + + + Archive Query Log + + + +
+ + + + diff --git a/archive_query_log/dashboard/package-lock.json b/archive_query_log/dashboard/package-lock.json new file mode 100644 index 00000000..77446dd2 --- /dev/null +++ b/archive_query_log/dashboard/package-lock.json @@ -0,0 +1,3633 @@ +{ + "name": "aql-dashboard", + "version": "0.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "aql-dashboard", + "version": "0.0.0", + "license": "ISC", + "dependencies": { + "@mdi/font": "6.2.95", + "axios": "^1.7.2", + "elastic-tiny-client": "^0.1.4", + "http-proxy": "^1.18.1", + "roboto-fontface": "*", + "vue": "^3.4.21", + "vuetify": "^3.5.8" + }, + "devDependencies": { + "@babel/types": "^7.24.0", + "@types/node": "^20.11.25", + "@vitejs/plugin-vue": "^5.0.4", + "concurrently": "^8.2.2", + "sass": "^1.71.1", + "typescript": "^5.4.2", + "unplugin-fonts": "^1.1.1", + "unplugin-vue-components": "^0.26.0", + "vite": "^5.1.5", + "vite-plugin-node-polyfills": "^0.21.0", + "vite-plugin-vuetify": "^2.0.3", + "vue-tsc": "^2.0.6" + } + }, + "node_modules/@antfu/utils": { + "version": "0.7.7", + "resolved": "https://registry.npmjs.org/@antfu/utils/-/utils-0.7.7.tgz", + "integrity": "sha512-gFPqTG7otEJ8uP6wrhDv6mqwGWYZKNvAcCq6u9hOj0c+IKCEsY4L1oC9trPq2SaWIzAfHvqfBDxF591JkMf+kg==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/@babel/helper-string-parser": { + "version": "7.24.1", + "resolved": "https://registry.npmjs.org/@babel/helper-string-parser/-/helper-string-parser-7.24.1.tgz", + "integrity": "sha512-2ofRCjnnA9y+wk8b9IAREroeUP02KHp431N2mhKniy2yKIDKpbrHv9eXwm8cBeWQYcJmzv5qKCu65P47eCF7CQ==", + "dev": true, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/helper-validator-identifier": { + "version": "7.22.20", + "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.22.20.tgz", + "integrity": "sha512-Y4OZ+ytlatR8AI+8KZfKuL5urKp7qey08ha31L8b3BwewJAoJamTzyvxPR/5D+KkdJCGPq/+8TukHBlY10FX9A==", + "dev": true, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/parser": { + "version": "7.24.4", + "resolved": "https://registry.npmjs.org/@babel/parser/-/parser-7.24.4.tgz", + "integrity": "sha512-zTvEBcghmeBma9QIGunWevvBAp4/Qu9Bdq+2k0Ot4fVMD6v3dsC9WOcRSKk7tRRyBM/53yKMJko9xOatGQAwSg==", + "bin": { + "parser": "bin/babel-parser.js" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@babel/runtime": { + "version": "7.24.4", + "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.24.4.tgz", + "integrity": "sha512-dkxf7+hn8mFBwKjs9bvBlArzLVxVbS8usaPUDd5p2a9JCL9tB8OaOVN1isD4+Xyk4ns89/xeOmbQvgdK7IIVdA==", + "dev": true, + "dependencies": { + "regenerator-runtime": "^0.14.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@babel/types": { + "version": "7.24.0", + "resolved": "https://registry.npmjs.org/@babel/types/-/types-7.24.0.tgz", + "integrity": "sha512-+j7a5c253RfKh8iABBhywc8NSfP5LURe7Uh4qpsh6jc+aLJguvmIUBdjSdEMQv2bENrCR5MfRdjGo7vzS/ob7w==", + "dev": true, + "dependencies": { + "@babel/helper-string-parser": "^7.23.4", + "@babel/helper-validator-identifier": "^7.22.20", + "to-fast-properties": "^2.0.0" + }, + "engines": { + "node": ">=6.9.0" + } + }, + "node_modules/@esbuild/aix-ppc64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/aix-ppc64/-/aix-ppc64-0.20.2.tgz", + "integrity": "sha512-D+EBOJHXdNZcLJRBkhENNG8Wji2kgc9AZ9KiPr1JuZjsNtyHzrsfLRrY0tk2H2aoFu6RANO1y1iPPUCDYWkb5g==", + "cpu": [ + "ppc64" + ], + "optional": true, + "os": [ + "aix" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm/-/android-arm-0.20.2.tgz", + "integrity": "sha512-t98Ra6pw2VaDhqNWO2Oph2LXbz/EJcnLmKLGBJwEwXX/JAN83Fym1rU8l0JUWK6HkIbWONCSSatf4sf2NBRx/w==", + "cpu": [ + "arm" + ], + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-arm64/-/android-arm64-0.20.2.tgz", + "integrity": "sha512-mRzjLacRtl/tWU0SvD8lUEwb61yP9cqQo6noDZP/O8VkwafSYwZ4yWy24kan8jE/IMERpYncRt2dw438LP3Xmg==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/android-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/android-x64/-/android-x64-0.20.2.tgz", + "integrity": "sha512-btzExgV+/lMGDDa194CcUQm53ncxzeBrWJcncOBxuC6ndBkKxnHdFJn86mCIgTELsooUmwUm9FkhSp5HYu00Rg==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-arm64/-/darwin-arm64-0.20.2.tgz", + "integrity": "sha512-4J6IRT+10J3aJH3l1yzEg9y3wkTDgDk7TSDFX+wKFiWjqWp/iCfLIYzGyasx9l0SAFPT1HwSCR+0w/h1ES/MjA==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/darwin-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/darwin-x64/-/darwin-x64-0.20.2.tgz", + "integrity": "sha512-tBcXp9KNphnNH0dfhv8KYkZhjc+H3XBkF5DKtswJblV7KlT9EI2+jeA8DgBjp908WEuYll6pF+UStUCfEpdysA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-arm64/-/freebsd-arm64-0.20.2.tgz", + "integrity": "sha512-d3qI41G4SuLiCGCFGUrKsSeTXyWG6yem1KcGZVS+3FYlYhtNoNgYrWcvkOoaqMhwXSMrZRl69ArHsGJ9mYdbbw==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/freebsd-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/freebsd-x64/-/freebsd-x64-0.20.2.tgz", + "integrity": "sha512-d+DipyvHRuqEeM5zDivKV1KuXn9WeRX6vqSqIDgwIfPQtwMP4jaDsQsDncjTDDsExT4lR/91OLjRo8bmC1e+Cw==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "freebsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm/-/linux-arm-0.20.2.tgz", + "integrity": "sha512-VhLPeR8HTMPccbuWWcEUD1Az68TqaTYyj6nfE4QByZIQEQVWBB8vup8PpR7y1QHL3CpcF6xd5WVBU/+SBEvGTg==", + "cpu": [ + "arm" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-arm64/-/linux-arm64-0.20.2.tgz", + "integrity": "sha512-9pb6rBjGvTFNira2FLIWqDk/uaf42sSyLE8j1rnUpuzsODBq7FvpwHYZxQ/It/8b+QOS1RYfqgGFNLRI+qlq2A==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ia32": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ia32/-/linux-ia32-0.20.2.tgz", + "integrity": "sha512-o10utieEkNPFDZFQm9CoP7Tvb33UutoJqg3qKf1PWVeeJhJw0Q347PxMvBgVVFgouYLGIhFYG0UGdBumROyiig==", + "cpu": [ + "ia32" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-loong64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-loong64/-/linux-loong64-0.20.2.tgz", + "integrity": "sha512-PR7sp6R/UC4CFVomVINKJ80pMFlfDfMQMYynX7t1tNTeivQ6XdX5r2XovMmha/VjR1YN/HgHWsVcTRIMkymrgQ==", + "cpu": [ + "loong64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-mips64el": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-mips64el/-/linux-mips64el-0.20.2.tgz", + "integrity": "sha512-4BlTqeutE/KnOiTG5Y6Sb/Hw6hsBOZapOVF6njAESHInhlQAghVVZL1ZpIctBOoTFbQyGW+LsVYZ8lSSB3wkjA==", + "cpu": [ + "mips64el" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-ppc64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-ppc64/-/linux-ppc64-0.20.2.tgz", + "integrity": "sha512-rD3KsaDprDcfajSKdn25ooz5J5/fWBylaaXkuotBDGnMnDP1Uv5DLAN/45qfnf3JDYyJv/ytGHQaziHUdyzaAg==", + "cpu": [ + "ppc64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-riscv64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-riscv64/-/linux-riscv64-0.20.2.tgz", + "integrity": "sha512-snwmBKacKmwTMmhLlz/3aH1Q9T8v45bKYGE3j26TsaOVtjIag4wLfWSiZykXzXuE1kbCE+zJRmwp+ZbIHinnVg==", + "cpu": [ + "riscv64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-s390x": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-s390x/-/linux-s390x-0.20.2.tgz", + "integrity": "sha512-wcWISOobRWNm3cezm5HOZcYz1sKoHLd8VL1dl309DiixxVFoFe/o8HnwuIwn6sXre88Nwj+VwZUvJf4AFxkyrQ==", + "cpu": [ + "s390x" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/linux-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/linux-x64/-/linux-x64-0.20.2.tgz", + "integrity": "sha512-1MdwI6OOTsfQfek8sLwgyjOXAu+wKhLEoaOLTjbijk6E2WONYpH9ZU2mNtR+lZ2B4uwr+usqGuVfFT9tMtGvGw==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/netbsd-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/netbsd-x64/-/netbsd-x64-0.20.2.tgz", + "integrity": "sha512-K8/DhBxcVQkzYc43yJXDSyjlFeHQJBiowJ0uVL6Tor3jGQfSGHNNJcWxNbOI8v5k82prYqzPuwkzHt3J1T1iZQ==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "netbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/openbsd-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/openbsd-x64/-/openbsd-x64-0.20.2.tgz", + "integrity": "sha512-eMpKlV0SThJmmJgiVyN9jTPJ2VBPquf6Kt/nAoo6DgHAoN57K15ZghiHaMvqjCye/uU4X5u3YSMgVBI1h3vKrQ==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "openbsd" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/sunos-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/sunos-x64/-/sunos-x64-0.20.2.tgz", + "integrity": "sha512-2UyFtRC6cXLyejf/YEld4Hajo7UHILetzE1vsRcGL3earZEW77JxrFjH4Ez2qaTiEfMgAXxfAZCm1fvM/G/o8w==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "sunos" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-arm64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-arm64/-/win32-arm64-0.20.2.tgz", + "integrity": "sha512-GRibxoawM9ZCnDxnP3usoUDO9vUkpAxIIZ6GQI+IlVmr5kP3zUq+l17xELTHMWTWzjxa2guPNyrpq1GWmPvcGQ==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-ia32": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-ia32/-/win32-ia32-0.20.2.tgz", + "integrity": "sha512-HfLOfn9YWmkSKRQqovpnITazdtquEW8/SoHW7pWpuEeguaZI4QnCRW6b+oZTztdBnZOS2hqJ6im/D5cPzBTTlQ==", + "cpu": [ + "ia32" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@esbuild/win32-x64": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/@esbuild/win32-x64/-/win32-x64-0.20.2.tgz", + "integrity": "sha512-N49X4lJX27+l9jbLKSqZ6bKNjzQvHaT8IIFUy+YIqmXQdjYCToGWwOItDrfby14c78aDd5NHQl29xingXfCdLQ==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">=12" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.4.15", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz", + "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==" + }, + "node_modules/@mdi/font": { + "version": "6.2.95", + "resolved": "https://registry.npmjs.org/@mdi/font/-/font-6.2.95.tgz", + "integrity": "sha512-0RKkhabkFZP3ALwKqrjhdKdhydQpoydIjX6cvjIwLyjADCsE0pG68YkGY+S3qnfdErmhS4m8adwvgrAFXp2AYQ==" + }, + "node_modules/@nodelib/fs.scandir": { + "version": "2.1.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.scandir/-/fs.scandir-2.1.5.tgz", + "integrity": "sha512-vq24Bq3ym5HEQm2NKCr3yXDwjc7vTsEThRDnkp2DK9p1uqLR+DHurm/NOTo0KG7HYHU7eppKZj3MyqYuMBf62g==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "2.0.5", + "run-parallel": "^1.1.9" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.stat": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-2.0.5.tgz", + "integrity": "sha512-RkhPPp2zrqDAQA/2jNhnztcPAlv64XdhIp7a7454A5ovI7Bukxgt7MX7udwAu3zg1DcpPU0rz3VV1SeaqvY4+A==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@nodelib/fs.walk": { + "version": "1.2.8", + "resolved": "https://registry.npmjs.org/@nodelib/fs.walk/-/fs.walk-1.2.8.tgz", + "integrity": "sha512-oGB+UxlgWcgQkgwo8GcEGwemoTFt3FIO9ababBmaGwXIoBKZ+GTy0pP185beGg7Llih/NSHSV2XAs1lnznocSg==", + "dev": true, + "dependencies": { + "@nodelib/fs.scandir": "2.1.5", + "fastq": "^1.6.0" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/@rollup/plugin-inject": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/@rollup/plugin-inject/-/plugin-inject-5.0.5.tgz", + "integrity": "sha512-2+DEJbNBoPROPkgTDNe8/1YXWcqxbN5DTjASVIOx8HS+pITXushyNiBV56RB08zuptzz8gT3YfkqriTBVycepg==", + "dev": true, + "dependencies": { + "@rollup/pluginutils": "^5.0.1", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.3" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/pluginutils": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/@rollup/pluginutils/-/pluginutils-5.1.0.tgz", + "integrity": "sha512-XTIWOPPcpvyKI6L1NHo0lFlCyznUEyPmPY1mc3KpPVDYulHSTvyeLNVW00QTLIAFNhR3kYnJTQHeGqU4M3n09g==", + "dev": true, + "dependencies": { + "@types/estree": "^1.0.0", + "estree-walker": "^2.0.2", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=14.0.0" + }, + "peerDependencies": { + "rollup": "^1.20.0||^2.0.0||^3.0.0||^4.0.0" + }, + "peerDependenciesMeta": { + "rollup": { + "optional": true + } + } + }, + "node_modules/@rollup/rollup-android-arm-eabi": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.16.3.tgz", + "integrity": "sha512-1ACInKIT0pXmTYuPoJAL8sOT0lV3PEACFSVxnD03hGIojJ1CmbzZmLJyk2xew+yxqTlmx7xydkiJcBzdp0V+AQ==", + "cpu": [ + "arm" + ], + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-android-arm64": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm64/-/rollup-android-arm64-4.16.3.tgz", + "integrity": "sha512-vGl+Bny8cawCM7ExugzqEB8ke3t7Pm9/mo+ciA9kJh6pMuNyM+31qhewMwHwseDZ/LtdW0SCocW1CsMxcq1Lsg==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "android" + ] + }, + "node_modules/@rollup/rollup-darwin-arm64": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-arm64/-/rollup-darwin-arm64-4.16.3.tgz", + "integrity": "sha512-Lj8J9WzQRvfWO4GfI+bBkIThUFV1PtI+es/YH/3cwUQ+edXu8Mre0JRJfRrAeRjPiHDPFFZaX51zfgHHEhgRAg==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-darwin-x64": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-darwin-x64/-/rollup-darwin-x64-4.16.3.tgz", + "integrity": "sha512-NPPOXMTIWJk50lgZmRReEYJFvLG5rgMDzaVauWNB2MgFQYm9HuNXQdVVg3iEZ3A5StIzxhMlPjVyS5fsv4PJmg==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "darwin" + ] + }, + "node_modules/@rollup/rollup-linux-arm-gnueabihf": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-gnueabihf/-/rollup-linux-arm-gnueabihf-4.16.3.tgz", + "integrity": "sha512-ij4tv1XtWcDScaTgoMnvDEYZ2Wjl2ZhDFEyftjBKu6sNNLHIkKuXBol/bVSh+md5zSJ6em9hUXyPO3cVPCsl4Q==", + "cpu": [ + "arm" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm-musleabihf": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm-musleabihf/-/rollup-linux-arm-musleabihf-4.16.3.tgz", + "integrity": "sha512-MTMAl30dzcfYB+smHe1sJuS2P1/hB8pqylkCe0/8/Lo8CADjy/eM8x43nBoR5eqcYgpOtCh7IgHpvqSMAE38xw==", + "cpu": [ + "arm" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-gnu": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-gnu/-/rollup-linux-arm64-gnu-4.16.3.tgz", + "integrity": "sha512-vY3fAg6JLDoNh781HHHMPvt8K6RWG3OmEj3xI9BOFSQTD5PNaGKvCB815MyGlDnFYUw7lH+WvvQqoBwLtRDR1A==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-arm64-musl": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-arm64-musl/-/rollup-linux-arm64-musl-4.16.3.tgz", + "integrity": "sha512-61SpQGBSb8QkfV/hUYWezlEig4ro55t8NcE5wWmy1bqRsRVHCEDkF534d+Lln/YeLUoSWtJHvvG3bx9lH/S6uA==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-powerpc64le-gnu": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-powerpc64le-gnu/-/rollup-linux-powerpc64le-gnu-4.16.3.tgz", + "integrity": "sha512-4XGexJthsNhEEgv/zK4/NnAOjYKoeCsIoT+GkqTY2u3rse0lbJ8ft1bpDCdlkvifsLDL2uwe4fn8PLR4IMTKQQ==", + "cpu": [ + "ppc64" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-riscv64-gnu": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-riscv64-gnu/-/rollup-linux-riscv64-gnu-4.16.3.tgz", + "integrity": "sha512-/pArXjqnEdhbQ1qe4CTTlJ6/GjWGdWNRucKAp4fqKnKf7QC0BES3QEV34ACumHHQ4uEGt4GctF2ISCMRhkli0A==", + "cpu": [ + "riscv64" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-s390x-gnu": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-s390x-gnu/-/rollup-linux-s390x-gnu-4.16.3.tgz", + "integrity": "sha512-vu4f3Y8iwjtRfSZdmtP8nC1jmRx1IrRVo2cLQlQfpFZ0e2AE9YbPgfIzpuK+i3C4zFETaLLNGezbBns2NuS/uA==", + "cpu": [ + "s390x" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-gnu": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-gnu/-/rollup-linux-x64-gnu-4.16.3.tgz", + "integrity": "sha512-n4HEgIJulNSmAKT3SYF/1wuzf9od14woSBseNkzur7a+KJIbh2Jb+J9KIsdGt3jJnsLW0BT1Sj6MiwL4Zzku6Q==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-linux-x64-musl": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-linux-x64-musl/-/rollup-linux-x64-musl-4.16.3.tgz", + "integrity": "sha512-guO/4N1884ig2AzTKPc6qA7OTnFMUEg/X2wiesywRO1eRD7FzHiaiTQQOLFmnUXWj2pgQXIT1g5g3e2RpezXcQ==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ] + }, + "node_modules/@rollup/rollup-win32-arm64-msvc": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-arm64-msvc/-/rollup-win32-arm64-msvc-4.16.3.tgz", + "integrity": "sha512-+rxD3memdkhGz0NhNqbYHXBoA33MoHBK4uubZjF1IeQv1Psi6tqgsCcC6vwQjxBM1qoCqOQQBy0cgNbbZKnGUg==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-ia32-msvc": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-ia32-msvc/-/rollup-win32-ia32-msvc-4.16.3.tgz", + "integrity": "sha512-0NxVbLhBXmwANWWbgZY/RdSkeuHEgF+u8Dc0qBowUVBYsR2y2vwVGjKgUcj1wtu3jpjs057io5g9HAPr3Icqjg==", + "cpu": [ + "ia32" + ], + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@rollup/rollup-win32-x64-msvc": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/@rollup/rollup-win32-x64-msvc/-/rollup-win32-x64-msvc-4.16.3.tgz", + "integrity": "sha512-hutnZavtOx/G4uVdgoZz5279By9NVbgmxOmGGgnzUjZYuwp2+NzGq6KXQmHXBWz7W/vottXn38QmKYAdQLa/vQ==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "win32" + ] + }, + "node_modules/@types/estree": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/@types/estree/-/estree-1.0.5.tgz", + "integrity": "sha512-/kYRxGDLWzHOB7q+wtSUQlFrtcdUccpfy+X+9iMBpHK8QLLhx2wIPYuS5DYtR9Wa/YlZAbIovy7qVdB1Aq6Lyw==", + "devOptional": true + }, + "node_modules/@types/node": { + "version": "20.12.7", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz", + "integrity": "sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==", + "devOptional": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/@vitejs/plugin-vue": { + "version": "5.0.4", + "resolved": "https://registry.npmjs.org/@vitejs/plugin-vue/-/plugin-vue-5.0.4.tgz", + "integrity": "sha512-WS3hevEszI6CEVEx28F8RjTX97k3KsrcY6kvTg7+Whm5y3oYvcqzVeGCU3hxSAn4uY2CLCkeokkGKpoctccilQ==", + "dev": true, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "peerDependencies": { + "vite": "^5.0.0", + "vue": "^3.2.25" + } + }, + "node_modules/@volar/language-core": { + "version": "2.2.0-alpha.10", + "resolved": "https://registry.npmjs.org/@volar/language-core/-/language-core-2.2.0-alpha.10.tgz", + "integrity": "sha512-njVJLtpu0zMvDaEk7K5q4BRpOgbyEUljU++un9TfJoJNhxG0z/hWwpwgTRImO42EKvwIxF3XUzeMk+qatAFy7Q==", + "dev": true, + "dependencies": { + "@volar/source-map": "2.2.0-alpha.10" + } + }, + "node_modules/@volar/source-map": { + "version": "2.2.0-alpha.10", + "resolved": "https://registry.npmjs.org/@volar/source-map/-/source-map-2.2.0-alpha.10.tgz", + "integrity": "sha512-nrdWApVkP5cksAnDEyy1JD9rKdwOJsEq1B+seWO4vNXmZNcxQQCx4DULLBvKt7AzRUAQiAuw5aQkb9RBaSqdVA==", + "dev": true, + "dependencies": { + "muggle-string": "^0.4.0" + } + }, + "node_modules/@volar/typescript": { + "version": "2.2.0-alpha.10", + "resolved": "https://registry.npmjs.org/@volar/typescript/-/typescript-2.2.0-alpha.10.tgz", + "integrity": "sha512-GCa0vTVVdA9ULUsu2Rx7jwsIuyZQPvPVT9o3NrANTbYv+523Ao1gv3glC5vzNSDPM6bUl37r94HbCj7KINQr+g==", + "dev": true, + "dependencies": { + "@volar/language-core": "2.2.0-alpha.10", + "path-browserify": "^1.0.1" + } + }, + "node_modules/@vue/compiler-core": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/compiler-core/-/compiler-core-3.4.24.tgz", + "integrity": "sha512-vbW/tgbwJYj62N/Ww99x0zhFTkZDTcGh3uwJEuadZ/nF9/xuFMC4693P9r+3sxGXISABpDKvffY5ApH9pmdd1A==", + "dependencies": { + "@babel/parser": "^7.24.4", + "@vue/shared": "3.4.24", + "entities": "^4.5.0", + "estree-walker": "^2.0.2", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-dom": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/compiler-dom/-/compiler-dom-3.4.24.tgz", + "integrity": "sha512-4XgABML/4cNndVsQndG6BbGN7+EoisDwi3oXNovqL/4jdNhwvP8/rfRMTb6FxkxIxUUtg6AI1/qZvwfSjxJiWA==", + "dependencies": { + "@vue/compiler-core": "3.4.24", + "@vue/shared": "3.4.24" + } + }, + "node_modules/@vue/compiler-sfc": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/compiler-sfc/-/compiler-sfc-3.4.24.tgz", + "integrity": "sha512-nRAlJUK02FTWfA2nuvNBAqsDZuERGFgxZ8sGH62XgFSvMxO2URblzulExsmj4gFZ8e+VAyDooU9oAoXfEDNxTA==", + "dependencies": { + "@babel/parser": "^7.24.4", + "@vue/compiler-core": "3.4.24", + "@vue/compiler-dom": "3.4.24", + "@vue/compiler-ssr": "3.4.24", + "@vue/shared": "3.4.24", + "estree-walker": "^2.0.2", + "magic-string": "^0.30.10", + "postcss": "^8.4.38", + "source-map-js": "^1.2.0" + } + }, + "node_modules/@vue/compiler-ssr": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/compiler-ssr/-/compiler-ssr-3.4.24.tgz", + "integrity": "sha512-ZsAtr4fhaUFnVcDqwW3bYCSDwq+9Gk69q2r/7dAHDrOMw41kylaMgOP4zRnn6GIEJkQznKgrMOGPMFnLB52RbQ==", + "dependencies": { + "@vue/compiler-dom": "3.4.24", + "@vue/shared": "3.4.24" + } + }, + "node_modules/@vue/language-core": { + "version": "2.0.14", + "resolved": "https://registry.npmjs.org/@vue/language-core/-/language-core-2.0.14.tgz", + "integrity": "sha512-3q8mHSNcGTR7sfp2X6jZdcb4yt8AjBXAfKk0qkZIh7GAJxOnoZ10h5HToZglw4ToFvAnq+xu/Z2FFbglh9Icag==", + "dev": true, + "dependencies": { + "@volar/language-core": "2.2.0-alpha.10", + "@vue/compiler-dom": "^3.4.0", + "@vue/shared": "^3.4.0", + "computeds": "^0.0.1", + "minimatch": "^9.0.3", + "path-browserify": "^1.0.1", + "vue-template-compiler": "^2.7.14" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/@vue/reactivity": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/reactivity/-/reactivity-3.4.24.tgz", + "integrity": "sha512-nup3fSYg4i4LtNvu9slF/HF/0dkMQYfepUdORBcMSsankzRPzE7ypAFurpwyRBfU1i7Dn1kcwpYsE1wETSh91g==", + "dependencies": { + "@vue/shared": "3.4.24" + } + }, + "node_modules/@vue/runtime-core": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/runtime-core/-/runtime-core-3.4.24.tgz", + "integrity": "sha512-c7iMfj6cJMeAG3s5yOn9Rc5D9e2/wIuaozmGf/ICGCY3KV5H7mbTVdvEkd4ZshTq7RUZqj2k7LMJWVx+EBiY1g==", + "dependencies": { + "@vue/reactivity": "3.4.24", + "@vue/shared": "3.4.24" + } + }, + "node_modules/@vue/runtime-dom": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/runtime-dom/-/runtime-dom-3.4.24.tgz", + "integrity": "sha512-uXKzuh/Emfad2Y7Qm0ABsLZZV6H3mAJ5ZVqmAOlrNQRf+T5mxpPGZBfec1hkP41t6h6FwF6RSGCs/gd8WbuySQ==", + "dependencies": { + "@vue/runtime-core": "3.4.24", + "@vue/shared": "3.4.24", + "csstype": "^3.1.3" + } + }, + "node_modules/@vue/server-renderer": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/server-renderer/-/server-renderer-3.4.24.tgz", + "integrity": "sha512-H+DLK4sQF6sRgzKyofmlEVBIV/9KrQU6HIV7nt6yIwSGGKvSwlV8pqJlebUKLpbXaNHugdSfAbP6YmXF69lxow==", + "dependencies": { + "@vue/compiler-ssr": "3.4.24", + "@vue/shared": "3.4.24" + }, + "peerDependencies": { + "vue": "3.4.24" + } + }, + "node_modules/@vue/shared": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/@vue/shared/-/shared-3.4.24.tgz", + "integrity": "sha512-BW4tajrJBM9AGAknnyEw5tO2xTmnqgup0VTnDAMcxYmqOX0RG0b9aSUGAbEKolD91tdwpA6oCwbltoJoNzpItw==" + }, + "node_modules/@vuetify/loader-shared": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@vuetify/loader-shared/-/loader-shared-2.0.3.tgz", + "integrity": "sha512-Ss3GC7eJYkp2SF6xVzsT7FAruEmdihmn4OCk2+UocREerlXKWgOKKzTN5PN3ZVN5q05jHHrsNhTuWbhN61Bpdg==", + "devOptional": true, + "dependencies": { + "upath": "^2.0.1" + }, + "peerDependencies": { + "vue": "^3.0.0", + "vuetify": "^3.0.0" + } + }, + "node_modules/acorn": { + "version": "8.11.3", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", + "integrity": "sha512-Y9rRfJG5jcKOE0CLisYbojUjIrIEE7AGMzA/Sm4BslANhbS+cDMpgBdcPT91oJ7OuJ9hYJBx59RjbhxVnrF8Xg==", + "dev": true, + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/ansi-regex": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", + "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/ansi-styles": { + "version": "4.3.0", + "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", + "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", + "dev": true, + "dependencies": { + "color-convert": "^2.0.1" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/chalk/ansi-styles?sponsor=1" + } + }, + "node_modules/anymatch": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz", + "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==", + "devOptional": true, + "dependencies": { + "normalize-path": "^3.0.0", + "picomatch": "^2.0.4" + }, + "engines": { + "node": ">= 8" + } + }, + "node_modules/asn1.js": { + "version": "4.10.1", + "resolved": "https://registry.npmjs.org/asn1.js/-/asn1.js-4.10.1.tgz", + "integrity": "sha512-p32cOF5q0Zqs9uBiONKYLm6BClCoBCM5O9JfeUSlnQLBTxYdTK+pW+nXflm8UkKd2UYlEbYz5qEi0JuZR9ckSw==", + "dev": true, + "dependencies": { + "bn.js": "^4.0.0", + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0" + } + }, + "node_modules/asn1.js/node_modules/bn.js": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.12.0.tgz", + "integrity": "sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==", + "dev": true + }, + "node_modules/assert": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/assert/-/assert-2.1.0.tgz", + "integrity": "sha512-eLHpSK/Y4nhMJ07gDaAzoX/XAKS8PSaojml3M0DM4JpV1LAi5JOJ/p6H/XWrl8L+DzVEvVCW1z3vWAaB9oTsQw==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.2", + "is-nan": "^1.3.2", + "object-is": "^1.1.5", + "object.assign": "^4.1.4", + "util": "^0.12.5" + } + }, + "node_modules/asynckit": { + "version": "0.4.0", + "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz", + "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==" + }, + "node_modules/available-typed-arrays": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/available-typed-arrays/-/available-typed-arrays-1.0.7.tgz", + "integrity": "sha512-wvUjBtSGN7+7SjNpq/9M2Tg350UZD3q62IFZLbRAR1bSMlCo1ZaeW+BJ+D090e4hIIZLBcTDWe4Mh4jvUDajzQ==", + "dev": true, + "dependencies": { + "possible-typed-array-names": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/axios": { + "version": "1.7.2", + "resolved": "https://registry.npmjs.org/axios/-/axios-1.7.2.tgz", + "integrity": "sha512-2A8QhOMrbomlDuiLeK9XibIBzuHeRcqqNOHp0Cyp5EoJ1IFDh+XZH3A6BkXtv0K4gFGCI0Y4BM7B1wOEi0Rmgw==", + "dependencies": { + "follow-redirects": "^1.15.6", + "form-data": "^4.0.0", + "proxy-from-env": "^1.1.0" + } + }, + "node_modules/balanced-match": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", + "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", + "dev": true + }, + "node_modules/base64-js": { + "version": "1.5.1", + "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", + "integrity": "sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/binary-extensions": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz", + "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==", + "devOptional": true, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/bn.js": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-5.2.1.tgz", + "integrity": "sha512-eXRvHzWyYPBuB4NBy0cmYQjGitUrtqwbvlzP3G6VFnNRbsZQIxQ10PbKKHt8gZ/HW/D/747aDl+QkDqg3KQLMQ==", + "dev": true + }, + "node_modules/brace-expansion": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.1.tgz", + "integrity": "sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==", + "dev": true, + "dependencies": { + "balanced-match": "^1.0.0" + } + }, + "node_modules/braces": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz", + "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==", + "devOptional": true, + "dependencies": { + "fill-range": "^7.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/brorand": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/brorand/-/brorand-1.1.0.tgz", + "integrity": "sha512-cKV8tMCEpQs4hK/ik71d6LrPOnpkpGBR0wzxqr68g2m/LB2GxVYQroAjMJZRVM1Y4BCjCKc3vAamxSzOY2RP+w==", + "dev": true + }, + "node_modules/browser-resolve": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/browser-resolve/-/browser-resolve-2.0.0.tgz", + "integrity": "sha512-7sWsQlYL2rGLy2IWm8WL8DCTJvYLc/qlOnsakDac87SOoCd16WLsaAMdCiAqsTNHIe+SXfaqyxyo6THoWqs8WQ==", + "dev": true, + "dependencies": { + "resolve": "^1.17.0" + } + }, + "node_modules/browserify-aes": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/browserify-aes/-/browserify-aes-1.2.0.tgz", + "integrity": "sha512-+7CHXqGuspUn/Sl5aO7Ea0xWGAtETPXNSAjHo48JfLdPWcMng33Xe4znFvQweqc/uzk5zSOI3H52CYnjCfb5hA==", + "dev": true, + "dependencies": { + "buffer-xor": "^1.0.3", + "cipher-base": "^1.0.0", + "create-hash": "^1.1.0", + "evp_bytestokey": "^1.0.3", + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/browserify-cipher": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/browserify-cipher/-/browserify-cipher-1.0.1.tgz", + "integrity": "sha512-sPhkz0ARKbf4rRQt2hTpAHqn47X3llLkUGn+xEJzLjwY8LRs2p0v7ljvI5EyoRO/mexrNunNECisZs+gw2zz1w==", + "dev": true, + "dependencies": { + "browserify-aes": "^1.0.4", + "browserify-des": "^1.0.0", + "evp_bytestokey": "^1.0.0" + } + }, + "node_modules/browserify-des": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/browserify-des/-/browserify-des-1.0.2.tgz", + "integrity": "sha512-BioO1xf3hFwz4kc6iBhI3ieDFompMhrMlnDFC4/0/vd5MokpuAc3R+LYbwTA9A5Yc9pq9UYPqffKpW2ObuwX5A==", + "dev": true, + "dependencies": { + "cipher-base": "^1.0.1", + "des.js": "^1.0.0", + "inherits": "^2.0.1", + "safe-buffer": "^5.1.2" + } + }, + "node_modules/browserify-rsa": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/browserify-rsa/-/browserify-rsa-4.1.0.tgz", + "integrity": "sha512-AdEER0Hkspgno2aR97SAf6vi0y0k8NuOpGnVH3O99rcA5Q6sh8QxcngtHuJ6uXwnfAXNM4Gn1Gb7/MV1+Ymbog==", + "dev": true, + "dependencies": { + "bn.js": "^5.0.0", + "randombytes": "^2.0.1" + } + }, + "node_modules/browserify-sign": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/browserify-sign/-/browserify-sign-4.2.3.tgz", + "integrity": "sha512-JWCZW6SKhfhjJxO8Tyiiy+XYB7cqd2S5/+WeYHsKdNKFlCBhKbblba1A/HN/90YwtxKc8tCErjffZl++UNmGiw==", + "dev": true, + "dependencies": { + "bn.js": "^5.2.1", + "browserify-rsa": "^4.1.0", + "create-hash": "^1.2.0", + "create-hmac": "^1.1.7", + "elliptic": "^6.5.5", + "hash-base": "~3.0", + "inherits": "^2.0.4", + "parse-asn1": "^5.1.7", + "readable-stream": "^2.3.8", + "safe-buffer": "^5.2.1" + }, + "engines": { + "node": ">= 0.12" + } + }, + "node_modules/browserify-sign/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "dev": true, + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/browserify-sign/node_modules/readable-stream/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "dev": true + }, + "node_modules/browserify-sign/node_modules/string_decoder": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", + "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "dev": true, + "dependencies": { + "safe-buffer": "~5.1.0" + } + }, + "node_modules/browserify-sign/node_modules/string_decoder/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "dev": true + }, + "node_modules/browserify-zlib": { + "version": "0.2.0", + "resolved": "https://registry.npmjs.org/browserify-zlib/-/browserify-zlib-0.2.0.tgz", + "integrity": "sha512-Z942RysHXmJrhqk88FmKBVq/v5tqmSkDz7p54G/MGyjMnCFFnC79XWNbg+Vta8W6Wb2qtSZTSxIGkJrRpCFEiA==", + "dev": true, + "dependencies": { + "pako": "~1.0.5" + } + }, + "node_modules/buffer": { + "version": "5.7.1", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz", + "integrity": "sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.1.13" + } + }, + "node_modules/buffer-xor": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/buffer-xor/-/buffer-xor-1.0.3.tgz", + "integrity": "sha512-571s0T7nZWK6vB67HI5dyUF7wXiNcfaPPPTl6zYCNApANjIvYJTg7hlud/+cJpdAhS7dVzqMLmfhfHR3rAcOjQ==", + "dev": true + }, + "node_modules/builtin-status-codes": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/builtin-status-codes/-/builtin-status-codes-3.0.0.tgz", + "integrity": "sha512-HpGFw18DgFWlncDfjTa2rcQ4W88O1mC8e8yZ2AvQY5KDaktSTwo+KRf6nHK6FRI5FyRyb/5T6+TSxfP7QyGsmQ==", + "dev": true + }, + "node_modules/call-bind": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", + "integrity": "sha512-GHTSNSYICQ7scH7sZ+M2rFopRoLh8t2bLSW6BbgrtLsahOIB5iyAVJf9GjWK3cYTDaMj4XdBpM1cA6pIS0Kv2w==", + "dev": true, + "dependencies": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "function-bind": "^1.1.2", + "get-intrinsic": "^1.2.4", + "set-function-length": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/chalk": { + "version": "4.1.2", + "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz", + "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.1.0", + "supports-color": "^7.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/chalk?sponsor=1" + } + }, + "node_modules/chalk/node_modules/supports-color": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz", + "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==", + "dev": true, + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/chokidar": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz", + "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==", + "devOptional": true, + "dependencies": { + "anymatch": "~3.1.2", + "braces": "~3.0.2", + "glob-parent": "~5.1.2", + "is-binary-path": "~2.1.0", + "is-glob": "~4.0.1", + "normalize-path": "~3.0.0", + "readdirp": "~3.6.0" + }, + "engines": { + "node": ">= 8.10.0" + }, + "funding": { + "url": "https://paulmillr.com/funding/" + }, + "optionalDependencies": { + "fsevents": "~2.3.2" + } + }, + "node_modules/cipher-base": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/cipher-base/-/cipher-base-1.0.4.tgz", + "integrity": "sha512-Kkht5ye6ZGmwv40uUDZztayT2ThLQGfnj/T71N/XzeZeo3nf8foyW7zGTsPYkEya3m5f3cAypH+qe7YOrM1U2Q==", + "dev": true, + "dependencies": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/cliui": { + "version": "8.0.1", + "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", + "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", + "dev": true, + "dependencies": { + "string-width": "^4.2.0", + "strip-ansi": "^6.0.1", + "wrap-ansi": "^7.0.0" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/color-convert": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", + "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", + "dev": true, + "dependencies": { + "color-name": "~1.1.4" + }, + "engines": { + "node": ">=7.0.0" + } + }, + "node_modules/color-name": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", + "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", + "dev": true + }, + "node_modules/combined-stream": { + "version": "1.0.8", + "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz", + "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==", + "dependencies": { + "delayed-stream": "~1.0.0" + }, + "engines": { + "node": ">= 0.8" + } + }, + "node_modules/computeds": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/computeds/-/computeds-0.0.1.tgz", + "integrity": "sha512-7CEBgcMjVmitjYo5q8JTJVra6X5mQ20uTThdK+0kR7UEaDrAWEQcRiBtWJzga4eRpP6afNwwLsX2SET2JhVB1Q==", + "dev": true + }, + "node_modules/concurrently": { + "version": "8.2.2", + "resolved": "https://registry.npmjs.org/concurrently/-/concurrently-8.2.2.tgz", + "integrity": "sha512-1dP4gpXFhei8IOtlXRE/T/4H88ElHgTiUzh71YUmtjTEHMSRS2Z/fgOxHSxxusGHogsRfxNq1vyAwxSC+EVyDg==", + "dev": true, + "dependencies": { + "chalk": "^4.1.2", + "date-fns": "^2.30.0", + "lodash": "^4.17.21", + "rxjs": "^7.8.1", + "shell-quote": "^1.8.1", + "spawn-command": "0.0.2", + "supports-color": "^8.1.1", + "tree-kill": "^1.2.2", + "yargs": "^17.7.2" + }, + "bin": { + "conc": "dist/bin/concurrently.js", + "concurrently": "dist/bin/concurrently.js" + }, + "engines": { + "node": "^14.13.0 || >=16.0.0" + }, + "funding": { + "url": "https://github.com/open-cli-tools/concurrently?sponsor=1" + } + }, + "node_modules/concurrently/node_modules/date-fns": { + "version": "2.30.0", + "resolved": "https://registry.npmjs.org/date-fns/-/date-fns-2.30.0.tgz", + "integrity": "sha512-fnULvOpxnC5/Vg3NCiWelDsLiUc9bRwAPs/+LfTLNvetFCtCTN+yQz15C/fs4AwX1R9K5GLtLfn8QW+dWisaAw==", + "dev": true, + "dependencies": { + "@babel/runtime": "^7.21.0" + }, + "engines": { + "node": ">=0.11" + }, + "funding": { + "type": "opencollective", + "url": "https://opencollective.com/date-fns" + } + }, + "node_modules/console-browserify": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/console-browserify/-/console-browserify-1.2.0.tgz", + "integrity": "sha512-ZMkYO/LkF17QvCPqM0gxw8yUzigAOZOSWSHg91FH6orS7vcEj5dVZTidN2fQ14yBSdg97RqhSNwLUXInd52OTA==", + "dev": true + }, + "node_modules/constants-browserify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/constants-browserify/-/constants-browserify-1.0.0.tgz", + "integrity": "sha512-xFxOwqIzR/e1k1gLiWEophSCMqXcwVHIH7akf7b/vxcUeGunlj3hvZaaqxwHsTgn+IndtkQJgSztIDWeumWJDQ==", + "dev": true + }, + "node_modules/core-util-is": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", + "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", + "dev": true + }, + "node_modules/create-ecdh": { + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/create-ecdh/-/create-ecdh-4.0.4.tgz", + "integrity": "sha512-mf+TCx8wWc9VpuxfP2ht0iSISLZnt0JgWlrOKZiNqyUZWnjIaCIVNQArMHnCZKfEYRg6IM7A+NeJoN8gf/Ws0A==", + "dev": true, + "dependencies": { + "bn.js": "^4.1.0", + "elliptic": "^6.5.3" + } + }, + "node_modules/create-ecdh/node_modules/bn.js": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.12.0.tgz", + "integrity": "sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==", + "dev": true + }, + "node_modules/create-hash": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", + "integrity": "sha512-z00bCGNHDG8mHAkP7CtT1qVu+bFQUPjYq/4Iv3C3kWjTFV10zIjfSoeqXo9Asws8gwSHDGj/hl2u4OGIjapeCg==", + "dev": true, + "dependencies": { + "cipher-base": "^1.0.1", + "inherits": "^2.0.1", + "md5.js": "^1.3.4", + "ripemd160": "^2.0.1", + "sha.js": "^2.4.0" + } + }, + "node_modules/create-hmac": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/create-hmac/-/create-hmac-1.1.7.tgz", + "integrity": "sha512-MJG9liiZ+ogc4TzUwuvbER1JRdgvUFSB5+VR/g5h82fGaIRWMWddtKBHi7/sVhfjQZ6SehlyhvQYrcYkaUIpLg==", + "dev": true, + "dependencies": { + "cipher-base": "^1.0.3", + "create-hash": "^1.1.0", + "inherits": "^2.0.1", + "ripemd160": "^2.0.0", + "safe-buffer": "^5.0.1", + "sha.js": "^2.4.8" + } + }, + "node_modules/create-require": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", + "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", + "dev": true + }, + "node_modules/crypto-browserify": { + "version": "3.12.0", + "resolved": "https://registry.npmjs.org/crypto-browserify/-/crypto-browserify-3.12.0.tgz", + "integrity": "sha512-fz4spIh+znjO2VjL+IdhEpRJ3YN6sMzITSBijk6FK2UvTqruSQW+/cCZTSNsMiZNvUeq0CqurF+dAbyiGOY6Wg==", + "dev": true, + "dependencies": { + "browserify-cipher": "^1.0.0", + "browserify-sign": "^4.0.0", + "create-ecdh": "^4.0.0", + "create-hash": "^1.1.0", + "create-hmac": "^1.1.0", + "diffie-hellman": "^5.0.0", + "inherits": "^2.0.1", + "pbkdf2": "^3.0.3", + "public-encrypt": "^4.0.0", + "randombytes": "^2.0.0", + "randomfill": "^1.0.3" + }, + "engines": { + "node": "*" + } + }, + "node_modules/csstype": { + "version": "3.1.3", + "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz", + "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==" + }, + "node_modules/de-indent": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/de-indent/-/de-indent-1.0.2.tgz", + "integrity": "sha512-e/1zu3xH5MQryN2zdVaF0OrdNLUbvWxzMbi+iNA6Bky7l1RoP8a2fIbRocyHclXt/arDrrR6lL3TqFD9pMQTsg==", + "dev": true + }, + "node_modules/debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "devOptional": true, + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, + "node_modules/define-data-property": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", + "integrity": "sha512-rBMvIzlpA8v6E+SJZoo++HAYqsLrkg7MSfIinMPFhmkorw7X+dOXVJQs+QT69zGkzMyfDnIMN2Wid1+NbL3T+A==", + "dev": true, + "dependencies": { + "es-define-property": "^1.0.0", + "es-errors": "^1.3.0", + "gopd": "^1.0.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/define-properties": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/define-properties/-/define-properties-1.2.1.tgz", + "integrity": "sha512-8QmQKqEASLd5nx0U1B1okLElbUuuttJ/AnYmRXbbbGDWh6uS208EjD4Xqq/I9wK7u0v6O08XhTWnt5XtEbR6Dg==", + "dev": true, + "dependencies": { + "define-data-property": "^1.0.1", + "has-property-descriptors": "^1.0.0", + "object-keys": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/delayed-stream": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", + "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/des.js": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/des.js/-/des.js-1.1.0.tgz", + "integrity": "sha512-r17GxjhUCjSRy8aiJpr8/UadFIzMzJGexI3Nmz4ADi9LYSFx4gTBp80+NaX/YsXWWLhpZ7v/v/ubEc/bCNfKwg==", + "dev": true, + "dependencies": { + "inherits": "^2.0.1", + "minimalistic-assert": "^1.0.0" + } + }, + "node_modules/diffie-hellman": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/diffie-hellman/-/diffie-hellman-5.0.3.tgz", + "integrity": "sha512-kqag/Nl+f3GwyK25fhUMYj81BUOrZ9IuJsjIcDE5icNM9FJHAVm3VcUDxdLPoQtTuUylWm6ZIknYJwwaPxsUzg==", + "dev": true, + "dependencies": { + "bn.js": "^4.1.0", + "miller-rabin": "^4.0.0", + "randombytes": "^2.0.0" + } + }, + "node_modules/diffie-hellman/node_modules/bn.js": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.12.0.tgz", + "integrity": "sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==", + "dev": true + }, + "node_modules/domain-browser": { + "version": "4.23.0", + "resolved": "https://registry.npmjs.org/domain-browser/-/domain-browser-4.23.0.tgz", + "integrity": "sha512-ArzcM/II1wCCujdCNyQjXrAFwS4mrLh4C7DZWlaI8mdh7h3BfKdNd3bKXITfl2PT9FtfQqaGvhi1vPRQPimjGA==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://bevry.me/fund" + } + }, + "node_modules/elastic-tiny-client": { + "version": "0.1.4", + "resolved": "https://registry.npmjs.org/elastic-tiny-client/-/elastic-tiny-client-0.1.4.tgz", + "integrity": "sha512-bkDaZlXAaC5cdsUtW38BdgBYX1yADNKnI1zDR0E1mE2maCRml03HD+2AMtJCp1GJ5D1zViFpl9WYmkByDFxlYw==" + }, + "node_modules/elliptic": { + "version": "6.5.5", + "resolved": "https://registry.npmjs.org/elliptic/-/elliptic-6.5.5.tgz", + "integrity": "sha512-7EjbcmUm17NQFu4Pmgmq2olYMj8nwMnpcddByChSUjArp8F5DQWcIcpriwO4ZToLNAJig0yiyjswfyGNje/ixw==", + "dev": true, + "dependencies": { + "bn.js": "^4.11.9", + "brorand": "^1.1.0", + "hash.js": "^1.0.0", + "hmac-drbg": "^1.0.1", + "inherits": "^2.0.4", + "minimalistic-assert": "^1.0.1", + "minimalistic-crypto-utils": "^1.0.1" + } + }, + "node_modules/elliptic/node_modules/bn.js": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.12.0.tgz", + "integrity": "sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==", + "dev": true + }, + "node_modules/emoji-regex": { + "version": "8.0.0", + "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", + "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", + "dev": true + }, + "node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/es-define-property": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.0.tgz", + "integrity": "sha512-jxayLKShrEqqzJ0eumQbVhTYQM27CfT1T35+gCgDFoL82JLsXqTJ76zv6A0YLOgEnLUMvLzsDsGIrl8NFpT2gQ==", + "dev": true, + "dependencies": { + "get-intrinsic": "^1.2.4" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/es-errors": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz", + "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/esbuild": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz", + "integrity": "sha512-WdOOppmUNU+IbZ0PaDiTst80zjnrOkyJNHoKupIcVyU8Lvla3Ugx94VzkQ32Ijqd7UhHJy75gNWDMUekcrSJ6g==", + "devOptional": true, + "hasInstallScript": true, + "bin": { + "esbuild": "bin/esbuild" + }, + "engines": { + "node": ">=12" + }, + "optionalDependencies": { + "@esbuild/aix-ppc64": "0.20.2", + "@esbuild/android-arm": "0.20.2", + "@esbuild/android-arm64": "0.20.2", + "@esbuild/android-x64": "0.20.2", + "@esbuild/darwin-arm64": "0.20.2", + "@esbuild/darwin-x64": "0.20.2", + "@esbuild/freebsd-arm64": "0.20.2", + "@esbuild/freebsd-x64": "0.20.2", + "@esbuild/linux-arm": "0.20.2", + "@esbuild/linux-arm64": "0.20.2", + "@esbuild/linux-ia32": "0.20.2", + "@esbuild/linux-loong64": "0.20.2", + "@esbuild/linux-mips64el": "0.20.2", + "@esbuild/linux-ppc64": "0.20.2", + "@esbuild/linux-riscv64": "0.20.2", + "@esbuild/linux-s390x": "0.20.2", + "@esbuild/linux-x64": "0.20.2", + "@esbuild/netbsd-x64": "0.20.2", + "@esbuild/openbsd-x64": "0.20.2", + "@esbuild/sunos-x64": "0.20.2", + "@esbuild/win32-arm64": "0.20.2", + "@esbuild/win32-ia32": "0.20.2", + "@esbuild/win32-x64": "0.20.2" + } + }, + "node_modules/escalade": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.1.2.tgz", + "integrity": "sha512-ErCHMCae19vR8vQGe50xIsVomy19rg6gFu3+r3jkEO46suLMWBksvVyoGgQV+jOfl84ZSOSlmv6Gxa89PmTGmA==", + "dev": true, + "engines": { + "node": ">=6" + } + }, + "node_modules/estree-walker": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/estree-walker/-/estree-walker-2.0.2.tgz", + "integrity": "sha512-Rfkk/Mp/DL7JVje3u18FxFujQlTNR2q6QfMSMB7AvCBx91NGj/ba3kCfza0f6dVDbw7YlRf/nDrn7pQrCCyQ/w==" + }, + "node_modules/eventemitter3": { + "version": "4.0.7", + "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-4.0.7.tgz", + "integrity": "sha512-8guHBZCwKnFhYdHr2ysuRWErTwhoN2X8XELRlrRwpmfeY2jjuUN4taQMsULKUVo1K4DvZl+0pgfyoysHxvmvEw==" + }, + "node_modules/events": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/events/-/events-3.3.0.tgz", + "integrity": "sha512-mQw+2fkQbALzQ7V0MY0IqdnXNOeTtP4r0lN9z7AAawCXgqea7bDii20AYrIBrFd/Hx0M2Ocz6S111CaFkUcb0Q==", + "dev": true, + "engines": { + "node": ">=0.8.x" + } + }, + "node_modules/evp_bytestokey": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/evp_bytestokey/-/evp_bytestokey-1.0.3.tgz", + "integrity": "sha512-/f2Go4TognH/KvCISP7OUsHn85hT9nUkxxA9BEWxFn+Oj9o8ZNLm/40hdlgSLyuOimsrTKLUMEorQexp/aPQeA==", + "dev": true, + "dependencies": { + "md5.js": "^1.3.4", + "safe-buffer": "^5.1.1" + } + }, + "node_modules/fast-glob": { + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-3.3.2.tgz", + "integrity": "sha512-oX2ruAFQwf/Orj8m737Y5adxDQO0LAB7/S5MnxCdTNDd4p6BsyIVsv9JQsATbTSq8KHRpLwIHbVlUNatxd+1Ow==", + "dev": true, + "dependencies": { + "@nodelib/fs.stat": "^2.0.2", + "@nodelib/fs.walk": "^1.2.3", + "glob-parent": "^5.1.2", + "merge2": "^1.3.0", + "micromatch": "^4.0.4" + }, + "engines": { + "node": ">=8.6.0" + } + }, + "node_modules/fastq": { + "version": "1.17.1", + "resolved": "https://registry.npmjs.org/fastq/-/fastq-1.17.1.tgz", + "integrity": "sha512-sRVD3lWVIXWg6By68ZN7vho9a1pQcN/WBFaAAsDDFzlJjvoGx0P8z7V1t72grFJfJhu3YPZBuu25f7Kaw2jN1w==", + "dev": true, + "dependencies": { + "reusify": "^1.0.4" + } + }, + "node_modules/fill-range": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz", + "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==", + "devOptional": true, + "dependencies": { + "to-regex-range": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/find-up": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", + "dev": true, + "dependencies": { + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/follow-redirects": { + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", + "funding": [ + { + "type": "individual", + "url": "https://github.com/sponsors/RubenVerborgh" + } + ], + "engines": { + "node": ">=4.0" + }, + "peerDependenciesMeta": { + "debug": { + "optional": true + } + } + }, + "node_modules/for-each": { + "version": "0.3.3", + "resolved": "https://registry.npmjs.org/for-each/-/for-each-0.3.3.tgz", + "integrity": "sha512-jqYfLp7mo9vIyQf8ykW2v7A+2N4QjeCeI5+Dz9XraiO1ign81wjiH7Fb9vSOWvQfNtmSa4H2RoQTrrXivdUZmw==", + "dev": true, + "dependencies": { + "is-callable": "^1.1.3" + } + }, + "node_modules/form-data": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz", + "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==", + "dependencies": { + "asynckit": "^0.4.0", + "combined-stream": "^1.0.8", + "mime-types": "^2.1.12" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/fsevents": { + "version": "2.3.3", + "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz", + "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==", + "hasInstallScript": true, + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": "^8.16.0 || ^10.6.0 || >=11.0.0" + } + }, + "node_modules/function-bind": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz", + "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/get-caller-file": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", + "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", + "dev": true, + "engines": { + "node": "6.* || 8.* || >= 10.*" + } + }, + "node_modules/get-intrinsic": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.2.4.tgz", + "integrity": "sha512-5uYhsJH8VJBTv7oslg4BznJYhDoRI6waYCxMmCdnTrcCrHA/fCFKoTFz2JKKE0HdDFUF7/oQuhzumXJK7paBRQ==", + "dev": true, + "dependencies": { + "es-errors": "^1.3.0", + "function-bind": "^1.1.2", + "has-proto": "^1.0.1", + "has-symbols": "^1.0.3", + "hasown": "^2.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/glob-parent": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz", + "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==", + "devOptional": true, + "dependencies": { + "is-glob": "^4.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/gopd": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz", + "integrity": "sha512-d65bNlIadxvpb/A2abVdlqKqV563juRnZ1Wtk6s1sIR8uNsXR70xqIzVqxVf1eTqDunwT2MkczEeaezCKTZhwA==", + "dev": true, + "dependencies": { + "get-intrinsic": "^1.1.3" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-flag": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz", + "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/has-property-descriptors": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-property-descriptors/-/has-property-descriptors-1.0.2.tgz", + "integrity": "sha512-55JNKuIW+vq4Ke1BjOTjM2YctQIvCT7GFzHwmfZPGo5wnrgkid0YQtnAleFSqumZm4az3n2BS+erby5ipJdgrg==", + "dev": true, + "dependencies": { + "es-define-property": "^1.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-proto": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/has-proto/-/has-proto-1.0.3.tgz", + "integrity": "sha512-SJ1amZAJUiZS+PhsVLf5tGydlaVB8EdFpaSO4gmiUKUOxk8qzn5AIy4ZeJUmh22znIdk/uMAUT2pl3FxzVUH+Q==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-symbols": { + "version": "1.0.3", + "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.0.3.tgz", + "integrity": "sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/has-tostringtag": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz", + "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==", + "dev": true, + "dependencies": { + "has-symbols": "^1.0.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/hash-base": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/hash-base/-/hash-base-3.0.4.tgz", + "integrity": "sha512-EeeoJKjTyt868liAlVmcv2ZsUfGHlE3Q+BICOXcZiwN3osr5Q/zFGYmTJpoIzuaSTAwndFy+GqhEwlU4L3j4Ow==", + "dev": true, + "dependencies": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + }, + "engines": { + "node": ">=4" + } + }, + "node_modules/hash.js": { + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/hash.js/-/hash.js-1.1.7.tgz", + "integrity": "sha512-taOaskGt4z4SOANNseOviYDvjEJinIkRgmp7LbKP2YTTmVxWBl87s/uzK9r+44BclBSp2X7K1hqeNfz9JbBeXA==", + "dev": true, + "dependencies": { + "inherits": "^2.0.3", + "minimalistic-assert": "^1.0.1" + } + }, + "node_modules/hasown": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz", + "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==", + "dev": true, + "dependencies": { + "function-bind": "^1.1.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/he": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/he/-/he-1.2.0.tgz", + "integrity": "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw==", + "dev": true, + "bin": { + "he": "bin/he" + } + }, + "node_modules/hmac-drbg": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/hmac-drbg/-/hmac-drbg-1.0.1.tgz", + "integrity": "sha512-Tti3gMqLdZfhOQY1Mzf/AanLiqh1WTiJgEj26ZuYQ9fbkLomzGchCws4FyrSd4VkpBfiNhaE1On+lOz894jvXg==", + "dev": true, + "dependencies": { + "hash.js": "^1.0.3", + "minimalistic-assert": "^1.0.0", + "minimalistic-crypto-utils": "^1.0.1" + } + }, + "node_modules/http-proxy": { + "version": "1.18.1", + "resolved": "https://registry.npmjs.org/http-proxy/-/http-proxy-1.18.1.tgz", + "integrity": "sha512-7mz/721AbnJwIVbnaSv1Cz3Am0ZLT/UBwkC92VlxhXv/k/BBQfM2fXElQNC27BVGr0uwUpplYPQM9LnaBMR5NQ==", + "dependencies": { + "eventemitter3": "^4.0.0", + "follow-redirects": "^1.0.0", + "requires-port": "^1.0.0" + }, + "engines": { + "node": ">=8.0.0" + } + }, + "node_modules/https-browserify": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/https-browserify/-/https-browserify-1.0.0.tgz", + "integrity": "sha512-J+FkSdyD+0mA0N+81tMotaRMfSL9SGi+xpD3T6YApKsc3bGSXJlfXri3VyFOeYkfLRQisDk1W+jIFFKBeUBbBg==", + "dev": true + }, + "node_modules/ieee754": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", + "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/immutable": { + "version": "4.3.5", + "resolved": "https://registry.npmjs.org/immutable/-/immutable-4.3.5.tgz", + "integrity": "sha512-8eabxkth9gZatlwl5TBuJnCsoTADlL6ftEr7A4qgdaTsPyreilDSnUk57SO+jfKcNtxPa22U5KK6DSeAYhpBJw==", + "devOptional": true + }, + "node_modules/inherits": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz", + "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==", + "dev": true + }, + "node_modules/is-arguments": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/is-arguments/-/is-arguments-1.1.1.tgz", + "integrity": "sha512-8Q7EARjzEnKpt/PCD7e1cgUS0a6X8u5tdSiMqXhojOdoV9TsMsiO+9VLC5vAmO8N7/GmXn7yjR8qnA6bVAEzfA==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.2", + "has-tostringtag": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-binary-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz", + "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==", + "devOptional": true, + "dependencies": { + "binary-extensions": "^2.0.0" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-callable": { + "version": "1.2.7", + "resolved": "https://registry.npmjs.org/is-callable/-/is-callable-1.2.7.tgz", + "integrity": "sha512-1BC0BVFhS/p0qtw6enp8e+8OD0UrK0oFLztSjNzhcKA3WDuJxxAPXzPuPtKkjEY9UUoEWlX/8fgKeu2S8i9JTA==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-core-module": { + "version": "2.13.1", + "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.13.1.tgz", + "integrity": "sha512-hHrIjvZsftOsvKSn2TRYl63zvxsgE0K+0mYMoH6gD4omR5IWB2KynivBQczo3+wF1cCkjzvptnI9Q0sPU66ilw==", + "dev": true, + "dependencies": { + "hasown": "^2.0.0" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-extglob": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", + "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==", + "devOptional": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-fullwidth-code-point": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", + "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/is-generator-function": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/is-generator-function/-/is-generator-function-1.0.10.tgz", + "integrity": "sha512-jsEjy9l3yiXEQ+PsXdmBwEPcOxaXWLspKdplFUVI9vq1iZgIekeC0L167qeu86czQaxed3q/Uzuw0swL0irL8A==", + "dev": true, + "dependencies": { + "has-tostringtag": "^1.0.0" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-glob": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz", + "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==", + "devOptional": true, + "dependencies": { + "is-extglob": "^2.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/is-nan": { + "version": "1.3.2", + "resolved": "https://registry.npmjs.org/is-nan/-/is-nan-1.3.2.tgz", + "integrity": "sha512-E+zBKpQ2t6MEo1VsonYmluk9NxGrbzpeeLC2xIViuO2EjU2xsXsBPwTr3Ykv9l08UYEVEdWeRZNouaZqF6RN0w==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.0", + "define-properties": "^1.1.3" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/is-number": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz", + "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==", + "devOptional": true, + "engines": { + "node": ">=0.12.0" + } + }, + "node_modules/is-typed-array": { + "version": "1.1.13", + "resolved": "https://registry.npmjs.org/is-typed-array/-/is-typed-array-1.1.13.tgz", + "integrity": "sha512-uZ25/bUAlUY5fR4OKT4rZQEBrzQWYV9ZJYGGsUmEJ6thodVJ1HX64ePQ6Z0qPWP+m+Uq6e9UugrE38jeYsDSMw==", + "dev": true, + "dependencies": { + "which-typed-array": "^1.1.14" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "dev": true + }, + "node_modules/isomorphic-timers-promises": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/isomorphic-timers-promises/-/isomorphic-timers-promises-1.0.1.tgz", + "integrity": "sha512-u4sej9B1LPSxTGKB/HiuzvEQnXH0ECYkSVQU39koSwmFAxhlEAFl9RdTvLv4TOTQUgBS5O3O5fwUxk6byBZ+IQ==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/local-pkg": { + "version": "0.4.3", + "resolved": "https://registry.npmjs.org/local-pkg/-/local-pkg-0.4.3.tgz", + "integrity": "sha512-SFppqq5p42fe2qcZQqqEOiVRXl+WCP1MdT6k7BDEW1j++sp5fIY+/fdRQitvKgB5BrBcmrs5m/L0v2FrU5MY1g==", + "dev": true, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + } + }, + "node_modules/locate-path": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", + "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", + "dev": true, + "dependencies": { + "p-locate": "^5.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/lodash": { + "version": "4.17.21", + "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.21.tgz", + "integrity": "sha512-v2kDEe57lecTulaDIuNTPy3Ry4gLGJ6Z1O3vE1krgXZNrsQ+LFTGHVxVjcXPs17LhbZVGedAJv8XZ1tvj5FvSg==", + "dev": true + }, + "node_modules/lru-cache": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz", + "integrity": "sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==", + "dev": true, + "dependencies": { + "yallist": "^4.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/magic-string": { + "version": "0.30.10", + "resolved": "https://registry.npmjs.org/magic-string/-/magic-string-0.30.10.tgz", + "integrity": "sha512-iIRwTIf0QKV3UAnYK4PU8uiEc4SRh5jX0mwpIwETPpHdhVM4f53RSwS/vXvN1JhGX+Cs7B8qIq3d6AH49O5fAQ==", + "dependencies": { + "@jridgewell/sourcemap-codec": "^1.4.15" + } + }, + "node_modules/md5.js": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/md5.js/-/md5.js-1.3.5.tgz", + "integrity": "sha512-xitP+WxNPcTTOgnTJcrhM0xvdPepipPSf3I8EIpGKeFLjt3PlJLIDG3u8EX53ZIubkb+5U2+3rELYpEhHhzdkg==", + "dev": true, + "dependencies": { + "hash-base": "^3.0.0", + "inherits": "^2.0.1", + "safe-buffer": "^5.1.2" + } + }, + "node_modules/merge2": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/merge2/-/merge2-1.4.1.tgz", + "integrity": "sha512-8q7VEgMJW4J8tcfVPy8g09NcQwZdbwFEqhe/WZkoIzjn/3TGDwtOCYtXGxA3O8tPzpczCCDgv+P2P5y00ZJOOg==", + "dev": true, + "engines": { + "node": ">= 8" + } + }, + "node_modules/micromatch": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/micromatch/-/micromatch-4.0.5.tgz", + "integrity": "sha512-DMy+ERcEW2q8Z2Po+WNXuw3c5YaUSFjAO5GsJqfEl7UjvtIuFKO6ZrKvcItdy98dwFI2N1tg3zNIdKaQT+aNdA==", + "dev": true, + "dependencies": { + "braces": "^3.0.2", + "picomatch": "^2.3.1" + }, + "engines": { + "node": ">=8.6" + } + }, + "node_modules/miller-rabin": { + "version": "4.0.1", + "resolved": "https://registry.npmjs.org/miller-rabin/-/miller-rabin-4.0.1.tgz", + "integrity": "sha512-115fLhvZVqWwHPbClyntxEVfVDfl9DLLTuJvq3g2O/Oxi8AiNouAHvDSzHS0viUJc+V5vm3eq91Xwqn9dp4jRA==", + "dev": true, + "dependencies": { + "bn.js": "^4.0.0", + "brorand": "^1.0.1" + }, + "bin": { + "miller-rabin": "bin/miller-rabin" + } + }, + "node_modules/miller-rabin/node_modules/bn.js": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.12.0.tgz", + "integrity": "sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==", + "dev": true + }, + "node_modules/mime-db": { + "version": "1.52.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz", + "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==", + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/mime-types": { + "version": "2.1.35", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz", + "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==", + "dependencies": { + "mime-db": "1.52.0" + }, + "engines": { + "node": ">= 0.6" + } + }, + "node_modules/minimalistic-assert": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-assert/-/minimalistic-assert-1.0.1.tgz", + "integrity": "sha512-UtJcAD4yEaGtjPezWuO9wC4nwUnVH/8/Im3yEHQP4b67cXlD/Qr9hdITCU1xDbSEXg2XKNaP8jsReV7vQd00/A==", + "dev": true + }, + "node_modules/minimalistic-crypto-utils": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/minimalistic-crypto-utils/-/minimalistic-crypto-utils-1.0.1.tgz", + "integrity": "sha512-JIYlbt6g8i5jKfJ3xz7rF0LXmv2TkDxBLUkiBeZ7bAx4GnnNMr8xFpGnOxn6GhTEHx3SjRrZEoU+j04prX1ktg==", + "dev": true + }, + "node_modules/minimatch": { + "version": "9.0.4", + "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-9.0.4.tgz", + "integrity": "sha512-KqWh+VchfxcMNRAJjj2tnsSJdNbHsVgnkBhTNrW7AjVo6OvLtxw8zfT9oLw1JSohlFzJ8jCoTgaoXvJ+kHt6fw==", + "dev": true, + "dependencies": { + "brace-expansion": "^2.0.1" + }, + "engines": { + "node": ">=16 || 14 >=14.17" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/ms": { + "version": "2.1.2", + "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz", + "integrity": "sha512-sGkPx+VjMtmA6MX27oA4FBFELFCZZ4S4XqeGOXCv68tT+jb3vk/RyaKWP0PTKyWtmLSM0b+adUTEvbs1PEaH2w==", + "devOptional": true + }, + "node_modules/muggle-string": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/muggle-string/-/muggle-string-0.4.1.tgz", + "integrity": "sha512-VNTrAak/KhO2i8dqqnqnAHOa3cYBwXEZe9h+D5h/1ZqFSTEFHdM65lR7RoIqq3tBBYavsOXV84NoHXZ0AkPyqQ==", + "dev": true + }, + "node_modules/nanoid": { + "version": "3.3.7", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.7.tgz", + "integrity": "sha512-eSRppjcPIatRIMC1U6UngP8XFcz8MQWGQdt1MTBQ7NaAmvXDfvNxbvWV3x2y6CdEUciCSsDHDQZbhYaB8QEo2g==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, + "node_modules/node-stdlib-browser": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/node-stdlib-browser/-/node-stdlib-browser-1.2.0.tgz", + "integrity": "sha512-VSjFxUhRhkyed8AtLwSCkMrJRfQ3e2lGtG3sP6FEgaLKBBbxM/dLfjRe1+iLhjvyLFW3tBQ8+c0pcOtXGbAZJg==", + "dev": true, + "dependencies": { + "assert": "^2.0.0", + "browser-resolve": "^2.0.0", + "browserify-zlib": "^0.2.0", + "buffer": "^5.7.1", + "console-browserify": "^1.1.0", + "constants-browserify": "^1.0.0", + "create-require": "^1.1.1", + "crypto-browserify": "^3.11.0", + "domain-browser": "^4.22.0", + "events": "^3.0.0", + "https-browserify": "^1.0.0", + "isomorphic-timers-promises": "^1.0.1", + "os-browserify": "^0.3.0", + "path-browserify": "^1.0.1", + "pkg-dir": "^5.0.0", + "process": "^0.11.10", + "punycode": "^1.4.1", + "querystring-es3": "^0.2.1", + "readable-stream": "^3.6.0", + "stream-browserify": "^3.0.0", + "stream-http": "^3.2.0", + "string_decoder": "^1.0.0", + "timers-browserify": "^2.0.4", + "tty-browserify": "0.0.1", + "url": "^0.11.0", + "util": "^0.12.4", + "vm-browserify": "^1.0.1" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/normalize-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz", + "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==", + "devOptional": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/object-inspect": { + "version": "1.13.1", + "resolved": "https://registry.npmjs.org/object-inspect/-/object-inspect-1.13.1.tgz", + "integrity": "sha512-5qoj1RUiKOMsCCNLV1CBiPYE10sziTsnmNxkAI/rZhiD63CF7IqdFGC/XzjWjpSgLf0LxXX3bDFIh0E18f6UhQ==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/object-is": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/object-is/-/object-is-1.1.6.tgz", + "integrity": "sha512-F8cZ+KfGlSGi09lJT7/Nd6KJZ9ygtvYC0/UYYLI9nmQKLMnydpB9yvbv9K1uSkEu7FU9vYPmVwLg328tX+ot3Q==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.7", + "define-properties": "^1.2.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/object-keys": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.1.1.tgz", + "integrity": "sha512-NuAESUOUMrlIXOfHKzD6bpPu3tYt3xvjNdRIQ+FeT0lNb4K8WR70CaDxhuNguS2XG+GjkyMwOzsN5ZktImfhLA==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/object.assign": { + "version": "4.1.5", + "resolved": "https://registry.npmjs.org/object.assign/-/object.assign-4.1.5.tgz", + "integrity": "sha512-byy+U7gp+FVwmyzKPYhW2h5l3crpmGsxl7X2s8y43IgxvG4g3QZ6CffDtsNQy1WsmZpQbO+ybo0AlW7TY6DcBQ==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.5", + "define-properties": "^1.2.1", + "has-symbols": "^1.0.3", + "object-keys": "^1.1.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/os-browserify": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/os-browserify/-/os-browserify-0.3.0.tgz", + "integrity": "sha512-gjcpUc3clBf9+210TRaDWbf+rZZZEshZ+DlXMRCeAjp0xhTrnQsKHypIy1J3d5hKdUzj69t708EHtU8P6bUn0A==", + "dev": true + }, + "node_modules/p-limit": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", + "dev": true, + "dependencies": { + "yocto-queue": "^0.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/p-locate": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", + "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", + "dev": true, + "dependencies": { + "p-limit": "^3.0.2" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/pako": { + "version": "1.0.11", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", + "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", + "dev": true + }, + "node_modules/parse-asn1": { + "version": "5.1.7", + "resolved": "https://registry.npmjs.org/parse-asn1/-/parse-asn1-5.1.7.tgz", + "integrity": "sha512-CTM5kuWR3sx9IFamcl5ErfPl6ea/N8IYwiJ+vpeB2g+1iknv7zBl5uPwbMbRVznRVbrNY6lGuDoE5b30grmbqg==", + "dev": true, + "dependencies": { + "asn1.js": "^4.10.1", + "browserify-aes": "^1.2.0", + "evp_bytestokey": "^1.0.3", + "hash-base": "~3.0", + "pbkdf2": "^3.1.2", + "safe-buffer": "^5.2.1" + }, + "engines": { + "node": ">= 0.10" + } + }, + "node_modules/path-browserify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz", + "integrity": "sha512-b7uo2UCUOYZcnF/3ID0lulOJi/bafxa1xPe7ZPsammBSpjSWQkjNxlt635YGS2MiR9GjvuXCtz2emr3jbsz98g==", + "dev": true + }, + "node_modules/path-exists": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/path-exists/-/path-exists-4.0.0.tgz", + "integrity": "sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w==", + "dev": true, + "engines": { + "node": ">=8" + } + }, + "node_modules/path-parse": { + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/path-parse/-/path-parse-1.0.7.tgz", + "integrity": "sha512-LDJzPVEEEPR+y48z93A0Ed0yXb8pAByGWo/k5YYdYgpY2/2EsOsksJrq7lOHxryrVOn1ejG6oAp8ahvOIQD8sw==", + "dev": true + }, + "node_modules/pbkdf2": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/pbkdf2/-/pbkdf2-3.1.2.tgz", + "integrity": "sha512-iuh7L6jA7JEGu2WxDwtQP1ddOpaJNC4KlDEFfdQajSGgGPNi4OyDc2R7QnbY2bR9QjBVGwgvTdNJZoE7RaxUMA==", + "dev": true, + "dependencies": { + "create-hash": "^1.1.2", + "create-hmac": "^1.1.4", + "ripemd160": "^2.0.1", + "safe-buffer": "^5.0.1", + "sha.js": "^2.4.8" + }, + "engines": { + "node": ">=0.12" + } + }, + "node_modules/picocolors": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.0.0.tgz", + "integrity": "sha512-1fygroTLlHu66zi26VoTDv8yRgm0Fccecssto+MhsZ0D/DGW2sm8E8AjW7NU5VVTRt5GxbeZ5qBuJr+HyLYkjQ==" + }, + "node_modules/picomatch": { + "version": "2.3.1", + "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz", + "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==", + "devOptional": true, + "engines": { + "node": ">=8.6" + }, + "funding": { + "url": "https://github.com/sponsors/jonschlinkert" + } + }, + "node_modules/pkg-dir": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/pkg-dir/-/pkg-dir-5.0.0.tgz", + "integrity": "sha512-NPE8TDbzl/3YQYY7CSS228s3g2ollTFnc+Qi3tqmqJp9Vg2ovUpixcJEo2HJScN2Ez+kEaal6y70c0ehqJBJeA==", + "dev": true, + "dependencies": { + "find-up": "^5.0.0" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/possible-typed-array-names": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/possible-typed-array-names/-/possible-typed-array-names-1.0.0.tgz", + "integrity": "sha512-d7Uw+eZoloe0EHDIYoe+bQ5WXnGMOpmiZFTuMWCwpjzzkL2nTjcKiAk4hh8TjnGye2TwWOk3UXucZ+3rbmBa8Q==", + "dev": true, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/postcss": { + "version": "8.4.38", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.38.tgz", + "integrity": "sha512-Wglpdk03BSfXkHoQa3b/oulrotAkwrlLDRSOb9D0bN86FdRyE9lppSp33aHNPgBa0JKCoB+drFLZkQoRRYae5A==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.7", + "picocolors": "^1.0.0", + "source-map-js": "^1.2.0" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, + "node_modules/process": { + "version": "0.11.10", + "resolved": "https://registry.npmjs.org/process/-/process-0.11.10.tgz", + "integrity": "sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==", + "dev": true, + "engines": { + "node": ">= 0.6.0" + } + }, + "node_modules/process-nextick-args": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", + "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", + "dev": true + }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, + "node_modules/public-encrypt": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/public-encrypt/-/public-encrypt-4.0.3.tgz", + "integrity": "sha512-zVpa8oKZSz5bTMTFClc1fQOnyyEzpl5ozpi1B5YcvBrdohMjH2rfsBtyXcuNuwjsDIXmBYlF2N5FlJYhR29t8Q==", + "dev": true, + "dependencies": { + "bn.js": "^4.1.0", + "browserify-rsa": "^4.0.0", + "create-hash": "^1.1.0", + "parse-asn1": "^5.0.0", + "randombytes": "^2.0.1", + "safe-buffer": "^5.1.2" + } + }, + "node_modules/public-encrypt/node_modules/bn.js": { + "version": "4.12.0", + "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-4.12.0.tgz", + "integrity": "sha512-c98Bf3tPniI+scsdk237ku1Dc3ujXQTSgyiPUDEOe7tRkhrqridvh8klBv0HCEso1OLOYcHuCv/cS6DNxKH+ZA==", + "dev": true + }, + "node_modules/punycode": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/punycode/-/punycode-1.4.1.tgz", + "integrity": "sha512-jmYNElW7yvO7TV33CjSmvSiE2yco3bV2czu/OzDKdMNVZQWfxCblURLhf+47syQRBntjfLdd/H0egrzIG+oaFQ==", + "dev": true + }, + "node_modules/qs": { + "version": "6.12.1", + "resolved": "https://registry.npmjs.org/qs/-/qs-6.12.1.tgz", + "integrity": "sha512-zWmv4RSuB9r2mYQw3zxQuHWeU+42aKi1wWig/j4ele4ygELZ7PEO6MM7rim9oAQH2A5MWfsAVf/jPvTPgCbvUQ==", + "dev": true, + "dependencies": { + "side-channel": "^1.0.6" + }, + "engines": { + "node": ">=0.6" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/querystring-es3": { + "version": "0.2.1", + "resolved": "https://registry.npmjs.org/querystring-es3/-/querystring-es3-0.2.1.tgz", + "integrity": "sha512-773xhDQnZBMFobEiztv8LIl70ch5MSF/jUQVlhwFyBILqq96anmoctVIYz+ZRp0qbCKATTn6ev02M3r7Ga5vqA==", + "dev": true, + "engines": { + "node": ">=0.4.x" + } + }, + "node_modules/queue-microtask": { + "version": "1.2.3", + "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", + "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/randombytes": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz", + "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==", + "dev": true, + "dependencies": { + "safe-buffer": "^5.1.0" + } + }, + "node_modules/randomfill": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/randomfill/-/randomfill-1.0.4.tgz", + "integrity": "sha512-87lcbR8+MhcWcUiQ+9e+Rwx8MyR2P7qnt15ynUlbm3TU/fjbgz4GsvfSUDTemtCCtVCqb4ZcEFlyPNTh9bBTLw==", + "dev": true, + "dependencies": { + "randombytes": "^2.0.5", + "safe-buffer": "^5.1.0" + } + }, + "node_modules/readable-stream": { + "version": "3.6.2", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz", + "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==", + "dev": true, + "dependencies": { + "inherits": "^2.0.3", + "string_decoder": "^1.1.1", + "util-deprecate": "^1.0.1" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/readdirp": { + "version": "3.6.0", + "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz", + "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==", + "devOptional": true, + "dependencies": { + "picomatch": "^2.2.1" + }, + "engines": { + "node": ">=8.10.0" + } + }, + "node_modules/regenerator-runtime": { + "version": "0.14.1", + "resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.14.1.tgz", + "integrity": "sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==", + "dev": true + }, + "node_modules/require-directory": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", + "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/requires-port": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz", + "integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ==" + }, + "node_modules/resolve": { + "version": "1.22.8", + "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", + "integrity": "sha512-oKWePCxqpd6FlLvGV1VU0x7bkPmmCNolxzjMf4NczoDnQcIWrAF+cPtZn5i6n+RfD2d9i0tzpKnG6Yk168yIyw==", + "dev": true, + "dependencies": { + "is-core-module": "^2.13.0", + "path-parse": "^1.0.7", + "supports-preserve-symlinks-flag": "^1.0.0" + }, + "bin": { + "resolve": "bin/resolve" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/reusify": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/reusify/-/reusify-1.0.4.tgz", + "integrity": "sha512-U9nH88a3fc/ekCF1l0/UP1IosiuIjyTh7hBvXVMHYgVcfGvt897Xguj2UOLDeI5BG2m7/uwyaLVT6fbtCwTyzw==", + "dev": true, + "engines": { + "iojs": ">=1.0.0", + "node": ">=0.10.0" + } + }, + "node_modules/ripemd160": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/ripemd160/-/ripemd160-2.0.2.tgz", + "integrity": "sha512-ii4iagi25WusVoiC4B4lq7pbXfAp3D9v5CwfkY33vffw2+pkDjY1D8GaN7spsxvCSx8dkPqOZCEZyfxcmJG2IA==", + "dev": true, + "dependencies": { + "hash-base": "^3.0.0", + "inherits": "^2.0.1" + } + }, + "node_modules/roboto-fontface": { + "version": "0.10.0", + "resolved": "https://registry.npmjs.org/roboto-fontface/-/roboto-fontface-0.10.0.tgz", + "integrity": "sha512-OlwfYEgA2RdboZohpldlvJ1xngOins5d7ejqnIBWr9KaMxsnBqotpptRXTyfNRLnFpqzX6sTDt+X+a+6udnU8g==" + }, + "node_modules/rollup": { + "version": "4.16.3", + "resolved": "https://registry.npmjs.org/rollup/-/rollup-4.16.3.tgz", + "integrity": "sha512-Ygm4fFO4usWcAG3Ud36Lmif5nudoi0X6QPLC+kRgrRjulAbmFkaTawP7fTIkRDnCNSf/4IAQzXM1T8e691kRtw==", + "devOptional": true, + "dependencies": { + "@types/estree": "1.0.5" + }, + "bin": { + "rollup": "dist/bin/rollup" + }, + "engines": { + "node": ">=18.0.0", + "npm": ">=8.0.0" + }, + "optionalDependencies": { + "@rollup/rollup-android-arm-eabi": "4.16.3", + "@rollup/rollup-android-arm64": "4.16.3", + "@rollup/rollup-darwin-arm64": "4.16.3", + "@rollup/rollup-darwin-x64": "4.16.3", + "@rollup/rollup-linux-arm-gnueabihf": "4.16.3", + "@rollup/rollup-linux-arm-musleabihf": "4.16.3", + "@rollup/rollup-linux-arm64-gnu": "4.16.3", + "@rollup/rollup-linux-arm64-musl": "4.16.3", + "@rollup/rollup-linux-powerpc64le-gnu": "4.16.3", + "@rollup/rollup-linux-riscv64-gnu": "4.16.3", + "@rollup/rollup-linux-s390x-gnu": "4.16.3", + "@rollup/rollup-linux-x64-gnu": "4.16.3", + "@rollup/rollup-linux-x64-musl": "4.16.3", + "@rollup/rollup-win32-arm64-msvc": "4.16.3", + "@rollup/rollup-win32-ia32-msvc": "4.16.3", + "@rollup/rollup-win32-x64-msvc": "4.16.3", + "fsevents": "~2.3.2" + } + }, + "node_modules/run-parallel": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", + "integrity": "sha512-5l4VyZR86LZ/lDxZTR6jqL8AFE2S0IFLMP26AbjsLVADxHdhB/c0GUsH+y39UfCi3dzz8OlQuPmnaJOMoDHQBA==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "dependencies": { + "queue-microtask": "^1.2.2" + } + }, + "node_modules/rxjs": { + "version": "7.8.1", + "resolved": "https://registry.npmjs.org/rxjs/-/rxjs-7.8.1.tgz", + "integrity": "sha512-AA3TVj+0A2iuIoQkWEK/tqFjBq2j+6PO6Y0zJcvzLAFhEFIO3HL0vls9hWLncZbAAbK0mar7oZ4V079I/qPMxg==", + "dev": true, + "dependencies": { + "tslib": "^2.1.0" + } + }, + "node_modules/safe-buffer": { + "version": "5.2.1", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz", + "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==", + "dev": true, + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ] + }, + "node_modules/sass": { + "version": "1.75.0", + "resolved": "https://registry.npmjs.org/sass/-/sass-1.75.0.tgz", + "integrity": "sha512-ShMYi3WkrDWxExyxSZPst4/okE9ts46xZmJDSawJQrnte7M1V9fScVB+uNXOVKRBt0PggHOwoZcn8mYX4trnBw==", + "devOptional": true, + "dependencies": { + "chokidar": ">=3.0.0 <4.0.0", + "immutable": "^4.0.0", + "source-map-js": ">=0.6.2 <2.0.0" + }, + "bin": { + "sass": "sass.js" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/semver": { + "version": "7.6.0", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz", + "integrity": "sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==", + "dev": true, + "dependencies": { + "lru-cache": "^6.0.0" + }, + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, + "node_modules/set-function-length": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", + "integrity": "sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==", + "dev": true, + "dependencies": { + "define-data-property": "^1.1.4", + "es-errors": "^1.3.0", + "function-bind": "^1.1.2", + "get-intrinsic": "^1.2.4", + "gopd": "^1.0.1", + "has-property-descriptors": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + } + }, + "node_modules/setimmediate": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", + "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", + "dev": true + }, + "node_modules/sha.js": { + "version": "2.4.11", + "resolved": "https://registry.npmjs.org/sha.js/-/sha.js-2.4.11.tgz", + "integrity": "sha512-QMEp5B7cftE7APOjk5Y6xgrbWu+WkLVQwk8JNjZ8nKRciZaByEW6MubieAiToS7+dwvrjGhH8jRXz3MVd0AYqQ==", + "dev": true, + "dependencies": { + "inherits": "^2.0.1", + "safe-buffer": "^5.0.1" + }, + "bin": { + "sha.js": "bin.js" + } + }, + "node_modules/shell-quote": { + "version": "1.8.1", + "resolved": "https://registry.npmjs.org/shell-quote/-/shell-quote-1.8.1.tgz", + "integrity": "sha512-6j1W9l1iAs/4xYBI1SYOVZyFcCis9b4KCLQ8fgAGG07QvzaRLVVRQvAy85yNmmZSjYjg4MWh4gNvlPujU/5LpA==", + "dev": true, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/side-channel": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/side-channel/-/side-channel-1.0.6.tgz", + "integrity": "sha512-fDW/EZ6Q9RiO8eFG8Hj+7u/oW+XrPTIChwCOM2+th2A6OblDtYYIpve9m+KvI9Z4C9qSEXlaGR6bTEYHReuglA==", + "dev": true, + "dependencies": { + "call-bind": "^1.0.7", + "es-errors": "^1.3.0", + "get-intrinsic": "^1.2.4", + "object-inspect": "^1.13.1" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/source-map-js": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.0.tgz", + "integrity": "sha512-itJW8lvSA0TXEphiRoawsCksnlf8SyvmFzIhltqAHluXd88pkCd+cXJVHTDwdCr0IzwptSm035IHQktUu1QUMg==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/spawn-command": { + "version": "0.0.2", + "resolved": "https://registry.npmjs.org/spawn-command/-/spawn-command-0.0.2.tgz", + "integrity": "sha512-zC8zGoGkmc8J9ndvml8Xksr1Amk9qBujgbF0JAIWO7kXr43w0h/0GJNM/Vustixu+YE8N/MTrQ7N31FvHUACxQ==", + "dev": true + }, + "node_modules/stream-browserify": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/stream-browserify/-/stream-browserify-3.0.0.tgz", + "integrity": "sha512-H73RAHsVBapbim0tU2JwwOiXUj+fikfiaoYAKHF3VJfA0pe2BCzkhAHBlLG6REzE+2WNZcxOXjK7lkso+9euLA==", + "dev": true, + "dependencies": { + "inherits": "~2.0.4", + "readable-stream": "^3.5.0" + } + }, + "node_modules/stream-http": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/stream-http/-/stream-http-3.2.0.tgz", + "integrity": "sha512-Oq1bLqisTyK3TSCXpPbT4sdeYNdmyZJv1LxpEm2vu1ZhK89kSE5YXwZc3cWk0MagGaKriBh9mCFbVGtO+vY29A==", + "dev": true, + "dependencies": { + "builtin-status-codes": "^3.0.0", + "inherits": "^2.0.4", + "readable-stream": "^3.6.0", + "xtend": "^4.0.2" + } + }, + "node_modules/string_decoder": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz", + "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==", + "dev": true, + "dependencies": { + "safe-buffer": "~5.2.0" + } + }, + "node_modules/string-width": { + "version": "4.2.3", + "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", + "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", + "dev": true, + "dependencies": { + "emoji-regex": "^8.0.0", + "is-fullwidth-code-point": "^3.0.0", + "strip-ansi": "^6.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/strip-ansi": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", + "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", + "dev": true, + "dependencies": { + "ansi-regex": "^5.0.1" + }, + "engines": { + "node": ">=8" + } + }, + "node_modules/supports-color": { + "version": "8.1.1", + "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-8.1.1.tgz", + "integrity": "sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==", + "dev": true, + "dependencies": { + "has-flag": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/supports-color?sponsor=1" + } + }, + "node_modules/supports-preserve-symlinks-flag": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/supports-preserve-symlinks-flag/-/supports-preserve-symlinks-flag-1.0.0.tgz", + "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", + "dev": true, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/timers-browserify": { + "version": "2.0.12", + "resolved": "https://registry.npmjs.org/timers-browserify/-/timers-browserify-2.0.12.tgz", + "integrity": "sha512-9phl76Cqm6FhSX9Xe1ZUAMLtm1BLkKj2Qd5ApyWkXzsMRaA7dgr81kf4wJmQf/hAvg8EEyJxDo3du/0KlhPiKQ==", + "dev": true, + "dependencies": { + "setimmediate": "^1.0.4" + }, + "engines": { + "node": ">=0.6.0" + } + }, + "node_modules/to-fast-properties": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/to-fast-properties/-/to-fast-properties-2.0.0.tgz", + "integrity": "sha512-/OaKK0xYrs3DmxRYqL/yDc+FxFUVYhDlXMhRmv3z915w2HF1tnN1omB354j8VUGO/hbRzyD6Y3sA7v7GS/ceog==", + "dev": true, + "engines": { + "node": ">=4" + } + }, + "node_modules/to-regex-range": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz", + "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==", + "devOptional": true, + "dependencies": { + "is-number": "^7.0.0" + }, + "engines": { + "node": ">=8.0" + } + }, + "node_modules/tree-kill": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz", + "integrity": "sha512-L0Orpi8qGpRG//Nd+H90vFB+3iHnue1zSSGmNOOCh1GLJ7rUKVwV2HvijphGQS2UmhUZewS9VgvxYIdgr+fG1A==", + "dev": true, + "bin": { + "tree-kill": "cli.js" + } + }, + "node_modules/tslib": { + "version": "2.6.2", + "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz", + "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==", + "dev": true + }, + "node_modules/tty-browserify": { + "version": "0.0.1", + "resolved": "https://registry.npmjs.org/tty-browserify/-/tty-browserify-0.0.1.tgz", + "integrity": "sha512-C3TaO7K81YvjCgQH9Q1S3R3P3BtN3RIM8n+OvX4il1K1zgE8ZhI0op7kClgkxtutIE8hQrcrHBXvIheqKUUCxw==", + "dev": true + }, + "node_modules/typescript": { + "version": "5.4.5", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.4.5.tgz", + "integrity": "sha512-vcI4UpRgg81oIRUFwR0WSIHKt11nJ7SAVlYNIu+QpqeyXP+gpQJy/Z4+F0aGxSE4MqwjyXvW/TzgkLAx2AGHwQ==", + "devOptional": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "devOptional": true + }, + "node_modules/unplugin": { + "version": "1.10.1", + "resolved": "https://registry.npmjs.org/unplugin/-/unplugin-1.10.1.tgz", + "integrity": "sha512-d6Mhq8RJeGA8UfKCu54Um4lFA0eSaRa3XxdAJg8tIdxbu1ubW0hBCZUL7yI2uGyYCRndvbK8FLHzqy2XKfeMsg==", + "dev": true, + "dependencies": { + "acorn": "^8.11.3", + "chokidar": "^3.6.0", + "webpack-sources": "^3.2.3", + "webpack-virtual-modules": "^0.6.1" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/unplugin-fonts": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/unplugin-fonts/-/unplugin-fonts-1.1.1.tgz", + "integrity": "sha512-/Aw/rL9D2aslGGM0vi+2R2aG508RSwawLnnBuo+JDSqYc4cHJO1R1phllhN6GysEhBp/6a4B6+vSFPVapWyAAw==", + "dev": true, + "dependencies": { + "fast-glob": "^3.2.12", + "unplugin": "^1.3.1" + }, + "peerDependencies": { + "@nuxt/kit": "^3.0.0", + "vite": "^2.0.0 || ^3.0.0 || ^4.0.0 || ^5.0.0" + }, + "peerDependenciesMeta": { + "@nuxt/kit": { + "optional": true + } + } + }, + "node_modules/unplugin-vue-components": { + "version": "0.26.0", + "resolved": "https://registry.npmjs.org/unplugin-vue-components/-/unplugin-vue-components-0.26.0.tgz", + "integrity": "sha512-s7IdPDlnOvPamjunVxw8kNgKNK8A5KM1YpK5j/p97jEKTjlPNrA0nZBiSfAKKlK1gWZuyWXlKL5dk3EDw874LQ==", + "dev": true, + "dependencies": { + "@antfu/utils": "^0.7.6", + "@rollup/pluginutils": "^5.0.4", + "chokidar": "^3.5.3", + "debug": "^4.3.4", + "fast-glob": "^3.3.1", + "local-pkg": "^0.4.3", + "magic-string": "^0.30.3", + "minimatch": "^9.0.3", + "resolve": "^1.22.4", + "unplugin": "^1.4.0" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/antfu" + }, + "peerDependencies": { + "@babel/parser": "^7.15.8", + "@nuxt/kit": "^3.2.2", + "vue": "2 || 3" + }, + "peerDependenciesMeta": { + "@babel/parser": { + "optional": true + }, + "@nuxt/kit": { + "optional": true + } + } + }, + "node_modules/upath": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/upath/-/upath-2.0.1.tgz", + "integrity": "sha512-1uEe95xksV1O0CYKXo8vQvN1JEbtJp7lb7C5U9HMsIp6IVwntkH/oNUzyVNQSd4S1sYk2FpSSW44FqMc8qee5w==", + "devOptional": true, + "engines": { + "node": ">=4", + "yarn": "*" + } + }, + "node_modules/url": { + "version": "0.11.3", + "resolved": "https://registry.npmjs.org/url/-/url-0.11.3.tgz", + "integrity": "sha512-6hxOLGfZASQK/cijlZnZJTq8OXAkt/3YGfQX45vvMYXpZoo8NdWZcY73K108Jf759lS1Bv/8wXnHDTSz17dSRw==", + "dev": true, + "dependencies": { + "punycode": "^1.4.1", + "qs": "^6.11.2" + } + }, + "node_modules/util": { + "version": "0.12.5", + "resolved": "https://registry.npmjs.org/util/-/util-0.12.5.tgz", + "integrity": "sha512-kZf/K6hEIrWHI6XqOFUiiMa+79wE/D8Q+NCNAWclkyg3b4d2k7s0QGepNjiABc+aR3N1PAyHL7p6UcLY6LmrnA==", + "dev": true, + "dependencies": { + "inherits": "^2.0.3", + "is-arguments": "^1.0.4", + "is-generator-function": "^1.0.7", + "is-typed-array": "^1.1.3", + "which-typed-array": "^1.1.2" + } + }, + "node_modules/util-deprecate": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", + "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==", + "dev": true + }, + "node_modules/vite": { + "version": "5.2.10", + "resolved": "https://registry.npmjs.org/vite/-/vite-5.2.10.tgz", + "integrity": "sha512-PAzgUZbP7msvQvqdSD+ErD5qGnSFiGOoWmV5yAKUEI0kdhjbH6nMWVyZQC/hSc4aXwc0oJ9aEdIiF9Oje0JFCw==", + "devOptional": true, + "dependencies": { + "esbuild": "^0.20.1", + "postcss": "^8.4.38", + "rollup": "^4.13.0" + }, + "bin": { + "vite": "bin/vite.js" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "funding": { + "url": "https://github.com/vitejs/vite?sponsor=1" + }, + "optionalDependencies": { + "fsevents": "~2.3.3" + }, + "peerDependencies": { + "@types/node": "^18.0.0 || >=20.0.0", + "less": "*", + "lightningcss": "^1.21.0", + "sass": "*", + "stylus": "*", + "sugarss": "*", + "terser": "^5.4.0" + }, + "peerDependenciesMeta": { + "@types/node": { + "optional": true + }, + "less": { + "optional": true + }, + "lightningcss": { + "optional": true + }, + "sass": { + "optional": true + }, + "stylus": { + "optional": true + }, + "sugarss": { + "optional": true + }, + "terser": { + "optional": true + } + } + }, + "node_modules/vite-plugin-node-polyfills": { + "version": "0.21.0", + "resolved": "https://registry.npmjs.org/vite-plugin-node-polyfills/-/vite-plugin-node-polyfills-0.21.0.tgz", + "integrity": "sha512-Sk4DiKnmxN8E0vhgEhzLudfJQfaT8k4/gJ25xvUPG54KjLJ6HAmDKbr4rzDD/QWEY+Lwg80KE85fGYBQihEPQA==", + "dev": true, + "dependencies": { + "@rollup/plugin-inject": "^5.0.5", + "node-stdlib-browser": "^1.2.0" + }, + "funding": { + "url": "https://github.com/sponsors/davidmyersdev" + }, + "peerDependencies": { + "vite": "^2.0.0 || ^3.0.0 || ^4.0.0 || ^5.0.0" + } + }, + "node_modules/vite-plugin-vuetify": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/vite-plugin-vuetify/-/vite-plugin-vuetify-2.0.3.tgz", + "integrity": "sha512-HbYajgGgb/noaVKNRhnnXIiQZrNXfNIeanUGAwXgOxL6h/KULS40Uf51Kyz8hNmdegF+DwjgXXI/8J1PNS83xw==", + "devOptional": true, + "dependencies": { + "@vuetify/loader-shared": "^2.0.3", + "debug": "^4.3.3", + "upath": "^2.0.1" + }, + "engines": { + "node": "^18.0.0 || >=20.0.0" + }, + "peerDependencies": { + "vite": ">=5", + "vue": "^3.0.0", + "vuetify": "^3.0.0" + } + }, + "node_modules/vm-browserify": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/vm-browserify/-/vm-browserify-1.1.2.tgz", + "integrity": "sha512-2ham8XPWTONajOR0ohOKOHXkm3+gaBmGut3SRuu75xLd/RRaY6vqgh8NBYYk7+RW3u5AtzPQZG8F10LHkl0lAQ==", + "dev": true + }, + "node_modules/vue": { + "version": "3.4.24", + "resolved": "https://registry.npmjs.org/vue/-/vue-3.4.24.tgz", + "integrity": "sha512-NPdx7dLGyHmKHGRRU5bMRYVE+rechR+KDU5R2tSTNG36PuMwbfAJ+amEvOAw7BPfZp5sQulNELSLm5YUkau+Sg==", + "dependencies": { + "@vue/compiler-dom": "3.4.24", + "@vue/compiler-sfc": "3.4.24", + "@vue/runtime-dom": "3.4.24", + "@vue/server-renderer": "3.4.24", + "@vue/shared": "3.4.24" + }, + "peerDependencies": { + "typescript": "*" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, + "node_modules/vue-template-compiler": { + "version": "2.7.16", + "resolved": "https://registry.npmjs.org/vue-template-compiler/-/vue-template-compiler-2.7.16.tgz", + "integrity": "sha512-AYbUWAJHLGGQM7+cNTELw+KsOG9nl2CnSv467WobS5Cv9uk3wFcnr1Etsz2sEIHEZvw1U+o9mRlEO6QbZvUPGQ==", + "dev": true, + "dependencies": { + "de-indent": "^1.0.2", + "he": "^1.2.0" + } + }, + "node_modules/vue-tsc": { + "version": "2.0.14", + "resolved": "https://registry.npmjs.org/vue-tsc/-/vue-tsc-2.0.14.tgz", + "integrity": "sha512-DgAO3U1cnCHOUO7yB35LENbkapeRsBZ7Ugq5hGz/QOHny0+1VQN8eSwSBjYbjLVPfvfw6EY7sNPjbuHHUhckcg==", + "dev": true, + "dependencies": { + "@volar/typescript": "2.2.0-alpha.10", + "@vue/language-core": "2.0.14", + "semver": "^7.5.4" + }, + "bin": { + "vue-tsc": "bin/vue-tsc.js" + }, + "peerDependencies": { + "typescript": "*" + } + }, + "node_modules/vuetify": { + "version": "3.5.16", + "resolved": "https://registry.npmjs.org/vuetify/-/vuetify-3.5.16.tgz", + "integrity": "sha512-jyApfATreFMkgjvK0bL7ntZnr+p9TU73+4E3kX6fIvUitdAP9fltG7yj+v3k14HLqZRSNhTL1GhQ95DFx631zw==", + "engines": { + "node": "^12.20 || >=14.13" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/johnleider" + }, + "peerDependencies": { + "typescript": ">=4.7", + "vite-plugin-vuetify": ">=1.0.0", + "vue": "^3.3.0", + "vue-i18n": "^9.0.0", + "webpack-plugin-vuetify": ">=2.0.0" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + }, + "vite-plugin-vuetify": { + "optional": true + }, + "vue-i18n": { + "optional": true + }, + "webpack-plugin-vuetify": { + "optional": true + } + } + }, + "node_modules/webpack-sources": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/webpack-sources/-/webpack-sources-3.2.3.tgz", + "integrity": "sha512-/DyMEOrDgLKKIG0fmvtz+4dUX/3Ghozwgm6iPp8KRhvn+eQf9+Q7GWxVNMk3+uCPWfdXYC4ExGBckIXdFEfH1w==", + "dev": true, + "engines": { + "node": ">=10.13.0" + } + }, + "node_modules/webpack-virtual-modules": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/webpack-virtual-modules/-/webpack-virtual-modules-0.6.1.tgz", + "integrity": "sha512-poXpCylU7ExuvZK8z+On3kX+S8o/2dQ/SVYueKA0D4WEMXROXgY8Ez50/bQEUmvoSMMrWcrJqCHuhAbsiwg7Dg==", + "dev": true + }, + "node_modules/which-typed-array": { + "version": "1.1.15", + "resolved": "https://registry.npmjs.org/which-typed-array/-/which-typed-array-1.1.15.tgz", + "integrity": "sha512-oV0jmFtUky6CXfkqehVvBP/LSWJ2sy4vWMioiENyJLePrBO/yKyV9OyJySfAKosh+RYkIl5zJCNZ8/4JncrpdA==", + "dev": true, + "dependencies": { + "available-typed-arrays": "^1.0.7", + "call-bind": "^1.0.7", + "for-each": "^0.3.3", + "gopd": "^1.0.1", + "has-tostringtag": "^1.0.2" + }, + "engines": { + "node": ">= 0.4" + }, + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/wrap-ansi": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", + "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", + "dev": true, + "dependencies": { + "ansi-styles": "^4.0.0", + "string-width": "^4.1.0", + "strip-ansi": "^6.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/chalk/wrap-ansi?sponsor=1" + } + }, + "node_modules/xtend": { + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", + "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", + "dev": true, + "engines": { + "node": ">=0.4" + } + }, + "node_modules/y18n": { + "version": "5.0.8", + "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", + "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", + "dev": true, + "engines": { + "node": ">=10" + } + }, + "node_modules/yallist": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz", + "integrity": "sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==", + "dev": true + }, + "node_modules/yargs": { + "version": "17.7.2", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", + "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", + "dev": true, + "dependencies": { + "cliui": "^8.0.1", + "escalade": "^3.1.1", + "get-caller-file": "^2.0.5", + "require-directory": "^2.1.1", + "string-width": "^4.2.3", + "y18n": "^5.0.5", + "yargs-parser": "^21.1.1" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/yargs-parser": { + "version": "21.1.1", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", + "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", + "dev": true, + "engines": { + "node": ">=12" + } + }, + "node_modules/yocto-queue": { + "version": "0.1.0", + "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", + "integrity": "sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==", + "dev": true, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + } + } +} diff --git a/archive_query_log/dashboard/package.json b/archive_query_log/dashboard/package.json new file mode 100644 index 00000000..b3e70acc --- /dev/null +++ b/archive_query_log/dashboard/package.json @@ -0,0 +1,39 @@ +{ + "name": "aql-dashboard", + "version": "0.0.0", + "scripts": { + "dev": "concurrently --kill-others \"npm run dev-vite\" \"npm run dev-proxy\"", + "dev-vite": "vite", + "dev-proxy": "node proxy.js", + "build": "vue-tsc --noEmit && vite build", + "preview": "vite preview" + }, + "dependencies": { + "@mdi/font": "6.2.95", + "axios": "^1.7.2", + "elastic-tiny-client": "^0.1.4", + "http-proxy": "^1.18.1", + "roboto-fontface": "*", + "vue": "^3.4.21", + "vuetify": "^3.5.8" + }, + "devDependencies": { + "@babel/types": "^7.24.0", + "@types/node": "^20.11.25", + "@vitejs/plugin-vue": "^5.0.4", + "concurrently": "^8.2.2", + "sass": "^1.71.1", + "typescript": "^5.4.2", + "unplugin-fonts": "^1.1.1", + "unplugin-vue-components": "^0.26.0", + "vite": "^5.1.5", + "vite-plugin-node-polyfills": "^0.21.0", + "vite-plugin-vuetify": "^2.0.3", + "vue-tsc": "^2.0.6" + }, + "description": "Monitor and manage the crawling of the [Archive Query Log](https://github.com/webis-de/archive-query-log).", + "main": "index.js", + "keywords": [], + "author": "", + "license": "ISC" +} diff --git a/archive_query_log/dashboard/src/App.vue b/archive_query_log/dashboard/src/App.vue new file mode 100644 index 00000000..fe4bbe81 --- /dev/null +++ b/archive_query_log/dashboard/src/App.vue @@ -0,0 +1,81 @@ + + + + + + diff --git a/archive_query_log/dashboard/src/client.js b/archive_query_log/dashboard/src/client.js new file mode 100644 index 00000000..efc58beb --- /dev/null +++ b/archive_query_log/dashboard/src/client.js @@ -0,0 +1,13 @@ +import axios from 'axios'; + +const BASE_URL = 'http://localhost:8000'; + +export async function fetchData(endpoint) { + try { + const response = await axios.get(`${BASE_URL}${endpoint}`); + return response.data; + } catch (error) { + console.error(`Error fetching data from ${endpoint}:`, error.message); + throw error; + } +} \ No newline at end of file diff --git a/archive_query_log/dashboard/src/components/Footer.vue b/archive_query_log/dashboard/src/components/Footer.vue new file mode 100644 index 00000000..1e0a321a --- /dev/null +++ b/archive_query_log/dashboard/src/components/Footer.vue @@ -0,0 +1,18 @@ + + + + \ No newline at end of file diff --git a/archive_query_log/dashboard/src/components/Header.vue b/archive_query_log/dashboard/src/components/Header.vue new file mode 100644 index 00000000..a309f01c --- /dev/null +++ b/archive_query_log/dashboard/src/components/Header.vue @@ -0,0 +1,32 @@ + + + + \ No newline at end of file diff --git a/archive_query_log/dashboard/src/components/ProgressTable.vue b/archive_query_log/dashboard/src/components/ProgressTable.vue new file mode 100644 index 00000000..6273cee4 --- /dev/null +++ b/archive_query_log/dashboard/src/components/ProgressTable.vue @@ -0,0 +1,27 @@ + + + + + + \ No newline at end of file diff --git a/archive_query_log/dashboard/src/components/StatisticsTable.vue b/archive_query_log/dashboard/src/components/StatisticsTable.vue new file mode 100644 index 00000000..081ed534 --- /dev/null +++ b/archive_query_log/dashboard/src/components/StatisticsTable.vue @@ -0,0 +1,26 @@ + + + + + \ No newline at end of file diff --git a/archive_query_log/dashboard/src/main.ts b/archive_query_log/dashboard/src/main.ts new file mode 100644 index 00000000..79170d31 --- /dev/null +++ b/archive_query_log/dashboard/src/main.ts @@ -0,0 +1,20 @@ +/** + * main.ts + * + * Bootstraps Vuetify and other plugins then mounts the App` + */ + +// Plugins +import { registerPlugins } from '@/plugins' + +// Components +import App from './App.vue' + +// Composables +import { createApp } from 'vue' + +const app = createApp(App) + +registerPlugins(app) + +app.mount('#app') \ No newline at end of file diff --git a/archive_query_log/dashboard/src/plugins/README.md b/archive_query_log/dashboard/src/plugins/README.md new file mode 100644 index 00000000..62201c7c --- /dev/null +++ b/archive_query_log/dashboard/src/plugins/README.md @@ -0,0 +1,3 @@ +# Plugins + +Plugins are a way to extend the functionality of your Vue application. Use this folder for registering plugins that you want to use globally. diff --git a/archive_query_log/dashboard/src/plugins/index.ts b/archive_query_log/dashboard/src/plugins/index.ts new file mode 100644 index 00000000..818042a9 --- /dev/null +++ b/archive_query_log/dashboard/src/plugins/index.ts @@ -0,0 +1,15 @@ +/** + * plugins/index.ts + * + * Automatically included in `./src/main.ts` + */ + +// Plugins +import vuetify from './vuetify' + +// Types +import type { App } from 'vue' + +export function registerPlugins (app: App) { + app.use(vuetify) +} diff --git a/archive_query_log/dashboard/src/plugins/vuetify.ts b/archive_query_log/dashboard/src/plugins/vuetify.ts new file mode 100644 index 00000000..46ece964 --- /dev/null +++ b/archive_query_log/dashboard/src/plugins/vuetify.ts @@ -0,0 +1,19 @@ +/** + * plugins/vuetify.ts + * + * Framework documentation: https://vuetifyjs.com` + */ + +// Styles +import '@mdi/font/css/materialdesignicons.css' +import 'vuetify/styles' + +// Composables +import { createVuetify } from 'vuetify' + +// https://vuetifyjs.com/en/introduction/why-vuetify/#feature-guides +export default createVuetify({ + theme: { + defaultTheme: 'light', + }, +}) diff --git a/archive_query_log/dashboard/src/vite-env.d.ts b/archive_query_log/dashboard/src/vite-env.d.ts new file mode 100644 index 00000000..323c78a6 --- /dev/null +++ b/archive_query_log/dashboard/src/vite-env.d.ts @@ -0,0 +1,7 @@ +/// + +declare module '*.vue' { + import type { DefineComponent } from 'vue' + const component: DefineComponent<{}, {}, any> + export default component +} diff --git a/archive_query_log/dashboard/tsconfig.json b/archive_query_log/dashboard/tsconfig.json new file mode 100644 index 00000000..129f9bfe --- /dev/null +++ b/archive_query_log/dashboard/tsconfig.json @@ -0,0 +1,32 @@ +{ + "compilerOptions": { + "target": "ESNext", + "jsx": "preserve", + "lib": ["DOM", "ESNext"], + "baseUrl": ".", + "module": "ESNext", + "moduleResolution": "bundler", + "paths": { + "@/*": ["src/*"] + }, + "resolveJsonModule": true, + "types": [ + "vite/client", + "vite-plugin-vue-layouts/client", + "unplugin-vue-router/client" + ], + "allowJs": true, + "strict": true, + "strictNullChecks": true, + "noUnusedLocals": true, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "isolatedModules": true, + "skipLibCheck": true + }, + "include": [ + "./src/typed-router.d.ts" + ], + "exclude": ["dist", "node_modules", "cypress"], + "references": [{ "path": "./tsconfig.node.json" }], +} diff --git a/archive_query_log/dashboard/tsconfig.node.json b/archive_query_log/dashboard/tsconfig.node.json new file mode 100644 index 00000000..348f346c --- /dev/null +++ b/archive_query_log/dashboard/tsconfig.node.json @@ -0,0 +1,9 @@ +{ + "compilerOptions": { + "composite": true, + "module": "ESNext", + "moduleResolution": "bundler", + "allowSyntheticDefaultImports": true + }, + "include": ["vite.config.mts"] +} diff --git a/archive_query_log/dashboard/vite.config.mts b/archive_query_log/dashboard/vite.config.mts new file mode 100644 index 00000000..ad8f6c4c --- /dev/null +++ b/archive_query_log/dashboard/vite.config.mts @@ -0,0 +1,42 @@ +// Plugins +import Components from "unplugin-vue-components/vite"; +import Vue from "@vitejs/plugin-vue"; +import Vuetify, { transformAssetUrls } from "vite-plugin-vuetify"; +import ViteFonts from "unplugin-fonts/vite"; +import { nodePolyfills } from "vite-plugin-node-polyfills"; +// Utilities +import { defineConfig } from "vite"; +import { fileURLToPath, URL } from "node:url"; + +// https://vitejs.dev/config/ +export default defineConfig({ + plugins: [ + Vue({ + template: { transformAssetUrls }, + }), + // https://github.com/vuetifyjs/vuetify-loader/tree/master/packages/vite-plugin#readme + Vuetify(), + Components(), + ViteFonts({ + google: { + families: [ + { + name: "Roboto", + styles: "wght@100;300;400;500;700;900", + }, + ], + }, + }), + nodePolyfills(), + ], + define: { "process.env": {} }, + resolve: { + alias: { + "@": fileURLToPath(new URL("./src", import.meta.url)), + }, + extensions: [".js", ".json", ".jsx", ".mjs", ".ts", ".tsx", ".vue"], + }, + server: { + port: 3000, + }, +}); diff --git a/archive_query_log/download/__init__.py b/archive_query_log/downloaders/__init__.py similarity index 100% rename from archive_query_log/download/__init__.py rename to archive_query_log/downloaders/__init__.py diff --git a/archive_query_log/downloaders/warc.py b/archive_query_log/downloaders/warc.py new file mode 100644 index 00000000..d971585a --- /dev/null +++ b/archive_query_log/downloaders/warc.py @@ -0,0 +1,301 @@ +from datetime import datetime +from itertools import chain +from typing import Iterable, Iterator, TypeVar, Generic, Type, Callable +from uuid import uuid5 +from warnings import warn + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import Exists, FunctionScore, Term, RankFeature +from requests import ConnectionError as RequestsConnectionError +from tqdm.auto import tqdm +from warc_s3 import WarcS3Record +from warcio.recordloader import ArcWarcRecord +from web_archive_api.cdx import CdxApi, CdxMatchType, CdxCapture +from web_archive_api.memento import MementoApi + +from archive_query_log import __version__ as app_version +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_WARC_DOWNLOADER +from archive_query_log.orm import Serp, InnerDownloader, WarcLocation, Result +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + +_T = TypeVar("_T") + + +class _WrapperArcWarcRecord(ArcWarcRecord, Generic[_T]): + wrapped: _T + + def __init__(self, wrapped: _T, record: ArcWarcRecord): + super().__init__( + record.format, + record.rec_type, + record.rec_headers, + record.raw_stream, + record.http_headers, + record.content_type, + record.length, + payload_length=record.payload_length, + digest_checker=record.digest_checker, + ) + self.wrapped = wrapped + + +def _unwrap( + warc_record: WarcS3Record, + wrapper_type: Type[_WrapperArcWarcRecord[_T]], +) -> tuple[_T, WarcLocation]: + record: ArcWarcRecord = warc_record.record + if not isinstance(record, wrapper_type): + raise TypeError(f"Expected {wrapper_type}, got {type(record)}.") + + location = WarcLocation( + file=warc_record.location.key, + offset=warc_record.location.offset, + length=warc_record.location.length, + ) + return record.wrapped, location + + +class _SerpArcWarcRecord(_WrapperArcWarcRecord[Serp]): + pass + + +def _download_serp_warc( + config: Config, + serp: Serp, +) -> Iterable[_SerpArcWarcRecord]: + if serp.capture.status_code != 200: + return + memento_api = MementoApi( + api_url=serp.archive.memento_api_url, + session=config.http.session, + ) + try: + records = memento_api.load_url_warc( + url=serp.capture.url, + timestamp=serp.capture.timestamp, + raw=True, + ) + except RequestsConnectionError: + warn(RuntimeWarning( + f"Connection error while downloading WARC " + f"for capture URL {serp.capture.url} at {serp.capture.timestamp}." + )) + return + for record in records: + yield _SerpArcWarcRecord(serp, record) + + +def download_serps_warc(config: Config) -> None: + changed_serps_search: Search = ( + Serp.search(using=config.es.client) + .filter( + Term(capture__status_code=200) & + ~Term(warc_downloader__should_download=False) + ) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_serps = changed_serps_search.count() + + if num_changed_serps <= 0: + echo("No new/changed SERPs.") + return + + changed_serps: Iterable[Serp] = ( + changed_serps_search + # Downloading WARCs is very slow, so we keep track + # of the Elasticsearch query for a full day, assuming that + # 1000 WARCs can be downloaded in 24h. + .params(scroll="24h") + .scan() + ) + changed_serps = safe_iter_scan(changed_serps) + # noinspection PyTypeChecker + changed_serps = tqdm(changed_serps, total=num_changed_serps, + desc="Downloading WARCs", unit="SERP") + + # Download from Memento API. + serp_records = chain.from_iterable( + _download_serp_warc(config, serp) + for serp in changed_serps + ) + + # Write to S3. + stored_records: Iterator[WarcS3Record] = ( + config.s3.warc_store.write(serp_records)) + stored_serps = ( + _unwrap(record, _SerpArcWarcRecord) + for record in stored_records + ) + + downloader_id_components = ( + config.s3.endpoint_url, + config.s3.bucket_name, + app_version, + ) + downloader_id = str(uuid5( + NAMESPACE_WARC_DOWNLOADER, + ":".join(downloader_id_components), + )) + actions = ( + update_action( + serp, + warc_location=location, + warc_downloader=InnerDownloader( + id=downloader_id, + should_download=False, + last_downloaded=utc_now(), + ), + ) + for serp, location in stored_serps + ) + config.es.bulk(actions) + + +class _ResultArcWarcRecord(_WrapperArcWarcRecord[Result]): + pass + + +def _capture_timestamp_distance( + timestamp: datetime) -> Callable[[CdxCapture], float]: + def _distance(capture: CdxCapture) -> float: + return abs(timestamp - capture.timestamp).total_seconds() + + return _distance + + +def _download_result_warc( + config: Config, + result: Result, +) -> Iterator[_ResultArcWarcRecord]: + if result.snippet.url is None: + return + + cdx_api = CdxApi( + api_url=result.archive.cdx_api_url, + session=config.http.session, + ) + memento_api = MementoApi( + api_url=result.archive.memento_api_url, + session=config.http.session, + ) + + capture_timestamp = result.capture.timestamp + nearest_result_capture_before_serp: CdxCapture | None = min( + cdx_api.iter_captures( + result.snippet.url, + match_type=CdxMatchType.EXACT, + to_timestamp=capture_timestamp, + ), + key=_capture_timestamp_distance(capture_timestamp), + default=None, + ) + nearest_result_capture_after_serp: CdxCapture | None = min( + cdx_api.iter_captures( + result.snippet.url, + match_type=CdxMatchType.EXACT, + from_timestamp=capture_timestamp, + ), + key=_capture_timestamp_distance(capture_timestamp), + default=None, + ) + if nearest_result_capture_before_serp is None: + result.update( + using=config.es.client, + warc_before_serp_downloader=InnerDownloader( + should_download=False, + last_downloaded=utc_now(), + ).to_dict(), + ) + else: + records = memento_api.load_capture_warc( + capture=nearest_result_capture_before_serp, + raw=True, + ) + for record in records: + yield _ResultArcWarcRecord(result, record) + if nearest_result_capture_after_serp is None: + result.update( + using=config.es.client, + warc_after_serp_downloader=InnerDownloader( + should_download=False, + last_downloaded=utc_now(), + ).to_dict(), + ) + else: + records = memento_api.load_capture_warc( + capture=nearest_result_capture_after_serp, + raw=True, + ) + for record in records: + yield _ResultArcWarcRecord(result, record) + + +def download_results_warc(config: Config) -> None: + changed_results_search: Search = ( + Result.search(using=config.es.client) + .filter( + Exists(field="snippet.url") & + ~Term(should_fetch_captures=False) + ) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_results = changed_results_search.count() + + if num_changed_results <= 0: + echo("No new/changed results.") + return + + changed_results: Iterable[Result] = changed_results_search.scan() + changed_results = safe_iter_scan(changed_results) + # noinspection PyTypeChecker + changed_results = tqdm(changed_results, total=num_changed_results, + desc="Downloading WARCs", unit="result") + + # Download from Memento API. + result_records = chain.from_iterable( + _download_result_warc(config, result) + for result in changed_results + ) + + # Write to S3. + stored_records: Iterator[WarcS3Record] = ( + config.s3.warc_store.write(result_records)) + stored_results = ( + _unwrap(record, _ResultArcWarcRecord) + for record in stored_records + ) + + downloader_id_components = ( + config.s3.endpoint_url, + config.s3.bucket_name, + app_version, + ) + downloader_id = str(uuid5( + NAMESPACE_WARC_DOWNLOADER, + ":".join(downloader_id_components), + )) + actions = ( + update_action( + result, + warc_location=location, + warc_downloader=InnerDownloader( + id=downloader_id, + should_download=False, + last_downloaded=utc_now(), + ), + ) + for result, location in stored_results + ) + config.es.bulk(actions) diff --git a/archive_query_log/queries/__init__.py b/archive_query_log/imports/__init__.py similarity index 100% rename from archive_query_log/queries/__init__.py rename to archive_query_log/imports/__init__.py diff --git a/archive_query_log/imports/aql22.py b/archive_query_log/imports/aql22.py new file mode 100644 index 00000000..f72ea369 --- /dev/null +++ b/archive_query_log/imports/aql22.py @@ -0,0 +1,227 @@ +from datetime import datetime, timezone +from itertools import chain +from os.path import getmtime +from pathlib import Path +from typing import Iterable, Iterator, NamedTuple +from urllib.parse import unquote +from uuid import uuid5 + +from click import echo +from elasticsearch_dsl.query import Term +from tqdm.auto import tqdm + +from archive_query_log.config import Config +from archive_query_log.legacy.model import ArchivedUrl +from archive_query_log.legacy.urls.iterable import ArchivedUrls +from archive_query_log.namespaces import NAMESPACE_CAPTURE +from archive_query_log.orm import Capture, Archive, Provider, \ + InnerProvider, InnerArchive +from archive_query_log.utils.time import CET, UTC + + +class _ImportablePath(NamedTuple): + path: Path + archive: Archive + provider: Provider + domain: str + url_path_prefix: str + + +def _iter_captures( + config: Config, + importable_path: _ImportablePath, + last_modified: datetime, + archived_urls: Iterable[ArchivedUrl], + check_memento: bool = True, +) -> Iterator[Capture]: + for archived_url in archived_urls: + url = archived_url.url + timestamp = datetime.fromtimestamp( + archived_url.timestamp, + tz=timezone.utc, + ) + # Bug fix because the AQL-22 data is in CET, but the timestamps are + # not marked as such. + timestamp = timestamp.astimezone(CET) + timestamp = timestamp.replace(tzinfo=UTC) + + memento_url = ( + f"{importable_path.archive.memento_api_url}/" + f"{timestamp.astimezone(UTC).strftime('%Y%m%d%H%M%S')}/" + f"{url}") + if check_memento: + response = config.http.session_no_retry.head( + memento_url, + allow_redirects=False, + ) + if response.status_code != 200: + continue + + capture_id_components = ( + importable_path.archive.cdx_api_url, + url, + timestamp.astimezone(UTC).strftime("%Y%m%d%H%M%S"), + ) + capture_id = str(uuid5( + NAMESPACE_CAPTURE, + ":".join(capture_id_components), + )) + yield Capture( + id=capture_id, + last_modified=last_modified.replace(microsecond=0), + archive=InnerArchive( + id=importable_path.archive.id, + cdx_api_url=importable_path.archive.cdx_api_url, + memento_api_url=importable_path.archive.memento_api_url, + ), + provider=InnerProvider( + id=importable_path.provider.id, + domain=importable_path.domain, + url_path_prefix=importable_path.url_path_prefix, + ), + url=url, + timestamp=timestamp.astimezone(UTC), + url_query_parser=InnerProvider( + should_parse=True, + ), + ) + + +def _import_captures_path( + config: Config, + importable_path: _ImportablePath, + check_memento: bool = True, +) -> None: + echo(f"Importing captures from {importable_path.path} to " + f"archive {importable_path.archive.id} and " + f"provider {importable_path.provider.id}.") + + json_paths = list(importable_path.path.glob("*.jsonl.gz")) + oldest_modification_time = min( + datetime.fromtimestamp(getmtime(path)) + for path in json_paths + ).astimezone(UTC) + echo(f"Found {len(json_paths)} JSONL files " + f"(oldest from {oldest_modification_time.strftime('%c')}).") + + urls_iterators_list = [ArchivedUrls(path) for path in json_paths] + urls_iterators: Iterable[ArchivedUrls] = urls_iterators_list + if len(urls_iterators_list) > 50: + # noinspection PyTypeChecker + urls_iterators = tqdm( + urls_iterators, + desc="Get capture count", + unit="file", + ) + total_count = sum(len(urls) for urls in urls_iterators) + echo(f"Found {total_count} captures.") + + archived_urls: Iterable[ArchivedUrl] = chain.from_iterable( + urls_iterators_list) + # noinspection PyTypeChecker + archived_urls = tqdm( + archived_urls, + total=total_count, + desc="Importing captures", + unit="capture", + ) + captures_iter = _iter_captures( + config=config, + importable_path=importable_path, + last_modified=oldest_modification_time, + archived_urls=archived_urls, + check_memento=check_memento, + ) + actions = ( + { + **capture.to_dict(include_meta=True), + "_op_type": "create", + } + for capture in captures_iter + ) + config.es.bulk(actions) + + +def import_captures( + config: Config, + data_dir_path: Path, + check_memento: bool, + search_provider: str | None, + search_provider_index: int | None, +) -> None: + echo(f"Importing AQL-22 captures from: {data_dir_path}") + + archive_response = ( + Archive.search(using=config.es.client) + .query( + Term(cdx_api_url="https://web.archive.org/cdx/search/cdx") + ) + .execute() + ) + if archive_response.hits.total.value < 1: + echo("No AQL-22 archive found. Add an archive with the " + "CDX API URL 'https://web.archive.org/cdx/search/cdx' " + "first.") + exit(1) + + archive: Archive = archive_response.hits[0] + echo(f"Importing captures for archive {archive.id}: {archive.name}") + + archived_urls_path = data_dir_path / "archived-urls" + if not archived_urls_path.exists(): + echo("No captures found.") + return + + search_provider_paths = sorted(archived_urls_path.glob("*")) + if search_provider is not None: + search_provider_paths = [ + path for path in search_provider_paths + if path.name == search_provider + ] + elif search_provider_index is not None: + search_provider_paths = [search_provider_paths[search_provider_index]] + if len(search_provider_paths) == 0: + echo("No captures found.") + return + prefix_paths_list: list[Path] = list(chain.from_iterable(( + search_provider_path.glob("*") + for search_provider_path in search_provider_paths + ))) + + importable_paths = [] + prefix_paths: Iterable[Path] = prefix_paths_list + # noinspection PyTypeChecker + prefix_paths = tqdm( + prefix_paths, + desc="Checking URL prefixes", + unit="prefix", + ) + for prefix_path in prefix_paths: + prefix = unquote(prefix_path.name) + domain = prefix.split("/", maxsplit=1)[0] + url_path_prefix = prefix.removeprefix(domain) + + provider_response = ( + Provider.search(using=config.es.client) + .query( + Term(domains=domain) & + Term(url_path_prefixes=url_path_prefix) + ) + .execute() + ) + if provider_response.hits.total.value >= 1: + provider: Provider = provider_response.hits[0] + importable_paths.append(_ImportablePath( + path=prefix_path, + archive=archive, + provider=provider, + domain=domain, + url_path_prefix=url_path_prefix, + )) + + for importable_path in importable_paths: + _import_captures_path( + config=config, + importable_path=importable_path, + check_memento=check_memento, + ) diff --git a/archive_query_log/imports/archive_it.py b/archive_query_log/imports/archive_it.py new file mode 100644 index 00000000..b73fd37a --- /dev/null +++ b/archive_query_log/imports/archive_it.py @@ -0,0 +1,89 @@ +from urllib.parse import urljoin + +from click import echo +from tqdm.auto import tqdm + +from archive_query_log.archives import add_archive +from archive_query_log.config import Config + +_ARCHIVE_IT_METADATA_FIELDS = [ + "Title", + "Description", + "Subject", + "Coverage", + "Language", + "Collector", + "Creator", + "Publisher", + "Date", + "Identifier", + "Rights", +] + +DEFAULT_ARCHIVE_IT_API_URL: str = "https://partner.archive-it.org" +DEFAULT_ARCHIVE_IT_WAYBACK_URL: str = "https://wayback.archive-it.org/" +DEFAULT_ARCHIVE_IT_PAGE_SIZE: int = 100 + + +def import_archives( + config: Config, + api_url: str = DEFAULT_ARCHIVE_IT_API_URL, + wayback_url: str = DEFAULT_ARCHIVE_IT_WAYBACK_URL, + page_size: int = DEFAULT_ARCHIVE_IT_PAGE_SIZE, + priority: float | None = None, + no_merge: bool = False, + auto_merge: bool = False, +) -> None: + echo("Load Archive-It collections.") + collections_api_url = urljoin(api_url, "/api/collection") + response = config.http.session.get( + collections_api_url, + params=[ + ("limit", 0), + ("format", "json"), + ], + ) + num_collections = int(response.headers["Total-Row-Count"]) + echo(f"Found {num_collections} collections on Archive-It.") + + # noinspection PyTypeChecker + progress = tqdm(total=num_collections, desc="Import archives", + unit="archives", disable=not auto_merge and not no_merge) + offset_range = range(0, num_collections, page_size) + for offset in offset_range: + response = config.http.session.get( + collections_api_url, + params=[ + ("limit", page_size), + ("offset", offset), + ("format", "json"), + ], + ) + response_list = response.json() + for item in response_list: + name = f"Archive-It {item['name']}" + archive_it_id = int(item["id"]) + + description_parts = [] + metadata = item["metadata"] + for metadata_field in _ARCHIVE_IT_METADATA_FIELDS: + if metadata_field in metadata: + for title in metadata[metadata_field]: + description_parts.append( + f"{metadata_field}: {title['value']}") + description_parts.append(f"Archive-It ID: {archive_it_id}") + description = "\n".join(description_parts) + cdx_api_url = urljoin( + wayback_url, f"{archive_it_id}/timemap/cdx") + memento_api_url = urljoin(wayback_url, f"{archive_it_id}") + add_archive( + config=config, + name=name, + description=description, + cdx_api_url=cdx_api_url, + memento_api_url=memento_api_url, + priority=priority, + no_merge=no_merge, + auto_merge=auto_merge, + ) + progress.update(1) diff --git a/archive_query_log/imports/yaml.py b/archive_query_log/imports/yaml.py new file mode 100644 index 00000000..a3ffd0be --- /dev/null +++ b/archive_query_log/imports/yaml.py @@ -0,0 +1,481 @@ +from pathlib import Path +from typing import MutableMapping +from typing import Sequence, Iterable +from warnings import warn + +from click import echo +from click import prompt +from diskcache import Index +from elasticsearch_dsl.query import Terms +from tqdm.auto import tqdm +from whois import whois +from whois.parser import PywhoisError +from yaml import safe_load + +from archive_query_log.config import Config +from archive_query_log.orm import Provider +from archive_query_log.parsers.url_offset import add_url_offset_parser +from archive_query_log.parsers.url_page import add_url_page_parser +from archive_query_log.parsers.url_query import add_url_query_parser +from archive_query_log.parsers.warc_query import add_warc_query_parser +from archive_query_log.parsers.warc_snippets import add_warc_snippets_parser +from archive_query_log.parsers.xml import xpaths_from_css_selector, \ + text_xpath, merge_xpaths +from archive_query_log.providers import add_provider +from archive_query_log.utils.es import safe_iter_scan + + +def _provider_name( + i: int, + main_domain: str, + provider_names: MutableMapping[str, str], + review: bool, +) -> str | None: + provider_name_suggest: str | None + if main_domain in provider_names: + if not review: + return provider_names[main_domain] + else: + provider_name_suggest = provider_names[main_domain] + else: + try: + main_domain_info = whois(main_domain) + except PywhoisError: + main_domain_info = None + if main_domain_info is not None: + main_org: str | list[str] | None = main_domain_info.org + if isinstance(main_org, list): + main_org = main_org[0] + if main_org is not None: + for restricted_phrase in ["redacted", "privacy", + "domain protection", "not disclosed", + "identity protection", + "domains by proxy"]: + if restricted_phrase in main_org.casefold(): + main_org = None + break + if main_org is not None: + for suffix in ["Inc", "LLC", "Ltd", "LTD", "GmbH", "AG", "S.A", + "SE", "Co", "Pty", "B.V", "S.L", "S.R.L", + "S.A.S", + "SAS", "AB", "&"]: + main_org = main_org.removesuffix(f", {suffix}.") + main_org = main_org.removesuffix(f", {suffix}") + main_org = main_org.removesuffix(f" {suffix}.") + main_org = main_org.removesuffix(f" {suffix}") + provider_name_suggest = main_org + else: + provider_name_suggest = None + provider_name = prompt( + f"Please enter search provider #{i} name (https://{main_domain})", + type=str, default=provider_name_suggest, + show_default=provider_name_suggest != " ") + if provider_name.strip() == "": + return None + provider_names[main_domain] = provider_name + return provider_name + + +def import_providers( + config: Config, + services_path: Path, + cache_path: Path, + review: int | None, + no_merge: bool, + auto_merge: bool, +) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + provider_names: MutableMapping[str, str] = Index(str(cache_path)) + + if auto_merge or no_merge: + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import providers", + unit="provider", + ) + + num_services = len(services_list) + + ask_for_name = True + service: dict + for i, service in enumerate(services): + if "domains" not in service: + raise ValueError(f"Service definition #{i} from {services_path} " + f"has no domains: {service}") + + if ("query_parsers" not in service or + len(service["query_parsers"]) == 0): + continue + + main_domain = service["domains"][0] + if ask_for_name: + name = _provider_name(i, main_domain, provider_names, + review is not None and review <= i) + if name is None: + ask_for_name = False + else: + name = None + + add_provider( + config=config, + name=name, + description=None, + notes=service.get("notes"), + exclusion_reason=service.get("excluded"), + domains=set(service["domains"]), + url_path_prefixes=set(service["focused_url_prefixes"]), + priority=num_services - i, + no_merge=no_merge, + auto_merge=auto_merge, + ) + + +def import_url_query_parsers(config: Config, services_path: Path) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import parsers for providers", + unit="provider", + ) + for i, service in enumerate(services): + if "domains" not in service or "query_parsers" not in service: + continue + + query_parsers = service["query_parsers"] + num_query_parsers = len(query_parsers) + + providers = ( + Provider.search(using=config.es.client) + .query(Terms(domains=service["domains"])) + .scan() + ) + providers = safe_iter_scan(providers) + for provider in providers: + for k, query_parser in enumerate(query_parsers): + if query_parser["type"] == "fragment_segment": + warn(UserWarning( + f"Service definition #{i} " + f"query parser #{k} is of type " + f"'fragment_segment', which is not supported.")) + continue + remove_patterns = query_parser.get("remove_patterns") + if remove_patterns is not None: + remove_pattern_regex = "|".join(remove_patterns) + else: + remove_pattern_regex = None + space_patterns = query_parser.get("space_patterns") + if space_patterns is not None: + space_pattern_regex = "|".join(space_patterns) + else: + space_pattern_regex = None + segment_string = query_parser.get("segment") + if segment_string is not None: + segment = int(segment_string) + else: + segment = None + add_url_query_parser( + config=config, + provider_id=provider.meta.id, + url_pattern_regex=query_parser.get("url_pattern"), + priority=num_query_parsers - k, + parser_type=query_parser["type"], + parameter=query_parser.get("parameter"), + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +def import_url_page_parsers(config: Config, services_path: Path) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import parsers for providers", + unit="provider", + ) + for i, service in enumerate(services): + if "domains" not in service or "page_parsers" not in service: + continue + + page_parsers = service["page_parsers"] + num_page_parsers = len(page_parsers) + + providers = ( + Provider.search(using=config.es.client) + .query(Terms(domains=service["domains"])) + .scan() + ) + providers = safe_iter_scan(providers) + for provider in providers: + for k, page_parser in enumerate(page_parsers): + if page_parser["type"] == "fragment_segment": + warn(UserWarning( + f"Service definition #{i} " + f"page parser #{k} is of type " + f"'fragment_segment', which is not supported.")) + continue + remove_patterns = page_parser.get("remove_patterns") + if remove_patterns is not None: + remove_pattern_regex = "|".join(remove_patterns) + else: + remove_pattern_regex = None + space_patterns = page_parser.get("space_patterns") + if space_patterns is not None: + space_pattern_regex = "|".join(space_patterns) + else: + space_pattern_regex = None + segment_string = page_parser.get("segment") + if segment_string is not None: + segment = int(segment_string) + else: + segment = None + add_url_page_parser( + config=config, + provider_id=provider.meta.id, + url_pattern_regex=page_parser.get("url_pattern"), + priority=num_page_parsers - k, + parser_type=page_parser["type"], + parameter=page_parser.get("parameter"), + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +def import_url_offset_parsers(config: Config, services_path: Path) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import parsers for providers", + unit="provider", + ) + for i, service in enumerate(services): + if "domains" not in service or "offset_parsers" not in service: + continue + + offset_parsers = service["offset_parsers"] + num_offset_parsers = len(offset_parsers) + + providers = ( + Provider.search(using=config.es.client) + .query(Terms(domains=service["domains"])) + .scan() + ) + providers = safe_iter_scan(providers) + for provider in providers: + for k, offset_parser in enumerate(offset_parsers): + if offset_parser["type"] == "fragment_segment": + warn(UserWarning( + f"Service definition #{i} " + f"offset parser #{k} is of type " + f"'fragment_segment', which is not supported.")) + continue + remove_patterns = offset_parser.get("remove_patterns") + if remove_patterns is not None: + remove_pattern_regex = "|".join(remove_patterns) + else: + remove_pattern_regex = None + space_patterns = offset_parser.get("space_patterns") + if space_patterns is not None: + space_pattern_regex = "|".join(space_patterns) + else: + space_pattern_regex = None + segment_string = offset_parser.get("segment") + if segment_string is not None: + segment = int(segment_string) + else: + segment = None + add_url_offset_parser( + config=config, + provider_id=provider.meta.id, + url_pattern_regex=offset_parser.get("url_pattern"), + priority=num_offset_parsers - k, + parser_type=offset_parser["type"], + parameter=offset_parser.get("parameter"), + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +def import_warc_query_parsers(config: Config, services_path: Path) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import parsers for providers", + unit="provider", + ) + for service in services: + if ("domains" not in service or + "interpreted_query_parsers" not in service): + continue + + interpreted_query_parsers = service["interpreted_query_parsers"] + num_interpreted_query_parsers = len(interpreted_query_parsers) + + providers = ( + Provider.search(using=config.es.client) + .query(Terms(domains=service["domains"])) + .scan() + ) + providers = safe_iter_scan(providers) + for provider in providers: + for k, interpreted_query_parser in ( + enumerate(interpreted_query_parsers)): + if interpreted_query_parser["type"] != "html_selector": + continue + remove_patterns = ( + interpreted_query_parser.get("remove_patterns")) + if remove_patterns is not None: + remove_pattern_regex = "|".join(remove_patterns) + else: + remove_pattern_regex = None + space_patterns = interpreted_query_parser.get("space_patterns") + if space_patterns is not None: + space_pattern_regex = "|".join(space_patterns) + else: + space_pattern_regex = None + query_selector = interpreted_query_parser["query_selector"] + + query_text = interpreted_query_parser.get("query_text", False) + query_attribute = interpreted_query_parser.get( + "query_attribute", "value" if not query_text else None) + + query_xpaths = xpaths_from_css_selector(query_selector) + query_xpaths = [ + "//" + + text_xpath( + query_xpath, + attribute=query_attribute, + text=query_text, + ) + for query_xpath in query_xpaths + ] + query_xpath = merge_xpaths(query_xpaths) + + add_warc_query_parser( + config=config, + provider_id=provider.meta.id, + url_pattern_regex=interpreted_query_parser.get( + "url_pattern"), + priority=num_interpreted_query_parsers - k, + parser_type="xpath", + xpath=query_xpath, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + + +def import_warc_snippets_parsers(config: Config, services_path: Path) -> None: + echo("Load providers from services file.") + with services_path.open("r") as file: + services_list: Sequence[dict] = safe_load(file) + echo(f"Found {len(services_list)} service definitions.") + + services: Iterable[dict] = services_list + # noinspection PyTypeChecker + services = tqdm( + services, + desc="Import parsers for providers", + unit="provider", + ) + for service in services: + if ("domains" not in service or "results_parsers" not in service): + continue + + results_parsers = service["results_parsers"] + num_results_parsers = len(results_parsers) + + providers = ( + Provider.search(using=config.es.client) + .query(Terms(domains=service["domains"])) + .scan() + ) + providers = safe_iter_scan(providers) + for provider in providers: + for k, results_parser in enumerate(results_parsers): + if results_parser["type"] != "html_selector": + continue + results_selector = results_parser["results_selector"] + url_selector = results_parser.get("url_selector") + title_selector = results_parser.get("title_selector") + snippet_selector = results_parser.get("snippet_selector") + + results_xpaths = xpaths_from_css_selector(results_selector) + results_xpaths = [ + "//" + result_xpath + for result_xpath in results_xpaths + ] + results_xpath = merge_xpaths(results_xpaths) + + if url_selector is not None: + url_xpaths = xpaths_from_css_selector(url_selector) + url_xpaths = [ + text_xpath(xpath, attribute="href") + for xpath in url_xpaths + ] + url_xpath = merge_xpaths(url_xpaths) + else: + url_xpath = None + + if title_selector is not None: + title_xpaths = xpaths_from_css_selector(title_selector) + title_xpaths = [ + text_xpath(xpath, text=True) + for xpath in title_xpaths + ] + title_xpath = merge_xpaths(title_xpaths) + else: + title_xpath = None + + if snippet_selector is not None: + snippet_xpaths = xpaths_from_css_selector(snippet_selector) + snippet_xpaths = [ + text_xpath(xpath, text=True) + for xpath in snippet_xpaths + ] + snippet_xpath = merge_xpaths(snippet_xpaths) + else: + snippet_xpath = None + + add_warc_snippets_parser( + config=config, + provider_id=provider.meta.id, + url_pattern_regex=results_parser.get("url_pattern"), + priority=num_results_parsers - k, + parser_type="xpath", + xpath=results_xpath, + url_xpath=url_xpath, + title_xpath=title_xpath, + text_xpath=snippet_xpath, + ) diff --git a/archive_query_log/legacy/__init__.py b/archive_query_log/legacy/__init__.py new file mode 100644 index 00000000..93c64e93 --- /dev/null +++ b/archive_query_log/legacy/__init__.py @@ -0,0 +1,14 @@ +from logging import getLogger, Logger +from pathlib import Path + +PROJECT_DIRECTORY_PATH: Path = Path(__file__).parent.parent.parent +DATA_DIRECTORY_PATH: Path = PROJECT_DIRECTORY_PATH / "data" + +CDX_API_URL = "https://web.archive.org/cdx/search/cdx" + +LOGGER: Logger = getLogger(__name__) + +if __name__ == "__main__": + from archive_query_log.legacy.cli import main + + main() diff --git a/archive_query_log/legacy/cli/__init__.py b/archive_query_log/legacy/cli/__init__.py new file mode 100644 index 00000000..cb4dc902 --- /dev/null +++ b/archive_query_log/legacy/cli/__init__.py @@ -0,0 +1,19 @@ +from click import group + +from archive_query_log.legacy.cli.alexa import alexa +from archive_query_log.legacy.cli.corpus import corpus_command +from archive_query_log.legacy.cli.external import external +from archive_query_log.legacy.cli.index import index_command +from archive_query_log.legacy.cli.make import make_group + + +@group() +def main(): + pass + + +main.add_command(alexa) +main.add_command(corpus_command) +main.add_command(external) +main.add_command(index_command) +main.add_command(make_group) diff --git a/archive_query_log/cli/alexa.py b/archive_query_log/legacy/cli/alexa.py similarity index 77% rename from archive_query_log/cli/alexa.py rename to archive_query_log/legacy/cli/alexa.py index 59888d3a..0380847b 100644 --- a/archive_query_log/cli/alexa.py +++ b/archive_query_log/legacy/cli/alexa.py @@ -1,13 +1,12 @@ from pathlib import Path -from click import option, Path as PathParam, argument, IntRange +from click import option, Path as PathParam, argument, IntRange, group -from archive_query_log import DATA_DIRECTORY_PATH, CDX_API_URL -from archive_query_log.cli.main import main -from archive_query_log.cli.util import URL +from archive_query_log.legacy import DATA_DIRECTORY_PATH, CDX_API_URL +from archive_query_log.legacy.cli.util import URL -@main.group("alexa") +@group("alexa") def alexa(): pass @@ -32,7 +31,7 @@ def alexa(): default=DATA_DIRECTORY_PATH / "alexa-top-1m-archived-urls.jsonl" ) def archived_urls(api_url: str, output_path: Path) -> None: - from archive_query_log.services.alexa import AlexaTop1MArchivedUrls + from archive_query_log.legacy.services.alexa import AlexaTop1MArchivedUrls AlexaTop1MArchivedUrls( output_path=output_path, cdx_api_url=api_url, @@ -64,7 +63,7 @@ def archived_urls(api_url: str, output_path: Path) -> None: default=1000, ) def domains(data_dir: Path, api_url: str, depth: int) -> None: - from archive_query_log.services.alexa import AlexaTop1MFusedDomains + from archive_query_log.legacy.services.alexa import AlexaTop1MFusedDomains AlexaTop1MFusedDomains( data_directory_path=data_dir, cdx_api_url=api_url, diff --git a/archive_query_log/cli/corpus.py b/archive_query_log/legacy/cli/corpus.py similarity index 95% rename from archive_query_log/cli/corpus.py rename to archive_query_log/legacy/cli/corpus.py index b44ffc5b..c2ef2a80 100644 --- a/archive_query_log/cli/corpus.py +++ b/archive_query_log/legacy/cli/corpus.py @@ -7,23 +7,22 @@ from typing import Collection from uuid import UUID -from click import option, BOOL +from click import option, BOOL, command from tqdm.auto import tqdm -from archive_query_log import DATA_DIRECTORY_PATH -from archive_query_log.cli import main -from archive_query_log.cli.util import PathParam -from archive_query_log.index import ArchivedRawSerpIndex, \ +from archive_query_log.legacy import DATA_DIRECTORY_PATH +from archive_query_log.legacy.cli.util import PathParam +from archive_query_log.legacy.index import ArchivedRawSerpIndex, \ ArchivedUrlIndex, ArchivedQueryUrlIndex, ArchivedParsedSerpIndex, \ ArchivedSearchResultSnippetIndex, ArchivedRawSearchResultIndex, \ LocatedRecord -from archive_query_log.model import ArchivedUrl, CorpusQueryUrl, \ +from archive_query_log.legacy.model import ArchivedUrl, CorpusQueryUrl, \ ArchivedSearchResultSnippet, CorpusDocument, CorpusJsonlLocation, \ CorpusWarcLocation, ArchivedRawSerp, \ ArchivedQueryUrl, ArchivedParsedSerp, CorpusQuery, CorpusSearchResult -@main.command( +@command( "corpus", help="Generate corpus.", ) @@ -71,10 +70,6 @@ def corpus_command( queries: bool, output_directory: Path, ) -> None: - from archive_query_log.index import ArchivedUrlIndex, \ - ArchivedQueryUrlIndex, ArchivedRawSerpIndex, ArchivedParsedSerpIndex, \ - ArchivedSearchResultSnippetIndex, ArchivedRawSearchResultIndex - output_path: Path if output_directory is not None: output_path = output_directory @@ -155,6 +150,7 @@ def corpus_command( else: archived_ids = set(archived_url_index) + # noinspection PyTypeChecker archived_ids = tqdm( archived_ids, desc="Build corpus", @@ -220,7 +216,7 @@ def _build_query_url( archived_parsed_serp_loc: LocatedRecord[ CorpusJsonlLocation, ArchivedParsedSerp ] | None, -) -> CorpusQueryUrl | None: +) -> CorpusQueryUrl: archived_url = archived_url_loc.record return CorpusQueryUrl( id=archived_url.id, @@ -269,8 +265,8 @@ def _build_search_result( # archived_parsed_search_result_index: ArchivedParsedSearchResultIndex, archived_search_result_snippet: ArchivedSearchResultSnippet, ) -> CorpusSearchResult: - archived_snippet_loc = archived_search_result_snippet_index \ - .get(archived_search_result_snippet.id) + archived_snippet_loc = archived_search_result_snippet_index[ + archived_search_result_snippet.id] archived_raw_search_result_loc = archived_raw_search_result_index \ .get(archived_search_result_snippet.id) # archived_parsed_search_result_loc = archived_parsed_search_result_index \ diff --git a/archive_query_log/legacy/cli/util.py b/archive_query_log/legacy/cli/util.py new file mode 100644 index 00000000..f28dafae --- /dev/null +++ b/archive_query_log/legacy/cli/util.py @@ -0,0 +1,69 @@ +from typing import Dict, Any, List +from urllib.parse import urlparse + +from click import Parameter, Context +from click.shell_completion import CompletionItem +from click.types import StringParamType, Path, Choice + + +class UrlParam(StringParamType): + name = "url" + + def convert(self, value, param, ctx): + value = super().convert(value, param, ctx) + if value is None: + return None + tokens = urlparse(value) + if not tokens.scheme or not tokens.netloc: + self.fail(f"{value} is not a valid URL", param, ctx) + return value + + +URL = UrlParam() + +PathParam = Path + + +class ServiceChoice(Choice): + + def __init__(self) -> None: + super().__init__(choices=[], case_sensitive=False) + + def _ensure_choices(self): + if len(self.choices) == 0: + from archive_query_log.legacy.config import SERVICES + self.choices = sorted(SERVICES.keys()) + + def to_info_dict(self) -> Dict[str, Any]: + self._ensure_choices() + return super().to_info_dict() + + def get_metavar(self, param: Parameter) -> str: + self._ensure_choices() + return super().get_metavar(param) + + def get_missing_message(self, param: Parameter) -> str: + self._ensure_choices() + return super().get_missing_message(param) + + def convert( + self, + value: Any, + param: Parameter | None, + ctx: Context | None, + ) -> Any: + self._ensure_choices() + return super().convert(value, param, ctx) + + def __repr__(self) -> str: + self._ensure_choices() + return super().__repr__() + + def shell_complete( + self, + ctx: Context, + param: Parameter, + incomplete: str, + ) -> List[CompletionItem]: + self._ensure_choices() + return super().shell_complete(ctx, param, incomplete) diff --git a/archive_query_log/legacy/config.py b/archive_query_log/legacy/config.py new file mode 100644 index 00000000..a2897620 --- /dev/null +++ b/archive_query_log/legacy/config.py @@ -0,0 +1,9 @@ +from typing import Mapping + +from archive_query_log.legacy import DATA_DIRECTORY_PATH +from archive_query_log.legacy.model import Service +from archive_query_log.legacy.services import read_services + +# Load all services that have parsers and create the services for them. +SERVICES_PATH = DATA_DIRECTORY_PATH / "selected-services.yaml" +SERVICES: Mapping[str, Service] = read_services(SERVICES_PATH) diff --git a/archive_query_log/conftest.py b/archive_query_log/legacy/conftest.py similarity index 100% rename from archive_query_log/conftest.py rename to archive_query_log/legacy/conftest.py diff --git a/archive_query_log/results/__init__.py b/archive_query_log/legacy/download/__init__.py similarity index 100% rename from archive_query_log/results/__init__.py rename to archive_query_log/legacy/download/__init__.py diff --git a/archive_query_log/download/iterable.py b/archive_query_log/legacy/download/iterable.py similarity index 59% rename from archive_query_log/download/iterable.py rename to archive_query_log/legacy/download/iterable.py index a858c722..430c2eb5 100644 --- a/archive_query_log/download/iterable.py +++ b/archive_query_log/legacy/download/iterable.py @@ -1,15 +1,16 @@ from dataclasses import dataclass from functools import cached_property +from gzip import open as gzip_open, GzipFile from json import JSONDecodeError from pathlib import Path from typing import Sized, Iterable, Iterator -from fastwarc import GZipStream, FileStream, ArchiveIterator, WarcRecordType, \ - WarcRecord from marshmallow import Schema +from warcio.archiveiterator import ArchiveIterator +from warcio.recordloader import ArcWarcRecord -from archive_query_log import LOGGER -from archive_query_log.model import ArchivedQueryUrl, ArchivedRawSerp +from archive_query_log.legacy import LOGGER +from archive_query_log.legacy.model import ArchivedQueryUrl, ArchivedRawSerp @dataclass(frozen=True) @@ -32,58 +33,62 @@ def _check_raw_serps_paths(self): f"Raw SERPs path must be a directory: {self.path}" ) - def _streams(self) -> Iterator[tuple[Path, GZipStream]]: + def _streams(self) -> Iterator[tuple[Path, GzipFile]]: files = self.path.glob("*.warc.gz") for file in files: - yield file, GZipStream(FileStream(str(file), "rb")) + with gzip_open(file, "rb") as stream: + yield file, stream def __len__(self) -> int: return sum( 1 for _, stream in self._streams() - for _ in ArchiveIterator( - stream, - record_types=WarcRecordType.response, - parse_http=False, - ) + for record in ArchiveIterator(stream, no_record_parse=True) + if record.rec_type == "response" ) @cached_property def _archived_serp_url_schema(self) -> Schema: return ArchivedQueryUrl.schema() - def _read_serp_content(self, record: WarcRecord) -> ArchivedRawSerp | None: + def _read_serp_content( + self, + record: ArcWarcRecord, + ) -> ArchivedRawSerp | None: archived_serp_url: ArchivedQueryUrl - record_url_header = record.headers["Archived-URL"] + record_url_header = record.rec_headers.get_header("Archived-URL") try: archived_serp_url = self._archived_serp_url_schema.loads( record_url_header ) except JSONDecodeError: - LOGGER.warning(f"Could not index {record_url_header} " - f"from record {record.record_id}.") + LOGGER.warning( + f"Could not index {record_url_header} from record " + f"{record.rec_headers.get_header('WARC-Record-ID')}." + ) return None - content_type = record.http_charset - if content_type is None: - content_type = "utf8" + encoding = record.http_headers.get_header("Content-Type") + if encoding is None: + encoding = "" + encoding = encoding.split(";")[-1].split("=")[-1].strip().lower() + if encoding == "" or "/" in encoding: + encoding = "utf8" return ArchivedRawSerp( url=archived_serp_url.url, timestamp=archived_serp_url.timestamp, query=archived_serp_url.query, page=archived_serp_url.page, offset=archived_serp_url.offset, - content=record.reader.read(), - encoding=content_type, + content=record.content_stream().read(), + encoding=encoding, ) def __iter__(self) -> Iterator[ArchivedRawSerp]: for path, stream in self._streams(): failures = False - for record in ArchiveIterator( - stream, - record_types=WarcRecordType.response, - parse_http=True, - ): + for record in ArchiveIterator(stream): + if record.rec_type != "response": + continue serp = self._read_serp_content(record) if serp is None: failures = True diff --git a/archive_query_log/download/raw.py b/archive_query_log/legacy/download/raw.py similarity index 96% rename from archive_query_log/download/raw.py rename to archive_query_log/legacy/download/raw.py index bc649e1f..40fbc689 100644 --- a/archive_query_log/download/raw.py +++ b/archive_query_log/legacy/download/raw.py @@ -8,8 +8,8 @@ from asyncio_pool import AioPool from tqdm.auto import tqdm -from archive_query_log.model import ArchivedUrl -from archive_query_log.util.archive_http import archive_http_client +from archive_query_log.legacy.model import ArchivedUrl +from archive_query_log.legacy.util.archive_http import archive_http_client class WebArchiveRawDownloader: @@ -102,7 +102,7 @@ async def _download_single( if file_path.exists(): return file_path url = archived_url.raw_archive_url - await sleep(1.0 * random()) + await sleep(1.0 * random()) # nosec: B311 try: async with client.get(url) as response: response.raise_for_status() diff --git a/archive_query_log/download/warc.py b/archive_query_log/legacy/download/warc.py similarity index 86% rename from archive_query_log/download/warc.py rename to archive_query_log/legacy/download/warc.py index 110fe5ff..be6944a9 100644 --- a/archive_query_log/download/warc.py +++ b/archive_query_log/legacy/download/warc.py @@ -13,10 +13,11 @@ from tqdm.auto import tqdm from warcio import WARCWriter, StatusAndHeaders -from archive_query_log.model import ArchivedUrl, Service -from archive_query_log.queries.iterable import ArchivedQueryUrls -from archive_query_log.serps.iterable import ArchivedParsedSerps -from archive_query_log.util.archive_http import archive_http_client +from archive_query_log.legacy.model import ArchivedUrl, Service, \ + ArchivedQueryUrl +from archive_query_log.legacy.queries.iterable import ArchivedQueryUrls +from archive_query_log.legacy.serps.iterable import ArchivedParsedSerps +from archive_query_log.legacy.util.archive_http import archive_http_client class _CdxPage(NamedTuple): @@ -89,6 +90,7 @@ def _next_available_file_path( file_size = path.stat().st_size if file_size + buffer_size <= self.max_file_size: return path + raise RuntimeError("All available file paths are filled.") async def _download( self, @@ -145,8 +147,6 @@ async def _download_single( if progress is not None: progress.update() return True - # FIXME: This won't work for snippet URLs. Instead, - # we need to fetch the closest archived URL if any. archive_url = url.archived_url.raw_archive_url url_headers = { "Archived-URL": url.archived_url.schema().dumps(url.archived_url), @@ -174,6 +174,9 @@ async def _download_single( warc_headers_dict={**url_headers}, ) writer.write_record(request_record) + + protocol = f"HTTP/{response.version}" + reason = str(response.reason) response_record = writer.create_warc_record( uri=str(response.url), record_type="response", @@ -181,7 +184,7 @@ async def _download_single( statusline=" ".join(( protocol, str(response.status), - response.reason, + reason, )), headers=response.headers, protocol=protocol @@ -253,8 +256,12 @@ def _service_pages( ] if cdx_page is not None: - assert domain is not None - assert len(domain_paths) == 1 + if domain is None: + raise RuntimeError( + "Domain must be specified when page is specified.") + if len(domain_paths) < 1: + raise RuntimeError( + "There must be exactly one domain path.") cdx_page_paths = [domain_paths[0] / f"{cdx_page:010}.jsonl.gz"] else: cdx_page_paths = [ @@ -306,13 +313,22 @@ def _deduplicate_urls( ) -> list[_CdxUrl]: if snippets: return list(urls) + if not all( + isinstance(url.archived_url, ArchivedQueryUrl) + for url in urls + ): + return list(urls) urls = sorted( urls, - key=lambda url: url.archived_url.query + key=lambda url: ( + url.archived_url.query + if isinstance(url.archived_url, ArchivedQueryUrl) else "") ) grouped_query_urls = groupby( urls, - key=lambda url: url.archived_url.query + key=lambda url: ( + url.archived_url.query + if isinstance(url.archived_url, ArchivedQueryUrl) else "") ) return [ WebArchiveWarcDownloader._canonical_url(urls) @@ -325,6 +341,7 @@ def _page_urls( focused: bool, snippets: bool, ) -> Iterable[_CdxUrl]: + urls: Iterable[_CdxUrl] if snippets: urls = ( _CdxUrl(url, page.output_path) @@ -349,7 +366,7 @@ async def download_service( cdx_page: int | None = None, snippets: bool = False, ): - pages = self._service_pages( + pages_list: Sequence[_CdxPage] = self._service_pages( data_directory=data_directory, focused=focused, service=service, @@ -358,26 +375,28 @@ async def download_service( snippets=snippets, ) - if len(pages) == 0: + if len(pages_list) == 0: return + pages: Iterable[_CdxPage] = pages_list if focused: + # noinspection PyTypeChecker pages = tqdm( pages, desc="Deduplicate query URLs", unit="page", ) - archived_urls = chain.from_iterable( + cdx_urls: Sequence[_CdxUrl] = list(chain.from_iterable( self._page_urls(page, focused, snippets) for page in pages - ) + )) if focused: - archived_urls = self._deduplicate_urls(archived_urls, snippets) - archived_urls = Random(0).sample( - archived_urls, - min(len(archived_urls), 75_000) - ) + archived_urls_list = self._deduplicate_urls( + cdx_urls, snippets) + sample_size = min(len(cdx_urls), 75_000) + random = Random(0) # nosec: B311 + cdx_urls = random.sample(archived_urls_list, sample_size) - await self._download(archived_urls) + await self._download(cdx_urls) diff --git a/archive_query_log/generate_review_sample.py b/archive_query_log/legacy/generate_review_sample.py similarity index 93% rename from archive_query_log/generate_review_sample.py rename to archive_query_log/legacy/generate_review_sample.py index fb69eaba..60730eff 100644 --- a/archive_query_log/generate_review_sample.py +++ b/archive_query_log/legacy/generate_review_sample.py @@ -36,7 +36,8 @@ def main(): # continue lines.append(line) - lines = Random(0).sample(lines, REVIEW_SAMPLE_SIZE) + random = Random(0) # nosec: B311 + lines = random.sample(lines, REVIEW_SAMPLE_SIZE) with REVIEW_SAMPLE_QUERIES_PATH.open("wt") as o: for line in lines: diff --git a/archive_query_log/index/__init__.py b/archive_query_log/legacy/index/__init__.py similarity index 79% rename from archive_query_log/index/__init__.py rename to archive_query_log/legacy/index/__init__.py index 212c9b81..bb881450 100644 --- a/archive_query_log/index/__init__.py +++ b/archive_query_log/legacy/index/__init__.py @@ -3,28 +3,26 @@ from dataclasses import dataclass from functools import cached_property from gzip import GzipFile -from io import TextIOWrapper from json import loads, JSONDecodeError from pathlib import Path -from shelve import open as shelf_open, Shelf +from shelve import open as shelf_open, Shelf # nosec: B403 from shutil import copyfileobj -from typing import Iterator, TypeVar, Generic, Type, IO, final, ContextManager +from typing import Iterator, TypeVar, Generic, Type, final, \ + ContextManager, Iterable from uuid import UUID, uuid5, NAMESPACE_URL from dataclasses_json import DataClassJsonMixin -from fastwarc import ArchiveIterator, FileStream, WarcRecord, \ - WarcRecordType -# pylint: disable=no-name-in-module -from fastwarc.stream_io import PythonIOStreamAdapter from marshmallow import Schema from tqdm.auto import tqdm +from warcio.archiveiterator import ArchiveIterator +from warcio.recordloader import ArcWarcRecord -from archive_query_log import DATA_DIRECTORY_PATH, LOGGER -from archive_query_log.model import ArchivedUrl, ArchivedQueryUrl, \ +from archive_query_log.legacy import DATA_DIRECTORY_PATH, LOGGER +from archive_query_log.legacy.model import ArchivedUrl, ArchivedQueryUrl, \ ArchivedParsedSerp, ArchivedSearchResultSnippet, ArchivedRawSerp, \ ArchivedRawSearchResult, CorpusJsonlLocation, CorpusJsonlSnippetLocation, \ CorpusWarcLocation -from archive_query_log.util.text import count_lines +from archive_query_log.legacy.util.text import count_lines, text_io_wrapper @dataclass(frozen=True) @@ -126,9 +124,9 @@ def _index_jsonl(self, path: Path) -> None: offset = 0 index: list[tuple[str, str, str]] = [] - with GzipFile(path, mode="r") as gzip_file: - gzip_file: IO[str] - for line in gzip_file: + with (GzipFile(path, mode="rb") as gzip_file, + text_io_wrapper(gzip_file) as file): + for line in file: try: record = loads(line) except JSONDecodeError: @@ -145,12 +143,9 @@ def _index_jsonl(self, path: Path) -> None: )) offset = gzip_file.tell() - try: - with index_path.open("wt") as index_file: - index_writer = writer(index_file) - index_writer.writerows(index) - except Exception as e: - LOGGER.error(e) + with index_path.open("wt") as index_file: + index_writer = writer(index_file) + index_writer.writerows(index) def _index_warc(self, dir_path: Path) -> None: if not dir_path.exists(): @@ -169,39 +164,36 @@ def _index_warc(self, dir_path: Path) -> None: for path in dir_path.iterdir(): if path.name.startswith("."): continue - records = ArchiveIterator( - FileStream(str(path), "rb"), - record_types=WarcRecordType.response, - parse_http=False, - ) - for record in records: - record: WarcRecord - offset = record.stream_pos - try: - record_url = loads(record.headers["Archived-URL"]) - except JSONDecodeError: - LOGGER.error( - f"Could not index " - f"{record.headers['Archived-URL']} " - f"at {path}." + with path.open("rb") as file: + records = ArchiveIterator(file, no_record_parse=True) + record: ArcWarcRecord + for record in records: + if record.rec_type != "response": + continue + offset = record.raw_stream.tell() + try: + record_url = loads( + record.rec_headers.get_header("Archived-URL")) + except JSONDecodeError: + LOGGER.error( + f"Could not index " + f"{record.rec_headers.get_header('Archived-URL')} " + f"at {path}." + ) + return + record_id = uuid5( + NAMESPACE_URL, + f"{record_url['timestamp']}:{record_url['url']}", ) - return - record_id = uuid5( - NAMESPACE_URL, - f"{record_url['timestamp']}:{record_url['url']}", - ) - index.append(( - str(record_id), - str(path.relative_to(self.data_directory)), - str(offset), - )) + index.append(( + str(record_id), + str(path.relative_to(self.data_directory)), + str(offset), + )) - try: - with index_path.open("wt") as index_file: - index_writer = writer(index_file) - index_writer.writerows(index) - except Exception as e: - LOGGER.error(e) + with index_path.open("wt") as index_file: + index_writer = writer(index_file) + index_writer.writerows(index) def _index_jsonl_snippets(self, path: Path) -> None: if not path.exists(): @@ -218,9 +210,9 @@ def _index_jsonl_snippets(self, path: Path) -> None: offset = 0 index: list[tuple[str, str, str, str]] = [] - with GzipFile(path, mode="r") as gzip_file: - gzip_file: IO[str] - for line in gzip_file: + with (GzipFile(path, mode="rb") as gzip_file, + text_io_wrapper(gzip_file) as file): + for line in file: try: record = loads(line) except JSONDecodeError: @@ -239,12 +231,9 @@ def _index_jsonl_snippets(self, path: Path) -> None: )) offset = gzip_file.tell() - try: - with index_path.open("wt") as index_file: - index_writer = writer(index_file) - index_writer.writerows(index) - except Exception as e: - LOGGER.error(e) + with index_path.open("wt") as index_file: + index_writer = writer(index_file) + index_writer.writerows(index) def _index(self, path: Path) -> None: if self.base_type == "archived-urls": @@ -289,21 +278,25 @@ def shelf_path(self) -> Path: def index(self) -> None: # Index each path individually. - indexed_paths = [] + indexable_paths: Iterable[Path] + # noinspection PyTypeChecker indexable_paths = tqdm( self._indexable_paths(), total=sum(1 for _ in self._indexable_paths()), desc="Index paths", unit="path", ) + indexed_paths_list: list[Path] = [] for indexable_path in indexable_paths: self._index(indexable_path) - indexed_paths.append(indexable_path) + indexed_paths_list.append(indexable_path) + indexed_paths: Iterable[Path] = indexed_paths_list # Merge all indexes into a single index. path = self.path num_lines = 0 with path.open("wb") as aggregated_index_file: + # noinspection PyTypeChecker indexed_paths = tqdm( indexed_paths, desc="Merge indices", @@ -318,21 +311,6 @@ def index(self) -> None: with index_path.open("rb") as index_file: num_lines += count_lines(index_file) - # TODO: This is too slow. - # # Create index shelf for efficient lookups. - # shelf_path = self.shelf_path - # with shelf_open(str(shelf_path), "c") as shelf: - # with path.open("rt") as file: - # lines = tqdm( - # file, - # total=num_lines, - # desc="Create index shelf", - # unit="line", - # ) - # for line in lines: - # uuid, line = line.split(",", maxsplit=1) - # shelf[uuid] = line - _CorpusLocationType = TypeVar( "_CorpusLocationType", @@ -385,7 +363,7 @@ def _index_shelf_path(self) -> Path: @cached_property def _index_shelve(self) -> Shelf: - return shelf_open(str(self._index_shelf_path), "r") + return shelf_open(str(self._index_shelf_path), "r") # nosec: B301 def index(self) -> None: self._meta_index.index() @@ -439,9 +417,8 @@ def _to_corpus_location(self, csv_line: list) -> CorpusJsonlLocation: def _read_record(self, location: CorpusJsonlLocation) -> _RecordType: path = self.data_directory / location.relative_path with GzipFile(path, "rb") as gzip_file: - gzip_file: IO[bytes] gzip_file.seek(location.byte_offset) - with TextIOWrapper(gzip_file) as text_file: + with text_io_wrapper(gzip_file) as text_file: line = text_file.readline() return self._schema.loads(line) @@ -455,16 +432,15 @@ def _to_corpus_location(self, csv_line: list) -> CorpusWarcLocation: byte_offset=int(csv_line[2]), ) - def _read_record(self, location: CorpusJsonlLocation) -> _RecordType: + def _read_record(self, location: CorpusWarcLocation) -> _RecordType: path = self.data_directory / location.relative_path with path.open("rb") as file: file.seek(location.byte_offset) - stream = PythonIOStreamAdapter(file) - record: WarcRecord = next(ArchiveIterator(stream)) + record: ArcWarcRecord = next(ArchiveIterator(file)) return self._read_warc_record(record) @abstractmethod - def _read_warc_record(self, record: WarcRecord) -> _RecordType: + def _read_warc_record(self, record: ArcWarcRecord) -> _RecordType: pass @@ -494,11 +470,13 @@ class ArchivedRawSerpIndex(_WarcIndex[ArchivedRawSerp]): data_directory: Path = DATA_DIRECTORY_PATH focused: bool = False - def _read_warc_record(self, record: WarcRecord) -> ArchivedRawSerp: - archived_url: ArchivedQueryUrl = self.schema.loads( - record.headers["Archived-URL"] - ) - content_type = record.http_charset + def _read_warc_record(self, record: ArcWarcRecord) -> ArchivedRawSerp: + header = record.rec_headers.get_header("Archived-URL") + archived_url = self.schema.loads( + record.rec_headers.get_header("Archived-URL")) + if isinstance(archived_url, list): + raise ValueError(f"Expected one URL in the header: {header}") + content_type = record.http_headers.get_header("Content-Type") if content_type is None: content_type = "utf8" return ArchivedRawSerp( @@ -507,7 +485,7 @@ def _read_warc_record(self, record: WarcRecord) -> ArchivedRawSerp: query=archived_url.query, page=archived_url.page, offset=archived_url.offset, - content=record.reader.read(), + content=record.content_stream().read(), encoding=content_type, ) @@ -544,14 +522,15 @@ def _to_corpus_location( def _read_record( self, location: CorpusJsonlSnippetLocation - ) -> _RecordType: + ) -> ArchivedSearchResultSnippet: path = self.data_directory / location.relative_path with GzipFile(path, "rb") as gzip_file: - gzip_file: IO[bytes] gzip_file.seek(location.byte_offset) - with TextIOWrapper(gzip_file) as text_file: + with text_io_wrapper(gzip_file) as text_file: line = text_file.readline() - record: ArchivedParsedSerp = self.schema.loads(line) + record = self.schema.loads(line) + if isinstance(record, list): + raise ValueError(f"Expected one result per line: {line}") return record.results[location.index] @@ -563,11 +542,15 @@ class ArchivedRawSearchResultIndex(_WarcIndex[ArchivedRawSearchResult]): data_directory: Path = DATA_DIRECTORY_PATH focused: bool = False - def _read_warc_record(self, record: WarcRecord) -> ArchivedRawSearchResult: - archived_url: ArchivedSearchResultSnippet = self.schema.loads( - record.headers["Archived-URL"] - ) - content_type = record.http_charset + def _read_warc_record( + self, + record: ArcWarcRecord, + ) -> ArchivedRawSearchResult: + header = record.rec_headers.get_header("Archived-URL") + archived_url = self.schema.loads(header) + if isinstance(archived_url, list): + raise ValueError(f"Expected one URL in the header: {header}") + content_type = record.http_headers.get_header("Content-Type") if content_type is None: content_type = "utf8" return ArchivedRawSearchResult( @@ -576,6 +559,6 @@ def _read_warc_record(self, record: WarcRecord) -> ArchivedRawSearchResult: rank=archived_url.rank, title=archived_url.title, snippet=archived_url.snippet, - content=record.reader.read(), + content=record.content_stream().read(), encoding=content_type, ) diff --git a/archive_query_log/model/__init__.py b/archive_query_log/legacy/model/__init__.py similarity index 93% rename from archive_query_log/model/__init__.py rename to archive_query_log/legacy/model/__init__.py index 8ab8a111..ad8c8549 100644 --- a/archive_query_log/model/__init__.py +++ b/archive_query_log/legacy/model/__init__.py @@ -9,9 +9,10 @@ from dataclasses_json import DataClassJsonMixin, config from marshmallow.fields import List, Nested, String, Field +from publicsuffixlist import PublicSuffixList -from archive_query_log.model.highlight import HighlightedText -from archive_query_log.util.serialization import HighlightedTextField +from archive_query_log.legacy.model.highlight import HighlightedText +from archive_query_log.legacy.util.serialization import HighlightedTextField @dataclass(frozen=True, slots=True) @@ -60,7 +61,7 @@ def url_md5(self) -> str: """ MD5 hash of the original URL. """ - return md5(self.url.encode()).hexdigest() + return md5(self.url.encode(), usedforsecurity=False).hexdigest() @cached_property def datetime(self) -> datetime: @@ -251,16 +252,16 @@ class ArchivedParsedSearchResult(ArchivedSearchResultSnippet, """ Plaintext of the archived SERP's snapshot content. """ - # TODO - pass # flake8: noqa: E402 -from archive_query_log.model.parse import QueryParser, \ +from archive_query_log.legacy.model.parse import QueryParser, \ PageParser, OffsetParser, QueryParserField, PageOffsetParserField, \ ResultsParserField, InterpretedQueryParserField, InterpretedQueryParser, \ ResultsParser +_public_suffix_list = PublicSuffixList() + @dataclass(frozen=True, slots=True) class Service(DataClassJsonMixin): @@ -276,52 +277,32 @@ class Service(DataClassJsonMixin): Service name (corresponds to ``alexa_domain`` without the ``alexa_public_suffix``). """ - public_suffix: str - """ - Public suffix (https://publicsuffix.org/) of ``alexa_domain``. - """ - alexa_domain: str - """ - Domain as it appears in Alexa top-1M ranks. - """ - alexa_rank: int | None - """ - Rank from fused Alexa top-1M rankings. - """ - category: str | None - """ - Category of the service (manual annotation). - """ - notes: str | None - """ - Notes about the service (manual annotation). - """ - input_field: bool | None - """ - Whether the service has an input field. - """ - search_form: bool | None - """ - Whether the service has a search form element. - """ - search_div: bool | None - """ - Whether the service has a search div element. - """ domains: Sequence[str] = field( metadata=config( decoder=tuple, mm_field=List(String()) - ) + ), ) """ Known domains of the service, including the main domain. """ + focused_url_prefixes: Sequence[str] = field( + metadata=config( + decoder=tuple, + mm_field=List(String()) + ), + default=(), + ) + """ + URL prefixes for a more focused pipeline which might miss some queries + but executes faster. + """ query_parsers: Sequence[QueryParser] = field( metadata=config( decoder=tuple, mm_field=List(QueryParserField()) - ) + ), + default=(), ) """ Query parsers in order of precedence. @@ -330,7 +311,8 @@ class Service(DataClassJsonMixin): metadata=config( decoder=tuple, mm_field=List(PageOffsetParserField()) - ) + ), + default=(), ) """ Page number parsers in order of precedence. @@ -339,7 +321,8 @@ class Service(DataClassJsonMixin): metadata=config( decoder=tuple, mm_field=List(PageOffsetParserField()) - ) + ), + default=(), ) """ Page number parsers in order of precedence. @@ -348,7 +331,8 @@ class Service(DataClassJsonMixin): metadata=config( decoder=tuple, mm_field=List(InterpretedQueryParserField()) - ) + ), + default=(), ) """ Interpreted query parsers in order of precedence. @@ -362,21 +346,12 @@ class Service(DataClassJsonMixin): metadata=config( decoder=tuple, mm_field=List(ResultsParserField()) - ) + ), + default=(), ) """ SERP parsers in order of precedence. """ - focused_url_prefixes: Sequence[str] = field( - metadata=config( - decoder=tuple, - mm_field=List(String()) - ) - ) - """ - URL prefixes for a more focused pipeline which might miss some queries - but executes faster. - """ class PathField(Field): @@ -386,7 +361,7 @@ def _serialize( return str(value) def _deserialize( - self, value: str, attr: str, data: Any, **kwargs: Any + self, value: str, attr: str | None, data: Any, **kwargs: Any ) -> Path: return Path(value) diff --git a/archive_query_log/model/highlight.py b/archive_query_log/legacy/model/highlight.py similarity index 100% rename from archive_query_log/model/highlight.py rename to archive_query_log/legacy/model/highlight.py diff --git a/archive_query_log/model/parse.py b/archive_query_log/legacy/model/parse.py similarity index 70% rename from archive_query_log/model/parse.py rename to archive_query_log/legacy/model/parse.py index e258291c..77577dca 100644 --- a/archive_query_log/model/parse.py +++ b/archive_query_log/legacy/model/parse.py @@ -1,10 +1,10 @@ -from re import compile, IGNORECASE +from re import compile as pattern, IGNORECASE from typing import Sequence, Protocol, runtime_checkable, Any, Mapping, Union from marshmallow.fields import Field -from archive_query_log.model import ArchivedUrl, ArchivedSearchResultSnippet, \ - ArchivedRawSerp +from archive_query_log.legacy.model import ( + ArchivedUrl, ArchivedSearchResultSnippet, ArchivedRawSerp) @runtime_checkable @@ -27,7 +27,7 @@ def parse(self, url: "ArchivedUrl") -> int | None: @runtime_checkable class InterpretedQueryParser(Protocol): - def parse(self, content: "ArchivedRawSerp") -> str | None: + def parse(self, raw_serp: "ArchivedRawSerp") -> str | None: ... @@ -43,68 +43,67 @@ def parse( class QueryParserField(Field): def _deserialize( self, - value: Any, + value: Mapping[str, Any], attr: str | None, data: Mapping[str, Any] | None, **kwargs, ) -> QueryParser: - value: Mapping[str, Any] parser_type = value["type"] if parser_type == "query_parameter": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ QueryParameterQueryParser return QueryParameterQueryParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), parameter=value["parameter"], ) elif parser_type == "fragment_parameter": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ FragmentParameterQueryParser return FragmentParameterQueryParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), parameter=value["parameter"], ) elif parser_type == "path_segment": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ PathSegmentQueryParser return PathSegmentQueryParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), segment=value["segment"], remove_patterns=( [ - compile(pattern, IGNORECASE) - for pattern in value["remove_patterns"] + pattern(remove_pattern, IGNORECASE) + for remove_pattern in value["remove_patterns"] ] if "remove_patterns" in value else [] ), space_patterns=( [ - compile(pattern, IGNORECASE) - for pattern in value["space_patterns"] + pattern(space_pattern, IGNORECASE) + for space_pattern in value["space_patterns"] ] if "space_patterns" in value else [] ), ) elif parser_type == "fragment_segment": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ FragmentSegmentQueryParser return FragmentSegmentQueryParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), segment=value["segment"], remove_patterns=( [ - compile(pattern, IGNORECASE) - for pattern in value["remove_patterns"] + pattern(remove_pattern, IGNORECASE) + for remove_pattern in value["remove_patterns"] ] if "remove_patterns" in value else [] ), space_patterns=( [ - compile(pattern, IGNORECASE) - for pattern in value["space_patterns"] + pattern(space_pattern, IGNORECASE) + for space_pattern in value["space_patterns"] ] if "space_patterns" in value else [] @@ -117,54 +116,53 @@ def _deserialize( class PageOffsetParserField(Field): def _deserialize( self, - value: Any, + value: Mapping[str, Any], attr: str | None, data: Mapping[str, Any] | None, **kwargs, ) -> Union[PageParser | OffsetParser]: - value: Mapping[str, Any] parser_type = value["type"] if parser_type == "query_parameter": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ QueryParameterPageOffsetParser return QueryParameterPageOffsetParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), parameter=value["parameter"], ) elif parser_type == "fragment_parameter": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ FragmentParameterPageOffsetParser return FragmentParameterPageOffsetParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), parameter=value["parameter"], ) elif parser_type == "path_segment": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ PathSegmentPageOffsetParser return PathSegmentPageOffsetParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), segment=value["segment"], delimiter=value["delimiter"] if "delimiter" in value else "/", remove_patterns=( [ - compile(pattern, IGNORECASE) - for pattern in value["remove_patterns"] + pattern(remove_pattern, IGNORECASE) + for remove_pattern in value["remove_patterns"] ] if "remove_patterns" in value else [] ), ) elif parser_type == "fragment_segment": - from archive_query_log.queries.parse import \ + from archive_query_log.legacy.queries.parse import \ FragmentSegmentPageOffsetParser return FragmentSegmentPageOffsetParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), segment=value["segment"], delimiter=value["delimiter"] if "delimiter" in value else "/", remove_patterns=( [ - compile(pattern, IGNORECASE) - for pattern in value["remove_patterns"] + pattern(remove_pattern, IGNORECASE) + for remove_pattern in value["remove_patterns"] ] if "remove_patterns" in value else [] @@ -177,18 +175,17 @@ def _deserialize( class InterpretedQueryParserField(Field): def _deserialize( self, - value: Any, + value: Mapping[str, Any], attr: str | None, data: Mapping[str, Any] | None, **kwargs, ) -> InterpretedQueryParser: - value: Mapping[str, Any] parser_type = value["type"] if parser_type == "html_selector": - from archive_query_log.results.parse import \ + from archive_query_log.legacy.results.parse import \ HtmlSelectorInterpretedQueryParser return HtmlSelectorInterpretedQueryParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), query_selector=value["query_selector"], query_attribute=( value["query_attribute"] @@ -208,18 +205,17 @@ def _deserialize( class ResultsParserField(Field): def _deserialize( self, - value: Any, + value: Mapping[str, Any], attr: str | None, data: Mapping[str, Any] | None, **kwargs, ) -> ResultsParser: - value: Mapping[str, Any] parser_type = value["type"] if parser_type == "html_selector": - from archive_query_log.results.parse import \ + from archive_query_log.legacy.results.parse import \ HtmlSelectorResultsParser return HtmlSelectorResultsParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), results_selector=value["results_selector"], url_selector=value["url_selector"], url_attribute=( @@ -235,10 +231,10 @@ def _deserialize( ), ) elif parser_type == "chatnoir": - from archive_query_log.results.chatnoir import \ + from archive_query_log.legacy.results.chatnoir import \ ChatNoirResultsParser return ChatNoirResultsParser( - url_pattern=compile(value["url_pattern"], IGNORECASE), + url_pattern=pattern(value["url_pattern"], IGNORECASE), ) else: raise ValueError(f"Unknown parser type: {parser_type}") diff --git a/archive_query_log/results/test/__init__.py b/archive_query_log/legacy/queries/__init__.py similarity index 100% rename from archive_query_log/results/test/__init__.py rename to archive_query_log/legacy/queries/__init__.py diff --git a/archive_query_log/legacy/queries/iterable.py b/archive_query_log/legacy/queries/iterable.py new file mode 100644 index 00000000..8555ab01 --- /dev/null +++ b/archive_query_log/legacy/queries/iterable.py @@ -0,0 +1,45 @@ +from dataclasses import dataclass +from gzip import GzipFile +from pathlib import Path +from typing import Sized, Iterable, Iterator + +from archive_query_log.legacy.model import ArchivedQueryUrl +from archive_query_log.legacy.util.text import count_lines, text_io_wrapper + + +@dataclass(frozen=True) +class ArchivedQueryUrls(Sized, Iterable[ArchivedQueryUrl]): + """ + Read archived query URLs from a JSONL file. + """ + + path: Path + """ + Path where the query URLs are stored in JSONL format. + """ + + def __post_init__(self): + self._check_urls_path() + + def _check_urls_path(self): + if not self.path.exists() or not self.path.is_file(): + raise ValueError( + f"URLs path must be a file: {self.path}" + ) + + def __len__(self) -> int: + with (self.path.open("rb") as file, + GzipFile(fileobj=file, mode="rb") as gzip_file): + return count_lines(gzip_file) + + def __iter__(self) -> Iterator[ArchivedQueryUrl]: + schema = ArchivedQueryUrl.schema() + with (self.path.open("rb") as file, + GzipFile(fileobj=file, mode="rb") as gzip_file, + text_io_wrapper(gzip_file) as text_file): + for line in text_file: + query_url = schema.loads(line, many=True) + if isinstance(query_url, list): + raise ValueError( + f"Expected one query URL per line: {line}") + yield query_url diff --git a/archive_query_log/queries/parse.py b/archive_query_log/legacy/queries/parse.py similarity index 87% rename from archive_query_log/queries/parse.py rename to archive_query_log/legacy/queries/parse.py index 320c63c8..42834219 100644 --- a/archive_query_log/queries/parse.py +++ b/archive_query_log/legacy/queries/parse.py @@ -1,15 +1,15 @@ from dataclasses import dataclass from gzip import GzipFile -from io import TextIOWrapper from pathlib import Path -from typing import Sequence, NamedTuple, Pattern +from typing import Sequence, NamedTuple, Pattern, Iterable from urllib.parse import parse_qsl, unquote, quote from tqdm.auto import tqdm -from archive_query_log.model import ArchivedQueryUrl, \ +from archive_query_log.legacy.model import ArchivedQueryUrl, \ ArchivedUrl, PageParser, QueryParser, OffsetParser, Service -from archive_query_log.urls.iterable import ArchivedUrls +from archive_query_log.legacy.urls.iterable import ArchivedUrls +from archive_query_log.legacy.util.text import text_io_wrapper @dataclass(frozen=True) @@ -180,27 +180,27 @@ def parse( if output_path.exists() and not self.overwrite: return output_path.parent.mkdir(parents=True, exist_ok=True) - archived_urls = ArchivedUrls(input_path) + archived_urls: Iterable[ArchivedUrl] = ArchivedUrls(input_path) if self.verbose: + # noinspection PyTypeChecker archived_urls = tqdm( archived_urls, desc="Parse SERP URLs", unit="URL", ) - archived_serp_urls = ( + archived_serp_urls_nullable = ( self._parse_single(archived_url, focused) for archived_url in archived_urls ) archived_serp_urls = ( archived_serp_url - for archived_serp_url in archived_serp_urls + for archived_serp_url in archived_serp_urls_nullable if archived_serp_url is not None ) output_schema = ArchivedQueryUrl.schema() - # noinspection PyTypeChecker - with output_path.open("wb") as file, \ - GzipFile(fileobj=file, mode="wb") as gzip_file, \ - TextIOWrapper(gzip_file) as text_file: + with (output_path.open("wb") as file, + GzipFile(fileobj=file, mode="wb") as gzip_file, + text_io_wrapper(gzip_file) as text_file): for archived_serp_url in archived_serp_urls: text_file.write(output_schema.dumps(archived_serp_url)) text_file.write("\n") @@ -211,8 +211,8 @@ def _parse_single( focused: bool, ) -> ArchivedQueryUrl | None: query: str | None = None - for parser in self.query_parsers: - query = parser.parse(archived_url) + for query_parser in self.query_parsers: + query = query_parser.parse(archived_url) if query is not None: break @@ -220,8 +220,8 @@ def _parse_single( return None page: int | None = None - for parser in self.page_parsers: - page = parser.parse(archived_url) + for page_parser in self.page_parsers: + page = page_parser.parse(archived_url) if page is not None: break @@ -229,8 +229,8 @@ def _parse_single( return None offset: int | None = None - for parser in self.offset_parsers: - offset = parser.parse(archived_url) + for offset_parser in self.offset_parsers: + offset = offset_parser.parse(archived_url) if offset is not None: break @@ -280,8 +280,12 @@ def _service_pages( ] if cdx_page is not None: - assert domain is not None - assert len(domain_paths) == 1 + if domain is None: + raise RuntimeError( + "Domain must be specified when page is specified.") + if len(domain_paths) < 1: + raise RuntimeError( + "There must be exactly one domain path.") cdx_page_paths = [domain_paths[0] / f"{cdx_page:010}.jsonl.gz"] else: cdx_page_paths = [ @@ -313,7 +317,7 @@ def parse_service( domain: str | None = None, cdx_page: int | None = None, ): - pages = self._service_pages( + pages_list: Sequence[_CdxPage] = self._service_pages( data_directory=data_directory, focused=focused, service=service, @@ -321,10 +325,12 @@ def parse_service( cdx_page=cdx_page, ) - if len(pages) == 0: + if len(pages_list) == 0: return - if len(pages) > 1: + pages: Iterable[_CdxPage] = pages_list + if len(pages_list) > 1: + # noinspection PyTypeChecker pages = tqdm( pages, desc="Parse archived SERP URLs", diff --git a/archive_query_log/serps/__init__.py b/archive_query_log/legacy/results/__init__.py similarity index 100% rename from archive_query_log/serps/__init__.py rename to archive_query_log/legacy/results/__init__.py diff --git a/archive_query_log/results/chatnoir.py b/archive_query_log/legacy/results/chatnoir.py similarity index 60% rename from archive_query_log/results/chatnoir.py rename to archive_query_log/legacy/results/chatnoir.py index 0f20b5f4..2cfc924a 100644 --- a/archive_query_log/results/chatnoir.py +++ b/archive_query_log/legacy/results/chatnoir.py @@ -4,10 +4,10 @@ from bs4 import Tag -from archive_query_log.model import ArchivedSearchResultSnippet, \ +from archive_query_log.legacy.model import ArchivedSearchResultSnippet, \ HighlightedText -from archive_query_log.results.parse import HtmlResultsParser -from archive_query_log.util.html import clean_html +from archive_query_log.legacy.results.parse import HtmlResultsParser +from archive_query_log.legacy.util.html import clean_html @dataclass(frozen=True) @@ -21,23 +21,21 @@ def parse_html( serp_url: str, ) -> Iterator[ArchivedSearchResultSnippet]: results = html.find("section", id="SearchResults") - if results is None: + if results is None or not isinstance(results, Tag): return results_iter = results.find_all("article", class_="search-result") for index, result in enumerate(results_iter): - header: Tag = result.find("header") - url = header.find("a", class_="link")["href"] + header = result.find("header") + url: str = header.find("a", class_="link")["href"] url = urljoin(serp_url, url) - title = clean_html(header.find("h2")) + title = HighlightedText(clean_html(header.find("h2"))) # Remove header. Only the snippet will be left. header.decompose() - snippet = clean_html(result) - if len(snippet) == 0: - snippet = None + snippet = HighlightedText(clean_html(result)) yield ArchivedSearchResultSnippet( rank=index + 1, url=url, timestamp=timestamp, - title=HighlightedText(title), - snippet=HighlightedText(snippet), + title=title, + snippet=snippet if len(snippet) > 0 else None, ) diff --git a/archive_query_log/results/parse.py b/archive_query_log/legacy/results/parse.py similarity index 82% rename from archive_query_log/results/parse.py rename to archive_query_log/legacy/results/parse.py index b685e5c7..d01bcc62 100644 --- a/archive_query_log/results/parse.py +++ b/archive_query_log/legacy/results/parse.py @@ -1,19 +1,19 @@ from abc import abstractmethod, ABC from dataclasses import dataclass from gzip import GzipFile -from io import TextIOWrapper from pathlib import Path -from typing import Sequence, NamedTuple, Iterator, Pattern +from typing import Sequence, NamedTuple, Iterator, Pattern, Iterable from urllib.parse import quote, urljoin from bs4 import Tag, BeautifulSoup from tqdm.auto import tqdm -from archive_query_log.download.iterable import ArchivedRawSerps -from archive_query_log.model import ArchivedRawSerp, \ +from archive_query_log.legacy.download.iterable import ArchivedRawSerps +from archive_query_log.legacy.model import ArchivedRawSerp, \ ArchivedSearchResultSnippet, ResultsParser, InterpretedQueryParser, \ ArchivedParsedSerp, Service, HighlightedText -from archive_query_log.util.html import clean_html +from archive_query_log.legacy.util.html import clean_html +from archive_query_log.legacy.util.text import text_io_wrapper class HtmlResultsParser(ResultsParser, ABC): @@ -63,6 +63,7 @@ def parse_html( serp_url: str, ) -> Iterator[ArchivedSearchResultSnippet]: for index, result in enumerate(html.select(self.results_selector)): + url_tag: Tag | None if self.url_selector == ":--self": url_tag = result else: @@ -76,13 +77,14 @@ def parse_html( continue url = urljoin(serp_url, url) + title_tag: Tag | None if self.title_selector == ":--self": title_tag = result else: title_tag = result.select_one(self.title_selector) if title_tag is None: continue - title = clean_html(title_tag) + title = HighlightedText(clean_html(title_tag)) if len(title) == 0: continue @@ -93,11 +95,12 @@ def parse_html( else: snippet_tags = result.select(self.snippet_selector) if snippet_tags is not None and snippet_tags: - for snippet_candidate in snippet_tags: - snippet_candidate = clean_html(snippet_candidate) + for snippet_candidate_tag in snippet_tags: + snippet_candidate = HighlightedText( + clean_html(snippet_candidate_tag) + ) - if (snippet_candidate and - len(snippet_candidate) > 0 and + if (len(snippet_candidate) > 0 and (not snippet or len(snippet_candidate) > len(snippet))): snippet = snippet_candidate @@ -106,8 +109,8 @@ def parse_html( rank=index + 1, url=url, timestamp=timestamp, - title=HighlightedText(title), - snippet=HighlightedText(snippet), + title=title, + snippet=snippet, ) @@ -171,27 +174,28 @@ def parse(self, input_path: Path, output_path: Path) -> None: if output_path.exists() and not self.overwrite: return output_path.parent.mkdir(parents=True, exist_ok=True) - archived_serp_contents = ArchivedRawSerps(input_path) + archived_serp_contents: Iterable[ArchivedRawSerp] = ( + ArchivedRawSerps(input_path)) if self.verbose: + # noinspection PyTypeChecker archived_serp_contents = tqdm( archived_serp_contents, desc="Parse SERP WARC records", unit="record", ) - archived_parsed_serps = ( + archived_parsed_serps_nullable = ( self.parse_single(archived_serp_content) for archived_serp_content in archived_serp_contents ) archived_parsed_serps = ( archived_serp - for archived_serp in archived_parsed_serps + for archived_serp in archived_parsed_serps_nullable if archived_serp is not None ) output_schema = ArchivedParsedSerp.schema() - # noinspection PyTypeChecker with output_path.open("wb") as file, \ GzipFile(fileobj=file, mode="wb") as gzip_file, \ - TextIOWrapper(gzip_file) as text_file: + text_io_wrapper(gzip_file) as text_file: for archived_parsed_serp in archived_parsed_serps: text_file.write(output_schema.dumps(archived_parsed_serp)) text_file.write("\n") @@ -201,14 +205,15 @@ def parse_single( archived_serp_content: ArchivedRawSerp ) -> ArchivedParsedSerp | None: results: Sequence[ArchivedSearchResultSnippet] | None = None - for parser in self.results_parsers: - results = parser.parse(archived_serp_content) + for results_parser in self.results_parsers: + results = results_parser.parse(archived_serp_content) if results is not None: break interpreted_query: str | None = None - for parser in self.interpreted_query_parsers: - interpreted_query = parser.parse(archived_serp_content) + for interpreted_query_parser in self.interpreted_query_parsers: + interpreted_query = interpreted_query_parser.parse( + archived_serp_content) if interpreted_query is not None: break @@ -262,8 +267,12 @@ def _service_pages( ] if cdx_page is not None: - assert domain is not None - assert len(domain_paths) == 1 + if domain is None: + raise RuntimeError( + "Domain must be specified when page is specified.") + if len(domain_paths) < 1: + raise RuntimeError( + "There must be exactly one domain path.") cdx_page_paths = [domain_paths[0] / f"{cdx_page:010}"] else: cdx_page_paths = [ @@ -295,7 +304,7 @@ def parse_service( domain: str | None = None, cdx_page: int | None = None, ): - pages = self._service_pages( + pages_list: Sequence[_CdxPage] = self._service_pages( data_directory=data_directory, focused=focused, service=service, @@ -303,10 +312,12 @@ def parse_service( cdx_page=cdx_page, ) - if len(pages) == 0: + if len(pages_list) == 0: return - if len(pages) > 1: + pages: Iterable[_CdxPage] = pages_list + if len(pages_list) > 1: + # noinspection PyTypeChecker pages = tqdm( pages, desc="Parse archived SERP URLs", diff --git a/archive_query_log/urls/__init__.py b/archive_query_log/legacy/results/test/__init__.py similarity index 100% rename from archive_query_log/urls/__init__.py rename to archive_query_log/legacy/results/test/__init__.py diff --git a/archive_query_log/results/test/generate_tests.py b/archive_query_log/legacy/results/test/generate_tests.py similarity index 88% rename from archive_query_log/results/test/generate_tests.py rename to archive_query_log/legacy/results/test/generate_tests.py index 823b8a4e..166747d3 100644 --- a/archive_query_log/results/test/generate_tests.py +++ b/archive_query_log/legacy/results/test/generate_tests.py @@ -3,26 +3,21 @@ from gzip import GzipFile from io import TextIOWrapper, BytesIO from json import loads -from math import inf from pathlib import Path from random import Random -from re import compile +from re import compile as pattern from textwrap import dedent -from typing import Iterable from requests import get from slugify import slugify from tqdm import tqdm from warcio import WARCWriter, StatusAndHeaders -from archive_query_log import PROJECT_DIRECTORY_PATH -from archive_query_log.config import SERVICES -from archive_query_log.model import Service, ArchivedQueryUrl +from archive_query_log.legacy import PROJECT_DIRECTORY_PATH +from archive_query_log.legacy.model import ArchivedQueryUrl NUM_SERVICES = 11 -# SERVICE_NAMES = None SERVICE_NAMES = ["google", "yahoo", "bing", "duckduckgo", "ask", "ecosia"] -# SERVICE_NAMES = ["google"] NUM_QUERIES_PER_SERVICE = 50 DATA_PATH = Path( @@ -37,7 +32,7 @@ TESTS_PATH = PROJECT_DIRECTORY_PATH / \ "archive_query_log/results/test/" -PATTERN_SPECIAL_CHARS = compile(r"[^0-9a-z]+") +PATTERN_SPECIAL_CHARS = pattern(r"[^0-9a-z]+") def warc_url(url: str, timestamp: float) -> str: @@ -50,16 +45,7 @@ def warc_url(url: str, timestamp: float) -> str: def main(): - if SERVICE_NAMES is None: - services: Iterable[Service] = SERVICES.values() - services = sorted( - services, - key=lambda s: s.alexa_rank if s.alexa_rank is not None else inf, - ) - services = services[:NUM_SERVICES] - service_names = [s.name for s in services] - else: - service_names = SERVICE_NAMES + service_names = SERVICE_NAMES query_urls = defaultdict(list) for path in tqdm( @@ -73,19 +59,15 @@ def main(): continue if "\"serp_warc_relative_path\": \"" not in line: continue - if not any( - f"\"search_provider_name\": \"{service_name}" in line - for service_name in service_names - ): - continue query_url = loads(line) if query_url["search_provider_name"] not in service_names: continue query_urls[query_url["search_provider_name"]].append(query_url) print(f"Found {sum(len(urls) for urls in query_urls.values())} SERPs.") + random = Random(0) # nosec: B311 query_urls = { - service_name: Random(0).sample( + service_name: random.sample( query_urls[service_name], min( NUM_QUERIES_PER_SERVICE, len(query_urls[service_name]), @@ -129,6 +111,7 @@ def main(): ) response = get( wayback_raw_url, + timeout=60 * 4, # nosec: B113 ) response.raise_for_status() diff --git a/archive_query_log/results/test/test_360_serp_parsing.py b/archive_query_log/legacy/results/test/test_360_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_360_serp_parsing.py rename to archive_query_log/legacy/results/test/test_360_serp_parsing.py index 8f4f9e54..fafc1b84 100644 --- a/archive_query_log/results/test/test_360_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_360_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_360_an_jian_diao_cha_bi_lu_1576497545(): diff --git a/archive_query_log/results/test/test_aliexpress_serp_parsing.py b/archive_query_log/legacy/results/test/test_aliexpress_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_aliexpress_serp_parsing.py rename to archive_query_log/legacy/results/test/test_aliexpress_serp_parsing.py index e43179e3..f95dc00f 100644 --- a/archive_query_log/results/test/test_aliexpress_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_aliexpress_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_aliexpress_barefoot_accessories_1508385234(): diff --git a/archive_query_log/results/test/test_amazon_serp_parsing.py b/archive_query_log/legacy/results/test/test_amazon_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_amazon_serp_parsing.py rename to archive_query_log/legacy/results/test/test_amazon_serp_parsing.py index b6cc038f..858463cc 100644 --- a/archive_query_log/results/test/test_amazon_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_amazon_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_amazon_under_armour_socks_1553107494(): diff --git a/archive_query_log/results/test/test_ask_serp_parsing.py b/archive_query_log/legacy/results/test/test_ask_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_ask_serp_parsing.py rename to archive_query_log/legacy/results/test/test_ask_serp_parsing.py index b4f2fbe5..6389c389 100644 --- a/archive_query_log/results/test/test_ask_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_ask_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_ask_peter_krogh_photographer_1184320758(): diff --git a/archive_query_log/results/test/test_baidu_serp_parsing.py b/archive_query_log/legacy/results/test/test_baidu_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_baidu_serp_parsing.py rename to archive_query_log/legacy/results/test/test_baidu_serp_parsing.py index 10e99148..b3cad875 100644 --- a/archive_query_log/results/test/test_baidu_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_baidu_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_baidu_lian_xi_qu_dian_nao_pei_xun_1643390077(): diff --git a/archive_query_log/results/test/test_bing_serp_parsing.py b/archive_query_log/legacy/results/test/test_bing_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_bing_serp_parsing.py rename to archive_query_log/legacy/results/test/test_bing_serp_parsing.py index 6ac38e25..e5ad1d22 100644 --- a/archive_query_log/results/test/test_bing_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_bing_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_bing_uscis_forms_400_1486690408(): diff --git a/archive_query_log/results/test/test_bongacams_serp_parsing.py b/archive_query_log/legacy/results/test/test_bongacams_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_bongacams_serp_parsing.py rename to archive_query_log/legacy/results/test/test_bongacams_serp_parsing.py index 6781e784..6cb20a87 100644 --- a/archive_query_log/results/test/test_bongacams_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_bongacams_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_bongacams_facial_1578106424(): diff --git a/archive_query_log/results/test/test_brave_serp_parsing.py b/archive_query_log/legacy/results/test/test_brave_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_brave_serp_parsing.py rename to archive_query_log/legacy/results/test/test_brave_serp_parsing.py index c84844e1..57833f78 100644 --- a/archive_query_log/results/test/test_brave_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_brave_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_brave_chomikuj_1656776694(): diff --git a/archive_query_log/results/test/test_canva_serp_parsing.py b/archive_query_log/legacy/results/test/test_canva_serp_parsing.py similarity index 84% rename from archive_query_log/results/test/test_canva_serp_parsing.py rename to archive_query_log/legacy/results/test/test_canva_serp_parsing.py index 22542d25..9667280a 100644 --- a/archive_query_log/results/test/test_canva_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_canva_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_canva_instagram_reels_video_1607594697(): diff --git a/archive_query_log/results/test/test_chefkoch_serp_parsing.py b/archive_query_log/legacy/results/test/test_chefkoch_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_chefkoch_serp_parsing.py rename to archive_query_log/legacy/results/test/test_chefkoch_serp_parsing.py index bcb8ab69..58411bf0 100644 --- a/archive_query_log/results/test/test_chefkoch_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_chefkoch_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_chefkoch_spaghetti_eis_torte_1342866905(): diff --git a/archive_query_log/results/test/test_cnn_serp_parsing.py b/archive_query_log/legacy/results/test/test_cnn_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_cnn_serp_parsing.py rename to archive_query_log/legacy/results/test/test_cnn_serp_parsing.py index ba92d003..c85854ee 100644 --- a/archive_query_log/results/test/test_cnn_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_cnn_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_cnn_kindly_check_1642508434(): diff --git a/archive_query_log/results/test/test_csdn_serp_parsing.py b/archive_query_log/legacy/results/test/test_csdn_serp_parsing.py similarity index 89% rename from archive_query_log/results/test/test_csdn_serp_parsing.py rename to archive_query_log/legacy/results/test/test_csdn_serp_parsing.py index 1852b5c3..e3dda9f7 100644 --- a/archive_query_log/results/test/test_csdn_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_csdn_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_csdn_fifo_1663204137(): diff --git a/archive_query_log/results/test/test_duckduckgo_serp_parsing.py b/archive_query_log/legacy/results/test/test_duckduckgo_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_duckduckgo_serp_parsing.py rename to archive_query_log/legacy/results/test/test_duckduckgo_serp_parsing.py index a2ef92ea..b5fc82e1 100644 --- a/archive_query_log/results/test/test_duckduckgo_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_duckduckgo_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_duckduckgo_3rd_party_twitch_chat_1642095474(): diff --git a/archive_query_log/results/test/test_ebay_serp_parsing.py b/archive_query_log/legacy/results/test/test_ebay_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_ebay_serp_parsing.py rename to archive_query_log/legacy/results/test/test_ebay_serp_parsing.py index 9ab3905e..38bd7563 100644 --- a/archive_query_log/results/test/test_ebay_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_ebay_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_ebay_se_xing_mei_jia_wei_vxin_dun35358_1544323503(): diff --git a/archive_query_log/results/test/test_ecosia_serp_parsing.py b/archive_query_log/legacy/results/test/test_ecosia_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_ecosia_serp_parsing.py rename to archive_query_log/legacy/results/test/test_ecosia_serp_parsing.py index e97a468b..5958a557 100644 --- a/archive_query_log/results/test/test_ecosia_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_ecosia_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_ecosia_financial_risk_tolerance_quiz_1643759873(): diff --git a/archive_query_log/results/test/test_espn_serp_parsing.py b/archive_query_log/legacy/results/test/test_espn_serp_parsing.py similarity index 85% rename from archive_query_log/results/test/test_espn_serp_parsing.py rename to archive_query_log/legacy/results/test/test_espn_serp_parsing.py index 1b7b1230..4dc91e9d 100644 --- a/archive_query_log/results/test/test_espn_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_espn_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_espn_ball_state_1619440827(): diff --git a/archive_query_log/results/test/test_etsy_serp_parsing.py b/archive_query_log/legacy/results/test/test_etsy_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_etsy_serp_parsing.py rename to archive_query_log/legacy/results/test/test_etsy_serp_parsing.py index 55b8d756..084f3f08 100644 --- a/archive_query_log/results/test/test_etsy_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_etsy_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_etsy_embroidery_kit_1375805089(): diff --git a/archive_query_log/results/test/test_facebook_serp_parsing.py b/archive_query_log/legacy/results/test/test_facebook_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_facebook_serp_parsing.py rename to archive_query_log/legacy/results/test/test_facebook_serp_parsing.py index 375da4d7..f4659cfa 100644 --- a/archive_query_log/results/test/test_facebook_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_facebook_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_facebook_vanilla_1481832838(): diff --git a/archive_query_log/results/test/test_github_serp_parsing.py b/archive_query_log/legacy/results/test/test_github_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_github_serp_parsing.py rename to archive_query_log/legacy/results/test/test_github_serp_parsing.py index fe7a0a7e..2807c3b1 100644 --- a/archive_query_log/results/test/test_github_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_github_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_github_topic_deprecated_org_bandwidth_fork_true_1634361552(): diff --git a/archive_query_log/results/test/test_google_serp_parsing.py b/archive_query_log/legacy/results/test/test_google_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_google_serp_parsing.py rename to archive_query_log/legacy/results/test/test_google_serp_parsing.py index fc653f3e..8710b36f 100644 --- a/archive_query_log/results/test/test_google_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_google_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_google_taikoo_hui_mandarin_oriental_hotel_guangzhou_1652086766(): @@ -66,10 +66,10 @@ def test_parse_query_google_zombie_apocalypse_1565114134(): ) -def test_parse_query_google_w_fan_j_li_s_ma_n_tang_and_w_yu_april_2012_towards_certain_fixes_with_editing_rules_and_master_data_the_vldb_journal_21_2_213_238_297_10_1007_s00778_011_0253_7_1614165399(): +def test_parse_query_google_scholar_w_fan_j_li_s_ma_n_tang_and_w_yu_april_2012_towards_certain_fixes_with_editing_rules_and_master_data_the_vldb_journal_21_2_213_238_297_10_1007_s00778_011_0253_7_1614165399(): verify_serp_parsing( "https://web.archive.org/web/20210224111639id_/https://scholar.google.com/scholar?hl=en&q=W.+Fan%2C+J.+Li%2C+S.+Ma%2C+N.+Tang%2C+and+W.+Yu.+April+2012.+Towards+certain+fixes+with+editing+rules+and+master+data.+The+VLDB+Journal%2C+21(2)%3A+213--238.+297+10.1007%2Fs00778-011-0253-7+", - "google", + "google-scholar", ) @@ -80,17 +80,17 @@ def test_parse_query_google_susan_boyle_make_me_a_channel_of_your_peace_16077910 ) -def test_parse_query_google_muhammed_rashid_1656890873(): +def test_parse_query_google_scholar_muhammed_rashid_1656890873(): verify_serp_parsing( "https://web.archive.org/web/20220703232753id_/https://scholar.google.com/scholar?q=Muhammed+Rashid", - "google", + "google-scholar", ) -def test_parse_query_google_a_tumeo_m_branca_l_camerini_a_dual_priority_realtime_multiprocessor_system_on_fpga_for_automotive_ap_1614181186(): +def test_parse_query_google_scholar_a_tumeo_m_branca_l_camerini_a_dual_priority_realtime_multiprocessor_system_on_fpga_for_automotive_ap_1614181186(): verify_serp_parsing( "https://web.archive.org/web/20210224153946id_/https://scholar.google.com/scholar?hl=en&q=A.+Tumeo%2C+M.+Branca%2C+L.+Camerini%2C+%22A+Dual-Priority+RealTime+Multiprocessor+System+on+FPGA+for+Automotive+Applications%22%2C+Design%2C+Automation+and+Test+in+Europe+(DATE)+2008%2C+pages+1039--1044.+10.1145%2F1403375.1403625+", - "google", + "google-scholar", ) @@ -437,10 +437,10 @@ def test_parse_query_google_list_of_cantons_of_france_1634951028(): ) -def test_parse_query_google_wan_y_menon_s_and_ramaprasad_a_2009_the_paradoxical_nature_of_electronic_decision_aids_on_comparison_1614187144(): +def test_parse_query_google_scholar_wan_y_menon_s_and_ramaprasad_a_2009_the_paradoxical_nature_of_electronic_decision_aids_on_comparison_1614187144(): verify_serp_parsing( "https://web.archive.org/web/20210224171904id_/https://scholar.google.com/scholar?hl=en&q=Wan%2C+Y.%2C+Menon%2C+S.%2C+and+Ramaprasad%2C+A.+2009.+The+paradoxical+nature+of+electronic+decision+aids+on+comparison-shopping%3A+The+experiments+and+analysis.+J.+Theoret.+Appl.+Electron.+Commerce+Res.%2C+80--96.+", - "google", + "google-scholar", ) diff --git a/archive_query_log/results/test/test_imdb_serp_parsing.py b/archive_query_log/legacy/results/test/test_imdb_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_imdb_serp_parsing.py rename to archive_query_log/legacy/results/test/test_imdb_serp_parsing.py index c8774da1..9f228c2f 100644 --- a/archive_query_log/results/test/test_imdb_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_imdb_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_imdb_pulse_1283006912(): diff --git a/archive_query_log/results/test/test_imgur_serp_parsing.py b/archive_query_log/legacy/results/test/test_imgur_serp_parsing.py similarity index 85% rename from archive_query_log/results/test/test_imgur_serp_parsing.py rename to archive_query_log/legacy/results/test/test_imgur_serp_parsing.py index 9c90a673..f6b4ce6b 100644 --- a/archive_query_log/results/test/test_imgur_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_imgur_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_imgur_search_term_string_1565643838(): diff --git a/archive_query_log/results/test/test_indeed_serp_parsing.py b/archive_query_log/legacy/results/test/test_indeed_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_indeed_serp_parsing.py rename to archive_query_log/legacy/results/test/test_indeed_serp_parsing.py index 56392464..dda8a319 100644 --- a/archive_query_log/results/test/test_indeed_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_indeed_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_indeed_origins_macy_s_1353165507(): diff --git a/archive_query_log/results/test/test_jd_serp_parsing.py b/archive_query_log/legacy/results/test/test_jd_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_jd_serp_parsing.py rename to archive_query_log/legacy/results/test/test_jd_serp_parsing.py index cd34e21d..e9ebf0ca 100644 --- a/archive_query_log/results/test/test_jd_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_jd_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_jd_yu_diao_jiu_ning_meng_1601078153(): diff --git a/archive_query_log/results/test/test_linkedin_serp_parsing.py b/archive_query_log/legacy/results/test/test_linkedin_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_linkedin_serp_parsing.py rename to archive_query_log/legacy/results/test/test_linkedin_serp_parsing.py index 3fc141cb..8de77655 100644 --- a/archive_query_log/results/test/test_linkedin_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_linkedin_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_linkedin_vizthink_1229875736(): diff --git a/archive_query_log/results/test/test_manual_google_serp_parsing.py b/archive_query_log/legacy/results/test/test_manual_google_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_manual_google_serp_parsing.py rename to archive_query_log/legacy/results/test/test_manual_google_serp_parsing.py index 1975a2a2..8a451f10 100644 --- a/archive_query_log/results/test/test_manual_google_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_manual_google_serp_parsing.py @@ -1,5 +1,5 @@ # flake8: noqa -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_9_11_search(): diff --git a/archive_query_log/results/test/test_naver_serp_parsing.py b/archive_query_log/legacy/results/test/test_naver_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_naver_serp_parsing.py rename to archive_query_log/legacy/results/test/test_naver_serp_parsing.py index b5e40bbb..98e0806c 100644 --- a/archive_query_log/results/test/test_naver_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_naver_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_naver_11548566_973003263(): diff --git a/archive_query_log/results/test/test_pornhub_serp_parsing.py b/archive_query_log/legacy/results/test/test_pornhub_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_pornhub_serp_parsing.py rename to archive_query_log/legacy/results/test/test_pornhub_serp_parsing.py index 2fe24ee7..6c3f48f8 100644 --- a/archive_query_log/results/test/test_pornhub_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_pornhub_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_pornhub_kardashian_1232246712(): diff --git a/archive_query_log/results/test/test_qq_serp_parsing.py b/archive_query_log/legacy/results/test/test_qq_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_qq_serp_parsing.py rename to archive_query_log/legacy/results/test/test_qq_serp_parsing.py index 389a98ec..db3082d1 100644 --- a/archive_query_log/results/test/test_qq_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_qq_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_qq_danil_kozlovsky_1360453772(): diff --git a/archive_query_log/results/test/test_qwant_serp_parsing.py b/archive_query_log/legacy/results/test/test_qwant_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_qwant_serp_parsing.py rename to archive_query_log/legacy/results/test/test_qwant_serp_parsing.py index b3593ea5..ca9f1e90 100644 --- a/archive_query_log/results/test/test_qwant_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_qwant_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_qwant_administrateur_general_du_cnam_1619206522(): diff --git a/archive_query_log/results/test/test_reddit_serp_parsing.py b/archive_query_log/legacy/results/test/test_reddit_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_reddit_serp_parsing.py rename to archive_query_log/legacy/results/test/test_reddit_serp_parsing.py index c854b7b2..ba17ec2b 100644 --- a/archive_query_log/results/test/test_reddit_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_reddit_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_reddit_teleperformance_1260472045(): diff --git a/archive_query_log/results/test/test_roblox_serp_parsing.py b/archive_query_log/legacy/results/test/test_roblox_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_roblox_serp_parsing.py rename to archive_query_log/legacy/results/test/test_roblox_serp_parsing.py index ddf2b780..96792530 100644 --- a/archive_query_log/results/test/test_roblox_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_roblox_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_roblox_survival_1656759229(): diff --git a/archive_query_log/results/test/test_sogou_serp_parsing.py b/archive_query_log/legacy/results/test/test_sogou_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_sogou_serp_parsing.py rename to archive_query_log/legacy/results/test/test_sogou_serp_parsing.py index 6b63237f..ff907c07 100644 --- a/archive_query_log/results/test/test_sogou_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_sogou_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_sogou_lofke_yi_kua_shi_chang_mai_mai_1333092705(): diff --git a/archive_query_log/results/test/test_stackoverflow_serp_parsing.py b/archive_query_log/legacy/results/test/test_stackoverflow_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_stackoverflow_serp_parsing.py rename to archive_query_log/legacy/results/test/test_stackoverflow_serp_parsing.py index 2a70dbd1..75400120 100644 --- a/archive_query_log/results/test/test_stackoverflow_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_stackoverflow_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_stackoverflow_objective_c_1354546520(): diff --git a/archive_query_log/results/test/test_tribunnews_serp_parsing.py b/archive_query_log/legacy/results/test/test_tribunnews_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_tribunnews_serp_parsing.py rename to archive_query_log/legacy/results/test/test_tribunnews_serp_parsing.py index 22f228d7..89a03f91 100644 --- a/archive_query_log/results/test/test_tribunnews_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_tribunnews_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_tribunnews_peringatan_dini_bmkg_kamis_18_februari_2021_1632679908(): diff --git a/archive_query_log/results/test/test_twitch_serp_parsing.py b/archive_query_log/legacy/results/test/test_twitch_serp_parsing.py similarity index 95% rename from archive_query_log/results/test/test_twitch_serp_parsing.py rename to archive_query_log/legacy/results/test/test_twitch_serp_parsing.py index 950c3c18..4f68dc6a 100644 --- a/archive_query_log/results/test/test_twitch_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_twitch_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_twitch_juliversal_1629750531(): diff --git a/archive_query_log/results/test/test_twitter_serp_parsing.py b/archive_query_log/legacy/results/test/test_twitter_serp_parsing.py similarity index 96% rename from archive_query_log/results/test/test_twitter_serp_parsing.py rename to archive_query_log/legacy/results/test/test_twitter_serp_parsing.py index 39500735..e8cac267 100644 --- a/archive_query_log/results/test/test_twitter_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_twitter_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_twitter_rabble_ca_lang_ar_1442999332(): diff --git a/archive_query_log/results/test/test_utils.py b/archive_query_log/legacy/results/test/test_utils.py similarity index 84% rename from archive_query_log/results/test/test_utils.py rename to archive_query_log/legacy/results/test/test_utils.py index df7ccd3b..57f6fe38 100644 --- a/archive_query_log/results/test/test_utils.py +++ b/archive_query_log/legacy/results/test/test_utils.py @@ -8,11 +8,12 @@ from slugify import slugify from tqdm.auto import tqdm -from archive_query_log import PROJECT_DIRECTORY_PATH -from archive_query_log.config import SERVICES -from archive_query_log.download.iterable import ArchivedRawSerps -from archive_query_log.model import ArchivedParsedSerp, ArchivedRawSerp -from archive_query_log.results.parse import ArchivedParsedSerpParser +from archive_query_log.legacy import PROJECT_DIRECTORY_PATH +from archive_query_log.legacy.config import SERVICES +from archive_query_log.legacy.download.iterable import ArchivedRawSerps +from archive_query_log.legacy.model import ArchivedParsedSerp, \ + ArchivedRawSerp, ResultsParser, InterpretedQueryParser, Service +from archive_query_log.legacy.results.parse import ArchivedParsedSerpParser _expected_dir = PROJECT_DIRECTORY_PATH / \ "data/manual-annotations/" \ @@ -26,13 +27,14 @@ def verify_serp_parsing( wayback_raw_url: str, service_name: str | None = None, ) -> None: + services: Iterable[Service] if service_name is None: services = SERVICES.values() else: services = [SERVICES[service_name]] - result_parsers = [] - interpreted_query_parsers = [] + result_parsers: list[ResultsParser] = [] + interpreted_query_parsers: list[InterpretedQueryParser] = [] for service in services: result_parsers += service.results_parsers interpreted_query_parsers += service.interpreted_query_parsers diff --git a/archive_query_log/results/test/test_vk_serp_parsing.py b/archive_query_log/legacy/results/test/test_vk_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_vk_serp_parsing.py rename to archive_query_log/legacy/results/test/test_vk_serp_parsing.py index 7648d654..26230ba2 100644 --- a/archive_query_log/results/test/test_vk_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_vk_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_vk_technology_1383988940(): diff --git a/archive_query_log/results/test/test_weibo_serp_parsing.py b/archive_query_log/legacy/results/test/test_weibo_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_weibo_serp_parsing.py rename to archive_query_log/legacy/results/test/test_weibo_serp_parsing.py index e4accb3e..5a606fd5 100644 --- a/archive_query_log/results/test/test_weibo_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_weibo_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_weibo_sheng_fu_lang_xi_si_1603546157(): diff --git a/archive_query_log/results/test/test_wikimedia_serp_parsing.py b/archive_query_log/legacy/results/test/test_wikimedia_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_wikimedia_serp_parsing.py rename to archive_query_log/legacy/results/test/test_wikimedia_serp_parsing.py index 25bbc8ab..f4b473a8 100644 --- a/archive_query_log/results/test/test_wikimedia_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_wikimedia_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_wikimedia_figures_in_theatrical_costumes_claude_gillot_1673_1722_class_photo_description_french_painter_drawer_1632572254(): diff --git a/archive_query_log/results/test/test_xvideos_serp_parsing.py b/archive_query_log/legacy/results/test/test_xvideos_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_xvideos_serp_parsing.py rename to archive_query_log/legacy/results/test/test_xvideos_serp_parsing.py index 9ec0f464..41f32bd3 100644 --- a/archive_query_log/results/test/test_xvideos_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_xvideos_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_xvideos_drunk_1325431509(): diff --git a/archive_query_log/results/test/test_yahoo_serp_parsing.py b/archive_query_log/legacy/results/test/test_yahoo_serp_parsing.py similarity index 99% rename from archive_query_log/results/test/test_yahoo_serp_parsing.py rename to archive_query_log/legacy/results/test/test_yahoo_serp_parsing.py index e43dd30d..c6e628b1 100644 --- a/archive_query_log/results/test/test_yahoo_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_yahoo_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_yahoo_diver_lg_u8180_1620023310(): diff --git a/archive_query_log/results/test/test_yandex_serp_parsing.py b/archive_query_log/legacy/results/test/test_yandex_serp_parsing.py similarity index 97% rename from archive_query_log/results/test/test_yandex_serp_parsing.py rename to archive_query_log/legacy/results/test/test_yandex_serp_parsing.py index d242f6c7..b81c37ff 100644 --- a/archive_query_log/results/test/test_yandex_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_yandex_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_yandex_speed_force_1535973684(): diff --git a/archive_query_log/results/test/test_youtube_serp_parsing.py b/archive_query_log/legacy/results/test/test_youtube_serp_parsing.py similarity index 98% rename from archive_query_log/results/test/test_youtube_serp_parsing.py rename to archive_query_log/legacy/results/test/test_youtube_serp_parsing.py index 482bdee9..e50cd555 100644 --- a/archive_query_log/results/test/test_youtube_serp_parsing.py +++ b/archive_query_log/legacy/results/test/test_youtube_serp_parsing.py @@ -1,6 +1,6 @@ # flake8: noqa # This file is auto-generated by generate_tests.py. -from archive_query_log.results.test.test_utils import verify_serp_parsing +from archive_query_log.legacy.results.test.test_utils import verify_serp_parsing def test_parse_query_youtube_pudding_1563068696(): diff --git a/archive_query_log/util/__init__.py b/archive_query_log/legacy/serps/__init__.py similarity index 100% rename from archive_query_log/util/__init__.py rename to archive_query_log/legacy/serps/__init__.py diff --git a/archive_query_log/legacy/serps/iterable.py b/archive_query_log/legacy/serps/iterable.py new file mode 100644 index 00000000..38186bce --- /dev/null +++ b/archive_query_log/legacy/serps/iterable.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +from gzip import GzipFile +from pathlib import Path +from typing import Sized, Iterable, Iterator + +from archive_query_log.legacy.model import ArchivedParsedSerp +from archive_query_log.legacy.util.text import count_lines, text_io_wrapper + + +@dataclass(frozen=True) +class ArchivedParsedSerps(Sized, Iterable[ArchivedParsedSerp]): + """ + Read archived parsed SERPs from a JSONL file. + """ + + path: Path + """ + Path where the parsed SERPs are stored in JSONL format. + """ + + def __post_init__(self): + self._check_urls_path() + + def _check_urls_path(self): + if not self.path.exists() or not self.path.is_file(): + raise ValueError( + f"URLs path must be a file: {self.path}" + ) + + def __len__(self) -> int: + with (self.path.open("rb") as file, + GzipFile(fileobj=file, mode="rb") as gzip_file): + return count_lines(gzip_file) + + def __iter__(self) -> Iterator[ArchivedParsedSerp]: + schema = ArchivedParsedSerp.schema() + with (self.path.open("rb") as file, + GzipFile(fileobj=file, mode="rb") as gzip_file, + text_io_wrapper(gzip_file) as text_file): + for line in text_file: + serp = schema.loads(line, many=True) + if isinstance(serp, list): + raise ValueError(f"Expected one SERP per line: {line}") + yield serp diff --git a/archive_query_log/service_stats.py b/archive_query_log/legacy/service_stats.py similarity index 95% rename from archive_query_log/service_stats.py rename to archive_query_log/legacy/service_stats.py index 1e259e93..cdabc26a 100644 --- a/archive_query_log/service_stats.py +++ b/archive_query_log/legacy/service_stats.py @@ -1,4 +1,4 @@ -from archive_query_log.config import SERVICES +from archive_query_log.legacy.config import SERVICES if __name__ == '__main__': num_url_prefixes = sum( diff --git a/archive_query_log/services/__init__.py b/archive_query_log/legacy/services/__init__.py similarity index 74% rename from archive_query_log/services/__init__.py rename to archive_query_log/legacy/services/__init__.py index aa972665..8ef3bfef 100644 --- a/archive_query_log/services/__init__.py +++ b/archive_query_log/legacy/services/__init__.py @@ -1,9 +1,10 @@ from pathlib import Path from typing import Mapping +from marshmallow import ValidationError from yaml import safe_load -from archive_query_log.model import Service +from archive_query_log.legacy.model import Service def read_services( @@ -15,14 +16,12 @@ def read_services( for service_dict in services_dict: try: - service = Service.schema().load(service_dict) + service = Service.schema(unknown="exclude").load(service_dict) services += [(service.name, service)] - except Exception as exception: + except ValidationError as e: if not ignore_parsing_errors: raise ValueError( - f"Could not parse service {service_dict['name']}", - exception - ) + f"Could not parse service {service_dict}") from e if not ignore_parsing_errors: service_names = set() for name, service in services: diff --git a/archive_query_log/services/aggregate_services.py b/archive_query_log/legacy/services/aggregate_services.py similarity index 100% rename from archive_query_log/services/aggregate_services.py rename to archive_query_log/legacy/services/aggregate_services.py diff --git a/archive_query_log/services/alexa.py b/archive_query_log/legacy/services/alexa.py similarity index 70% rename from archive_query_log/services/alexa.py rename to archive_query_log/legacy/services/alexa.py index 956d9c2e..216e698d 100644 --- a/archive_query_log/services/alexa.py +++ b/archive_query_log/legacy/services/alexa.py @@ -8,7 +8,7 @@ from math import floor, log10 from pathlib import Path from tempfile import gettempdir -from typing import Sized, Iterable, Any, Iterator, Mapping, Set +from typing import Sized, Iterable, Any, Iterator, Mapping, Set, NamedTuple from zipfile import ZipFile from publicsuffixlist import PublicSuffixList @@ -17,9 +17,9 @@ from requests.exceptions import ChunkedEncodingError from tqdm.auto import tqdm -from archive_query_log.model import ArchivedUrl -from archive_query_log.download.raw import WebArchiveRawDownloader -from archive_query_log.util.http_session import backoff_session +from archive_query_log.legacy.model import ArchivedUrl +from archive_query_log.legacy.download.raw import WebArchiveRawDownloader +from archive_query_log.legacy.util.http_session import backoff_session @dataclass(frozen=True) @@ -57,6 +57,8 @@ def num_pages(self) -> int: *self._params, ("showNumPages", True), ], + # 10 minutes + timeout=10 * 60 # nosec: B113 ) return int(num_pages_response.text) @@ -64,12 +66,13 @@ def _page_cache_path(self, page: int) -> Path: num_digits = floor(log10(self.num_pages)) + 1 return self._cache_path / f"page_{page:{num_digits}}.jsonl" - def _fetch_page(self, page: int) -> Path | None: + def _fetch_page(self, page: int) -> None: path = self._page_cache_path(page) if path.exists(): # Page was already downloaded, skip it. - assert path.is_file() - return path + if not path.is_file(): + raise RuntimeError(f"Path must be a file: {path}") + return session = backoff_session() try: @@ -103,11 +106,14 @@ def _fetch_pages(self) -> None: """ Fetch queries from each individual page. """ - for page in tqdm( - range(self.num_pages), - desc="Fetch urls", - unit="page", - ): + pages: Iterable[int] = range(self.num_pages) + # noinspection PyTypeChecker + pages = tqdm( + pages, + desc="Fetch urls", + unit="page", + ) + for page in pages: self._fetch_page(page) def _missing_pages(self) -> set[int]: @@ -126,12 +132,15 @@ def _merge_cached_pages(self) -> None: """ Merge queries from all pages. """ - with self.output_path.open("wt") as file: - for page in tqdm( - range(self.num_pages), - desc="Merge urls", - unit="page", - ): + with self.output_path.open("wt", encoding="utf8") as file: + pages: Iterable[int] = range(self.num_pages) + # noinspection PyTypeChecker + pages = tqdm( + pages, + desc="Merge urls", + unit="page", + ) + for page in pages: path = self._page_cache_path(page) with path.open("rt") as page_file: lines = page_file @@ -140,7 +149,8 @@ def _merge_cached_pages(self) -> None: def fetch(self) -> None: if self.output_path.exists(): - assert self.output_path.is_file() + if not self.output_path.is_file(): + raise RuntimeError(f"Path must be a file: {self.output_path}") return print(f"Storing temporary files at: {self._cache_path}") self._fetch_pages() @@ -158,19 +168,45 @@ def fetch(self) -> None: def __len__(self) -> int: self.fetch() - with self.output_path.open("rt") as file: + with self.output_path.open("rt", encoding="utf8") as file: return sum(1 for _ in file) def __iter__(self) -> Iterator[ArchivedUrl]: self.fetch() schema = ArchivedUrl.schema() - with self.output_path.open("rt") as file: + with self.output_path.open("rt", encoding="utf8") as file: for line in file: - yield schema.loads(line) + url = schema.loads(line, many=True) + if isinstance(url, list): + raise ValueError(f"Expected one URL per line: {line}") + yield url + + +def _iter_deduplicated(domains: Iterable[str]) -> Iterator[str]: + public_suffix_list = PublicSuffixList() + second_level_domains = set() + for domain in domains: + public_suffix = public_suffix_list.publicsuffix(domain) + second_level_domain = public_suffix_list.subdomain(domain, 0) + if second_level_domain is None: + second_level_domain = public_suffix + second_level_domain = second_level_domain.removesuffix( + f".{public_suffix}" + ) + if second_level_domain in second_level_domains: + continue + second_level_domains.add(second_level_domain) + yield domain + + +class AlexaTop1MDomain(NamedTuple): + rank: int + domain: str + public_suffix: str @dataclass(frozen=True) -class AlexaTop1MFusedDomains(Sized, Iterable[Path]): +class AlexaTop1MFusedDomains(Sized, Iterable[AlexaTop1MDomain]): """ Fuse the rop-1000 of all archived Alexa top-1M rankings. """ @@ -214,38 +250,28 @@ def _fetch_rankings(self) -> Iterable[Path]: raise RuntimeError("Some downloads were unsuccessful. Try again.") return paths.values() - def _iter_deduplicated(self, domains: Iterable[str]) -> Iterator[str]: - public_suffix_list = PublicSuffixList() - second_level_domains = set() - for domain in domains: - public_suffix = public_suffix_list.publicsuffix(domain) - second_level_domain = public_suffix_list.subdomain(domain, 0) - if second_level_domain is None: - second_level_domain = public_suffix - second_level_domain = second_level_domain.removesuffix( - f".{public_suffix}" - ) - if second_level_domain in second_level_domains: - continue - second_level_domains.add(second_level_domain) - yield domain - def _fuse_cached_rankings(self) -> None: runs: list[Run] = [] num_runs = sum(1 for _ in self._cache_path.iterdir()) - for path in tqdm( - self._cache_path.iterdir(), - total=num_runs, - desc="Read ranking", - unit="ranking", - ): + paths: Iterable[Path] = self._cache_path.iterdir() + # noinspection PyTypeChecker + paths = tqdm( + paths, + total=num_runs, + desc="Fuse rankings", + unit="ranking", + ) + for path in paths: with path.open("rb") as file: with ZipFile(file) as zip_file: with zip_file.open("top-1m.csv", "r") as csv_file: with TextIOWrapper(csv_file) as lines: - domains = (line[1] for line in reader(lines)) + domains: Iterable[str] = ( + line[1] + for line in reader(lines) + ) if self.deduplicate_per_ranking: - domains = self._iter_deduplicated(domains) + domains = _iter_deduplicated(domains) if self.max_domains_per_ranking is not None: domains = islice( domains, @@ -255,8 +281,8 @@ def _fuse_cached_rankings(self) -> None: domain: 1_000_000 - index for index, domain in enumerate(domains) } - run = Run({"_": scores}) - runs.append(run) + alexa_run = Run({"_": scores}) + runs.append(alexa_run) print(f"Fusing {len(runs)} rankings.") combined_run = fuse( runs=runs, @@ -268,18 +294,19 @@ def _fuse_cached_rankings(self) -> None: key=lambda item: item[1], reverse=True, ) - domains = (domain for domain, _ in items) + fused_domains: Iterable[str] = (domain for domain, _ in items) if self.deduplicate_fused_ranking and not self.deduplicate_per_ranking: - domains = self._iter_deduplicated(domains) + fused_domains = _iter_deduplicated(fused_domains) public_suffix_list = PublicSuffixList(only_icann=True) with self._result_path.open("wt") as file: - for index, domain in enumerate(domains): + for index, domain in enumerate(fused_domains): public_suffix = public_suffix_list.publicsuffix(domain) file.write(f"{index + 1},{domain},{public_suffix}\n") def fetch(self) -> None: if self._result_path.exists(): - assert self._result_path.is_file() + if not self._result_path.is_file(): + raise RuntimeError(f"Path must be a file: {self._result_path}") return print(f"Storing temporary files at: {self._cache_path}") self._fetch_rankings() @@ -293,9 +320,9 @@ def __len__(self) -> int: with self._result_path.open("rt") as file: return sum(1 for _ in file) - def __iter__(self) -> Iterator[ArchivedUrl]: + def __iter__(self) -> Iterator[AlexaTop1MDomain]: self.fetch() - schema = ArchivedUrl.schema() with self._result_path.open("rt") as file: for line in file: - yield schema.loads(line) + index, domain, public_suffix = line.split(",") + yield AlexaTop1MDomain(int(index), domain, public_suffix) diff --git a/archive_query_log/services/search_forms.py b/archive_query_log/legacy/services/search_forms.py similarity index 99% rename from archive_query_log/services/search_forms.py rename to archive_query_log/legacy/services/search_forms.py index 534fd747..e707e4a7 100644 --- a/archive_query_log/services/search_forms.py +++ b/archive_query_log/legacy/services/search_forms.py @@ -132,9 +132,9 @@ def get_internet_archive_html(self, url: str, year=2022, byte_digits=4): # Request the corresponding HTML ia_url = f'https://web.archive.org/web/{timestamp}/{url}/' return self.session.get(ia_url, timeout=10) - except Exception: + except Exception as e: raise RuntimeError( - 'Failed to request an internet archive snapshot') + 'Failed to request an internet archive snapshot') from e def services_no_search(self): return self.out_df[(self.out_df['input'] is False) & ( @@ -172,7 +172,7 @@ def find_search_tag(soup: BeautifulSoup, tag='form'): return found, snippet_list -if __name__ == "__main__": +def main(): # Parse input parser = argparse.ArgumentParser( prog='Search form identification', @@ -197,3 +197,7 @@ def find_search_tag(soup: BeautifulSoup, tag='form'): start_row=start_row, end_row=end_row ) identifier.process_services() + + +if __name__ == "__main__": + main() diff --git a/archive_query_log/services/test_services.py b/archive_query_log/legacy/services/test_services.py similarity index 54% rename from archive_query_log/services/test_services.py rename to archive_query_log/legacy/services/test_services.py index c969bb11..af8cb622 100644 --- a/archive_query_log/services/test_services.py +++ b/archive_query_log/legacy/services/test_services.py @@ -1,5 +1,5 @@ -from archive_query_log.config import SERVICES_PATH -from archive_query_log.services import read_services +from archive_query_log.legacy.config import SERVICES_PATH +from archive_query_log.legacy.services import read_services def test_services_can_be_parsed(): diff --git a/archive_query_log/services/update_yaml.py b/archive_query_log/legacy/services/update_yaml.py similarity index 92% rename from archive_query_log/services/update_yaml.py rename to archive_query_log/legacy/services/update_yaml.py index f9cd5179..5d40a3d2 100644 --- a/archive_query_log/services/update_yaml.py +++ b/archive_query_log/legacy/services/update_yaml.py @@ -4,8 +4,9 @@ import yaml from pandas import concat, DataFrame -from archive_query_log import DATA_DIRECTORY_PATH -from archive_query_log.cli.external import load_services, load_domains, \ +from archive_query_log.legacy import DATA_DIRECTORY_PATH +from archive_query_log.legacy.cli.external import \ + load_services, load_domains, \ service_domains, load_url_prefixes, \ load_query_parsers, query_parser, load_page_offset_parsers, \ page_offset_parser_series @@ -25,7 +26,7 @@ def get_spreadsheet_data( services = load_services() idx_first = services["name"].ne(first_service).idxmin() idx_last = services["name"].ne(last_service).idxmin() - services = services.loc[idx_first:idx_last, :] + services = services.loc[idx_first:idx_last, :] # type: ignore domains = load_domains() services["domains"] = [ @@ -77,7 +78,7 @@ def update_yaml_file( """ services = get_spreadsheet_data( first_service=first_service, last_service=last_service) - with open(services_file, "r") as stream: + with open(services_file, "r", encoding="utf8") as stream: yaml_list = yaml.safe_load(stream) update_func = overwrite_parsers if overwrite else update_empty_parsers i = 0 @@ -134,9 +135,10 @@ def set_query_parsers(service_elem: dict, services: DataFrame) -> None: def update_ranks(df: pd.DataFrame, yaml_list: Sequence[dict]): for i, elem in enumerate(yaml_list): name = elem["name"] - try: - rank = int(df.loc[df["service"] == name, "rank"].values[0]) - except Exception: + rank_df = df.loc[df["service"] == name, "rank"] + if len(rank_df) > 0: + rank = int(rank_df.values[0]) + else: rank = 999999 yaml_list[i]["alexa_rank"] = rank diff --git a/data/manual-annotations/archived-raw-serps/warcs/duckduckgo-espanol-incluso-site-cnnespanol-cnn-com-1612897539.warc.gz b/archive_query_log/legacy/urls/__init__.py similarity index 100% rename from data/manual-annotations/archived-raw-serps/warcs/duckduckgo-espanol-incluso-site-cnnespanol-cnn-com-1612897539.warc.gz rename to archive_query_log/legacy/urls/__init__.py diff --git a/archive_query_log/urls/fetch.py b/archive_query_log/legacy/urls/fetch.py similarity index 94% rename from archive_query_log/urls/fetch.py rename to archive_query_log/legacy/urls/fetch.py index 87981634..27b17515 100644 --- a/archive_query_log/urls/fetch.py +++ b/archive_query_log/legacy/urls/fetch.py @@ -3,7 +3,6 @@ from enum import Enum from functools import cached_property from gzip import GzipFile -from io import TextIOWrapper from itertools import chain from pathlib import Path from typing import AbstractSet, Sequence, Any, Iterable, Iterator, NamedTuple @@ -16,9 +15,10 @@ from marshmallow import Schema from tqdm.auto import tqdm -from archive_query_log import CDX_API_URL -from archive_query_log.model import ArchivedUrl, Service -from archive_query_log.util.archive_http import archive_http_client +from archive_query_log.legacy import CDX_API_URL +from archive_query_log.legacy.model import ArchivedUrl, Service +from archive_query_log.legacy.util.archive_http import archive_http_client +from archive_query_log.legacy.util.text import text_io_wrapper class UrlMatchScope(Enum): @@ -92,6 +92,7 @@ async def _num_pages( url = f"{self.cdx_api_url}?{urlencode(num_pages_params)}" async with client.get(url) as response: text = await response.text() + # noinspection PyBroadException try: num_pages = int(text) except Exception: @@ -123,7 +124,7 @@ async def _fetch_page( client: RetryClient, progress: tqdm | None = None, ) -> None: - if page.path.exists() and not self.overwrite: + if page.path.exists() and not self.overwrite and progress is not None: progress.update() return params = [ @@ -141,10 +142,9 @@ async def _fetch_page( schema, ) page.path.parent.mkdir(parents=True, exist_ok=True) - # noinspection PyTypeChecker with page.path.open("wb") as file, \ GzipFile(fileobj=file, mode="wb") as gzip_file, \ - TextIOWrapper(gzip_file) as text_file: + text_io_wrapper(gzip_file) as text_file: for line in lines: text_file.write(line) text_file.write("\n") @@ -178,7 +178,9 @@ async def _service_pages( output_format_path = data_directory / "archived-urls" output_format_path.mkdir(parents=True, exist_ok=True) if cdx_page is not None: - assert domain is not None + if domain is None: + raise RuntimeError( + "Domain must be specified when page is specified.") service_path = output_format_path / service.name domain_path = service_path / quote(domain, safe="") cdx_page_path = domain_path / f"{cdx_page:010}.jsonl.gz" @@ -224,7 +226,7 @@ async def cdx_page_pages(_cdx_page: int) -> Sequence[_CdxPage]: for url_prefix in service.focused_url_prefixes ] else: - suffix_free_domains = [] + suffix_free_domains: list[str] = [] for domain in sorted(domains, key=len): if not any( domain.endswith(suffix) diff --git a/archive_query_log/legacy/urls/iterable.py b/archive_query_log/legacy/urls/iterable.py new file mode 100644 index 00000000..01558282 --- /dev/null +++ b/archive_query_log/legacy/urls/iterable.py @@ -0,0 +1,44 @@ +from dataclasses import dataclass +from gzip import GzipFile +from pathlib import Path +from typing import Sized, Iterable, Iterator + +from archive_query_log.legacy.model import ArchivedUrl +from archive_query_log.legacy.util.text import count_lines, text_io_wrapper + + +@dataclass(frozen=True) +class ArchivedUrls(Sized, Iterable[ArchivedUrl]): + """ + Read archived URLs from a JSONL file. + """ + + path: Path + """ + Path where the URLs are stored in JSONL format. + """ + + def __post_init__(self): + self._check_urls_path() + + def _check_urls_path(self): + if not self.path.exists() or not self.path.is_file(): + raise ValueError( + f"URLs path must be a file: {self.path}" + ) + + def __len__(self) -> int: + with (self.path.open("rb") as file, + GzipFile(fileobj=file, mode="rb") as gzip_file): + return count_lines(gzip_file) + + def __iter__(self) -> Iterator[ArchivedUrl]: + schema = ArchivedUrl.schema() + with (self.path.open("rb") as file, + GzipFile(fileobj=file, mode="rb") as gzip_file, + text_io_wrapper(gzip_file) as text_file): + for line in text_file: + url = schema.loads(line) + if isinstance(url, list): + raise ValueError(f"Expected one URL per line: {line}") + yield url diff --git a/archive_query_log/legacy/util/__init__.py b/archive_query_log/legacy/util/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archive_query_log/util/archive_http.py b/archive_query_log/legacy/util/archive_http.py similarity index 86% rename from archive_query_log/util/archive_http.py rename to archive_query_log/legacy/util/archive_http.py index 1f299071..4c4f1eaf 100644 --- a/archive_query_log/util/archive_http.py +++ b/archive_query_log/legacy/util/archive_http.py @@ -1,4 +1,5 @@ from contextlib import asynccontextmanager +from typing import AsyncIterator from aiohttp import ClientSession, TCPConnector, ClientTimeout, \ ClientConnectorError, ServerTimeoutError, ClientPayloadError @@ -6,7 +7,9 @@ @asynccontextmanager -async def archive_http_session(limit: int = 10) -> ClientSession: +async def archive_http_session( + limit: int = 10, +) -> AsyncIterator[ClientSession]: # The Wayback Machine doesn't seem to support more than 10 # parallel connections from the same IP. connector = TCPConnector( @@ -26,7 +29,7 @@ async def archive_http_session(limit: int = 10) -> ClientSession: @asynccontextmanager -async def archive_http_client(limit: int = 10) -> RetryClient: +async def archive_http_client(limit: int = 10) -> AsyncIterator[RetryClient]: retry_options = JitterRetry( attempts=10, start_timeout=10, # 10 seconds diff --git a/archive_query_log/util/html.py b/archive_query_log/legacy/util/html.py similarity index 100% rename from archive_query_log/util/html.py rename to archive_query_log/legacy/util/html.py diff --git a/archive_query_log/util/http_session.py b/archive_query_log/legacy/util/http_session.py similarity index 100% rename from archive_query_log/util/http_session.py rename to archive_query_log/legacy/util/http_session.py diff --git a/archive_query_log/util/iterable.py b/archive_query_log/legacy/util/iterable.py similarity index 100% rename from archive_query_log/util/iterable.py rename to archive_query_log/legacy/util/iterable.py diff --git a/archive_query_log/util/serialization.py b/archive_query_log/legacy/util/serialization.py similarity index 89% rename from archive_query_log/util/serialization.py rename to archive_query_log/legacy/util/serialization.py index db9ecc92..c042fa77 100644 --- a/archive_query_log/util/serialization.py +++ b/archive_query_log/legacy/util/serialization.py @@ -1,6 +1,6 @@ from marshmallow.fields import Field -from archive_query_log.model import HighlightedText +from archive_query_log.legacy.model import HighlightedText class HighlightedTextField(Field): diff --git a/archive_query_log/util/text.py b/archive_query_log/legacy/util/text.py similarity index 52% rename from archive_query_log/util/text.py rename to archive_query_log/legacy/util/text.py index 5f96d5b4..2ae6886c 100644 --- a/archive_query_log/util/text.py +++ b/archive_query_log/legacy/util/text.py @@ -1,14 +1,19 @@ +from io import IOBase, TextIOWrapper from typing import IO, Iterator _LINE_COUNT_BUFFER_SIZE = 1024 * 1024 -def _chunks(reader: IO[bytes]) -> Iterator[bytes]: +def _chunks(reader: IO[bytes] | IOBase) -> Iterator[bytes]: buffer = reader.read(_LINE_COUNT_BUFFER_SIZE) while buffer: yield buffer buffer = reader.read(_LINE_COUNT_BUFFER_SIZE) -def count_lines(file: IO[bytes]) -> int: +def count_lines(file: IO[bytes] | IOBase) -> int: return sum(buffer.count(b"\n") for buffer in _chunks(file)) + + +def text_io_wrapper(file: IO[bytes] | IOBase) -> IO[str]: + return TextIOWrapper(file) # type: ignore diff --git a/archive_query_log/util/urls.py b/archive_query_log/legacy/util/urls.py similarity index 100% rename from archive_query_log/util/urls.py rename to archive_query_log/legacy/util/urls.py diff --git a/archive_query_log/monitoring/__init__.py b/archive_query_log/monitoring/__init__.py new file mode 100644 index 00000000..db5b498a --- /dev/null +++ b/archive_query_log/monitoring/__init__.py @@ -0,0 +1,37 @@ +from pathlib import Path + +from flask import Flask +from werkzeug import run_simple + +from archive_query_log import __name__ as app_name +from archive_query_log.config import Config +from archive_query_log.monitoring.home import home + + +def monitoring_app(config: Config) -> Flask: + app = Flask(app_name) + app.add_url_rule("/", "home", lambda: home(config)) + return app + + +def run_monitoring(config: Config, host: str, port: int) -> None: + app = monitoring_app(config) + if app.template_folder is None: + template_file_names = [] + else: + template_dir_path: Path = Path(app.root_path) / app.template_folder + template_file_paths = [ + template_dir_path / template + for template in app.jinja_env.list_templates() + ] + template_file_names = [ + str(template) for template in template_file_paths + ] + run_simple( + hostname=host, + port=port, + application=app, + use_reloader=True, + use_debugger=True, + extra_files=template_file_names, + ) diff --git a/archive_query_log/monitoring/home.py b/archive_query_log/monitoring/home.py new file mode 100644 index 00000000..5b7ee94c --- /dev/null +++ b/archive_query_log/monitoring/home.py @@ -0,0 +1,365 @@ +from datetime import datetime +from typing import NamedTuple, Type + +from elasticsearch_dsl.query import Exists, Query, Term +from expiringdict import ExpiringDict +from flask import render_template, Response, make_response + +from archive_query_log.config import Config +from archive_query_log.orm import Archive, Provider, Source, Capture, \ + BaseDocument, Serp, Result, UrlQueryParser, UrlPageParser, \ + UrlOffsetParser, WarcQueryParser, WarcSnippetsParser +from archive_query_log.utils.time import utc_now + +_CACHE_SECONDS_STATISTICS = 60 * 5 # 5 minutes +_CACHE_SECONDS_PROGRESS = 60 * 10 # 10 minutes + + +class Statistics(NamedTuple): + name: str + description: str + total: str + disk_size: str | None + last_modified: datetime | None + + +class Progress(NamedTuple): + input_name: str + output_name: str + description: str + total: int + current: int + + +DocumentType = Type[BaseDocument] + +_statistics_cache: dict[ + tuple[DocumentType, str], + Statistics, +] = ExpiringDict( + max_len=100, + max_age_seconds=_CACHE_SECONDS_STATISTICS, +) + + +def _convert_bytes(bytes_count: int) -> str: + step_unit = 1000.0 + bytes_count_decimal: float = bytes_count + for unit in ["B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB", "RB"]: + if bytes_count_decimal < step_unit: + return f"{bytes_count_decimal:3.1f}β€―{unit}" + bytes_count_decimal /= step_unit + return f"{bytes_count_decimal:3.1f}β€―QB" + + +def _get_statistics( + config: Config, + name: str, + description: str, + document: DocumentType, + filter_query: Query | None = None, +) -> Statistics: + key = (document, repr(filter_query)) + if key in _statistics_cache: + return _statistics_cache[key] + + document.index().refresh(using=config.es.client) + stats = document.index().stats(using=config.es.client) + search = document.search(using=config.es.client) + if filter_query is not None: + search = search.filter(filter_query) + total = search.count() + last_modified_response = ( + search + .query(Exists(field="last_modified")) + .sort("-last_modified") + .extra(size=1) + .execute() + ) + if last_modified_response.hits.total.value == 0: + last_modified = None + else: + last_modified = last_modified_response.hits[0].last_modified + + statistics = Statistics( + name=name, + description=description, + total=total, + disk_size=( + _convert_bytes(stats["_all"]["total"]["store"]["size_in_bytes"]) + if filter_query is None else None + ), + last_modified=last_modified, + ) + _statistics_cache[key] = statistics + return statistics + + +_progress_cache: dict[ + tuple[DocumentType, str, str], + Progress, +] = ExpiringDict( + max_len=100, + max_age_seconds=_CACHE_SECONDS_PROGRESS, +) + + +def _get_processed_progress( + config: Config, + input_name: str, + output_name: str, + description: str, + document: DocumentType, + status_field: str, + filter_query: Query | None = None, +) -> Progress: + key = (document, repr(filter_query), status_field) + if key in _progress_cache: + return _progress_cache[key] + + document.index().refresh(using=config.es.client) + search = document.search(using=config.es.client) + if filter_query is not None: + search = search.filter(filter_query) + total = search.count() + search_processed = search.filter(Term(**{status_field: False})) + total_processed = search_processed.count() + progress = Progress( + input_name=input_name, + output_name=output_name, + description=description, + total=total, + current=total_processed, + ) + _progress_cache[key] = progress + return progress + + +def home(config: Config) -> str | Response: + statistics_list: list[Statistics] = [ + _get_statistics( + config=config, + name="Archives", + description="Web archiving services that offer CDX " + "and Memento APIs.", + document=Archive, + ), + _get_statistics( + config=config, + name="Providers", + description="Search providers, i.e., websites that offer " + "a search functionality.", + document=Provider, + ), + _get_statistics( + config=config, + name="Sources", + description="The cross product of all archives and " + "the provider's domains and URL prefixes.", + document=Source, + ), + _get_statistics( + config=config, + name="Captures", + description="Captures matching from the archives " + "that match domain and URL prefixes.", + document=Capture, + ), + _get_statistics( + config=config, + name="SERPs", + description="Search engine result pages that have been " + "identified among the captures.", + document=Serp, + ), + _get_statistics( + config=config, + name="+ URL query", + description="SERPs for which the query has been parsed " + "from the URL.", + document=Serp, + filter_query=Exists(field="url_query"), + ), + _get_statistics( + config=config, + name="+ URL page", + description="SERPs for which the page has been parsed " + "from the URL.", + document=Serp, + filter_query=Exists(field="url_page"), + ), + _get_statistics( + config=config, + name="+ URL offset", + description="SERPs for which the offset has been parsed " + "from the URL.", + document=Serp, + filter_query=Exists(field="url_offset"), + ), + _get_statistics( + config=config, + name="+ WARC", + description="SERPs for which the WARC has been downloaded.", + document=Serp, + filter_query=Exists(field="warc_location"), + ), + _get_statistics( + config=config, + name="+ WARC query", + description="SERPs for which the query has been parsed " + "from the WARC.", + document=Serp, + filter_query=Exists(field="warc_query"), + ), + _get_statistics( + config=config, + name="+ WARC snippets", + description="SERPs for which the snippets have been parsed " + "from the WARC.", + document=Serp, + filter_query=Exists(field="warc_snippets_parser.id"), + ), + _get_statistics( + config=config, + name="Results", + description="Search result from the SERPs.", + document=Result, + ), + _get_statistics( + config=config, + name="URL query parsers", + description="Parser to get the query from a SERP's URL.", + document=UrlQueryParser, + ), + _get_statistics( + config=config, + name="URL page parsers", + description="Parser to get the page from a SERP's URL.", + document=UrlPageParser, + ), + _get_statistics( + config=config, + name="URL offset parsers", + description="Parser to get the offset from a SERP's URL.", + document=UrlOffsetParser, + ), + _get_statistics( + config=config, + name="WARC query parsers", + description="Parser to get the query from a SERP's WARC contents.", + document=WarcQueryParser, + ), + _get_statistics( + config=config, + name="WARC snippets parsers", + description="Parser to get the snippets from a SERP's " + "WARC contents.", + document=WarcSnippetsParser, + ), + ] + + progress_list: list[Progress] = [ + _get_processed_progress( + config=config, + input_name="Archives", + output_name="Sources", + description="Build sources for all archives.", + document=Archive, + filter_query=~Exists(field="exclusion_reason"), + status_field="should_build_sources", + ), + _get_processed_progress( + config=config, + input_name="Providers", + output_name="Sources", + description="Build sources for all search providers.", + document=Provider, + filter_query=~Exists(field="exclusion_reason"), + status_field="should_build_sources", + ), + _get_processed_progress( + config=config, + input_name="Sources", + output_name="Captures", + description="Fetch CDX captures for all domains and " + "prefixes in the sources.", + document=Source, + status_field="should_fetch_captures", + ), + _get_processed_progress( + config=config, + input_name="Captures", + output_name="SERPs", + description="Parse queries from capture URLs.", + document=Capture, + status_field="url_query_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse page from SERP URLs.", + document=Serp, + status_field="url_page_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse offset from SERP URLs.", + document=Serp, + status_field="url_offset_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Download WARCs.", + document=Serp, + filter_query=Term(capture__status_code=200), + status_field="warc_downloader.should_download", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse query from WARC contents.", + document=Serp, + filter_query=Exists(field="warc_location"), + status_field="warc_query_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="SERPs", + output_name="SERPs", + description="Parse snippets from WARC contents.", + document=Serp, + filter_query=Exists(field="warc_location"), + status_field="warc_snippets_parser.should_parse", + ), + _get_processed_progress( + config=config, + input_name="Results", + output_name="Results", + description="Download WARCs.", + document=Result, + filter_query=Exists(field="snippet.url"), + status_field="warc_downloader.should_download", + ), + ] + + etag = str(hash(( + tuple(statistics_list), + tuple(progress_list), + ))) + + response = make_response( + render_template( + "home.html", + statistics_list=statistics_list, + progress_list=progress_list, + year=utc_now().year, + ) + ) + response.headers.add("ETag", etag) + return response diff --git a/archive_query_log/namespaces.py b/archive_query_log/namespaces.py new file mode 100644 index 00000000..b94d23ff --- /dev/null +++ b/archive_query_log/namespaces.py @@ -0,0 +1,20 @@ +from uuid import uuid5, NAMESPACE_URL + +NAMESPACE_AQL = uuid5(NAMESPACE_URL, "aql") +NAMESPACE_SOURCE = uuid5(NAMESPACE_AQL, "filter") +NAMESPACE_CAPTURE = uuid5(NAMESPACE_AQL, "capture") +NAMESPACE_SERP = uuid5(NAMESPACE_AQL, "serp") +NAMESPACE_RESULT = uuid5(NAMESPACE_AQL, "result") +NAMESPACE_URL_QUERY_PARSER = uuid5(NAMESPACE_AQL, "url_query_parser") +NAMESPACE_URL_PAGE_PARSER = uuid5(NAMESPACE_AQL, "url_page_parser") +NAMESPACE_URL_OFFSET_PARSER = uuid5(NAMESPACE_AQL, "url_offset_parser") +NAMESPACE_URL_LANGUAGE_PARSER = uuid5( + NAMESPACE_AQL, "url_language_parser") +NAMESPACE_WARC_QUERY_PARSER = uuid5(NAMESPACE_AQL, "warc_query_parser") +NAMESPACE_WARC_SNIPPETS_PARSER = uuid5( + NAMESPACE_AQL, "warc_snippets_parser") +NAMESPACE_WARC_MAIN_CONTENT_PARSER = uuid5( + NAMESPACE_AQL, "warc_main_content_parser") +NAMESPACE_WARC_DIRECT_ANSWERS_PARSER = uuid5( + NAMESPACE_AQL, "warc_direct_answers_parser") +NAMESPACE_WARC_DOWNLOADER = uuid5(NAMESPACE_AQL, "warc_downloader") diff --git a/archive_query_log/orm.py b/archive_query_log/orm.py new file mode 100644 index 00000000..2c7b23ac --- /dev/null +++ b/archive_query_log/orm.py @@ -0,0 +1,502 @@ +from datetime import datetime +from functools import cached_property +from re import Pattern, compile as pattern +from typing import Literal + +from elasticsearch_dsl import Document, Keyword, Text, Date, RankFeature, \ + InnerDoc as InnerDocument, Object, Index, Integer, Nested, Long, Boolean + + +class BaseDocument(Document): + last_modified: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + + # TODO: At the moment, this is used more as a creation date. + # We could use a different field for that and use this one for the last + # modified date. + + # noinspection PyShadowingBuiltins + def __init__(self, id: str | None = None, **kwargs): + if id is not None: + if "meta" not in kwargs: + kwargs["meta"] = {} + kwargs["meta"]["id"] = id + super().__init__(**kwargs) + + @classmethod + def index(cls) -> Index: + return cls._index + + @property + def id(self) -> str: + return self.meta.id + + @id.setter + def id(self, value: str): + self.meta.id = value + + +class Archive(BaseDocument): + name: str = Text() + description: str = Text() + cdx_api_url: str = Keyword() + memento_api_url: str = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + should_build_sources: bool = Boolean() + last_built_sources: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + + class Index: + name = "aql_archives" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +class Provider(BaseDocument): + name: str = Text() + description: str = Text() + exclusion_reason: str = Text() + notes: str = Text() + domains: list[str] = Keyword() + url_path_prefixes: list[str] = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + should_build_sources: bool = Boolean() + last_built_sources: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + + class Index: + name = "aql_providers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +class InnerArchive(InnerDocument): + id: str = Keyword() + cdx_api_url: str = Keyword() + memento_api_url: str = Keyword() + priority: int | None = RankFeature(positive_score_impact=True) + + +class InnerProvider(InnerDocument): + id: str = Keyword() + domain: str = Keyword() + url_path_prefix: str = Keyword() + priority: int | None = RankFeature(positive_score_impact=True) + + +class Source(BaseDocument): + archive: InnerArchive = Object(InnerArchive) + provider: InnerProvider = Object(InnerProvider) + should_fetch_captures: bool = Boolean() + last_fetched_captures: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + + class Index: + name = "aql_sources" + settings = { + "number_of_shards": 5, + "number_of_replicas": 2, + } + + +class InnerParser(InnerDocument): + id: str = Keyword() + should_parse: bool = Boolean() + last_parsed: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + + +class Capture(BaseDocument): + archive: InnerArchive = Object(InnerArchive) + provider: InnerProvider = Object(InnerProvider) + url: str = Keyword() + url_key: str = Keyword() + timestamp: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + status_code: int = Integer() + digest: str = Keyword() + mimetype: str | None = Keyword() + filename: str | None = Keyword() + offset: int | None = Integer() + length: int | None = Integer() + access: str | None = Keyword() + redirect_url: str | None = Keyword() + flags: list[str] | None = Keyword() + collection: str | None = Keyword() + source: str | None = Keyword() + source_collection: str | None = Keyword() + url_query_parser: InnerParser | None = Object(InnerParser) + + class Index: + name = "aql_captures" + settings = { + "number_of_shards": 40, + "number_of_replicas": 2, + } + + +class InnerCapture(InnerDocument): + id: str = Keyword() + url: str = Keyword() + timestamp: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + status_code: int = Integer() + digest: str = Keyword() + mimetype: str | None = Keyword() + + +class InnerDownloader(InnerDocument): + id: str = Keyword() + should_download: bool = Boolean() + last_downloaded: datetime = Date( + default_timezone="UTC", + format="strict_date_time_no_millis", + ) + + +class WarcLocation(InnerDocument): + file: str = Keyword() + offset: int = Long() + length: int = Long() + + +class SnippetId(InnerDocument): + id: str = Keyword() + rank: int = Integer() + + +class Snippet(SnippetId): + content: str = Text() + url: str | None = Keyword() + title: str | None = Text() + text: str | None = Text() + + +class DirectAnswerId(InnerDocument): + id: str = Keyword() + + +class DirectAnswer(DirectAnswerId): + content: str = Text() + url: str | None = Keyword() + text: str | None = Text() + + +class Serp(BaseDocument): + archive: InnerArchive = Object(InnerArchive) + provider: InnerProvider = Object(InnerProvider) + capture: InnerCapture = Object(InnerCapture) + url_query: str = Text() + url_query_parser: InnerParser | None = Object(InnerParser) + url_page: int | None = Integer() + url_page_parser: InnerParser | None = Object(InnerParser) + url_offset: int | None = Integer() + url_offset_parser: InnerParser | None = Object(InnerParser) + # url_language: str | None = Keyword() + # url_language_parser: InnerParser | None = Object(InnerParser) + warc_location: WarcLocation | None = Object(WarcLocation) + warc_downloader: InnerDownloader | None = Object(InnerDownloader) + warc_query: str | None = Text() + warc_query_parser: InnerParser | None = Object(InnerParser) + warc_snippets: list[SnippetId] | None = Nested(SnippetId) + warc_snippets_parser: InnerParser | None = Object(InnerParser) + warc_direct_answers: list[DirectAnswerId] | None = Nested(DirectAnswerId) + warc_direct_answers_parser: InnerParser | None = Object(InnerParser) + + # rendered_warc_location: WarcLocation | None = Object(WarcLocation) + # rendered_warc_downloader: InnerDownloader | None = ( + # Object(InnerDownloader)) + + class Index: + name = "aql_serps" + settings = { + "number_of_shards": 40, + "number_of_replicas": 2, + } + + +class InnerSerp(InnerDocument): + id: str = Keyword() + + +class Result(BaseDocument): + archive: InnerArchive = Object(InnerArchive) + provider: InnerProvider = Object(InnerProvider) + capture: InnerCapture = Object(InnerCapture) + serp: InnerSerp = Object(InnerSerp) + snippet: Snippet = Object(Snippet) + snippet_parser: InnerParser | None = Object(InnerParser) + warc_before_serp_location: WarcLocation | None = Object(WarcLocation) + warc_before_serp_downloader: InnerDownloader | None = ( + Object(InnerDownloader)) + warc_after_serp_location: WarcLocation | None = Object(WarcLocation) + warc_after_serp_downloader: InnerDownloader | None = ( + Object(InnerDownloader)) + + class Index: + name = "aql_results" + settings = { + "number_of_shards": 20, + "number_of_replicas": 2, + } + + +class InnerProviderId(InnerDocument): + id: str = Keyword() + + +UrlQueryParserType = Literal[ + "query_parameter", + "fragment_parameter", + "path_segment", +] + + +class UrlQueryParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: UrlQueryParserType = Keyword() + parameter: str | None = Keyword() + segment: int | None = Keyword() + remove_pattern_regex: str | None = Keyword() + space_pattern_regex: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + @cached_property + def remove_pattern(self) -> Pattern | None: + if self.remove_pattern_regex is None: + return None + return pattern(self.remove_pattern_regex) + + @cached_property + def space_pattern(self) -> Pattern | None: + if self.space_pattern_regex is None: + return None + return pattern(self.space_pattern_regex) + + class Index: + name = "aql_url_query_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +UrlPageParserType = Literal[ + "query_parameter", + "fragment_parameter", + "path_segment", +] + + +class UrlPageParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: UrlPageParserType = Keyword() + parameter: str | None = Keyword() + segment: int | None = Keyword() + remove_pattern_regex: str | None = Keyword() + space_pattern_regex: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + @cached_property + def remove_pattern(self) -> Pattern | None: + if self.remove_pattern_regex is None: + return None + return pattern(self.remove_pattern_regex) + + class Index: + name = "aql_url_page_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +UrlOffsetParserType = Literal[ + "query_parameter", + "fragment_parameter", + "path_segment", +] + + +class UrlOffsetParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: UrlOffsetParserType = Keyword() + parameter: str | None = Keyword() + segment: int | None = Keyword() + remove_pattern_regex: str | None = Keyword() + space_pattern_regex: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + @cached_property + def remove_pattern(self) -> Pattern | None: + if self.remove_pattern_regex is None: + return None + return pattern(self.remove_pattern_regex) + + class Index: + name = "aql_url_offset_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +WarcQueryParserType = Literal[ + "xpath", +] + + +class WarcQueryParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: WarcQueryParserType = Keyword() + xpath: str | None = Keyword() + remove_pattern_regex: str | None = Keyword() + space_pattern_regex: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + @cached_property + def remove_pattern(self) -> Pattern | None: + if self.remove_pattern_regex is None: + return None + return pattern(self.remove_pattern_regex) + + @cached_property + def space_pattern(self) -> Pattern | None: + if self.space_pattern_regex is None: + return None + return pattern(self.space_pattern_regex) + + class Index: + name = "aql_warc_query_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +WarcSnippetsParserType = Literal[ + "xpath", +] + + +class WarcSnippetsParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: WarcSnippetsParserType = Keyword() + xpath: str | None = Keyword() + url_xpath: str | None = Keyword() + title_xpath: str | None = Keyword() + text_xpath: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + class Index: + name = "aql_warc_snippets_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +WarcDirectAnswersParserType = Literal[ + "xpath", +] + + +class WarcDirectAnswersParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: WarcDirectAnswersParserType = Keyword() + xpath: str | None = Keyword() + url_xpath: str | None = Keyword() + text_xpath: str | None = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + class Index: + name = "aql_warc_direct_answers_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } + + +WarcMainContentParserType = Literal[ + "resiliparse", +] + + +class WarcMainContentParser(BaseDocument): + provider: InnerProviderId | None = Object(InnerProviderId) + url_pattern_regex: str | None = Keyword() + priority: float | None = RankFeature(positive_score_impact=True) + parser_type: WarcMainContentParserType = Keyword() + + @cached_property + def url_pattern(self) -> Pattern | None: + if self.url_pattern_regex is None: + raise ValueError("No URL pattern regex.") + return pattern(self.url_pattern_regex) + + class Index: + name = "aql_warc_snippets_parsers" + settings = { + "number_of_shards": 1, + "number_of_replicas": 2, + } diff --git a/archive_query_log/parsers/__init__.py b/archive_query_log/parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archive_query_log/parsers/html.py b/archive_query_log/parsers/html.py new file mode 100644 index 00000000..ca8dcd5b --- /dev/null +++ b/archive_query_log/parsers/html.py @@ -0,0 +1,18 @@ +from io import BytesIO +from shutil import copyfileobj +from warnings import warn + +from warcio.recordloader import ArcWarcRecord + + +def read_html_string(record: ArcWarcRecord) -> str | None: + mime_type: str | None = record.http_headers.get_header("Content-Type") + if mime_type is None: + warn(UserWarning("No MIME type given.")) + return None + mime_type = mime_type.split(";", maxsplit=1)[0] + if mime_type != "text/xml": + return None + with BytesIO() as content_buffer: + copyfileobj(record.content_stream(), content_buffer) + return content_buffer.getvalue().decode("utf-8") diff --git a/archive_query_log/parsers/url.py b/archive_query_log/parsers/url.py new file mode 100644 index 00000000..e20a8351 --- /dev/null +++ b/archive_query_log/parsers/url.py @@ -0,0 +1,25 @@ +from urllib.parse import parse_qsl, urlsplit, unquote + + +def parse_url_query_parameter( + parameter: str, url: str) -> str | None: + for key, value in parse_qsl(urlsplit(url).query): + if key == parameter: + return value + return None + + +def parse_url_fragment_parameter( + parameter: str, url: str) -> str | None: + for key, value in parse_qsl(urlsplit(url).fragment): + if key == parameter: + return value + return None + + +def parse_url_path_segment(segment: int, url: str) -> str | None: + path_segments = urlsplit(url).path.split("/") + if len(path_segments) <= segment: + return None + path_segment = path_segments[segment] + return unquote(path_segment) diff --git a/archive_query_log/parsers/url_offset.py b/archive_query_log/parsers/url_offset.py new file mode 100644 index 00000000..10cefdb0 --- /dev/null +++ b/archive_query_log/parsers/url_offset.py @@ -0,0 +1,204 @@ +from functools import cache +from itertools import chain +from typing import Iterable, Iterator +from uuid import uuid5 +from warnings import warn + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists +from tqdm.auto import tqdm + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_URL_OFFSET_PARSER +from archive_query_log.orm import InnerProviderId, UrlOffsetParserType +from archive_query_log.orm import Serp, InnerParser, UrlOffsetParser +from archive_query_log.parsers.url import parse_url_query_parameter, \ + parse_url_fragment_parameter, parse_url_path_segment +from archive_query_log.parsers.util import clean_int +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def add_url_offset_parser( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: UrlOffsetParserType, + parameter: str | None, + segment: int | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + if parser_type == "query_parameter": + if parameter is None: + raise ValueError("No query parameter given.") + elif parser_type == "fragment_parameter": + if parameter is None: + raise ValueError("No fragment parameter given.") + elif parser_type == "path_segment": + if segment is None: + raise ValueError("No path segment given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + parser_id_components = ( + provider_id if provider_id is not None else "", + url_pattern_regex if url_pattern_regex is not None else "", + str(priority) if priority is not None else "", + ) + parser_id = str(uuid5( + NAMESPACE_URL_OFFSET_PARSER, + ":".join(parser_id_components), + )) + parser = UrlOffsetParser( + id=parser_id, + last_modified=utc_now(), + provider=InnerProviderId(id=provider_id) if provider_id else None, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type, + parameter=parameter, + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + parser.save(using=config.es.client) + + +def _parse_url_offset(parser: UrlOffsetParser, capture_url: str) -> int | None: + # Check if URL matches pattern. + if (parser.url_pattern is not None and + not parser.url_pattern.match(capture_url)): + return None + + # Parse offset. + if parser.parser_type == "query_parameter": + if parser.parameter is None: + raise ValueError("No offset parameter given.") + offset_string = parse_url_query_parameter( + parser.parameter, capture_url) + if offset_string is None: + return None + return clean_int( + text=offset_string, + remove_pattern=parser.remove_pattern, + ) + elif parser.parser_type == "fragment_parameter": + if parser.parameter is None: + raise ValueError("No fragment parameter given.") + offset_string = parse_url_fragment_parameter( + parser.parameter, capture_url) + if offset_string is None: + return None + return clean_int( + text=offset_string, + remove_pattern=parser.remove_pattern, + ) + elif parser.parser_type == "path_segment": + if parser.segment is None: + raise ValueError("No path segment given.") + offset_string = parse_url_path_segment(parser.segment, capture_url) + if offset_string is None: + return None + return clean_int( + text=offset_string, + remove_pattern=parser.remove_pattern, + ) + else: + raise ValueError(f"Unknown parser type: {parser.parser_type}") + + +@cache +def _url_offset_parsers( + config: Config, + provider_id: str, +) -> list[UrlOffsetParser]: + parsers: Iterable[UrlOffsetParser] = ( + UrlOffsetParser.search(using=config.es.client) + .filter( + ~Exists(field="provider.id") | + Term(provider__id=provider_id) + ) + .query(RankFeature(field="priority", saturation={})) + .scan() + ) + parsers = safe_iter_scan(parsers) + return list(parsers) + + +def _parse_serp_url_offset_action( + config: Config, + serp: Serp, +) -> Iterator[dict]: + # Re-check if parsing is necessary. + if (serp.url_offset_parser is not None and + serp.url_offset_parser.should_parse is not None and + not serp.url_offset_parser.should_parse): + return + + for parser in _url_offset_parsers(config, serp.provider.id): + # Try to parse the query. + url_offset = _parse_url_offset(parser, serp.capture.url) + if url_offset is None: + # Parsing was not successful, e.g., URL pattern did not match. + continue + if url_offset > 2147483647: + warn(RuntimeWarning( + f"URL offset {url_offset} parsed from URL {serp.capture.url} " + f"is too large for a signed 32-bit integer." + )) + continue + yield update_action( + serp, + url_offset=url_offset, + url_offset_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + yield update_action( + serp, + url_offset_parser=InnerParser( + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + + +def parse_serps_url_offset(config: Config) -> None: + Serp.index().refresh(using=config.es.client) + changed_serps_search: Search = ( + Serp.search(using=config.es.client) + .filter(~Term(url_offset_parser__should_parse=False)) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_serps = changed_serps_search.count() + if num_changed_serps > 0: + changed_serps: Iterable[Serp] = ( + changed_serps_search + .params(preserve_order=True) + .scan() + ) + changed_serps = safe_iter_scan(changed_serps) + # noinspection PyTypeChecker + changed_serps = tqdm( + changed_serps, total=num_changed_serps, + desc="Parsing URL offset", unit="SERP") + actions = chain.from_iterable( + _parse_serp_url_offset_action(config, serp) + for serp in changed_serps + ) + config.es.bulk(actions) + else: + echo("No new/changed SERPs.") diff --git a/archive_query_log/parsers/url_page.py b/archive_query_log/parsers/url_page.py new file mode 100644 index 00000000..48d67e5b --- /dev/null +++ b/archive_query_log/parsers/url_page.py @@ -0,0 +1,196 @@ +from functools import cache +from itertools import chain +from typing import Iterable, Iterator +from uuid import uuid5 + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists +from tqdm.auto import tqdm + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_URL_PAGE_PARSER +from archive_query_log.orm import InnerProviderId, UrlPageParserType +from archive_query_log.orm import Serp, InnerParser, UrlPageParser +from archive_query_log.parsers.url import parse_url_query_parameter, \ + parse_url_fragment_parameter, parse_url_path_segment +from archive_query_log.parsers.util import clean_int +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def add_url_page_parser( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: UrlPageParserType, + parameter: str | None, + segment: int | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + if parser_type == "query_parameter": + if parameter is None: + raise ValueError("No query parameter given.") + elif parser_type == "fragment_parameter": + if parameter is None: + raise ValueError("No fragment parameter given.") + elif parser_type == "path_segment": + if segment is None: + raise ValueError("No path segment given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + parser_id_components = ( + provider_id if provider_id is not None else "", + url_pattern_regex if url_pattern_regex is not None else "", + str(priority) if priority is not None else "", + ) + parser_id = str(uuid5( + NAMESPACE_URL_PAGE_PARSER, + ":".join(parser_id_components), + )) + parser = UrlPageParser( + id=parser_id, + last_modified=utc_now(), + provider=InnerProviderId(id=provider_id) if provider_id else None, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type, + parameter=parameter, + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + parser.save(using=config.es.client) + + +def _parse_url_page(parser: UrlPageParser, capture_url: str) -> int | None: + # Check if URL matches pattern. + if (parser.url_pattern is not None and + not parser.url_pattern.match(capture_url)): + return None + + # Parse page. + if parser.parser_type == "query_parameter": + if parser.parameter is None: + raise ValueError("No page parameter given.") + page_string = parse_url_query_parameter(parser.parameter, capture_url) + if page_string is None: + return None + return clean_int( + text=page_string, + remove_pattern=parser.remove_pattern, + ) + elif parser.parser_type == "fragment_parameter": + if parser.parameter is None: + raise ValueError("No fragment parameter given.") + page_string = parse_url_fragment_parameter( + parser.parameter, capture_url) + if page_string is None: + return None + return clean_int( + text=page_string, + remove_pattern=parser.remove_pattern, + ) + elif parser.parser_type == "path_segment": + if parser.segment is None: + raise ValueError("No path segment given.") + page_string = parse_url_path_segment(parser.segment, capture_url) + if page_string is None: + return None + return clean_int( + text=page_string, + remove_pattern=parser.remove_pattern, + ) + else: + raise ValueError(f"Unknown parser type: {parser.parser_type}") + + +@cache +def _url_page_parsers( + config: Config, + provider_id: str, +) -> list[UrlPageParser]: + parsers: Iterable[UrlPageParser] = ( + UrlPageParser.search(using=config.es.client) + .filter( + ~Exists(field="provider.id") | + Term(provider__id=provider_id) + ) + .query(RankFeature(field="priority", saturation={})) + .scan() + ) + parsers = safe_iter_scan(parsers) + return list(parsers) + + +def _parse_serp_url_page_action( + config: Config, + serp: Serp, +) -> Iterator[dict]: + # Re-check if parsing is necessary. + if (serp.url_page_parser is not None and + serp.url_page_parser.should_parse is not None and + not serp.url_page_parser.should_parse): + return + + for parser in _url_page_parsers(config, serp.provider.id): + # Try to parse the query. + url_page = _parse_url_page(parser, serp.capture.url) + if url_page is None: + # Parsing was not successful, e.g., URL pattern did not match. + continue + yield update_action( + serp, + url_page=url_page, + url_page_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + yield update_action( + serp, + url_page_parser=InnerParser( + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + + +def parse_serps_url_page(config: Config) -> None: + Serp.index().refresh(using=config.es.client) + changed_serps_search: Search = ( + Serp.search(using=config.es.client) + .filter(~Term(url_page_parser__should_parse=False)) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_serps = changed_serps_search.count() + if num_changed_serps > 0: + changed_serps: Iterable[Serp] = ( + changed_serps_search + .params(preserve_order=True) + .scan() + ) + changed_serps = safe_iter_scan(changed_serps) + # noinspection PyTypeChecker + changed_serps = tqdm( + changed_serps, total=num_changed_serps, + desc="Parsing URL page", unit="SERP") + actions = chain.from_iterable( + _parse_serp_url_page_action(config, serp) + for serp in changed_serps + ) + config.es.bulk(actions) + else: + echo("No new/changed SERPs.") diff --git a/archive_query_log/parsers/url_query.py b/archive_query_log/parsers/url_query.py new file mode 100644 index 00000000..0fc11cc6 --- /dev/null +++ b/archive_query_log/parsers/url_query.py @@ -0,0 +1,231 @@ +from functools import cache +from itertools import chain +from typing import Iterable, Iterator +from uuid import uuid5 + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists +from tqdm.auto import tqdm + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_URL_QUERY_PARSER +from archive_query_log.orm import Capture, Serp, InnerCapture, InnerParser, \ + UrlQueryParser +from archive_query_log.orm import UrlQueryParserType, InnerProviderId +from archive_query_log.parsers.url import parse_url_query_parameter, \ + parse_url_fragment_parameter, parse_url_path_segment +from archive_query_log.parsers.util import clean_text +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def add_url_query_parser( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: UrlQueryParserType, + parameter: str | None, + segment: int | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + if parser_type == "query_parameter": + if parameter is None: + raise ValueError("No query parameter given.") + elif parser_type == "fragment_parameter": + if parameter is None: + raise ValueError("No fragment parameter given.") + elif parser_type == "path_segment": + if segment is None: + raise ValueError("No path segment given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + parser_id_components = ( + provider_id if provider_id is not None else "", + url_pattern_regex if url_pattern_regex is not None else "", + str(priority) if priority is not None else "", + ) + parser_id = str(uuid5( + NAMESPACE_URL_QUERY_PARSER, + ":".join(parser_id_components), + )) + parser = UrlQueryParser( + id=parser_id, + last_modified=utc_now(), + provider=InnerProviderId(id=provider_id) if provider_id else None, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type, + parameter=parameter, + segment=segment, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + parser.save(using=config.es.client) + + +def _parse_url_query(parser: UrlQueryParser, capture_url: str) -> str | None: + # Check if URL matches pattern. + if (parser.url_pattern is not None and + not parser.url_pattern.match(capture_url)): + return None + + # Parse query. + if parser.parser_type == "query_parameter": + if parser.parameter is None: + raise ValueError("No query parameter given.") + query = parse_url_query_parameter(parser.parameter, capture_url) + if query is None: + return None + return clean_text( + text=query, + remove_pattern=parser.remove_pattern, + space_pattern=parser.space_pattern, + ) + elif parser.parser_type == "fragment_parameter": + if parser.parameter is None: + raise ValueError("No fragment parameter given.") + query = parse_url_fragment_parameter(parser.parameter, capture_url) + if query is None: + return None + return clean_text( + text=query, + remove_pattern=parser.remove_pattern, + space_pattern=parser.space_pattern, + ) + elif parser.parser_type == "path_segment": + if parser.segment is None: + raise ValueError("No path segment given.") + query = parse_url_path_segment(parser.segment, capture_url) + if query is None: + return None + return clean_text( + text=query, + remove_pattern=parser.remove_pattern, + space_pattern=parser.space_pattern, + ) + else: + raise ValueError(f"Unknown parser type: {parser.parser_type}") + + +@cache +def _url_query_parsers( + config: Config, + provider_id: str, +) -> list[UrlQueryParser]: + parsers: Iterable[UrlQueryParser] = ( + UrlQueryParser.search(using=config.es.client) + .filter( + ~Exists(field="provider.id") | + Term(provider__id=provider_id) + ) + .query(RankFeature(field="priority", saturation={})) + .scan() + ) + parsers = safe_iter_scan(parsers) + return list(parsers) + + +def _parse_serp_url_query_action( + config: Config, + capture: Capture, +) -> Iterator[dict]: + # Re-check if parsing is necessary. + if (capture.url_query_parser is not None and + capture.url_query_parser.should_parse is not None and + not capture.url_query_parser.should_parse): + return + + for parser in _url_query_parsers(config, capture.provider.id): + # Try to parse the query. + url_query = _parse_url_query(parser, capture.url) + if url_query is None: + # Parsing was not successful, e.g., URL pattern did not match. + continue + serp = Serp( + id=capture.id, + last_modified=utc_now(), + archive=capture.archive, + provider=capture.provider, + capture=InnerCapture( + id=capture.id, + url=capture.url, + timestamp=capture.timestamp, + status_code=capture.status_code, + digest=capture.digest, + mimetype=capture.mimetype, + ), + url_query=url_query, + url_query_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + url_page_parser=InnerParser( + should_parse=True, + ), + url_offset_parser=InnerParser( + should_parse=True, + ), + warc_query_parser=InnerParser( + should_parse=True, + ), + warc_snippets_parser=InnerParser( + should_parse=True, + ), + ) + yield serp.to_dict(include_meta=True) + yield update_action( + capture, + url_query_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + yield update_action( + capture, + url_query_parser=InnerParser( + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + + +def parse_serps_url_query(config: Config) -> None: + Capture.index().refresh(using=config.es.client) + changed_captures_search: Search = ( + Capture.search(using=config.es.client) + .filter(~Term(url_query_parser__should_parse=False)) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_captures = changed_captures_search.count() + if num_changed_captures > 0: + changed_captures: Iterable[Capture] = ( + changed_captures_search + .params(preserve_order=True) + .scan() + ) + changed_captures = safe_iter_scan(changed_captures) + # noinspection PyTypeChecker + changed_captures = tqdm( + changed_captures, total=num_changed_captures, + desc="Parsing URL query", unit="capture") + actions = chain.from_iterable( + _parse_serp_url_query_action(config, capture) + for capture in changed_captures + ) + config.es.bulk(actions) + else: + echo("No new/changed captures.") diff --git a/archive_query_log/parsers/util.py b/archive_query_log/parsers/util.py new file mode 100644 index 00000000..afbb5ce5 --- /dev/null +++ b/archive_query_log/parsers/util.py @@ -0,0 +1,33 @@ +from re import Pattern +from warnings import warn + + +def clean_text( + text: str, + remove_pattern: Pattern | None, + space_pattern: Pattern | None, +) -> str | None: + if remove_pattern is not None: + text = remove_pattern.sub("", text) + if space_pattern is not None: + text = space_pattern.sub(" ", text) + text = text.strip() + text = " ".join(text.split()) + if text == "": + return None + return text + + +def clean_int( + text: str, + remove_pattern: Pattern | None, +) -> int | None: + if remove_pattern is not None: + text = remove_pattern.sub("", text) + text = text.strip() + try: + parsed = int(text) + except ValueError: + warn(RuntimeWarning(f"Could not parse int: {text}")) + return None + return parsed diff --git a/archive_query_log/parsers/warc.py b/archive_query_log/parsers/warc.py new file mode 100644 index 00000000..676ae8cb --- /dev/null +++ b/archive_query_log/parsers/warc.py @@ -0,0 +1,20 @@ +from contextlib import contextmanager +from typing import Iterator + +from warc_s3 import WarcS3Store, WarcS3Location +from warcio.recordloader import ArcWarcRecord + +from archive_query_log.orm import WarcLocation + + +@contextmanager +def open_warc( + warc_store: WarcS3Store, + warc_location: WarcLocation, +) -> Iterator[ArcWarcRecord]: + with warc_store.read(WarcS3Location( + key=warc_location.file, + offset=warc_location.offset, + length=warc_location.length, + )) as record: + yield record diff --git a/archive_query_log/parsers/warc_direct_answers.py b/archive_query_log/parsers/warc_direct_answers.py new file mode 100644 index 00000000..b50d995b --- /dev/null +++ b/archive_query_log/parsers/warc_direct_answers.py @@ -0,0 +1,263 @@ +from functools import cache +from itertools import chain +from typing import Iterable, Iterator +from urllib.parse import urljoin +from uuid import uuid5 + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists +# noinspection PyProtectedMember +from lxml.etree import _Element, tostring # nosec: B410 +from tqdm.auto import tqdm +from warc_s3 import WarcS3Store + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_WARC_DIRECT_ANSWERS_PARSER, \ + NAMESPACE_RESULT +from archive_query_log.orm import Serp, InnerParser, InnerProviderId, \ + WarcDirectAnswersParserType, WarcDirectAnswersParser, WarcLocation, DirectAnswer, \ + Result, InnerSerp, DirectAnswerId, InnerDownloader +from archive_query_log.parsers.warc import open_warc +from archive_query_log.parsers.xml import parse_xml_tree, safe_xpath +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def add_warc_direct_answers_parser( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: WarcDirectAnswersParserType, + xpath: str | None, + url_xpath: str | None, + text_xpath: str | None, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + if parser_type == "xpath": + if xpath is None: + raise ValueError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + parser_id_components = ( + provider_id if provider_id is not None else "", + url_pattern_regex if url_pattern_regex is not None else "", + str(priority) if priority is not None else "", + ) + parser_id = str(uuid5( + NAMESPACE_WARC_DIRECT_ANSWERS_PARSER, + ":".join(parser_id_components), + )) + parser = WarcDirectAnswersParser( + id=parser_id, + last_modified=utc_now(), + provider=InnerProviderId(id=provider_id) if provider_id else None, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type, + xpath=xpath, + url_xpath=url_xpath, + text_xpath=text_xpath, + ) + parser.save(using=config.es.client) + + +def _parse_warc_direct_answers( + parser: WarcDirectAnswersParser, + serp_id: str, + capture_url: str, + warc_store: WarcS3Store, + warc_location: WarcLocation, +) -> list[DirectAnswer] | None: + # Check if URL matches pattern. + if (parser.url_pattern is not None and + not parser.url_pattern.match(capture_url)): + return None + + # Parse direct answers. + if parser.parser_type == "xpath": + if parser.xpath is None: + raise ValueError("No XPath given.") + with open_warc(warc_store, warc_location) as record: + tree = parse_xml_tree(record) + if tree is None: + return None + + elements = safe_xpath(tree, parser.xpath, _Element) + if len(elements) == 0: + return None + + direct_answers = [] + element: _Element + for i, element in enumerate(elements): + url: str | None = None + if parser.url_xpath is not None: + urls = safe_xpath(element, parser.url_xpath, str) + if len(urls) > 0: + url = urls[0].strip() + url = urljoin(capture_url, url) + text: str | None = None + if parser.text_xpath is not None: + texts = safe_xpath(element, parser.text_xpath, str) + if len(texts) > 0: + text = texts[0].strip() + + content: str = tostring( + element, + encoding=str, + method="xml", + pretty_print=False, + with_tail=True, + ) + direct_answer_id_components = ( + serp_id, + parser.id, + str(hash(content)), + str(i), + ) + direct_answer_id = str(uuid5( + NAMESPACE_RESULT, + ":".join(direct_answer_id_components), + )) + direct_answers.append(DirectAnswer( + id=direct_answer_id, + content=content, + url=url, + text=text, + )) + return direct_answers + else: + raise ValueError(f"Unknown parser type: {parser.parser_type}") + + +@cache +def _warc_direct_answers_parsers( + config: Config, + provider_id: str, +) -> list[WarcDirectAnswersParser]: + parsers: Iterable[WarcDirectAnswersParser] = ( + WarcDirectAnswersParser.search(using=config.es.client) + .filter( + ~Exists(field="provider.id") | + Term(provider__id=provider_id) + ) + .query(RankFeature(field="priority", saturation={})) + .scan() + ) + parsers = safe_iter_scan(parsers) + return list(parsers) + + +def _parse_serp_warc_direct_answers_action( + config: Config, + serp: Serp, +) -> Iterator[dict]: + # Re-check if it can be parsed. + if (serp.warc_location is None or + serp.warc_location.file is None or + serp.warc_location.offset is None or + serp.warc_location.length is None): + return + + # Re-check if parsing is necessary. + if (serp.warc_direct_answers_parser is not None and + serp.warc_direct_answers_parser.should_parse is not None and + not serp.warc_direct_answers_parser.should_parse): + return + + for parser in _warc_direct_answers_parsers(config, serp.provider.id): + # Try to parse the direct answers. + warc_direct_answers = _parse_warc_direct_answers( + parser=parser, + serp_id=serp.id, + capture_url=serp.capture.url, + warc_store=config.s3.warc_store, + warc_location=serp.warc_location, + ) + if warc_direct_answers is None: + # Parsing was not successful, e.g., URL pattern did not match. + continue + for direct_answer in warc_direct_answers: + yield Result( + id=direct_answer.id, + last_modified=utc_now(), + archive=serp.archive, + provider=serp.provider, + capture=serp.capture, + serp=InnerSerp( + id=serp.id, + ).to_dict(), + direct_answer=direct_answer, + direct_answer_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ).to_dict(), + warc_before_serp_downloader=InnerDownloader( + should_download=True, + ).to_dict(), + warc_after_serp_downloader=InnerDownloader( + should_download=True, + ).to_dict(), + ).to_dict(include_meta=True) + yield update_action( + serp, + warc_direct_answers=[ + DirectAnswerId( + id=direct_answer.id, + ) + for direct_answer in warc_direct_answers + ], + warc_direct_answers_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + yield update_action( + serp, + warc_direct_answers_parser=InnerParser( + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + + +def parse_serps_warc_direct_answers(config: Config) -> None: + Serp.index().refresh(using=config.es.client) + changed_serps_search: Search = ( + Serp.search(using=config.es.client) + .filter( + Exists(field="warc_location") & + ~Term(warc_direct_answers_parser__should_parse=False) + ) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_serps = changed_serps_search.count() + if num_changed_serps > 0: + changed_serps: Iterable[Serp] = ( + changed_serps_search + .params(preserve_order=True) + .scan() + ) + changed_serps = safe_iter_scan(changed_serps) + # noinspection PyTypeChecker + changed_serps = tqdm( + changed_serps, total=num_changed_serps, + desc="Parsing WARC direct answers", unit="SERP") + actions = chain.from_iterable( + _parse_serp_warc_direct_answers_action(config, serp) + for serp in changed_serps + ) + config.es.bulk(actions) + else: + echo("No new/changed SERPs.") \ No newline at end of file diff --git a/archive_query_log/parsers/warc_query.py b/archive_query_log/parsers/warc_query.py new file mode 100644 index 00000000..dcdefc53 --- /dev/null +++ b/archive_query_log/parsers/warc_query.py @@ -0,0 +1,192 @@ +from functools import cache +from itertools import chain +from typing import Iterable, Iterator +from uuid import uuid5 + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists +from tqdm.auto import tqdm +from warc_s3 import WarcS3Store + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_WARC_QUERY_PARSER +from archive_query_log.orm import Serp, InnerParser, InnerProviderId, \ + WarcQueryParserType, WarcQueryParser, WarcLocation +from archive_query_log.parsers.util import clean_text +from archive_query_log.parsers.warc import open_warc +from archive_query_log.parsers.xml import parse_xml_tree, safe_xpath +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def add_warc_query_parser( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: WarcQueryParserType, + xpath: str | None, + remove_pattern_regex: str | None, + space_pattern_regex: str | None, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + if parser_type == "xpath": + if xpath is None: + raise ValueError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + parser_id_components = ( + provider_id if provider_id is not None else "", + url_pattern_regex if url_pattern_regex is not None else "", + str(priority) if priority is not None else "", + ) + parser_id = str(uuid5( + NAMESPACE_WARC_QUERY_PARSER, + ":".join(parser_id_components), + )) + parser = WarcQueryParser( + id=parser_id, + last_modified=utc_now(), + provider=InnerProviderId(id=provider_id) if provider_id else None, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type, + xpath=xpath, + remove_pattern_regex=remove_pattern_regex, + space_pattern_regex=space_pattern_regex, + ) + parser.save(using=config.es.client) + + +def _parse_warc_query( + parser: WarcQueryParser, + capture_url: str, + warc_store: WarcS3Store, + warc_location: WarcLocation, +) -> str | None: + # Check if URL matches pattern. + if (parser.url_pattern is not None and + not parser.url_pattern.match(capture_url)): + return None + + # Parse query. + if parser.parser_type == "xpath": + if parser.xpath is None: + raise ValueError("No XPath given.") + with open_warc(warc_store, warc_location) as record: + tree = parse_xml_tree(record) + if tree is None: + return None + + queries = safe_xpath(tree, parser.xpath, str) + for query in queries: + query_cleaned = clean_text( + text=query, + remove_pattern=parser.remove_pattern, + space_pattern=parser.space_pattern, + ) + if query_cleaned is not None: + return query_cleaned + return None + else: + raise ValueError(f"Unknown parser type: {parser.parser_type}") + + +@cache +def _warc_query_parsers( + config: Config, + provider_id: str, +) -> list[WarcQueryParser]: + parsers: Iterable[WarcQueryParser] = ( + WarcQueryParser.search(using=config.es.client) + .filter( + ~Exists(field="provider.id") | + Term(provider__id=provider_id) + ) + .query(RankFeature(field="priority", saturation={})) + .scan() + ) + parsers = safe_iter_scan(parsers) + return list(parsers) + + +def _parse_serp_warc_query_action( + config: Config, + serp: Serp, +) -> Iterator[dict]: + # Re-check if it can be parsed. + if (serp.warc_location is None or + serp.warc_location.file is None or + serp.warc_location.offset is None or + serp.warc_location.length is None): + return + + # Re-check if parsing is necessary. + if (serp.warc_query_parser is not None and + serp.warc_query_parser.should_parse is not None and + not serp.warc_query_parser.should_parse): + return + + for parser in _warc_query_parsers(config, serp.provider.id): + # Try to parse the query. + warc_query = _parse_warc_query( + parser, serp.capture.url, config.s3.warc_store, serp.warc_location) + if warc_query is None: + # Parsing was not successful, e.g., URL pattern did not match. + continue + yield update_action( + serp, + warc_query=warc_query, + warc_query_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + yield update_action( + serp, + warc_query_parser=InnerParser( + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + + +def parse_serps_warc_query(config: Config) -> None: + Serp.index().refresh(using=config.es.client) + changed_serps_search: Search = ( + Serp.search(using=config.es.client) + .filter( + Exists(field="warc_location") & + ~Term(warc_query_parser__should_parse=False) + ) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_serps = changed_serps_search.count() + if num_changed_serps > 0: + changed_serps: Iterable[Serp] = ( + changed_serps_search + .params(preserve_order=True) + .scan() + ) + changed_serps = safe_iter_scan(changed_serps) + # noinspection PyTypeChecker + changed_serps = tqdm( + changed_serps, total=num_changed_serps, + desc="Parsing WARC query", unit="SERP") + actions = chain.from_iterable( + _parse_serp_warc_query_action(config, serp) + for serp in changed_serps + ) + config.es.bulk(actions) + else: + echo("No new/changed SERPs.") diff --git a/archive_query_log/parsers/warc_snippets.py b/archive_query_log/parsers/warc_snippets.py new file mode 100644 index 00000000..4ef9e5e1 --- /dev/null +++ b/archive_query_log/parsers/warc_snippets.py @@ -0,0 +1,273 @@ +from functools import cache +from itertools import chain +from typing import Iterable, Iterator +from urllib.parse import urljoin +from uuid import uuid5 + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Term, RankFeature, Exists +# noinspection PyProtectedMember +from lxml.etree import _Element, tostring # nosec: B410 +from tqdm.auto import tqdm +from warc_s3 import WarcS3Store + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_WARC_SNIPPETS_PARSER, \ + NAMESPACE_RESULT +from archive_query_log.orm import Serp, InnerParser, InnerProviderId, \ + WarcSnippetsParserType, WarcSnippetsParser, WarcLocation, Snippet, \ + Result, InnerSerp, SnippetId, InnerDownloader +from archive_query_log.parsers.warc import open_warc +from archive_query_log.parsers.xml import parse_xml_tree, safe_xpath +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def add_warc_snippets_parser( + config: Config, + provider_id: str | None, + url_pattern_regex: str | None, + priority: float | None, + parser_type: WarcSnippetsParserType, + xpath: str | None, + url_xpath: str | None, + title_xpath: str | None, + text_xpath: str | None, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + if parser_type == "xpath": + if xpath is None: + raise ValueError("No XPath given.") + else: + raise ValueError(f"Invalid parser type: {parser_type}") + parser_id_components = ( + provider_id if provider_id is not None else "", + url_pattern_regex if url_pattern_regex is not None else "", + str(priority) if priority is not None else "", + ) + parser_id = str(uuid5( + NAMESPACE_WARC_SNIPPETS_PARSER, + ":".join(parser_id_components), + )) + parser = WarcSnippetsParser( + id=parser_id, + last_modified=utc_now(), + provider=InnerProviderId(id=provider_id) if provider_id else None, + url_pattern_regex=url_pattern_regex, + priority=priority, + parser_type=parser_type, + xpath=xpath, + url_xpath=url_xpath, + title_xpath=title_xpath, + text_xpath=text_xpath, + ) + parser.save(using=config.es.client) + + +def _parse_warc_snippets( + parser: WarcSnippetsParser, + serp_id: str, + capture_url: str, + warc_store: WarcS3Store, + warc_location: WarcLocation, +) -> list[Snippet] | None: + # Check if URL matches pattern. + if (parser.url_pattern is not None and + not parser.url_pattern.match(capture_url)): + return None + + # Parse snippets. + if parser.parser_type == "xpath": + if parser.xpath is None: + raise ValueError("No XPath given.") + with open_warc(warc_store, warc_location) as record: + tree = parse_xml_tree(record) + if tree is None: + return None + + elements = safe_xpath(tree, parser.xpath, _Element) + if len(elements) == 0: + return None + + snippets = [] + element: _Element + for i, element in enumerate(elements): + url: str | None = None + if parser.url_xpath is not None: + urls = safe_xpath(element, parser.url_xpath, str) + if len(urls) > 0: + url = urls[0].strip() + url = urljoin(capture_url, url) + title: str | None = None + if parser.title_xpath is not None: + titles = safe_xpath(element, parser.title_xpath, str) + if len(titles) > 0: + title = titles[0].strip() + text: str | None = None + if parser.text_xpath is not None: + texts = safe_xpath(element, parser.text_xpath, str) + if len(texts) > 0: + text = texts[0].strip() + + content: str = tostring( + element, + encoding=str, + method="xml", + pretty_print=False, + with_tail=True, + ) + snippet_id_components = ( + serp_id, + parser.id, + str(hash(content)), + str(i), + ) + snippet_id = str(uuid5( + NAMESPACE_RESULT, + ":".join(snippet_id_components), + )) + snippets.append(Snippet( + id=snippet_id, + rank=i, + content=content, + url=url, + title=title, + text=text, + )) + return snippets + else: + raise ValueError(f"Unknown parser type: {parser.parser_type}") + + +@cache +def _warc_snippets_parsers( + config: Config, + provider_id: str, +) -> list[WarcSnippetsParser]: + parsers: Iterable[WarcSnippetsParser] = ( + WarcSnippetsParser.search(using=config.es.client) + .filter( + ~Exists(field="provider.id") | + Term(provider__id=provider_id) + ) + .query(RankFeature(field="priority", saturation={})) + .scan() + ) + parsers = safe_iter_scan(parsers) + return list(parsers) + + +def _parse_serp_warc_snippets_action( + config: Config, + serp: Serp, +) -> Iterator[dict]: + # Re-check if it can be parsed. + if (serp.warc_location is None or + serp.warc_location.file is None or + serp.warc_location.offset is None or + serp.warc_location.length is None): + return + + # Re-check if parsing is necessary. + if (serp.warc_snippets_parser is not None and + serp.warc_snippets_parser.should_parse is not None and + not serp.warc_snippets_parser.should_parse): + return + + for parser in _warc_snippets_parsers(config, serp.provider.id): + # Try to parse the snippets. + warc_snippets = _parse_warc_snippets( + parser=parser, + serp_id=serp.id, + capture_url=serp.capture.url, + warc_store=config.s3.warc_store, + warc_location=serp.warc_location, + ) + if warc_snippets is None: + # Parsing was not successful, e.g., URL pattern did not match. + continue + for snippet in warc_snippets: + yield Result( + id=snippet.id, + last_modified=utc_now(), + archive=serp.archive, + provider=serp.provider, + capture=serp.capture, + serp=InnerSerp( + id=serp.id, + ).to_dict(), + snippet=snippet, + snippet_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ).to_dict(), + warc_before_serp_downloader=InnerDownloader( + should_download=True, + ).to_dict(), + warc_after_serp_downloader=InnerDownloader( + should_download=True, + ).to_dict(), + ).to_dict(include_meta=True) + yield update_action( + serp, + warc_snippets=[ + SnippetId( + id=snippet.id, + rank=snippet.rank, + ) + for snippet in warc_snippets + ], + warc_snippets_parser=InnerParser( + id=parser.id, + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + yield update_action( + serp, + warc_snippets_parser=InnerParser( + should_parse=False, + last_parsed=utc_now(), + ), + ) + return + + +def parse_serps_warc_snippets(config: Config) -> None: + Serp.index().refresh(using=config.es.client) + changed_serps_search: Search = ( + Serp.search(using=config.es.client) + .filter( + Exists(field="warc_location") & + ~Term(warc_snippets_parser__should_parse=False) + ) + .query( + RankFeature(field="archive.priority", saturation={}) | + RankFeature(field="provider.priority", saturation={}) | + FunctionScore(functions=[RandomScore()]) + ) + ) + num_changed_serps = changed_serps_search.count() + if num_changed_serps > 0: + changed_serps: Iterable[Serp] = ( + changed_serps_search + .params(preserve_order=True) + .scan() + ) + changed_serps = safe_iter_scan(changed_serps) + # noinspection PyTypeChecker + changed_serps = tqdm( + changed_serps, total=num_changed_serps, + desc="Parsing WARC snippets", unit="SERP") + actions = chain.from_iterable( + _parse_serp_warc_snippets_action(config, serp) + for serp in changed_serps + ) + config.es.bulk(actions) + else: + echo("No new/changed SERPs.") diff --git a/archive_query_log/parsers/xml.py b/archive_query_log/parsers/xml.py new file mode 100644 index 00000000..36af76e7 --- /dev/null +++ b/archive_query_log/parsers/xml.py @@ -0,0 +1,90 @@ +from typing import Literal, Type, TypeVar, Iterable +from warnings import warn + +from cssselect import GenericTranslator +from cssselect.parser import parse as cssselect_parse +from lxml.etree import parse as etree_parse, XMLParser, HTMLParser # nosec: B410 +# noinspection PyProtectedMember +from lxml.etree import _ElementTree, _Element # nosec: B410 +from warcio.recordloader import ArcWarcRecord + +XmlParserType = Literal[ + "xml", + "html", +] + + +def parse_xml_tree(record: ArcWarcRecord) -> _ElementTree | None: + mime_type: str | None = record.http_headers.get_header("Content-Type") + if mime_type is None: + warn(UserWarning("No MIME type given.")) + return None + mime_type = mime_type.split(";", maxsplit=1)[0] + parser: XMLParser | HTMLParser + if mime_type == "text/xml": + parser = XMLParser() + elif mime_type == "text/html": + parser = HTMLParser() + else: + warn(UserWarning(f"Cannot find XML parser for MIME type: {mime_type}")) + return None + return etree_parse( # nosec: B320 + source=record.content_stream(), + parser=parser, + ) + + +_T = TypeVar("_T") + + +def safe_xpath( + tree: _ElementTree | _Element, + xpath: str, + item_type: Type[_T], +) -> list[_T]: + results = tree.xpath(xpath, smart_strings=False) + if not isinstance(results, list): + results = [results] + if not all(isinstance(result, item_type) for result in results): + types = ", ".join({str(type(result)) for result in results}) + raise ValueError( + f"All results of the XPath '{xpath}' results " + f"must be of type {item_type}, found: {types}") + return results + + +_translator = GenericTranslator() + + +def xpaths_from_css_selector(css_selector: str) -> list[str]: + if css_selector == ":--self": + return ["."] + selectors = cssselect_parse(css_selector) + return [ + _translator.selector_to_xpath( + selector, + prefix="", + translate_pseudo_elements=True, + ).replace("/descendant-or-self::*/", "//") + for selector in selectors + ] + + +def merge_xpaths(xpaths: Iterable[str]) -> str: + return " | ".join(xpaths) + + +def text_xpath( + xpath: str, + attribute: str | None = None, + text: bool = False, +) -> str: + if attribute is None and not text: + raise ValueError("Either an attribute or text=True must be given.") + if attribute is not None and text: + raise ValueError( + "An attribute and text=True are not allowed at the same time.") + if text: + return f"{xpath}//text()" + else: + return f"{xpath}/@{attribute}" diff --git a/archive_query_log/providers/__init__.py b/archive_query_log/providers/__init__.py new file mode 100644 index 00000000..f01a546a --- /dev/null +++ b/archive_query_log/providers/__init__.py @@ -0,0 +1,102 @@ +from uuid import uuid4 + +from click import echo, prompt +from elasticsearch_dsl import Search +from elasticsearch_dsl.query import Terms +from elasticsearch_dsl.response import Response + +from archive_query_log.config import Config +from archive_query_log.orm import Provider +from archive_query_log.utils.time import utc_now + + +def add_provider( + config: Config, + name: str | None, + description: str | None, + notes: str | None, + exclusion_reason: str | None, + domains: set[str], + url_path_prefixes: set[str], + priority: float | None, + no_merge: bool = False, + auto_merge: bool = False, +) -> None: + if priority is not None and priority <= 0: + raise ValueError("Priority must be strictly positive.") + Provider.index().refresh(using=config.es.client) + last_modified = utc_now() + should_build_sources = True + existing_provider_search: Search = ( + Provider.search(using=config.es.client) + .query(Terms(domains=list(domains))) + ) + existing_provider_response: Response = existing_provider_search.execute() + if existing_provider_response.hits.total.value > 0: + if no_merge: + return + existing_provider: Provider = existing_provider_response[0] + existing_domains = set(existing_provider.domains) + existing_url_path_prefixes = set( + existing_provider.url_path_prefixes) + provider_id = existing_provider.id + if auto_merge: + should_merge = True + else: + intersecting_domains = existing_domains & domains + first_intersecting_domains = sorted(intersecting_domains)[:5] + intersecting_domains_text = ", ".join(first_intersecting_domains) + num_more_intersecting_domains = (len(intersecting_domains) - + len(first_intersecting_domains)) + if num_more_intersecting_domains > 0: + intersecting_domains_text += \ + f" (+{num_more_intersecting_domains} more)" + echo(f"Provider {provider_id} already exists with " + f"conflicting domains: {intersecting_domains_text}") + add_to_existing = prompt("Merge with existing provider? " + "[y/N]", type=str, default="n", + show_default=False) + should_merge = add_to_existing.lower() == "y" + if not should_merge: + return + + if name is None: + name = existing_provider.name + if description is None: + description = existing_provider.description + if notes is None: + notes = existing_provider.notes + if exclusion_reason is None: + exclusion_reason = existing_provider.exclusion_reason + if priority is None: + priority = existing_provider.priority + + if (domains | existing_domains == existing_domains and + url_path_prefixes | existing_url_path_prefixes == + existing_url_path_prefixes): + last_modified = existing_provider.last_modified + should_build_sources = existing_provider.should_build_sources + + domains = domains | existing_domains + url_path_prefixes = url_path_prefixes | existing_url_path_prefixes + + if not auto_merge: + echo(f"Update provider {provider_id}.") + else: + provider_id = str(uuid4()) + if not no_merge and not auto_merge: + echo(f"Add new provider {provider_id}.") + + provider = Provider( + id=provider_id, + last_modified=last_modified, + name=name, + description=description, + notes=notes, + exclusion_reason=exclusion_reason, + domains=list(domains), + url_path_prefixes=list(url_path_prefixes), + priority=priority, + should_build_sources=should_build_sources, + ) + provider.save(using=config.es.client) diff --git a/archive_query_log/queries/iterable.py b/archive_query_log/queries/iterable.py deleted file mode 100644 index df582ef3..00000000 --- a/archive_query_log/queries/iterable.py +++ /dev/null @@ -1,44 +0,0 @@ -from dataclasses import dataclass -from gzip import GzipFile -from io import TextIOWrapper -from pathlib import Path -from typing import Sized, Iterable, Iterator, IO - -from archive_query_log.model import ArchivedQueryUrl -from archive_query_log.util.text import count_lines - - -@dataclass(frozen=True) -class ArchivedQueryUrls(Sized, Iterable[ArchivedQueryUrl]): - """ - Read archived query URLs from a JSONL file. - """ - - path: Path - """ - Path where the query URLs are stored in JSONL format. - """ - - def __post_init__(self): - self._check_urls_path() - - def _check_urls_path(self): - if not self.path.exists() or not self.path.is_file(): - raise ValueError( - f"URLs path must be a file: {self.path}" - ) - - def __len__(self) -> int: - with self.path.open("rb") as file: - with GzipFile(fileobj=file, mode="rb") as gzip_file: - gzip_file: IO[bytes] - return count_lines(gzip_file) - - def __iter__(self) -> Iterator[ArchivedQueryUrl]: - schema = ArchivedQueryUrl.schema() - with self.path.open("rb") as file: - with GzipFile(fileobj=file, mode="rb") as gzip_file: - gzip_file: IO[bytes] - with TextIOWrapper(gzip_file) as text_file: - for line in text_file: - yield schema.loads(line) diff --git a/archive_query_log/results/test/test_manual_facebook_serp_parsing.py b/archive_query_log/results/test/test_manual_facebook_serp_parsing.py deleted file mode 100644 index 4a65af96..00000000 --- a/archive_query_log/results/test/test_manual_facebook_serp_parsing.py +++ /dev/null @@ -1,72 +0,0 @@ -# flake8: noqa -from archive_query_log.results.test.test_utils import verify_serp_parsing - - -def test_jam_of_the_day_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20140917021101id_/https://www.facebook.com/search.php?q=%22Jam+of+the+Day&init=quick&tas=0.8517628074453497&search_first_focus=1302687872720', - 'facebook' - ) - - -def test_victoria_pynchon_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20110110152620id_/http://www.facebook.com/search.php?q=%22Victoria+Pynchon%22&init=q', - 'facebook' - ) - - -def test_anthony_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20130817124019id_/http://www.facebook.com/search.php?init=srp&sfxp&q=ANTHONY', - 'facebook' - ) - - -def test_7_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20210506042113id_/http://www.facebook.com/search.php?q=7', - 'facebook' - ) - - -def test_aj_duca_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20131226205922id_/http://www.facebook.com/search.php?q=AJ+Duca', - 'facebook' - ) - - -def test_noam_chomsky_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20210125134751id_/http://www.facebook.com/search.php?q=3DNoam%20Chomsky&init=3Dquick=', - 'facebook' - ) - - -def test_taylor_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20210126125305id_/http://www.facebook.com/search.php?q=3Dtaylor+company&am=', - 'facebook' - ) - - -def test_5_orsz_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20210410211649id_/http://www.facebook.com/search.php?q=5+orsz%C3%A1gos', - 'facebook' - ) - - -def test_1_million_cards_orsz_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20210304074906id_/http://www.facebook.com/search.php?q=1+million+cards&init=quick&tas=0.7974472279549098', - 'facebook' - ) - - -def test_abvie_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20210610122030id_/http://www.facebook.com/search.php?q=Abbvie&type=users&init=srp', - 'facebook' - ) diff --git a/archive_query_log/results/test/test_manual_youtube_serp_parsing.py b/archive_query_log/results/test/test_manual_youtube_serp_parsing.py deleted file mode 100644 index 957d07aa..00000000 --- a/archive_query_log/results/test/test_manual_youtube_serp_parsing.py +++ /dev/null @@ -1,9 +0,0 @@ -# flake8: noqa -from archive_query_log.results.test.test_utils import verify_serp_parsing - - -def test_chaoz_time_search(): - verify_serp_parsing( - 'https://web.archive.org/web/20220510040811id_/https://www.youtube.com/results?search_query=%21%21%21Chaoz+time%21%21%21', - 'youtube' - ) diff --git a/archive_query_log/schema.py b/archive_query_log/schema.py deleted file mode 100644 index 353aa034..00000000 --- a/archive_query_log/schema.py +++ /dev/null @@ -1,706 +0,0 @@ -from pyarrow import field, schema, string, timestamp, uint16, uint8, \ - dictionary, int8, list_, struct, uint32 -from pyarrow.dataset import partitioning - -SERP_SCHEMA = schema( - fields=[ - field( - "serp_id", - string(), - False, - metadata={ - "description": - "Unique SERP ID (based on a hash of the URL and timestamp " - "of the SERP).", - }, - ), - field( - "serp_url", - string(), - False, - metadata={ - "description": "Full URL of the SERP.", - }, - ), - field( - "serp_domain", - string(), - False, - metadata={ - "description": "Domain of the SERP URL.", - }, - ), - field( - "serp_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public suffix (https://publicsuffix.org/) of the SERP " - "domain.", - }, - ), - field( - "serp_timestamp", - timestamp("s"), - False, - metadata={ - "description": - "Timestamp of the archived snapshot in the Wayback " - "Machine.", - }, - ), - field( - "serp_year", - uint16(), - False, - metadata={ - "description": - "Year of the archived snapshot in the Wayback Machine.", - }, - ), - field( - "serp_month", - uint8(), - False, - metadata={ - "description": - "Month of the archived snapshot in the Wayback Machine.", - }, - ), - field( - "serp_wayback_url", - string(), - False, - metadata={ - "description": - "URL of the archived snapshot's contents in the Wayback " - "Machine.", - }, - ), - field( - "serp_wayback_raw_url", - string(), - False, - metadata={ - "description": - "URL of the archived snapshot's raw contents in the " - "Wayback Machine.", - }, - ), - field( - "serp_page", - uint8(), - True, - metadata={ - "description": - "SERP page number as parsed from the URL, e.g., 1, 2, " - "3 (zero-indexed).", - }, - ), - field( - "serp_offset", - uint16(), - True, - metadata={ - "description": - "SERP results offset (start position) as parsed from the " - "URL, e.g., 10, 20 (zero-indexed).", - }, - ), - field( - "serp_query_text_url", - string(), - True, - metadata={ - "description": "The SERP's query as parsed from the URL.", - }, - ), - field( - "serp_query_text_url_language", - dictionary(int8(), string()), - True, - metadata={ - "description": - "Language identified in the query as parsed from the URL. " - "(Google's cld3; min threshold for 'hr' or 'bs': 0.5, for " - "others: 0.7.)", - }, - ), - field( - "serp_query_text_html", - string(), - True, - metadata={ - "description": - "The SERP's query as parsed from the HTML contents. " - "(Can be different from the query parsed from the URL due " - "to spelling correction etc.)", - }, - ), - field( - "serp_warc_relative_path", - string(), - True, - metadata={ - "description": - "Path of the SERP's WARC file relative to the corpus root " - "path.", - }, - ), - field( - "serp_warc_byte_offset", - uint32(), - True, - metadata={ - "description": - "Position of the SERP's WARC record's first byte in the " - "compressed WARC file.", - }, - ), - field( - "serp_results", - list_(struct([ - field( - "result_id", - string(), - False, - metadata={ - "description": - "Unique document ID (based on a hash of the URL " - "and timestamp of the SERP and the result snippet " - "rank).", - }, - ), - field( - "result_url", - string(), - False, - metadata={ - "description": "Full URL of the document.", - }, - ), - field( - "result_domain", - string(), - False, - metadata={ - "description": "Domain of the document URL.", - }, - ), - field( - "result_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public suffix (https://publicsuffix.org/) of the " - "document domain.", - }, - ), - field( - "result_wayback_url", - string(), - False, - metadata={ - "description": - "URL of the document's nearest archived " - "snapshot's contents in the Wayback Machine. " - "Note that there might not be a snapshot for the " - "exact timestamp, but the Wayback Machine instead " - "redirects to the nearest available snapshot.", - }, - ), - field( - "result_wayback_raw_url", - string(), - False, - metadata={ - "description": - "URL of the document's nearest archived " - "snapshot's raw contents in the Wayback Machine. " - "Note that there might not be a snapshot for the " - "exact timestamp, but the Wayback Machine instead " - "redirects to the nearest available snapshot.", - }, - ), - field( - "result_snippet_rank", - uint8(), - False, - metadata={ - "description": - "Rank of the document's snippet on the SERP.", - }, - ), - field( - "result_snippet_title", - string(), - False, - metadata={ - "description": - "Snippet title of the search result with optional " - "highlighting (normalized to ```` tags, other " - "tags removed).", - }, - ), - field( - "result_snippet_text", - string(), - True, - metadata={ - "description": - "Snippet text of the search result with optional " - "highlighting (normalized to ```` tags, other " - "tags removed).", - }, - ), - field( - "result_warc_relative_path", - string(), - True, - metadata={ - "description": - "Path of the SERP's WARC file relative to the " - "corpus root path.", - }, - ), - field( - "result_warc_byte_offset", - uint32(), - True, - metadata={ - "description": - "Position of the SERP's WARC record's first byte " - "in the compressed WARC file.", - }, - ), - ])), - True, - metadata={ - "description": - "Retrieved results from the SERP in the same order as " - "they appear.", - }, - ), - field( - "search_provider_name", - string(), - False, - metadata={ - "description": - "Search provider name (domain without the Public Suffix).", - }, - ), - field( - "search_provider_alexa_domain", - string(), - False, - metadata={ - "description": - "Main domain of the search provider as it appears in " - "Alexa top-1M ranks.", - }, - ), - field( - "search_provider_alexa_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public Suffix (https://publicsuffix.org/) of the search " - "provider's main domain.", - }, - ), - field( - "search_provider_alexa_rank", - uint32(), - True, - metadata={ - "description": - "Rank of the search provider's main domain in fused Alexa " - "top-1M rankings.", - }, - ), - field( - "search_provider_category", - dictionary(uint8(), string()), - True, - metadata={ - "description": - "Category of the search provider (manual annotation).", - }, - ), - ], - metadata={ - "description": "A single search engine result page.", - }, -) - -RESULT_SCHEMA = schema( - fields=[ - field( - "result_id", - string(), - False, - metadata={ - "description": - "Unique document ID (based on a hash of the URL and " - "timestamp of the SERP and the result snippet rank).", - }, - ), - field( - "result_url", - string(), - False, - metadata={ - "description": "Full URL of the document.", - }, - ), - field( - "result_domain", - string(), - False, - metadata={ - "description": "Domain of the document URL.", - }, - ), - field( - "result_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public suffix (https://publicsuffix.org/) of the " - "document domain.", - }, - ), - field( - "result_wayback_url", - string(), - False, - metadata={ - "description": - "URL of the document's nearest archived snapshot's " - "contents in the Wayback Machine. Note that there might " - "not be a snapshot for the exact timestamp, but the " - "Wayback Machine instead redirects to the nearest " - "available snapshot.", - }, - ), - field( - "result_wayback_raw_url", - string(), - False, - metadata={ - "description": - "URL of the document's nearest archived snapshot's raw " - "contents in the Wayback Machine. Note that there might " - "not be a snapshot for the exact timestamp, but the " - "Wayback Machine instead redirects to the nearest " - "available snapshot.", - }, - ), - field( - "result_snippet_rank", - uint8(), - False, - metadata={ - "description": "Rank of the document's snippet on the SERP.", - }, - ), - field( - "result_snippet_title", - string(), - False, - metadata={ - "description": - "Snippet title of the search result with optional " - "highlighting (normalized to ```` tags, other tags " - "removed).", - }, - ), - field( - "result_snippet_text", - string(), - True, - metadata={ - "description": - "Snippet text of the search result with optional " - "highlighting (normalized to ```` tags, other tags " - "removed).", - }, - ), - field( - "result_warc_relative_path", - string(), - True, - metadata={ - "description": - "Path of the SERP's WARC file relative to the corpus root " - "path.", - }, - ), - field( - "result_warc_byte_offset", - uint32(), - True, - metadata={ - "description": - "Position of the SERP's WARC record's first byte in the " - "compressed WARC file.", - }, - ), - field( - "serp_id", - string(), - False, - metadata={ - "description": - "Unique SERP ID (based on a hash of the URL and timestamp " - "of the SERP).", - }, - ), - field( - "serp_url", - string(), - False, - metadata={ - "description": "Full URL of the SERP.", - }, - ), - field( - "serp_domain", - string(), - False, - metadata={ - "description": "Domain of the SERP URL.", - }, - ), - field( - "serp_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public suffix (https://publicsuffix.org/) of the SERP " - "domain.", - }, - ), - field( - "serp_timestamp", - timestamp("s"), - False, - metadata={ - "description": - "Timestamp of the archived snapshot in the Wayback " - "Machine.", - }, - ), - field( - "serp_year", - uint16(), - False, - metadata={ - "description": - "Year of the archived snapshot in the Wayback Machine.", - }, - ), - field( - "serp_month", - uint8(), - False, - metadata={ - "description": - "Month of the archived snapshot in the Wayback Machine.", - }, - ), - field( - "serp_wayback_url", - string(), - False, - metadata={ - "description": - "URL of the archived snapshot's contents in the Wayback " - "Machine.", - }, - ), - field( - "serp_wayback_raw_url", - string(), - False, - metadata={ - "description": - "URL of the archived snapshot's raw contents in the " - "Wayback Machine.", - }, - ), - field( - "serp_page", - uint8(), - True, - metadata={ - "description": - "SERP page number as parsed from the URL, e.g., 1, 2, " - "3 (zero-indexed).", - }, - ), - field( - "serp_offset", - uint16(), - True, - metadata={ - "description": - "SERP results offset (start position) as parsed from the " - "URL, e.g., 10, 20 (zero-indexed).", - }, - ), - field( - "serp_query_text_url", - string(), - True, - metadata={ - "description": "The SERP's query as parsed from the URL.", - }, - ), - field( - "serp_query_text_url_language", - dictionary(int8(), string()), - True, - metadata={ - "description": - "Language identified in the query as parsed from the URL. " - "(Google's cld3; min threshold for 'hr' or 'bs': 0.5, for " - "others: 0.7.)", - }, - ), - field( - "serp_query_text_html", - string(), - True, - metadata={ - "description": - "The SERP's query as parsed from the HTML contents.", - }, - ), - field( - "serp_warc_relative_path", - string(), - True, - metadata={ - "description": - "Path of the SERP's WARC file relative to the corpus root " - "path.", - }, - ), - field( - "serp_warc_byte_offset", - uint32(), - True, - metadata={ - "description": - "Position of the SERP's WARC record's first byte in the " - "compressed WARC file.", - }, - ), - field( - "search_provider_name", - string(), - False, - metadata={ - "description": - "Search provider name (domain without the Public Suffix).", - }, - ), - field( - "search_provider_alexa_domain", - string(), - False, - metadata={ - "description": - "Main domain of the search provider as it appears in " - "Alexa top-1M ranks.", - }, - ), - field( - "search_provider_alexa_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public Suffix (https://publicsuffix.org/) of the search " - "provider's main domain.", - }, - ), - field( - "search_provider_alexa_rank", - uint32(), - True, - metadata={ - "description": - "Rank of the search provider's main domain in fused Alexa " - "top-1M rankings.", - }, - ), - field( - "search_provider_category", - dictionary(uint8(), string()), - True, - metadata={ - "description": - "Category of the search provider (manual annotation).", - }, - ), - ], - metadata={ - "description": "A single result from a SERP.", - }, -) - -SERP_PARTITIONING = partitioning( - schema=schema( - fields=[ - field( - "serp_domain_public_suffix", - string(), - False, - metadata={ - "description": - "Public suffix (https://publicsuffix.org/) of the " - "SERP domain.", - }, - ), - field( - "serp_domain", - string(), - False, - metadata={ - "description": "Domain of the SERP URL.", - }, - ), - field( - "serp_year", - uint16(), - False, - metadata={ - "description": - "Year of the archived snapshot in the Wayback " - "Machine.", - }, - ), - field( - "serp_month", - uint8(), - False, - metadata={ - "description": - "Month of the archived snapshot in the Wayback " - "Machine.", - }, - ), - ], - ), - flavor="hive", -) - -RESULT_PARTITIONING = SERP_PARTITIONING diff --git a/archive_query_log/serps/iterable.py b/archive_query_log/serps/iterable.py deleted file mode 100644 index 787e1a88..00000000 --- a/archive_query_log/serps/iterable.py +++ /dev/null @@ -1,44 +0,0 @@ -from dataclasses import dataclass -from gzip import GzipFile -from io import TextIOWrapper -from pathlib import Path -from typing import Sized, Iterable, Iterator, IO - -from archive_query_log.model import ArchivedParsedSerp -from archive_query_log.util.text import count_lines - - -@dataclass(frozen=True) -class ArchivedParsedSerps(Sized, Iterable[ArchivedParsedSerp]): - """ - Read archived parsed SERPs from a JSONL file. - """ - - path: Path - """ - Path where the parsed SERPs are stored in JSONL format. - """ - - def __post_init__(self): - self._check_urls_path() - - def _check_urls_path(self): - if not self.path.exists() or not self.path.is_file(): - raise ValueError( - f"URLs path must be a file: {self.path}" - ) - - def __len__(self) -> int: - with self.path.open("rb") as file: - with GzipFile(fileobj=file, mode="rb") as gzip_file: - gzip_file: IO[bytes] - return count_lines(gzip_file) - - def __iter__(self) -> Iterator[ArchivedParsedSerp]: - schema = ArchivedParsedSerp.schema() - with self.path.open("rb") as file: - with GzipFile(fileobj=file, mode="rb") as gzip_file: - gzip_file: IO[bytes] - with TextIOWrapper(gzip_file) as text_file: - for line in text_file: - yield schema.loads(line) diff --git a/archive_query_log/sources/__init__.py b/archive_query_log/sources/__init__.py new file mode 100644 index 00000000..7797307c --- /dev/null +++ b/archive_query_log/sources/__init__.py @@ -0,0 +1,189 @@ +from itertools import chain +from typing import Iterable, Iterator +from uuid import uuid5 +from warnings import warn + +from click import echo +from elasticsearch_dsl import Search +from elasticsearch_dsl.function import RandomScore +from elasticsearch_dsl.query import FunctionScore, Exists, Term +from tqdm.auto import tqdm + +from archive_query_log.config import Config +from archive_query_log.namespaces import NAMESPACE_SOURCE +from archive_query_log.orm import ( + Archive, Provider, Source, InnerArchive, InnerProvider) +from archive_query_log.utils.es import safe_iter_scan, update_action +from archive_query_log.utils.time import utc_now + + +def _sources_batch(archive: Archive, provider: Provider) -> list[dict]: + if provider.exclusion_reason is not None: + warn( + f"Skipping provider {provider.id} " + f"because it is excluded: {provider.exclusion_reason}" + ) + return [] + + batch = [] + for domain in provider.domains: + for url_path_prefix in provider.url_path_prefixes: + source_id_components = ( + archive.cdx_api_url, + archive.memento_api_url, + domain, + url_path_prefix, + ) + source_id = str(uuid5( + NAMESPACE_SOURCE, + ":".join(source_id_components), + )) + source = Source( + id=source_id, + last_modified=utc_now(), + archive=InnerArchive( + id=archive.id, + cdx_api_url=archive.cdx_api_url, + memento_api_url=archive.memento_api_url, + priority=archive.priority, + ), + provider=InnerProvider( + id=provider.id, + domain=domain, + url_path_prefix=url_path_prefix, + priority=provider.priority, + ), + should_fetch_captures=True, + ) + batch.append(source.to_dict(include_meta=True)) + return batch + + +def _iter_sources_batches_changed_archives( + changed_archives_search: Search, + all_providers_search: Search, +) -> Iterator[list[dict]]: + archive: Archive + provider: Provider + changed_archives = changed_archives_search.scan() + changed_archives = safe_iter_scan(changed_archives) + for archive in changed_archives: + all_providers = all_providers_search.scan() + all_providers = safe_iter_scan(all_providers) + for provider in all_providers: + yield _sources_batch( + archive, + provider, + ) + yield [update_action( + archive, + should_build_sources=False, + last_built_sources=utc_now(), + )] + + +def _iter_sources_batches_changed_providers( + changed_providers_search: Search, + all_archives_search: Search, +) -> Iterator[list[dict]]: + archive: Archive + provider: Provider + changed_providers = changed_providers_search.scan() + changed_providers = safe_iter_scan(changed_providers) + for provider in changed_providers: + all_archives = all_archives_search.scan() + all_archives = safe_iter_scan(all_archives) + for archive in all_archives: + yield _sources_batch( + archive, + provider, + ) + yield [update_action( + provider, + should_build_sources=False, + last_built_sources=utc_now(), + )] + + +def _build_archive_sources(config: Config) -> None: + Archive.index().refresh(using=config.es.client) + changed_archives_search = ( + Archive.search(using=config.es.client) + .filter(~Term(should_build_sources=False)) + .query(FunctionScore(functions=[RandomScore()])) + ) + num_changed_archives = changed_archives_search.count() + all_providers_search = ( + Provider.search(using=config.es.client) + .filter(~Exists(field="exclusion_reason"))) + num_all_providers = all_providers_search.count() + num_batches_archives = ( + (num_changed_archives * num_all_providers) + + num_changed_archives + ) + if num_batches_archives > 0: + echo(f"Building sources for {num_changed_archives} " + f"new/changed archives.") + action_batches_archives: Iterable[list[dict]] = ( + _iter_sources_batches_changed_archives( + changed_archives_search=changed_archives_search, + all_providers_search=all_providers_search, + )) + # noinspection PyTypeChecker + action_batches_archives = tqdm( + action_batches_archives, + total=num_batches_archives, + desc="Build sources", + unit="batch", + ) + actions_archives = chain.from_iterable(action_batches_archives) + config.es.bulk(actions_archives) + else: + echo("No new/changed archives.") + + +def _build_provider_sources(config: Config) -> None: + Provider.index().refresh(using=config.es.client) + changed_providers_search = ( + Provider.search(using=config.es.client) + .filter(~Term(should_build_sources=False)) + .query(FunctionScore(functions=[RandomScore()])) + ) + num_changed_providers = changed_providers_search.count() + all_archives_search = Archive.search(using=config.es.client) + num_all_archives = all_archives_search.count() + num_batches_providers = ( + (num_changed_providers * num_all_archives) + + num_changed_providers + ) + if num_batches_providers > 0: + echo( + f"Building sources for {num_changed_providers} " + f"new/changed providers.") + action_batches_providers: Iterable[list[dict]] = ( + _iter_sources_batches_changed_providers( + changed_providers_search=changed_providers_search, + all_archives_search=all_archives_search, + )) + # noinspection PyTypeChecker + action_batches_providers = tqdm( + action_batches_providers, + total=num_batches_providers, + desc="Build sources", + unit="batch", + ) + actions_providers = chain.from_iterable(action_batches_providers) + config.es.bulk(actions_providers) + else: + echo("No new/changed providers.") + + +def build_sources( + config: Config, + skip_archives: bool, + skip_providers: bool, +) -> None: + if not skip_archives: + _build_archive_sources(config) + if not skip_providers: + _build_provider_sources(config) diff --git a/archive_query_log/templates/home.html b/archive_query_log/templates/home.html new file mode 100644 index 00000000..47a41543 --- /dev/null +++ b/archive_query_log/templates/home.html @@ -0,0 +1,146 @@ + + + Archive Query Log + + + + + + +
+

Archive Query Log

+

The Archive Query Log monitoring interface.

+ +
+
+
+ Monitor the crawling and parsing of the Archive Query Log. + Directly go to the detailed statistics or check the progress. +
+
+

Statistics

+
+ + + + + + + + + + + + {% for statistics in statistics_list %} + + + + + + + + {% endfor %} + +
TypeDescriptionCountSizeLast modified
{{ statistics.name }}{% if statistics.description %}{{ statistics.description }}{% else %}β€”{% endif %}{{ "{:,.0f}".format(statistics.total) }} + {% if statistics.disk_size %} + {{ statistics.disk_size }} + {% else %} + β€” + {% endif %} + + {% if statistics.last_modified %} + + {% else %} + β€” + {% endif %} +
+
+
+
+

Progress

+
+ + + + + + + + + + + + + {% for progress in progress_list %} + + + + + + + + + {% endfor %} + +
InputOutputDescriptionUnprocessedProcessedProgress
{{ progress.input_name }}{{ progress.output_name }}{{ progress.description }}{{ "{:,.0f}".format(progress.total - progress.current) }}{{ "{:,.0f}".format(progress.current) }} + + {{ "{:,.0f}".format(progress.current) }}/{{ "{:,.0f}".format(progress.total) }} + + {% if progress.total != 0 %} + {{ "{:.0f}".format(progress.current / progress.total * 100) }}% + {% else %} + 0% + {% endif %} +
+
+
+
+ + diff --git a/archive_query_log/test_fastwarc.py b/archive_query_log/test_fastwarc.py deleted file mode 100644 index 13c26d24..00000000 --- a/archive_query_log/test_fastwarc.py +++ /dev/null @@ -1,22 +0,0 @@ -def test_fastwarc_installed(): - import fastwarc - assert fastwarc - - from fastwarc import GZipStream - assert GZipStream - - from fastwarc import FileStream - assert FileStream - - from fastwarc import ArchiveIterator - assert ArchiveIterator - - from fastwarc import WarcRecordType - assert WarcRecordType - - from fastwarc import WarcRecord - assert WarcRecord - - # pylint: disable=no-name-in-module - from fastwarc.stream_io import PythonIOStreamAdapter - assert PythonIOStreamAdapter diff --git a/archive_query_log/urls/iterable.py b/archive_query_log/urls/iterable.py deleted file mode 100644 index 81523dbf..00000000 --- a/archive_query_log/urls/iterable.py +++ /dev/null @@ -1,44 +0,0 @@ -from dataclasses import dataclass -from gzip import GzipFile -from io import TextIOWrapper -from pathlib import Path -from typing import Sized, Iterable, Iterator, IO - -from archive_query_log.model import ArchivedUrl -from archive_query_log.util.text import count_lines - - -@dataclass(frozen=True) -class ArchivedUrls(Sized, Iterable[ArchivedUrl]): - """ - Read archived URLs from a JSONL file. - """ - - path: Path - """ - Path where the URLs are stored in JSONL format. - """ - - def __post_init__(self): - self._check_urls_path() - - def _check_urls_path(self): - if not self.path.exists() or not self.path.is_file(): - raise ValueError( - f"URLs path must be a file: {self.path}" - ) - - def __len__(self) -> int: - with self.path.open("rb") as file: - with GzipFile(fileobj=file, mode="rb") as gzip_file: - gzip_file: IO[bytes] - return count_lines(gzip_file) - - def __iter__(self) -> Iterator[ArchivedUrl]: - schema = ArchivedUrl.schema() - with self.path.open("rb") as file: - with GzipFile(fileobj=file, mode="rb") as gzip_file: - gzip_file: IO[bytes] - with TextIOWrapper(gzip_file) as text_file: - for line in text_file: - yield schema.loads(line) diff --git a/archive_query_log/utils/__init__.py b/archive_query_log/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archive_query_log/utils/es.py b/archive_query_log/utils/es.py new file mode 100644 index 00000000..4a6c027f --- /dev/null +++ b/archive_query_log/utils/es.py @@ -0,0 +1,58 @@ +from typing import Iterator, TypeVar, Iterable, Any +from warnings import warn + +from elasticsearch import NotFoundError +from elasticsearch_dsl import Document, InnerDoc +from elasticsearch_dsl.utils import META_FIELDS + +DocumentType = TypeVar("DocumentType", bound=Document) + + +def safe_iter_scan(it: Iterable[DocumentType]) -> Iterator[DocumentType]: + try: + for doc in it: + yield doc + except NotFoundError as e: + if (e.info is not None and isinstance(e.info, dict) and + "error" in e.info and + isinstance(e.info["error"], dict) and + "root_cause" in e.info["error"] and + isinstance(e.info["error"]["root_cause"], list) and + len(e.info["error"]["root_cause"]) > 0 and + isinstance(e.info["error"]["root_cause"][0], dict) and + "resource.id" in e.info["error"]["root_cause"][0] and + e.info["error"]["root_cause"][0]["resource.id"] == + "search_phase_execution_exception"): + warn(RuntimeWarning("Scan expired. Stopping iteration.")) + raise StopIteration() from e + else: + raise e + + +def _to_dict_if_needed(value: Any) -> Any: + if isinstance(value, InnerDoc): + return value.to_dict() + return value + + +def update_action( + document: Document, + retry_on_conflict: int | None = 3, + **fields, +) -> dict: + action = { + f"_{key}": document.meta[key] + for key in META_FIELDS + if key not in ("score") + if key in document.meta and document.meta[key] is not None + } + action["_op_type"] = "update" + # Create a partial document by instantiating a new object of the + # document type and ignoring the meta fields (e.g., the document ID). + action["doc"] = type(document)(**fields).to_dict( + include_meta=False, + skip_empty=True, + ) + if retry_on_conflict is not None: + action["_retry_on_conflict"] = retry_on_conflict + return action diff --git a/archive_query_log/utils/time.py b/archive_query_log/utils/time.py new file mode 100644 index 00000000..22bbac16 --- /dev/null +++ b/archive_query_log/utils/time.py @@ -0,0 +1,14 @@ +from datetime import datetime, timezone, timedelta + +EPOCH = datetime.fromtimestamp(0) + +UTC = timezone.utc + +CET = timezone(timedelta(hours=1)) +""" +Central European Time (CET) +""" + + +def utc_now() -> datetime: + return datetime.now(tz=UTC).replace(microsecond=0) diff --git a/config.yml b/config.yml new file mode 100644 index 00000000..8728a8ea --- /dev/null +++ b/config.yml @@ -0,0 +1,8 @@ +es: + host: elasticsearch.srv.webis.de + port: 9200 + username: null + password: null +s3: + endpoint_url: https://s3.dw.webis.de + bucket_name: archive-query-log diff --git a/data/.gitignore b/data/.gitignore index 7389b24b..d1232636 100644 --- a/data/.gitignore +++ b/data/.gitignore @@ -11,3 +11,4 @@ examples/** !examples/serps.zip !examples/results.jsonl !examples/results.zip +cache diff --git a/data/manual-annotations/archived-raw-serps/expected/amazon-guang-huaiha-hawa-me-ta-1632443655.approved.txt b/data/manual-annotations/archived-raw-serps/expected/amazon-guang-huaiha-hawa-me-ta-1632443655.approved.txt new file mode 100644 index 00000000..233b693a --- /dev/null +++ b/data/manual-annotations/archived-raw-serps/expected/amazon-guang-huaiha-hawa-me-ta-1632443655.approved.txt @@ -0,0 +1,395 @@ +{ + "interpreted_query": "\u5149\u30d5\u30a1\u30a4\u30d0\u30fc \u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc", + "offset": null, + "page": null, + "query": "\u5149\u30d5\u30a1\u30a4\u30d0\u30fc \u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc", + "results": [ + { + "rank": 2, + "snippet": null, + "timestamp": 1632443655, + "title": "Hi-FIELD Optical Power Meter HF-610A Fiber Optic Checker (10 km), Integrated Type, Self-Calibration Function, Set of 8, Japanese Instruction Manual & Japanese Battery Included FC/ST/SC", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A15W1T8C8HGANC&url=%2FHi-FIELD-Integrated-Self-Calibration-Function-Instruction%2Fdp%2FB07CRR497N%2Fref%3Dsr_1_1_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-1-spons%26psc%3D1&qualifier=1632450856&id=3198468320158844&widgetName=sp_atf" + }, + { + "rank": 3, + "snippet": null, + "timestamp": 1632443655, + "title": "KKnoon Fiber Optic Tool Set 5km Red Fiber Optic Cleaver FTTH Kit Optical Power Meter Tool Kit", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A2XW5534FXO3YB&url=%2FKKnoon-Fiber-Optic-Cleaver-Optical%2Fdp%2FB099ZYB3QV%2Fref%3Dsr_1_2_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-2-spons%26psc%3D1%26smid%3DA2THBSOMLUNAGS&qualifier=1632450856&id=3198468320158844&widgetName=sp_atf" + }, + { + "rank": 4, + "snippet": null, + "timestamp": 1632443655, + "title": "Hi\u00a0\u2013\u00a0Field Light Power Meter TL510\u00a0\u00b0C + Fiber Optic Checker (5\u00a0km) Set of all St/FC/SC Connector with Exclusive Case Made in Japan batteries", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A2VHRT9INYR132&url=%2FHi-%25E2%2580%2593-Field-TL510-%25C2%25B0C-Connector-Exclusive-batteries%2Fdp%2FB079KBF4MV%2Fref%3Dsr_1_3_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-3-spons%26psc%3D1&qualifier=1632450856&id=3198468320158844&widgetName=sp_atf" + }, + { + "rank": 5, + "snippet": null, + "timestamp": 1632443655, + "title": "Sc and Fc in all -701 Fiber Optic Cable Tester for Connector Fiber Tester and 10 dBm Optical Power Meter and Visual Disorder Locator Less than 10mW", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_atf_aps_sr_pg1_1?ie=UTF8&adId=A302PKYXTY0QZ9&url=%2FTester-Connector-Optical-Disorder-Locator%2Fdp%2FB016WKI46S%2Fref%3Dsr_1_4_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-4-spons%26psc%3D1&qualifier=1632450856&id=3198468320158844&widgetName=sp_atf" + }, + { + "rank": 6, + "snippet": null, + "timestamp": 1632443655, + "title": "Hi-FIELD Optical Power Meter HF-610A Fiber Optic Checker (10 km), Integrated Type, Self-Calibration Function, Set of 8, Japanese Instruction Manual & Japanese Battery Included FC/ST/SC", + "url": "https://www.amazon.co.jp/-/en/Hi-FIELD-Integrated-Self-Calibration-Function-Instruction/dp/B07CRR497N/ref=sr_1_5?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-5" + }, + { + "rank": 7, + "snippet": null, + "timestamp": 1632443655, + "title": "Hi\u00a0\u2013\u00a0Field Light Power Meter TL510\u00a0\u00b0C + Fiber Optic Checker (5\u00a0km) Set of all St/FC/SC Connector with Exclusive Case Made in Japan batteries", + "url": "https://www.amazon.co.jp/-/en/Hi-%E2%80%93-Field-TL510-%C2%B0C-Connector-Exclusive-batteries/dp/B079K96YYN/ref=sr_1_6?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-6" + }, + { + "rank": 8, + "snippet": null, + "timestamp": 1632443655, + "title": "KK moon Optical Power Meter Mini Fiber 8 Wavelength LED Light Network Cable Tester FTTH Fiber Optic Cable Tester", + "url": "https://www.amazon.co.jp/-/en/Optical-Power-Wavelength-Network-Tester/dp/B07Z9GMDMB/ref=sr_1_7?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-7" + }, + { + "rank": 9, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optical Power Meter FC/SC/ST Interface Fiber Optical Attenuation Tester, Wavelength Memory, Wavelength Range 800-1700nm", + "url": "https://www.amazon.co.jp/-/en/Optical-Interface-Attenuation-Wavelength-800-1700nm/dp/B08GYW74PV/ref=sr_1_8?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-8" + }, + { + "rank": 10, + "snippet": null, + "timestamp": 1632443655, + "title": "KLS-35-S Fiber Optic Light Source Optic Tool Fiber Maintenance Dual Wavelength 1310+1550nm Handheld Fiber Optic Power Meter", + "url": "https://www.amazon.co.jp/-/en/KLS-35-S-Source-Maintenance-Wavelength-Handheld/dp/B08XTZ2QY2/ref=sr_1_9?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-9" + }, + { + "rank": 11, + "snippet": null, + "timestamp": 1632443655, + "title": "Pudibei Fiber Optic Tool Set [Optical Power Meter + Fiber Optic Checker (10 km) + Cutter for Fiber Optic + Wire Stripper] with Self Calibration Function (PDB-520A)", + "url": "https://www.amazon.co.jp/-/en/Pudibei-Stripper-Calibration-Function-PDB-520A/dp/B08Y96VD25/ref=sr_1_10?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-10" + }, + { + "rank": 12, + "snippet": null, + "timestamp": 1632443655, + "title": "Portable Optical Power Meter Fiber Tester SC/FC TL-510A (-70 to +10 dBm) for Telecommunications Cable TV Optical Fiber Experiments", + "url": "https://www.amazon.co.jp/-/en/Portable-Optical-TL-510A-Telecommunications-Experiments/dp/B00QPWQ11Y/ref=sr_1_11?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-11" + }, + { + "rank": 13, + "snippet": null, + "timestamp": 1632443655, + "title": "Optical power meter, stable KLS-25M handheld fiber light source, durable fiber optic checker for stable output.", + "url": "https://www.amazon.co.jp/-/en/Optical-KLS-25M-handheld-durable-checker/dp/B08HSK6T2F/ref=sr_1_12?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-12" + }, + { + "rank": 14, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Cable Tester, FC/SC/ST Wide Wavelength HDTV, SDI for SDI Analog MATV Systems", + "url": "https://www.amazon.co.jp/-/en/Fiber-Tester-Wavelength-Analog-Systems/dp/B09BG4QMBK/ref=sr_1_13?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-13" + }, + { + "rank": 15, + "snippet": null, + "timestamp": 1632443655, + "title": "15km Fiber Optic Power Multi-Tester Meter High Precision Visual Fault Rotator Fiber Optic Tool Connector Optical Testing Equipment", + "url": "https://www.amazon.co.jp/-/en/Multi-Tester-Precision-Rotator-Connector-Equipment/dp/B08L983PGG/ref=sr_1_14?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-14" + }, + { + "rank": 16, + "snippet": null, + "timestamp": 1632443655, + "title": "M 16\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fcSC\u30a2\u30c0\u30d7\u30bf\u30fc\u30aa\u30d7\u30c6\u30a3\u30ab\u30eb\u30c6\u30b9\u30bf\u30fc\u30a2\u30af\u30bb\u30b5\u30ea\u30fc\u3001\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fcSC\u30a2\u30c0\u30d7\u30bf\u30fcSC\u30a2\u30c0\u30d7\u30bf\u30fc", + "url": "https://www.amazon.co.jp/-/en/dp/B09CTM51SV/ref=sr_1_15?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-15" + }, + { + "rank": 17, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Tester Tool High Network Cable Tester Fiber Optic Meter Mini Fiber Optic Power Meter Fiber Optic Power Meter for Fiber", + "url": "https://www.amazon.co.jp/-/en/Fiber-Optic-Tester-Network-Cable/dp/B08ZSKF2QT/ref=sr_1_16?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-16" + }, + { + "rank": 18, + "snippet": null, + "timestamp": 1632443655, + "title": "\u96fb\u6c17\u30c6\u30b9\u30bf\u30fc\u3001\u30dd\u30fc\u30bf\u30d6\u30eb\u5e83\u6ce2\u9577\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30c4\u30fc\u30eb\u3001\u30cd\u30c3\u30c8\u30ef\u30fc\u30af\u30c6\u30b9\u30bf\u30fc\u3001\u30cf\u30f3\u30c9\u30d8\u30eb\u30c9\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3001HDTV\u3001SDI\u7528", + "url": "https://www.amazon.co.jp/-/en/dp/B08ZJL49T2/ref=sr_1_17?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-17" + }, + { + "rank": 19, + "snippet": null, + "timestamp": 1632443655, + "title": "FTTH Optical Tools Fiber Connection Tool Kit FTTH Optic Kit Assembly Termination Tool Lost Fiber Power Meter Light Source Industrial Sensor", + "url": "https://www.amazon.co.jp/-/en/Optical-Connection-Assembly-Termination-Industrial/dp/B08W3C725Z/ref=sr_1_18?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-18" + }, + { + "rank": 20, + "snippet": null, + "timestamp": 1632443655, + "title": "Laser light source optical power meter handheld fiber optic tester fiber test tool", + "url": "https://www.amazon.co.jp/-/en/Laser-source-optical-handheld-tester/dp/B08WWTZ8YN/ref=sr_1_19?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-19" + }, + { + "rank": 21, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter Laser/R Light Source Optical Power Meter Handheld Fiber Optic Tester Fiber Tester for Electrical Communication Engineering Maintenance", + "url": "https://www.amazon.co.jp/-/en/Handheld-Electrical-Communication-Engineering-Maintenance/dp/B09DPR6KFR/ref=sr_1_20?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-20" + }, + { + "rank": 22, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter Fiber Optic Test Pen 15km Test Fault Distance 800-1700nm High Precision Visual Disorder Locator Fiber Optic Tool", + "url": "https://www.amazon.co.jp/-/en/Distance-800-1700nm-Precision-Disorder-Locator/dp/B08GYXM7Q5/ref=sr_1_21?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-21" + }, + { + "rank": 23, + "snippet": null, + "timestamp": 1632443655, + "title": "KKnoon Fiber Optic Tool Set 5km Red Fiber Optic Cleaver FTTH Kit Optical Power Meter Tool Kit", + "url": "https://www.amazon.co.jp/-/en/KKnoon-Fiber-Optic-Cleaver-Optical/dp/B099ZYB3QV/ref=sr_1_22?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-22" + }, + { + "rank": 24, + "snippet": null, + "timestamp": 1632443655, + "title": "Optical Tool Handheld Light Source, Fiber Optic Measuring Instrument KLS-25M Stable Hand Light Source for FC + SC + 2.5mm Universal with 2 Wavelengths, Adapter Interface", + "url": "https://www.amazon.co.jp/-/en/Measuring-Instrument-Universal-Wavelengths-Interface/dp/B082TXJ4RC/ref=sr_1_23?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-23" + }, + { + "rank": 25, + "snippet": null, + "timestamp": 1632443655, + "title": "\u5149 \u6e2c\u5b9a \u30c1\u30a7\u30c3\u30af \u6a5f\u5668 \u5149\u30d1\u30ef\u30fc \u30e1\u30fc\u30bf \u30b9\u30ea\u30e0\u30bf\u30a4\u30d7 \uff08New!) \u5149 \u30d5\u30a1\u30a4\u30d0 \u306e \u65bd\u5de5\u3001\u4fdd\u5b88\u7528 \u3010Lavert\u3011", + "url": "https://www.amazon.co.jp/-/en/dp/B00RKG5UB6/ref=sr_1_24?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-24" + }, + { + "rank": 26, + "snippet": null, + "timestamp": 1632443655, + "title": "hobbyant skit MT-7618 3-in-1 \u30df\u30cb LED \u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u8996\u899a\u969c\u5bb3\u30ed\u30b1\u30fc\u30bf\u30fc 1mW \u30ec\u30fc\u30b6\u30fc\u51fa\u529b\u81ea\u5df1\u6821\u6b63", + "url": "https://www.amazon.co.jp/-/en/dp/B09GV9B8H5/ref=sr_1_25?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-25" + }, + { + "rank": 27, + "snippet": null, + "timestamp": 1632443655, + "title": "hobbyant \u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u5149\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u30b1\u30fc\u30d6\u30eb\u30c6\u30b9\u30bf\u30fc\u5149\u30c6\u30b9\u30bf\u30fc+ 6dBm -70dBm", + "url": "https://www.amazon.co.jp/-/en/dp/B09GV3SPY2/ref=sr_1_26?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-26" + }, + { + "rank": 28, + "snippet": null, + "timestamp": 1632443655, + "title": "Amazon Basics TOSLINK Optical Audio Digital Cable", + "url": "https://www.amazon.co.jp/-/en/Amazon-Basics-TOSLINK-Optical-Digital/dp/B00L3KO5WK/ref=sr_1_27?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-27" + }, + { + "rank": 29, + "snippet": null, + "timestamp": 1632443655, + "title": "hobbyant NOYAFA NF-909Optica\u5149\u6e90\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30c6\u30b9\u30bf\u30fc\u5149\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u30c6\u30b9\u30bf\u30fc\u8996\u899a\u969c\u5bb3\u30ed\u30b1\u30fc\u30bf\u30fc\u30cd\u30c3\u30c8\u30ef\u30fc\u30af\u6a5f\u5668", + "url": "https://www.amazon.co.jp/-/en/dp/B09GV9RCDY/ref=sr_1_28?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-28" + }, + { + "rank": 30, + "snippet": null, + "timestamp": 1632443655, + "title": "FLYPROFiber 10dBLC Fiber Attenator 10dB-2 Pack | dB Options: 3dB, 5dB, 7dB, 10dB, 15dB, 20dB | LC/UPC SM Singlemode Fixed Fiber Attenuator, Male to Female", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_1?ie=UTF8&adId=A13Y1NK1MNRJHR&url=%2FFLYPROFiber-10dBLC-Attenator-10dB-2-Options%2Fdp%2FB095JY1RCN%2Fref%3Dsr_1_29_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-29-spons%26psc%3D1&qualifier=1632450856&id=3198468320158844&widgetName=sp_mtf" + }, + { + "rank": 31, + "snippet": null, + "timestamp": 1632443655, + "title": "FLYPROFiber 5dB 2pcs SC Attenator;Options: 3dB,5dB,7dB,10dB,SC/UPC Singlemode Fixed Fiber Optic Attenator,SC Male to Female,5dB-2 Pack", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_1?ie=UTF8&adId=A2VIIWLSMY6EIT&url=%2FFLYPROFiber-5dB-2pcs-Attenator-Options%2Fdp%2FB095JZG1S2%2Fref%3Dsr_1_30_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-30-spons%26psc%3D1&qualifier=1632450856&id=3198468320158844&widgetName=sp_mtf" + }, + { + "rank": 32, + "snippet": null, + "timestamp": 1632443655, + "title": "S & K.LIFE Optical Fiber Cable Connector sc Relay Connector Set of 10 Optical Adapter SC Connector Relay Adapter Single Multimode Relay Adapter", + "url": "https://www.amazon.co.jp/-/en/gp/slredirect/picassoRedirect.html/ref=pa_sp_mtf_aps_sr_pg1_1?ie=UTF8&adId=A6I8N4OX64XLJ&url=%2FK-LIFE-Optical-Connector-Adapter-Multimode%2Fdp%2FB093GP5WQL%2Fref%3Dsr_1_31_sspa%3Fdchild%3D1%26keywords%3D%25E5%2585%2589%25E3%2583%2595%25E3%2582%25A1%25E3%2582%25A4%25E3%2583%2590%25E3%2583%25BC%25E3%2583%2591%25E3%2583%25AF%25E3%2583%25BC%25E3%2583%25A1%25E3%2583%25BC%25E3%2582%25BF%25E3%2583%25BC%26qid%3D1632450856%26sr%3D8-31-spons%26psc%3D1&qualifier=1632450856&id=3198468320158844&widgetName=sp_mtf" + }, + { + "rank": 33, + "snippet": null, + "timestamp": 1632443655, + "title": "hobbyant JW3208A\u30dd\u30fc\u30bf\u30d6\u30eb-70 + 6dBm\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30c6\u30b9\u30bf\u30fc\u5149\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc", + "url": "https://www.amazon.co.jp/-/en/dp/B09GV68KJV/ref=sr_1_32?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-32" + }, + { + "rank": 34, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Extension Cable with SC-SC Extension Adapter (3m)", + "url": "https://www.amazon.co.jp/-/en/Fiber-Optic-Extension-Cable-Adapter/dp/B00KJQG3FG/ref=sr_1_33?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-33" + }, + { + "rank": 35, + "snippet": null, + "timestamp": 1632443655, + "title": "Visual Obstacle Locator 30mw30km Optical Cable Tester Equipment Red Optical VFL, with 2.5mm Universal Connector, FC/SC/ST Transfer Pen for Cable TV Communication Engineering Maintenance", + "url": "https://www.amazon.co.jp/-/en/Equipment-Universal-Communication-Engineering-Maintenance/dp/B08YJZGGDZ/ref=sr_1_34?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-34" + }, + { + "rank": 36, + "snippet": null, + "timestamp": 1632443655, + "title": "NTT Fletzo ONU Router Compatible Optical Cable Shutter Type SC Connector Compatible Manufacturer: NTT Docomo Soft Bank au NURO Light and More (9.8 ft (3 m)", + "url": "https://www.amazon.co.jp/-/en/Compatible-Optical-Shutter-Connector-Manufacturer/dp/B07GZLF71R/ref=sr_1_35?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-35" + }, + { + "rank": 37, + "snippet": null, + "timestamp": 1632443655, + "title": "Domestic Optical Fiber Diameter 0.03 inches (0.75 mm) Length 3.3 ft (1 m) 20 Pieces", + "url": "https://www.amazon.co.jp/-/en/Domestic-Optical-Diameter-inches-Length/dp/B08H18T422/ref=sr_1_36?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-36" + }, + { + "rank": 38, + "snippet": null, + "timestamp": 1632443655, + "title": "Visual Fault Locator 30mW30KM, VFL Tester Kit includes Single Mode 9/125um FC Plug-LC Plug Adapter and Red Light Pen for Fiber Optic Network Cable Testing", + "url": "https://www.amazon.co.jp/-/en/Locator-30mW30KM-Plug-LC-Adapter-Network/dp/B08YJ8BVSS/ref=sr_1_37?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-37" + }, + { + "rank": 39, + "snippet": null, + "timestamp": 1632443655, + "title": "\u30d5\u30a1\u30a4\u30d0\u30fc\u5149\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3001\u5c4b\u5916\u7528\u30af\u30a4\u30c3\u30af\u30a4\u30f3\u30b9\u30c8\u30fc\u30eb(\u8efd\u3044\u30d1\u30ef\u30fc)", + "url": "https://www.amazon.co.jp/-/en/dp/B09GNMWHHR/ref=sr_1_38?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-38" + }, + { + "rank": 40, + "snippet": null, + "timestamp": 1632443655, + "title": "Domestic Fiber Optic Diameter 0.5\u00a0mm Length 1\u00a0m Pack of 20", + "url": "https://www.amazon.co.jp/-/en/Domestic-Fiber-Optic-Diameter-Length/dp/B06WV9ZFWZ/ref=sr_1_39?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-39" + }, + { + "rank": 41, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic ST Adapter, Durable Fiber Optic Power Meter ST Adapter for Optical Fiber Power Meter and Light Sources", + "url": "https://www.amazon.co.jp/-/en/Fiber-Adapter-Durable-Optical-Sources/dp/B09B1L25L9/ref=sr_1_40?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-40" + }, + { + "rank": 42, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter ST Adapter Optical Tester Accessories, Fiber Optic Power Meter ST Adapter Adapter", + "url": "https://www.amazon.co.jp/-/en/Fiber-Adapter-Optical-Tester-Accessories/dp/B09CTNS21P/ref=sr_1_41?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-41" + }, + { + "rank": 43, + "snippet": null, + "timestamp": 1632443655, + "title": "MXBAOHENG Portable Optical Power Meter Fiber Tester Mini Fiber Cable Tester 6 Wavelength FC/SC/ST Telecommunications LED Display Cable TV Fiber Optic Test Telecommunication Instrument Measurement (TL563A-v30)", + "url": "https://www.amazon.co.jp/-/en/Wavelength-Telecommunications-Telecommunication-Instrument-Measurement/dp/B099K3DFK7/ref=sr_1_42?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-42" + }, + { + "rank": 44, + "snippet": null, + "timestamp": 1632443655, + "title": "\u91d1\u5c5e\u88fd\u306e\u5149\u30d5\u30a1\u30a4\u30d0\u30fcST\u30a2\u30c0\u30d7\u30bf\u30fc\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3068\u5149\u6e90\u7528\u306eST\u30a2\u30c0\u30d7\u30bf\u30fc", + "url": "https://www.amazon.co.jp/-/en/dp/B098JKQHDX/ref=sr_1_43?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-43" + }, + { + "rank": 45, + "snippet": null, + "timestamp": 1632443655, + "title": "\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3001\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30c4\u30fc\u30eb\u3001\u30dd\u30fc\u30bf\u30d6\u30eb\u96fb\u6c17\u30c6\u30b9\u30bf\u30fc\u3001SDI\u30a2\u30ca\u30ed\u30b0MATV\u30b7\u30b9\u30c6\u30e0\u7528FC/SC/ST HDTV\u3001SDI", + "url": "https://www.amazon.co.jp/-/en/dp/B099ZVQWDN/ref=sr_1_44?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-44" + }, + { + "rank": 46, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter, Fiber Optic Cable Tester, Fiber Tester, 15km Wide Wavelength Network Tester, Handheld Portable for HDTV SDI", + "url": "https://www.amazon.co.jp/-/en/Tester-Wavelength-Network-Handheld-Portable/dp/B099Z6MYFN/ref=sr_1_45?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-45" + }, + { + "rank": 47, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter, Fiber Tester, 15km FC/SC/ST Fiber Optic Cable Tester, Fiber Optic Tool, HDTV and SDI", + "url": "https://www.amazon.co.jp/-/en/Fiber-Optic-Power-Meter-Tester/dp/B099Y7XZWF/ref=sr_1_46?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-46" + }, + { + "rank": 48, + "snippet": null, + "timestamp": 1632443655, + "title": "\u96fb\u6c17\u30c6\u30b9\u30bf\u30fc\u3001\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30c4\u30fc\u30eb\u3001FC/SC/ST\u30d3\u30b8\u30e5\u30a2\u30eb\u30d5\u30a9\u30fc\u30eb\u30c8\u30ed\u30b1\u30fc\u30bf\u30fc\u3001\u30d5\u30a1\u30a4\u30d0\u30fc\u30c6\u30b9\u30bf\u30fc\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3001HDTV\u3001SDI\u7528", + "url": "https://www.amazon.co.jp/-/en/dp/B099XN1H3K/ref=sr_1_47?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-47" + }, + { + "rank": 49, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter,Fiber Tester,15km Fiber Optic Tool,Wide Wavelength FC/SC/ST Fiber Optic Cable Tester,Portable for HDTV SDI", + "url": "https://www.amazon.co.jp/-/en/Fiber-Optic-Tester-Wavelength-Portable/dp/B098VZFC9G/ref=sr_1_48?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-48" + }, + { + "rank": 50, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter Tester, High Precision Optical Power Meter, Handheld Optical Power Tester, FC + SC + 2.5mm Universal (LC Option 1.25mm) & 70\u00b0 + 10dbm", + "url": "https://www.amazon.co.jp/-/en/Tester-Precision-Optical-Handheld-Universal/dp/B08XYRN6YH/ref=sr_1_49?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-49" + }, + { + "rank": 51, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Power Meter, 1310/1550nm Dual Wavelength Handheld Fiber Optic Light Source Meter for Fiber Optic Test, FC + SC + 2.5mm Universal Optical Connector", + "url": "https://www.amazon.co.jp/-/en/Wavelength-Handheld-Universal-Optical-Connector/dp/B08MW3YVVX/ref=sr_1_50?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-50" + }, + { + "rank": 52, + "snippet": null, + "timestamp": 1632443655, + "title": "\u5149\u30d1\u30ef\u30fc\u30c6\u30b9\u30bf\u30fc\u5149\u30b1\u30fc\u30d6\u30eb\u30c6\u30b9\u30bf\u30fcJW3218C-US\u30df\u30cb\u9ad8\u7cbe\u5ea6\u5149\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30b1\u30fc\u30d6\u30eb\u30c6\u30b9\u30bf\u30fc\u30c4\u30fc\u30eb800-1700nm", + "url": "https://www.amazon.co.jp/-/en/dp/B09FLPBXXT/ref=sr_1_51?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-51" + }, + { + "rank": 53, + "snippet": null, + "timestamp": 1632443655, + "title": "Funien Fiber Optic Power Meter, Rechargeable Optical Power Meter G750 Portable Color LCD Screen Fiber Optic Power Meter with Flash Light 7 Wavelength Compatible", + "url": "https://www.amazon.co.jp/-/en/Rechargeable-Optical-Portable-Wavelength-Compatible/dp/B091CKJLT7/ref=sr_1_52?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-52" + }, + { + "rank": 54, + "snippet": null, + "timestamp": 1632443655, + "title": "OKAYOU Practical Handheld FTTH Optical Fiber Optic Power Meter Fiber Optic Cable Tester Network FC/SC Connector-70~ +30dBm", + "url": "https://www.amazon.co.jp/-/en/OKAYOU-Practical-Handheld-Optical-Connector-70/dp/B08P35Z7LG/ref=sr_1_53?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-53" + }, + { + "rank": 55, + "snippet": null, + "timestamp": 1632443655, + "title": "Fiber Optic Cable Checker for SC 2 Piece Set Bare Fiber Conduction Checker Adapter Plug Measurement Check", + "url": "https://www.amazon.co.jp/-/en/Fiber-Checker-Conduction-Adapter-Measurement/dp/B09BK7JXBL/ref=sr_1_54?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-54" + }, + { + "rank": 56, + "snippet": null, + "timestamp": 1632443655, + "title": "\u30aa\u30fc\u30eb\u30a4\u30f3\u30ef\u30f3\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3001\u5149\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u3001\u30dd\u30fc\u30bf\u30d6\u30eb\u30aa\u30fc\u30eb\u30a4\u30f3\u30aa\u30fc\u30eb\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30d1\u30ef\u30fc\u30e1\u30fc\u30bf\u30fc\u30d3\u30b8\u30e5\u30a2\u30eb\u30d5\u30a9\u30fc\u30eb\u30c8\u30ed\u30b1\u30fc\u30bf\u30fc\u5149\u30d5\u30a1\u30a4\u30d0\u30fc\u30b1\u30fc\u30d6\u30eb\u30c6\u30b9\u30bf\u30fc\u3001\u30b5\u30a4\u30ba\uff1a\u7d047.1 * 32.3 * 1.1in", + "url": "https://www.amazon.co.jp/-/en/dp/B09GFLDMJY/ref=sr_1_55?dchild=1&keywords=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&qid=1632450856&sr=8-55" + } + ], + "timestamp": 1632443655, + "url": "https://www.amazon.co.jp/s?k=%E5%85%89%E3%83%95%E3%82%A1%E3%82%A4%E3%83%90%E3%83%BC+%E3%83%91%E3%83%AF%E3%83%BC%E3%83%A1%E3%83%BC%E3%82%BF%E3%83%BC&ref=nb_sb_noss_2" +} diff --git a/data/manual-annotations/archived-raw-serps/expected/amazon-qian-lian-mo-hua-orishinarusauntotoratsuku-1657685343.approved.txt b/data/manual-annotations/archived-raw-serps/expected/amazon-qian-lian-mo-hua-orishinarusauntotoratsuku-1657685343.approved.txt new file mode 100644 index 00000000..10a2597c --- /dev/null +++ b/data/manual-annotations/archived-raw-serps/expected/amazon-qian-lian-mo-hua-orishinarusauntotoratsuku-1657685343.approved.txt @@ -0,0 +1,346 @@ +{ + "interpreted_query": "\u5343\u604b\uff0a\u4e07\u82b1 \u30aa\u30ea\u30b8\u30ca\u30eb\u30b5\u30a6\u30f3\u30c9\u30c8\u30e9\u30c3\u30af", + "offset": null, + "page": null, + "query": "\u5343\u604b\uff0a\u4e07\u82b1 \u30aa\u30ea\u30b8\u30ca\u30eb\u30b5\u30a6\u30f3\u30c9\u30c8\u30e9\u30c3\u30af", + "results": [ + { + "rank": 3, + "snippet": null, + "timestamp": 1657685343, + "title": "\u5343\u604b*\u4e07\u82b1 \u30aa\u30ea\u30b8\u30ca\u30eb\u30fb\u30b5\u30a6\u30f3\u30c9\u30c8\u30e9\u30c3\u30af", + "url": "https://www.amazon.co.jp/-/en/dp/B01N9FNNO0/ref=sr_1_1?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&crid=BFBUJUC89AS1&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&qid=1657692543&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&sr=8-1" + }, + { + "rank": 4, + "snippet": null, + "timestamp": 1657685343, + "title": "\u5343\u604b\uff0a\u4e07\u82b1 \u30aa\u30ea\u30b8\u30ca\u30eb\u30b5\u30a6\u30f3\u30c9\u30c8\u30e9\u30c3\u30af", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?ref=sr_1_2&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-2" + }, + { + "rank": 5, + "snippet": null, + "timestamp": 1657685343, + "title": "\u5343\u604b*\u4e07\u82b1 \u30ad\u30e3\u30e9\u30af\u30bf\u30fc\u30bd\u30f3\u30b0 Vol.4 \u300cBlue sky\u300d", + "url": "https://www.amazon.co.jp/-/en/dp/B01CSC1J7A/ref=sr_1_3?__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&crid=BFBUJUC89AS1&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&qid=1657692543&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&sr=8-3" + }, + { + "rank": 7, + "snippet": null, + "timestamp": 1657685343, + "title": "\u604b\u3072\u604b\u3075\u7e01", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92SGLJ&ref=sr_1_4&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-4" + }, + { + "rank": 8, + "snippet": null, + "timestamp": 1657685343, + "title": "\u604b\u3072\u604b\u3075\u7e01 <Koto Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92ZH3P&ref=sr_1_5&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-5" + }, + { + "rank": 9, + "snippet": null, + "timestamp": 1657685343, + "title": "\u3075\u305f\u3064\u306e\u5f71", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91689Y&ref=sr_1_6&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-6" + }, + { + "rank": 10, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30aa\u30ea\u30b8\u30ca\u30eb\u30df\u30cb\u30c9\u30e9\u30de\uff5e\u305d\u306e4\uff5e\u300c\u521d\u3081\u3066\u306e\u4f53\u9a13\u300d", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08CN4GRQX?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08CNN7ZZ7&ref=sr_1_7&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-7" + }, + { + "rank": 11, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30aa\u30ea\u30b8\u30ca\u30eb\u30df\u30cb\u30c9\u30e9\u30de\uff5e\u305d\u306e2\uff5e\u300c\u30ef\u30bf\u30b7\u3001\u30cb\u30f3\u30b8\u30e3\u3067\u3059\u300d", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08CNKMYYR?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08CNHQ4CD&ref=sr_1_8&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-8" + }, + { + "rank": 12, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30aa\u30ea\u30b8\u30ca\u30eb\u30df\u30cb\u30c9\u30e9\u30de\uff5e\u305d\u306e3\uff5e\u300c\u543e\u8f29\u306e\u4e00\u65e5\u300d", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08CNJJTFG?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08CNJXSYR&ref=sr_1_9&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-9" + }, + { + "rank": 13, + "snippet": null, + "timestamp": 1657685343, + "title": "\u604b\u3072\u604b\u3075\u7e01 (Game Size)", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92F7L2&ref=sr_1_10&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-10" + }, + { + "rank": 15, + "snippet": null, + "timestamp": 1657685343, + "title": "\u604b\u3072\u604b\u3075\u7e01 (Karaoke Version)", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92C3WY&ref=sr_1_11&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-11" + }, + { + "rank": 16, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30aa\u30ea\u30b8\u30ca\u30eb\u30df\u30cb\u30c9\u30e9\u30de\uff5e\u305d\u306e1\uff5e\u300c\u5deb\u5973\u306e\u304a\u4ed5\u4e8b\u300d", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08CNJC32F?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08CNK79RH&ref=sr_1_12&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-12" + }, + { + "rank": 17, + "snippet": null, + "timestamp": 1657685343, + "title": "\u604b\u3072\u604b\u3075\u7e01 (Title Version)", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92CM7C&ref=sr_1_13&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-13" + }, + { + "rank": 18, + "snippet": null, + "timestamp": 1657685343, + "title": "\u3075\u305f\u308a\u3067", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D8ZBN1Z&ref=sr_1_14&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-14" + }, + { + "rank": 19, + "snippet": null, + "timestamp": 1657685343, + "title": "\u611b\u3057\u3055\u3068\u611f\u8b1d\u306e\u6c17\u6301\u3061", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D8ZSLNT&ref=sr_1_15&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-15" + }, + { + "rank": 20, + "snippet": null, + "timestamp": 1657685343, + "title": "GIFT", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92C495&ref=sr_1_16&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-16" + }, + { + "rank": 21, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30ad\u30df\u306e\u3068\u306a\u308a", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91YL4V&ref=sr_1_17&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-17" + }, + { + "rank": 22, + "snippet": null, + "timestamp": 1657685343, + "title": "Love flower", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91JZWJ&ref=sr_1_18&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-18" + }, + { + "rank": 23, + "snippet": null, + "timestamp": 1657685343, + "title": "\u82b1\u9ce5\u98a8\u6708", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92BMCB&ref=sr_1_19&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-19" + }, + { + "rank": 24, + "snippet": null, + "timestamp": 1657685343, + "title": "\u8309\u5b50\u306e\u65e5\u5e38 <Instrument Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91HWW4&ref=sr_1_20&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-20" + }, + { + "rank": 25, + "snippet": null, + "timestamp": 1657685343, + "title": "\u935b\u932c", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D918X2N&ref=sr_1_21&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-21" + }, + { + "rank": 26, + "snippet": null, + "timestamp": 1657685343, + "title": "\u4eca\u6614\u306e\u8857", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D8ZZH1N&ref=sr_1_22&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-22" + }, + { + "rank": 27, + "snippet": null, + "timestamp": 1657685343, + "title": "\u3068\u304a\u308a\u3083\u3093\u305b\uff5e\u7518\u7f8e\u98a8\u6765 <Instrument Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D8ZYXTC&ref=sr_1_23&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-23" + }, + { + "rank": 28, + "snippet": null, + "timestamp": 1657685343, + "title": "\u3068\u304a\u308a\u3083\u3093\u305b\uff5e\u7518\u7f8e\u98a8\u6765 <Quiet Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92ZQRD&ref=sr_1_24&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-24" + }, + { + "rank": 29, + "snippet": null, + "timestamp": 1657685343, + "title": "\u614e\u307e\u3057\u304d\u671d", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92RZ33&ref=sr_1_25&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-25" + }, + { + "rank": 30, + "snippet": null, + "timestamp": 1657685343, + "title": "\u546a\u8a5b", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92N4Q8&ref=sr_1_26&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-26" + }, + { + "rank": 31, + "snippet": null, + "timestamp": 1657685343, + "title": "\u4f1d\u7d71\u3068\u683c\u5f0f", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92M44T&ref=sr_1_27&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-27" + }, + { + "rank": 32, + "snippet": null, + "timestamp": 1657685343, + "title": "\u904b\u547d", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92LSQQ&ref=sr_1_28&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-28" + }, + { + "rank": 33, + "snippet": null, + "timestamp": 1657685343, + "title": "Blue sky <Quiet Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92HW1V&ref=sr_1_29&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-29" + }, + { + "rank": 34, + "snippet": null, + "timestamp": 1657685343, + "title": "GIFT (Game Size)", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92GPH9&ref=sr_1_30&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-30" + }, + { + "rank": 35, + "snippet": null, + "timestamp": 1657685343, + "title": "\u795e\u69d8\u306e\u795d\u798f", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92FGJ4&ref=sr_1_31&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-31" + }, + { + "rank": 36, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30ad\u30df\u306e\u3068\u306a\u308a (Karaoke Version)", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92BR8X&ref=sr_1_32&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-32" + }, + { + "rank": 37, + "snippet": null, + "timestamp": 1657685343, + "title": "\u3075\u305f\u308a\u3067 (Game Size)", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92BMQS&ref=sr_1_33&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-33" + }, + { + "rank": 38, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30ad\u30ba\u30ca\u30d2\u30c8\u30c4 <Quiet Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D92B5N3&ref=sr_1_34&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-34" + }, + { + "rank": 39, + "snippet": null, + "timestamp": 1657685343, + "title": "\u5fcc\u307e\u308f\u3057\u304d\u7d99\u627f", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D929J33&ref=sr_1_35&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-35" + }, + { + "rank": 40, + "snippet": null, + "timestamp": 1657685343, + "title": "\u604b\u3072\u604b\u3075\u7e01 <Piano Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D9299N5&ref=sr_1_36&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-36" + }, + { + "rank": 41, + "snippet": null, + "timestamp": 1657685343, + "title": "\u6c7a\u610f", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D929144&ref=sr_1_37&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-37" + }, + { + "rank": 42, + "snippet": null, + "timestamp": 1657685343, + "title": "\u9811\u5f35\u308b\u304a\u59c9\u3055\u3093\u306f\u597d\u304d\u3067\u3059\u304b\uff1f <Quiet Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D928RZ9&ref=sr_1_38&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-38" + }, + { + "rank": 43, + "snippet": null, + "timestamp": 1657685343, + "title": "\u5f35\u308a\u8a70\u3081\u305f\u6642", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D928LTP&ref=sr_1_39&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-39" + }, + { + "rank": 44, + "snippet": null, + "timestamp": 1657685343, + "title": "\u660e\u93e1\u6b62\u6c34", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D928K1N&ref=sr_1_40&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-40" + }, + { + "rank": 45, + "snippet": null, + "timestamp": 1657685343, + "title": "\u30ad\u30ba\u30ca\u30d2\u30c8\u30c4 <Instrument Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D9249J9&ref=sr_1_41&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-41" + }, + { + "rank": 46, + "snippet": null, + "timestamp": 1657685343, + "title": "\u8eab\u3082\u5fc3\u3082\u2026", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D923FP1&ref=sr_1_42&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-42" + }, + { + "rank": 47, + "snippet": null, + "timestamp": 1657685343, + "title": "\u6727", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D922V2Y&ref=sr_1_43&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-43" + }, + { + "rank": 48, + "snippet": null, + "timestamp": 1657685343, + "title": "Blue sky <Instrument Version>", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91ZZ4K&ref=sr_1_44&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-44" + }, + { + "rank": 49, + "snippet": null, + "timestamp": 1657685343, + "title": "\u6c17\u306b\u306a\u308b\u5f93\u59b9", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91ZB5M&ref=sr_1_45&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-45" + }, + { + "rank": 50, + "snippet": null, + "timestamp": 1657685343, + "title": "\u7530\u5fc3\u5c4b", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91YL4S&ref=sr_1_46&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-46" + }, + { + "rank": 51, + "snippet": null, + "timestamp": 1657685343, + "title": "\u306a\u305c\u306b\u4f55\u6545\uff1f", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91XGC6&ref=sr_1_47&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-47" + }, + { + "rank": 52, + "snippet": null, + "timestamp": 1657685343, + "title": "\u4f55\u3082\u898b\u3048\u306a\u3044", + "url": "https://www.amazon.co.jp/-/en/music/player/albums/B08D9164JT?marketplaceId=A1VC38T7YXB528&musicTerritory=JP&trackAsin=B08D91WSYX&ref=sr_1_48&keywords=%E5%8D%83%E6%81%8B*%E4%B8%87%E8%8A%B1%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&qid=1657692543&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&sr=8-48" + } + ], + "timestamp": 1657685343, + "url": "https://www.amazon.co.jp/s?k=%E5%8D%83%E6%81%8B%EF%BC%8A%E4%B8%87%E8%8A%B1+%E3%82%AA%E3%83%AA%E3%82%B8%E3%83%8A%E3%83%AB%E3%82%B5%E3%82%A6%E3%83%B3%E3%83%89%E3%83%88%E3%83%A9%E3%83%83%E3%82%AF&__mk_zh_CN=%E4%BA%9A%E9%A9%AC%E9%80%8A%E7%BD%91%E7%AB%99&crid=BFBUJUC89AS1&sprefix=%E5%8D%83%E6%81%8B+%E4%B8%87%E8%8A%B1+original+soundtrack%2Caps%2C546&ref=nb_sb_noss" +} diff --git a/data/manual-annotations/archived-raw-serps/expected/amazon-yankumakashin-yanmakasa-to-1625727694.approved.txt b/data/manual-annotations/archived-raw-serps/expected/amazon-yankumakashin-yanmakasa-to-1625727694.approved.txt new file mode 100644 index 00000000..744600bf --- /dev/null +++ b/data/manual-annotations/archived-raw-serps/expected/amazon-yankumakashin-yanmakasa-to-1625727694.approved.txt @@ -0,0 +1,346 @@ +{ + "interpreted_query": "\"\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\"|\"\u30e4\u30f3\u30de\u30ac\u30b5\u30fc\u30c9\"", + "offset": null, + "page": null, + "query": "\"\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\"|\"\u30e4\u30f3\u30de\u30ac\u30b5\u30fc\u30c9\"", + "results": [ + { + "rank": 1, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e74 8/2 \u53f7 [\u96d1\u8a8c]", + "url": "https://www.amazon.co.jp/%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B4-%E5%8F%B7-%E9%9B%91%E8%AA%8C/dp/B098JHYF1K/ref=sr_1_1?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-1&x=0&y=0" + }, + { + "rank": 2, + "snippet": null, + "timestamp": 1625727694, + "title": "\u307f\u306a\u307f\u3051(22) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%81%BF%E3%81%AA%E3%81%BF%E3%81%91-22-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E6%A1%9C%E5%A0%B4-%E3%82%B3%E3%83%8F%E3%83%AB/dp/4065239885/ref=sr_1_2?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-2&x=0&y=0" + }, + { + "rank": 3, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e7433\u53f7 [2021\u5e747\u670812\u65e5\u767a\u58f2] [\u96d1\u8a8c]", + "url": "https://www.amazon.co.jp/%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B433%E5%8F%B7-2021%E5%B9%B47%E6%9C%8812%E6%97%A5%E7%99%BA%E5%A3%B2-%E9%9B%91%E8%AA%8C-%E7%A6%8F%E6%9C%AC%E4%BC%B8%E8%A1%8C-ebook/dp/B098SZYY4F/ref=sr_1_3?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-3&x=0&y=0" + }, + { + "rank": 4, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e7432\u53f7 [2021\u5e747\u67085\u65e5\u767a\u58f2] [\u96d1\u8a8c]", + "url": "https://www.amazon.co.jp/%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B432%E5%8F%B7-2021%E5%B9%B47%E6%9C%885%E6%97%A5%E7%99%BA%E5%A3%B2-%E9%9B%91%E8%AA%8C-%E8%97%A4%E6%B2%A2%E3%81%A8%E3%81%8A%E3%82%8B-ebook/dp/B0989SN8KK/ref=sr_1_4?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-4&x=0&y=0" + }, + { + "rank": 5, + "snippet": null, + "timestamp": 1625727694, + "title": "GTO \u30d1\u30e9\u30c0\u30a4\u30b9\u30fb\u30ed\u30b9\u30c8(16) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/GTO-%E3%83%91%E3%83%A9%E3%83%80%E3%82%A4%E3%82%B9%E3%83%BB%E3%83%AD%E3%82%B9%E3%83%88-16-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E3%81%A8%E3%81%8A%E3%82%8B/dp/4065236762/ref=sr_1_5?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-5&x=0&y=0" + }, + { + "rank": 6, + "snippet": null, + "timestamp": 1625727694, + "title": "\u4f55\u5ea6\u3001\u6642\u3092\u304f\u308a\u304b\u3048\u3057\u3066\u3082\u672c\u80fd\u5bfa\u304c\u71c3\u3048\u308b\u3093\u3058\u3083\u304c!?(3) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E4%BD%95%E5%BA%A6%E3%80%81%E6%99%82%E3%82%92%E3%81%8F%E3%82%8A%E3%81%8B%E3%81%88%E3%81%97%E3%81%A6%E3%82%82%E6%9C%AC%E8%83%BD%E5%AF%BA%E3%81%8C%E7%87%83%E3%81%88%E3%82%8B%E3%82%93%E3%81%98%E3%82%83%E3%81%8C-3-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E8%97%A4%E6%9C%AC-%E3%82%B1%E3%83%B3%E3%82%B7/dp/4065243440/ref=sr_1_6?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-6&x=0&y=0" + }, + { + "rank": 7, + "snippet": null, + "timestamp": 1625727694, + "title": "\u5229\u53e3\u306b\u306a\u308b\u306b\u306f\u9752\u3059\u304e\u308b\uff08\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E5%88%A9%E5%8F%A3%E3%81%AB%E3%81%AA%E3%82%8B%E3%81%AB%E3%81%AF%E9%9D%92%E3%81%99%E3%81%8E%E3%82%8B%EF%BC%88%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%A4%A7%E6%B2%BC%E9%9A%86%E6%8F%AE-ebook/dp/B0983674RH/ref=sr_1_7?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-7&x=0&y=0" + }, + { + "rank": 8, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30d0\u30a4\u30aa\u30ec\u30f3\u30b9\u30b8\u30e3\u30c3\u30af20XX(1) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%83%90%E3%82%A4%E3%82%AA%E3%83%AC%E3%83%B3%E3%82%B9%E3%82%B8%E3%83%A3%E3%83%83%E3%82%AF20XX-1-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E6%B0%B8%E4%BA%95-%E8%B1%AA/dp/406524353X/ref=sr_1_8?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-8&x=0&y=0" + }, + { + "rank": 9, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30aa\u30da\u770b(3) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%82%AA%E3%83%9A%E7%9C%8B-3-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E3%83%9F%E3%82%B5%E3%83%B2/dp/4065239079/ref=sr_1_9?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-9&x=0&y=0" + }, + { + "rank": 10, + "snippet": null, + "timestamp": 1625727694, + "title": "\u5f7c\u5cb8\u5cf6 48\u65e5\u5f8c\u2026(30) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E5%BD%BC%E5%B2%B8%E5%B3%B6-48%E6%97%A5%E5%BE%8C%E2%80%A6-30-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E6%9D%BE%E6%9C%AC/dp/4065243416/ref=sr_1_10?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-10&x=0&y=0" + }, + { + "rank": 11, + "snippet": null, + "timestamp": 1625727694, + "title": "\u89e3\u96c7\u3055\u308c\u305f\u6697\u9ed2\u5175\u58eb\uff08\uff13\uff10\u4ee3\uff09\u306e\u30b9\u30ed\u30fc\u306a\u30bb\u30ab\u30f3\u30c9\u30e9\u30a4\u30d5\uff08\uff14\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E8%A7%A3%E9%9B%87%E3%81%95%E3%82%8C%E3%81%9F%E6%9A%97%E9%BB%92%E5%85%B5%E5%A3%AB%EF%BC%88%EF%BC%93%EF%BC%90%E4%BB%A3%EF%BC%89%E3%81%AE%E3%82%B9%E3%83%AD%E3%83%BC%E3%81%AA%E3%82%BB%E3%82%AB%E3%83%B3%E3%83%89%E3%83%A9%E3%82%A4%E3%83%95%EF%BC%88%EF%BC%94%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%B2%A1%E6%B2%A2%E5%85%AD%E5%8D%81%E5%9B%9B-ebook/dp/B096VVM2XT/ref=sr_1_11?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-11&x=0&y=0" + }, + { + "rank": 12, + "snippet": null, + "timestamp": 1625727694, + "title": "\u6e21\u304f\u3093\u306e\u00d7\u00d7\u304c\u5d29\u58ca\u5bf8\u524d(11) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E6%B8%A1%E3%81%8F%E3%82%93%E3%81%AE%C3%97%C3%97%E3%81%8C%E5%B4%A9%E5%A3%8A%E5%AF%B8%E5%89%8D-11-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E9%B3%B4%E8%A6%8B-%E3%81%AA%E3%82%8B/dp/4065243432/ref=sr_1_12?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-12&x=0&y=0" + }, + { + "rank": 13, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30a2\u30eb\u30ad\u30e1\u30c7\u30b9\u306e\u5927\u6226(25) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%82%A2%E3%83%AB%E3%82%AD%E3%83%A1%E3%83%87%E3%82%B9%E3%81%AE%E5%A4%A7%E6%88%A6-25-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E4%B8%89%E7%94%B0-%E7%B4%80%E6%88%BF/dp/4065243378/ref=sr_1_13?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-13&x=0&y=0" + }, + { + "rank": 14, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30d1\u30e9\u30ec\u30eb\u30d1\u30e9\u30c0\u30a4\u30b9(1) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%83%91%E3%83%A9%E3%83%AC%E3%83%AB%E3%83%91%E3%83%A9%E3%83%80%E3%82%A4%E3%82%B9-1-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E5%B2%A1%E6%9C%AC-%E5%80%AB/dp/406510095X/ref=sr_1_14?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-14&x=0&y=0" + }, + { + "rank": 15, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30bb\u30f3\u30b4\u30af\u6a29\u5175\u885b(24) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%82%BB%E3%83%B3%E3%82%B4%E3%82%AF%E6%A8%A9%E5%85%B5%E8%A1%9B-24-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E5%AE%AE%E4%B8%8B-%E8%8B%B1%E6%A8%B9/dp/4065243424/ref=sr_1_15?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-15&x=0&y=0" + }, + { + "rank": 16, + "snippet": null, + "timestamp": 1625727694, + "title": "\u6065\u3058\u3089\u3046\u541b\u304c\u898b\u305f\u3044\u3093\u3060(1) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E6%81%A5%E3%81%98%E3%82%89%E3%81%86%E5%90%9B%E3%81%8C%E8%A6%8B%E3%81%9F%E3%81%84%E3%82%93%E3%81%A0-1-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E7%94%9C%E7%B1%B3-%E3%82%89%E3%81%8F%E3%82%8C/dp/4065236916/ref=sr_1_16?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-16&x=0&y=0" + }, + { + "rank": 17, + "snippet": null, + "timestamp": 1625727694, + "title": "\u6e80\u5dde\u30a2\u30d8\u30f3\u30b9\u30af\u30ef\u30c3\u30c9(5) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E6%BA%80%E5%B7%9E%E3%82%A2%E3%83%98%E3%83%B3%E3%82%B9%E3%82%AF%E3%83%AF%E3%83%83%E3%83%89-5-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E9%B9%BF%E5%AD%90/dp/4065243394/ref=sr_1_17?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-17&x=0&y=0" + }, + { + "rank": 18, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e7431\u53f7 [2021\u5e746\u670828\u65e5\u767a\u58f2] [\u96d1\u8a8c]", + "url": "https://www.amazon.co.jp/%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B431%E5%8F%B7-2021%E5%B9%B46%E6%9C%8828%E6%97%A5%E7%99%BA%E5%A3%B2-%E9%9B%91%E8%AA%8C-%E5%AE%AE%E4%B8%8B%E8%8B%B1%E6%A8%B9-ebook/dp/B097R6CC6N/ref=sr_1_18?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-18&x=0&y=0" + }, + { + "rank": 19, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30a2\u30ab\u30a4\u30ea\u30f3\u30b4(3) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%82%A2%E3%82%AB%E3%82%A4%E3%83%AA%E3%83%B3%E3%82%B4-3-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E3%83%A0%E3%83%A9%E3%82%BF-%E3%82%B3%E3%82%A6%E3%82%B8/dp/4065243467/ref=sr_1_19?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-19&x=0&y=0" + }, + { + "rank": 20, + "snippet": null, + "timestamp": 1625727694, + "title": "\u4e9c\u4eba\u3061\u3083\u3093\u306f\u8a9e\u308a\u305f\u3044\uff08\uff12\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E4%BA%9C%E4%BA%BA%E3%81%A1%E3%82%83%E3%82%93%E3%81%AF%E8%AA%9E%E3%82%8A%E3%81%9F%E3%81%84%EF%BC%88%EF%BC%92%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E3%83%9A%E3%83%88%E3%82%B9-ebook/dp/B014INFX8I/ref=sr_1_20?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-20&x=0&y=0" + }, + { + "rank": 21, + "snippet": null, + "timestamp": 1625727694, + "title": "\u624b\u54c1\u5148\u8f29 \u5148\u8f29 \u30d0\u30cb\u30fcVer. 1/4\u30b9\u30b1\u30fc\u30eb PVC\u88fd \u5857\u88c5\u6e08\u307f\u5b8c\u6210\u54c1\u30d5\u30a3\u30ae\u30e5\u30a2", + "url": "https://www.amazon.co.jp/%E6%89%8B%E5%93%81%E5%85%88%E8%BC%A9-%E3%83%90%E3%83%8B%E3%83%BCVer-4%E3%82%B9%E3%82%B1%E3%83%BC%E3%83%AB-PVC%E8%A3%BD-%E5%A1%97%E8%A3%85%E6%B8%88%E3%81%BF%E5%AE%8C%E6%88%90%E5%93%81%E3%83%95%E3%82%A3%E3%82%AE%E3%83%A5%E3%82%A2/dp/B08YYNFPCN/ref=sr_1_21?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-21&x=0&y=0" + }, + { + "rank": 22, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b5\u30bf\u30ce\u30d5\u30a1\u30cb(18) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%82%B5%E3%82%BF%E3%83%8E%E3%83%95%E3%82%A1%E3%83%8B-18-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E5%B1%B1%E7%94%B0-%E6%81%B5%E5%BA%B8/dp/4065243459/ref=sr_1_22?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-22&x=0&y=0" + }, + { + "rank": 23, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e7430\u53f7 [2021\u5e746\u670821\u65e5\u767a\u58f2] [\u96d1\u8a8c]", + "url": "https://www.amazon.co.jp/%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B430%E5%8F%B7-2021%E5%B9%B46%E6%9C%8821%E6%97%A5%E7%99%BA%E5%A3%B2-%E9%9B%91%E8%AA%8C-%E6%96%B0%E4%BA%95%E6%98%A5%E5%B7%BB-ebook/dp/B097BH291X/ref=sr_1_23?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-23&x=0&y=0" + }, + { + "rank": 24, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff11\uff13\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%91%EF%BC%93%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B079YN7N5W/ref=sr_1_24?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-24&x=0&y=0" + }, + { + "rank": 25, + "snippet": null, + "timestamp": 1625727694, + "title": "\uff2b\uff49\uff53\uff53\u00d7\uff53\uff49\uff53\u3000\u5f1f\u306b\u30ad\u30b9\u3057\u3061\u3083\u30c0\u30e1\u3067\u3059\u304b\uff1f\uff08\uff12\uff14\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%EF%BC%AB%EF%BD%89%EF%BD%93%EF%BD%93%C3%97%EF%BD%93%EF%BD%89%EF%BD%93-%E5%BC%9F%E3%81%AB%E3%82%AD%E3%82%B9%E3%81%97%E3%81%A1%E3%82%83%E3%83%80%E3%83%A1%E3%81%A7%E3%81%99%E3%81%8B%EF%BC%9F%EF%BC%88%EF%BC%92%EF%BC%94%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E3%81%A2%E3%81%9F%E3%81%BE%E6%9F%90-ebook/dp/B096VX2Y12/ref=sr_1_25?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-25&x=0&y=0" + }, + { + "rank": 26, + "snippet": null, + "timestamp": 1625727694, + "title": "\u864e\u9dab \u3068\u3089\u3064\u3050\u307f -TSUGUMI PROJECT-(2) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E8%99%8E%E9%B6%AB-%E3%81%A8%E3%82%89%E3%81%A4%E3%81%90%E3%81%BF-TSUGUMI-PROJECT-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB/dp/4065238420/ref=sr_1_26?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-26&x=0&y=0" + }, + { + "rank": 27, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30c6\u30f3\u30ab\u30a4\u30c1\u3000\u65e5\u672c\u6700\u5f37\u6b66\u82b8\u8005\u6c7a\u5b9a\u6226\uff08\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%83%86%E3%83%B3%E3%82%AB%E3%82%A4%E3%83%81-%E6%97%A5%E6%9C%AC%E6%9C%80%E5%BC%B7%E6%AD%A6%E8%8A%B8%E8%80%85%E6%B1%BA%E5%AE%9A%E6%88%A6%EF%BC%88%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E3%81%82%E3%81%9A%E3%81%BE%E4%BA%AC%E5%A4%AA%E9%83%8E-ebook/dp/B096VYC2VQ/ref=sr_1_27?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-27&x=0&y=0" + }, + { + "rank": 28, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30bb\u30d6\u30f3\u2606\u30b9\u30bf\u30fcJT(3) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%82%BB%E3%83%96%E3%83%B3%E2%98%86%E3%82%B9%E3%82%BF%E3%83%BCJT-3-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E6%9F%B3%E5%86%85-%E5%A4%A7%E6%A8%B9/dp/406524336X/ref=sr_1_28?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-28&x=0&y=0" + }, + { + "rank": 29, + "snippet": null, + "timestamp": 1625727694, + "title": "\u5927\u304d\u304f\u306a\u3063\u305f\u3089\u7d50\u5a5a\u3059\u308b\uff01\uff08\uff13\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E5%A4%A7%E3%81%8D%E3%81%8F%E3%81%AA%E3%81%A3%E3%81%9F%E3%82%89%E7%B5%90%E5%A9%9A%E3%81%99%E3%82%8B%EF%BC%81%EF%BC%88%EF%BC%93%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E6%B7%B1%E5%86%AC%E3%81%B5%E3%81%BF-ebook/dp/B096VVGKDM/ref=sr_1_29?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-29&x=0&y=0" + }, + { + "rank": 30, + "snippet": null, + "timestamp": 1625727694, + "title": "\u53cc\u751f\u904a\u622f(1) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E5%8F%8C%E7%94%9F%E9%81%8A%E6%88%AF-1-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E5%B2%A1%E7%94%B0-%E6%B7%B3%E5%8F%B8/dp/4065250935/ref=sr_1_30?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-30&x=0&y=0" + }, + { + "rank": 31, + "snippet": null, + "timestamp": 1625727694, + "title": "\u5973\u795e\u306e\u30b9\u30d7\u30ea\u30f3\u30bf\u30fc\uff08\uff16\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E5%A5%B3%E7%A5%9E%E3%81%AE%E3%82%B9%E3%83%97%E3%83%AA%E3%83%B3%E3%82%BF%E3%83%BC%EF%BC%88%EF%BC%96%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8E%9F%E7%94%B0%E9%87%8D%E5%85%89-ebook/dp/B096M5JXHD/ref=sr_1_31?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-31&x=0&y=0" + }, + { + "rank": 32, + "snippet": null, + "timestamp": 1625727694, + "title": "\u864e\u9dab\u3000\u3068\u3089\u3064\u3050\u307f\u3000\uff0d\uff34\uff33\uff35\uff27\uff35\uff2d\uff29\u3000\uff30\uff32\uff2f\uff2a\uff25\uff23\uff34\uff0d\uff08\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E8%99%8E%E9%B6%AB-%E3%81%A8%E3%82%89%E3%81%A4%E3%81%90%E3%81%BF-%EF%BC%8D%EF%BC%B4%EF%BC%B3%EF%BC%B5%EF%BC%A7%EF%BC%B5%EF%BC%AD%EF%BC%A9-%EF%BC%B0%EF%BC%B2%EF%BC%AF%EF%BC%AA%EF%BC%A5%EF%BC%A3%EF%BC%B4%EF%BC%8D%EF%BC%88%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%EF%BD%89%EF%BD%90%EF%BD%90%EF%BD%81%EF%BD%94%EF%BD%95-ebook/dp/B095WCBJXZ/ref=sr_1_32?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-32&x=0&y=0" + }, + { + "rank": 33, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B00URE8W0E/ref=sr_1_33?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-33&x=0&y=0" + }, + { + "rank": 34, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff12\uff12\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%92%EF%BC%92%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B0895NRVTN/ref=sr_1_34?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-34&x=0&y=0" + }, + { + "rank": 35, + "snippet": null, + "timestamp": 1625727694, + "title": "\u4e9c\u4eba\u3061\u3083\u3093\u306f\u8a9e\u308a\u305f\u3044\uff08\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E4%BA%9C%E4%BA%BA%E3%81%A1%E3%82%83%E3%82%93%E3%81%AF%E8%AA%9E%E3%82%8A%E3%81%9F%E3%81%84%EF%BC%88%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E3%83%9A%E3%83%88%E3%82%B9-ebook/dp/B00U23WV3E/ref=sr_1_35?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-35&x=0&y=0" + }, + { + "rank": 36, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff13\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%93%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B015QZP3A0/ref=sr_1_36?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-36&x=0&y=0" + }, + { + "rank": 37, + "snippet": null, + "timestamp": 1625727694, + "title": "\u6708\u520a\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e74 8/6 \u53f7 [\u96d1\u8a8c]: \u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 \u5897\u520a", + "url": "https://www.amazon.co.jp/%E6%9C%88%E5%88%8A%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B4-%E5%8F%B7-%E9%9B%91%E8%AA%8C-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3/dp/B098JVZLNJ/ref=sr_1_37?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-37&x=0&y=0" + }, + { + "rank": 38, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff12\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%92%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B010ABFLRU/ref=sr_1_38?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-38&x=0&y=0" + }, + { + "rank": 39, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff12\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%92%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B08518YY3Z/ref=sr_1_39?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-39&x=0&y=0" + }, + { + "rank": 40, + "snippet": null, + "timestamp": 1625727694, + "title": "\u6708\u520a\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3 2021\u5e74No.7 [2021\u5e746\u670817\u65e5\u767a\u58f2] [\u96d1\u8a8c]", + "url": "https://www.amazon.co.jp/%E6%9C%88%E5%88%8A%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3-2021%E5%B9%B4No-7-2021%E5%B9%B46%E6%9C%8817%E6%97%A5%E7%99%BA%E5%A3%B2-%E9%9B%91%E8%AA%8C-%E5%A4%A9%E9%87%8E%E9%9B%80-ebook/dp/B0976YP3T9/ref=sr_1_40?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-40&x=0&y=0" + }, + { + "rank": 41, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30d6\u30af\u30ed\u30ad\u30c3\u30af\u30b9(5) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E3%83%96%E3%82%AF%E3%83%AD%E3%82%AD%E3%83%83%E3%82%AF%E3%82%B9-5-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-%E6%9D%BE%E6%9C%A8-%E3%81%84%E3%81%A3%E3%81%8B/dp/4065243351/ref=sr_1_41?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-41&x=0&y=0" + }, + { + "rank": 42, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff11\uff16\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%91%EF%BC%96%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B07KY1HRNH/ref=sr_1_42?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-42&x=0&y=0" + }, + { + "rank": 43, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff14\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%94%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B018FSH5PG/ref=sr_1_43?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-43&x=0&y=0" + }, + { + "rank": 44, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30d5\u30b8\u30df\u6a21\u578b 1/24 \u982d\u6587\u5b57D\u30b7\u30ea\u30fc\u30ba No.7 180SX \u5065\u4e8c \u30d7\u30e9\u30e2\u30c7\u30eb ISD7", + "url": "https://www.amazon.co.jp/%E3%83%95%E3%82%B8%E3%83%9F%E6%A8%A1%E5%9E%8B-%E9%A0%AD%E6%96%87%E5%AD%97D%E3%82%B7%E3%83%AA%E3%83%BC%E3%82%BA-No-7-180SX-%E3%83%97%E3%83%A9%E3%83%A2%E3%83%87%E3%83%AB/dp/B07CCNB3QX/ref=sr_1_44?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-44&x=0&y=0" + }, + { + "rank": 45, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff11\uff15\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%91%EF%BC%95%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B07GSW259C/ref=sr_1_45?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-45&x=0&y=0" + }, + { + "rank": 46, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30d1\u30e9\u30ec\u30eb\u30d1\u30e9\u30c0\u30a4\u30b9\uff08\uff11\uff14\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%83%91%E3%83%A9%E3%83%AC%E3%83%AB%E3%83%91%E3%83%A9%E3%83%80%E3%82%A4%E3%82%B9%EF%BC%88%EF%BC%91%EF%BC%94%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%B2%A1%E6%9C%AC%E5%80%AB-ebook/dp/B095W9DHX9/ref=sr_1_46?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-46&x=0&y=0" + }, + { + "rank": 47, + "snippet": null, + "timestamp": 1625727694, + "title": "\u7ae5\u8c9e\u567a(3) (\u30e4\u30f3\u30de\u30acKC\u30b9\u30da\u30b7\u30e3\u30eb)", + "url": "https://www.amazon.co.jp/%E7%AB%A5%E8%B2%9E%E5%99%BA-3-%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%ACKC%E3%82%B9%E3%83%9A%E3%82%B7%E3%83%A3%E3%83%AB-Gino0808/dp/4065243548/ref=sr_1_47?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-47&x=0&y=0" + }, + { + "rank": 48, + "snippet": null, + "timestamp": 1625727694, + "title": "\u30b6\u30fb\u30d5\u30a1\u30d6\u30eb\uff08\uff11\uff11\uff09 (\u30e4\u30f3\u30b0\u30de\u30ac\u30b8\u30f3\u30b3\u30df\u30c3\u30af\u30b9)", + "url": "https://www.amazon.co.jp/%E3%82%B6%E3%83%BB%E3%83%95%E3%82%A1%E3%83%96%E3%83%AB%EF%BC%88%EF%BC%91%EF%BC%91%EF%BC%89-%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%E3%82%B3%E3%83%9F%E3%83%83%E3%82%AF%E3%82%B9-%E5%8D%97%E5%8B%9D%E4%B9%85-ebook/dp/B075676NB3/ref=sr_1_48?dchild=1&keywords=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&linkCode=ll2&linkId=76c8bdecef23a7536b9c5ec384e267da&qid=1625734894&sr=8-48&x=0&y=0" + } + ], + "timestamp": 1625727694, + "url": "https://www.amazon.co.jp/s?k=%22%E3%83%A4%E3%83%B3%E3%82%B0%E3%83%9E%E3%82%AC%E3%82%B8%E3%83%B3%22%7C%22%E3%83%A4%E3%83%B3%E3%83%9E%E3%82%AC%E3%82%B5%E3%83%BC%E3%83%89%22&x=0&y=0&linkCode=ll2&tag=rapfoodnewe0c-22&linkId=76c8bdecef23a7536b9c5ec384e267da&language=ja_JP&ref_=as_li_ss_tl" +} diff --git a/data/manual-annotations/archived-raw-serps/expected/baidu-sexinsex1-2-1213858525.approved.txt b/data/manual-annotations/archived-raw-serps/expected/baidu-sexinsex1-2-1213858525.approved.txt new file mode 100644 index 00000000..3a2f4e25 --- /dev/null +++ b/data/manual-annotations/archived-raw-serps/expected/baidu-sexinsex1-2-1213858525.approved.txt @@ -0,0 +1,45 @@ +{ + "interpreted_query": "sexinsex\u5927\u9646\u514d\u8d39\u5165\u53e3", + "offset": null, + "page": null, + "query": "sexinsex\ufffd\ufffd\u00bd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", + "results": [ + { + "rank": 1, + "snippet": null, + "timestamp": 1213858525, + "title": "sexinsex \u8272\u4e2d\u8272\u8bba\u575b\u7f51\u7ad9,\u8272\u4e2d\u8272\u8bba\u575b\u5927\u9646\u5165\u53e3,\u8272\u4e2d\u8272\u8bba\u575b\u5e10\u53f7\u514d\u8d39..", + "url": "http://seinse.net.cn/" + }, + { + "rank": 2, + "snippet": null, + "timestamp": 1213858525, + "title": "\u2261 \u90a3\u65f6\u82b1\u5f00\u2261 - \u6d3b\u529b\u57ce\u5e02 (\u6d3b\u529b\u57ce\u5e02\u793e\u533a&\u6d3b\u529b\u57ce\u5e02\u8bba\u575b)\uff0d\u514d\u8d39\u804a...", + "url": "http://www.chenji.org/index.php?gid=25" + }, + { + "rank": 3, + "snippet": null, + "timestamp": 1213858525, + "title": "\u6e29\u5dde\u5a77\u5a77\u4e94\u6708\u5929-----11sss\u827a\u672f\u5929\u7a7a\u793e\u533a", + "url": "http://www.134qq.cn/" + }, + { + "rank": 4, + "snippet": null, + "timestamp": 1213858525, + "title": "StartAid Sitemap Page 1707 of 1708", + "url": "http://www.startaid.com/sitemaps/page-1707.htm" + }, + { + "rank": 5, + "snippet": null, + "timestamp": 1213858525, + "title": "sexinsex \u8272\u4e2d\u8272\u8bba\u575b\u7f51\u7ad9,\u8272\u4e2d\u8272\u8bba\u575b\u5927\u9646\u5165\u53e3,\u8272\u4e2d\u8272\u8bba\u575b\u5e10\u53f7\u514d\u8d39..", + "url": "http://www.seinse.net.cn/" + } + ], + "timestamp": 1213858525, + "url": "http://www.baidu.com:80/s?wd=sexinsex%B4%F3%C2%BD%C3%E2%B7%D1%C8%EB%BF%DA&lm=0&si=&rn=10&ie=gb2312&ct=0&cl=3&f=1&rsp=9" +} diff --git a/data/manual-annotations/archived-raw-serps/expected/google-a-tumeo-m-branca-l-camerini-a-dual-priority-realtime-multiprocessor-system-on-fpga-for-automotive-ap-1614181186.approved.txt b/data/manual-annotations/archived-raw-serps/expected/google-scholar-a-tumeo-m-branca-l-camerini-a-dual-priority-realtime-multiprocessor-system-on-fpga-for-automotive-ap-1614181186.approved.txt similarity index 100% rename from data/manual-annotations/archived-raw-serps/expected/google-a-tumeo-m-branca-l-camerini-a-dual-priority-realtime-multiprocessor-system-on-fpga-for-automotive-ap-1614181186.approved.txt rename to data/manual-annotations/archived-raw-serps/expected/google-scholar-a-tumeo-m-branca-l-camerini-a-dual-priority-realtime-multiprocessor-system-on-fpga-for-automotive-ap-1614181186.approved.txt diff --git a/data/manual-annotations/archived-raw-serps/expected/google-muhammed-rashid-1656890873.approved.txt b/data/manual-annotations/archived-raw-serps/expected/google-scholar-muhammed-rashid-1656890873.approved.txt similarity index 100% rename from data/manual-annotations/archived-raw-serps/expected/google-muhammed-rashid-1656890873.approved.txt rename to data/manual-annotations/archived-raw-serps/expected/google-scholar-muhammed-rashid-1656890873.approved.txt diff --git a/data/manual-annotations/archived-raw-serps/expected/google-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-1614165399.approved.txt b/data/manual-annotations/archived-raw-serps/expected/google-scholar-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-1614165399.approved.txt similarity index 100% rename from data/manual-annotations/archived-raw-serps/expected/google-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-1614165399.approved.txt rename to data/manual-annotations/archived-raw-serps/expected/google-scholar-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-1614165399.approved.txt diff --git a/data/manual-annotations/archived-raw-serps/expected/google-wan-y-menon-s-and-ramaprasad-a-2009-the-paradoxical-nature-of-electronic-decision-aids-on-comparison-1614187144.approved.txt b/data/manual-annotations/archived-raw-serps/expected/google-scholar-wan-y-menon-s-and-ramaprasad-a-2009-the-paradoxical-nature-of-electronic-decision-aids-on-comparison-1614187144.approved.txt similarity index 100% rename from data/manual-annotations/archived-raw-serps/expected/google-wan-y-menon-s-and-ramaprasad-a-2009-the-paradoxical-nature-of-electronic-decision-aids-on-comparison-1614187144.approved.txt rename to data/manual-annotations/archived-raw-serps/expected/google-scholar-wan-y-menon-s-and-ramaprasad-a-2009-the-paradoxical-nature-of-electronic-decision-aids-on-comparison-1614187144.approved.txt diff --git a/data/manual-annotations/archived-raw-serps/expected/google-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-the-vldb-journal-21-2-213-238-297-10-1007-s00778-011-0253-7-1614165399.approved.txt b/data/manual-annotations/archived-raw-serps/expected/google-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-the-vldb-journal-21-2-213-238-297-10-1007-s00778-011-0253-7-1614165399.approved.txt deleted file mode 100644 index 88d1c884..00000000 --- a/data/manual-annotations/archived-raw-serps/expected/google-w-fan-j-li-s-ma-n-tang-and-w-yu-april-2012-towards-certain-fixes-with-editing-rules-and-master-data-the-vldb-journal-21-2-213-238-297-10-1007-s00778-011-0253-7-1614165399.approved.txt +++ /dev/null @@ -1,9 +0,0 @@ -{ - "interpreted_query": "W. Fan, J. Li, S. Ma, N. Tang, and W. Yu. April 2012. Towards certain fixes with editing rules and master data. The VLDB Journal, 21(2): 213--238. 297 10.1007/s00778-011-0253-7 ", - "offset": null, - "page": null, - "query": "W. Fan, J. Li, S. Ma, N. Tang, and W. Yu. April 2012. Towards certain fixes with editing rules and master data. The VLDB Journal, 21(2): 213--238. 297 10.1007/s00778-011-0253-7", - "results": [], - "timestamp": 1614165399, - "url": "https://scholar.google.com/scholar?hl=en&q=W.+Fan%2C+J.+Li%2C+S.+Ma%2C+N.+Tang%2C+and+W.+Yu.+April+2012.+Towards+certain+fixes+with+editing+rules+and+master+data.+The+VLDB+Journal%2C+21(2)%3A+213--238.+297+10.1007%2Fs00778-011-0253-7+" -} diff --git a/data/manual-annotations/archived-raw-serps/expected/imdb-angelina-jolie-1452700725.approved.txt b/data/manual-annotations/archived-raw-serps/expected/imdb-angelina-jolie-1452700725.approved.txt index 251c09a8..b004992f 100644 --- a/data/manual-annotations/archived-raw-serps/expected/imdb-angelina-jolie-1452700725.approved.txt +++ b/data/manual-annotations/archived-raw-serps/expected/imdb-angelina-jolie-1452700725.approved.txt @@ -1,5 +1,5 @@ { - "interpreted_query": "nv_sr_fn", + "interpreted_query": "Angelina Jolie", "offset": null, "page": null, "query": "Angelina Jolie", diff --git a/data/manual-annotations/archived-raw-serps/expected/imdb-hunger-games-1518585690.approved.txt b/data/manual-annotations/archived-raw-serps/expected/imdb-hunger-games-1518585690.approved.txt index eefb8c06..f1b623f5 100644 --- a/data/manual-annotations/archived-raw-serps/expected/imdb-hunger-games-1518585690.approved.txt +++ b/data/manual-annotations/archived-raw-serps/expected/imdb-hunger-games-1518585690.approved.txt @@ -1,5 +1,5 @@ { - "interpreted_query": "nv_sr_fn", + "interpreted_query": "hunger games", "offset": null, "page": null, "query": "hunger games", diff --git a/data/manual-annotations/archived-raw-serps/expected/imdb-sam-claflin-1472223834.approved.txt b/data/manual-annotations/archived-raw-serps/expected/imdb-sam-claflin-1472223834.approved.txt index d76d7b72..28cb20d3 100644 --- a/data/manual-annotations/archived-raw-serps/expected/imdb-sam-claflin-1472223834.approved.txt +++ b/data/manual-annotations/archived-raw-serps/expected/imdb-sam-claflin-1472223834.approved.txt @@ -1,5 +1,5 @@ { - "interpreted_query": "nv_sr_fn", + "interpreted_query": "Sam Claflin", "offset": null, "page": null, "query": "Sam Claflin", diff --git a/data/manual-annotations/archived-raw-serps/expected/imdb-the-expanse-1521743964.approved.txt b/data/manual-annotations/archived-raw-serps/expected/imdb-the-expanse-1521743964.approved.txt index 1f9984f0..42cd92c7 100644 --- a/data/manual-annotations/archived-raw-serps/expected/imdb-the-expanse-1521743964.approved.txt +++ b/data/manual-annotations/archived-raw-serps/expected/imdb-the-expanse-1521743964.approved.txt @@ -1,5 +1,5 @@ { - "interpreted_query": "nv_sr_fn", + "interpreted_query": "the expanse", "offset": null, "page": null, "query": "the expanse", diff --git a/data/manual-annotations/archived-raw-serps/expected/sogou-eed1-4a1-a-1332923188.approved.txt b/data/manual-annotations/archived-raw-serps/expected/sogou-eed1-4a1-a-1332923188.approved.txt new file mode 100644 index 00000000..a51883cf --- /dev/null +++ b/data/manual-annotations/archived-raw-serps/expected/sogou-eed1-4a1-a-1332923188.approved.txt @@ -0,0 +1,80 @@ +{ + "interpreted_query": "\u5364\u8120\u8120\u5192\u5f55\u964b\u9e7f\u8def\u8115\u8d42", + "offset": null, + "page": null, + "query": "\u00b1\u00c8\u00c8\u00f0\u00bc\u00aa\u00b9\u00b7\u00c1\u00b8", + "results": [ + { + "rank": 1, + "snippet": "\u8130\u813a\u8120\u5a92A\u9e7f\u8121\u8122\u8128\u9c81\u9686\u8124\u9646\u788c\u8133\u7984\u8134\u8121\u5a92\uff0c\u8130\u6885\u812a\u964b\u9e7f\u8121\u8130\u8d42\u8def\u8133\u8def\u8133\u8122\u812e\u812e\u811f\u9686\u62e2\u811c... \u9732\u813f\u8def\u9646\u812d\u8137\u8126\u83bd\u6f5e\u8d38\u8125\u7984\u8120\u7984\u5364\u5362\u8def\u5784\uff0c\u8122\u8128\u9c81\u9686\u8120\u8120\u788c\u832b\u8129\u8d42\u5507\u8135\u810c\u6f0f\u9e93\u8d38\uff0c\u8130\u8128\u8128...", + "timestamp": 1332923188, + "title": "\u7984\u5a04\u8130\u8d42\u788c\u8134\u8115\u9a74\u622e\u964b\u8124\u77db\u80eb\u5fd9\u8133\u964b \u5364\u5362\u8def\u5784or\u5507\u810c\u8125\u679a\u8119\u6885\u8120\u812e\u7984...", + "url": "http://biz.cn.yahoo.com/10-06-/202/xdyr.html" + }, + { + "rank": 2, + "snippet": "\u7ecf\u8425\u6a21\u5f0f\uff1a \u751f\u4ea7\u5546\u6240\u5728\u5730\uff1a \u6c5f\u82cf\u7701\u4f01\u4e1a\u4ecb\u7ecd\u4f01\u4e1a\u6027\u8d28\uff1a -- \u4f01\u4e1a\u7c7b\u578b\uff1a \u751f\u4ea7\u5546\u5e74\u8425\u4e1a\u989d\uff1a -- \u6cd5\u5b9a\u4ee3\u8868\u4eba/\u8d1f\u8d23\u4eba\uff1a \u8128\u77db\u63b3\u5e90\u8119\u5e3d\u5364\u5a04\u8131\u5a04\u7743\u8134\u6f5e\u6bdb\u8129\u8302\u9e7f\u9647\u9c81\u8124\u5507\u813a\u9646\u6f5e\u8131\u8128\u7743...", + "timestamp": 1332923188, + "title": "\u5507\u82a6\u8117\u813b\u8d42\u5507\u5f55\u8def\u9c81\u679a\u7984\u7164\u9686\u5784\u9646\u6f5e\u9e7f\u73ab\u9686\u5784\u9c81\u8302\u8117\u6bdb\u8115\u7743\u788c\u8120,\u8d42\u6885...", + "url": "http://www.net114.com/765576531/" + }, + { + "rank": 3, + "snippet": "\u7ecf\u8425\u6a21\u5f0f\uff1a \u751f\u4ea7\u5546\u6240\u5728\u5730\uff1a \u6c5f\u82cf\u7701\u626c\u5dde\u5e02\u4f01\u4e1a\u4ecb\u7ecd\u4f01\u4e1a\u6027\u8d28\uff1a -- \u4f01\u4e1a\u7c7b\u578b\uff1a \u751f\u4ea7\u5546\u5e74\u8425\u4e1a\u989d\uff1a -- \u6cd5\u5b9a\u4ee3\u8868\u4eba/\u8d1f\u8d23\u4eba\uff1a \u7743\u80eb\u5364\u9709\u5364\u622e\u9e7f\u82a6\u5507\u622e\u9c81\u8121\u8115\u5784\u8131\u81372005\u80eb\u951a\u9c81\u73ab\u62e2...", + "timestamp": 1332923188, + "title": "\u626c\u5dde \u622e\u80c0\u5364\u6ca1\u7743\u6f0f\u9e7f\u5a04\u80eb\u813a\u80eb\u8d42\u8115\u7743,\u8133\u732b\u8120\u5f55\u622e\u80c0\u5364\u6ca1\u7743\u6f0f,\u8d42\u80eb...", + "url": "http://www.net114.com/765576754/" + }, + { + "rank": 4, + "snippet": "\u5364\u8120\u8121\u536f\u80c0\u8137\u80eb\u9a74\u811f\u63b3\u8133\u536f\u788c\u8125\u9e7f\u9647\u8133\u8122\u5364\u951a\u8133\u5f55\uff08\u8126\u964b\u8124\u8134\u811f\u9176\u80eb\u8137\uff091000\u812d\u964b/\u812d\u8117\u7984\u9e7f\u812a\u964b\u8d42\u813d\u9c81\u679a200\u812d\u964b\u9686\u62e2 \u622e\u813b\u8115\u5507\u9646\u8292\uff0c\u8d42\u7984\u8122\u9a74\u9a74\u788c\u8d42\u812e\u8120\u6bdb\u8130\u63b3\u788c\u80eb\u812d\u5364...", + "timestamp": 1332923188, + "title": "\u8d42\u7984\u8122\u9a74\u9a74\u788c\u812d\u5364\u9e7f\u9647\u788c\u8133\u8128\u9646\u8124\u8c29\u8121\u5a92\u8120\u5a92\u9c81\u8121 \u788c\u6885\u8128\u9646\u8def\u9732\u8126\u6402...", + "url": "http://biz.cn.yahoo.com/10-06-/127/xe20.html" + }, + { + "rank": 5, + "snippet": "\u5364\u622e\u8122\u8128\u9c81\u9686\u8131\u8130\u7743\u811d\u811d\u5192\u8115\u5507\u812a\u7984\u9c81\u9686\u9e7f\u6f5e\u7984\u7164\u9a74\u5e3d\u9c81\u5364\u62e2\u5362\u7743\u813f\u5364\u8120\u5507\u8125\u788c\u80eb\u812a\u7984\u9e93\u8d38\u9732\u8129\u864f...\u6f5e\u8125\u9646\u7bd3\u812a\u8305\u62e2\u5362\u7984\u8c8c\u812e\u813d\u8131\u8128\u9e7f\u8134\u8131\u8137\u5364\u8122... \u8d38\u788c\u83bd\u80eb\u812d\u8125\u9176\u9e7f\u832b\u8130\u813b\u8129\u9732\u62e2...", + "timestamp": 1332923188, + "title": "\u9e93\u8d38\u6f5e\u8125\u8120\u812e\u5364\u622e\u8115\u7743\u810c\u94c6\uff08\u9646\u5192\u8d42\u7984\u7984\u964b\u622e\u811d\u788c\u951a\u5364\u5364\uff09 \u6df1\u5733", + "url": "http://map.sina.com.cn/poi.php?id=0755_8564" + }, + { + "rank": 6, + "snippet": "\u7984\u9c81\u9732\u6885\u6f5e\u810c\u5507\u9e7f\u9e93\u8120\u8121\u811d\u7984\u9709\u9646\u5192\u7984\u8c29\u9c81\u8121\u8115\u5784 \u8130\u7164\u80eb\u951a\u811f\u8c29\u8120\u5507\u63b3\u8137\u8125\u8129\u812a\u6f0f\u811d\u8def\u810c\u80eb\u8131\u8119\uff08\u8125\u5f55\uff09 \u7984\u9c81\u9732\u6885\u6f5e\u810c\u5507\u9e7f\u788c\u80eb\u8d42\u8d42\u811f\u8133\u812d\u8137\u812d\u8c29\u810c\u5e3d\u8121\u7743\u80eb\u8129\u8129\u8137\u63b3\u6402\u8121\u5507\u8130\u5e90...", + "timestamp": 1332923188, + "title": "\u7984\u9c81\u9732\u6885\u6f5e\u810c\u5507\u9e7f\u9e93\u8120\u8121\u811d\u7984\u9709\u9646\u5192\u7984\u8c29\u9c81\u8121\u8115\u5784 \u8130\u7164\u80eb\u951a\u811f\u8c29\u8120\u5507...", + "url": "http://real.67.com/index.php/Index/news/id/36583" + }, + { + "rank": 7, + "snippet": "4\u812d\u811713\u8120\u812e\u8126\u76f2\u6f5e\u6f5e\u8125\u7984\u8120\u7984\u9646\u788c\u8126\u8117\uff0c\u812e\u811c\u811c\u5e90\u8122\u9a74\u788c\u62e2\u8128\u80eb\u8120\u697c\u8121\u7743\u8129\u6402\u788c\u80eb\u6f5e\u5784\u8133\u8131\u812a\u8117\u8133\u811c\u788c\u697c\u5364\u9686\uff0c\u5364\u832b\u8120\u697c\u8129\u6402\u8128\u62e2\u8d42\u9176\u6f5e\u5784\u8133\u8131\u5507\u8125\u812a\u8117\u8def\u9541\u9686\u62e2\u812d\u8137\u5507\u8125\u8125\u951a\u812a...", + "timestamp": 1332923188, + "title": "\u80eb\u8d42\u811f\u8133\u8126\u964b\u9732\u9709\u5507\u8125\u812a\u8117\u8117\u8c8c\u864f\u8122\u8130\u8128737\u8125\u8c8c \u9e7f\u6f5e\u864f\u81221\u80eb\u951a\u9732\u813f...", + "url": "http://sports.cn.yahoo.com/10-06-/346/2bff1.html" + }, + { + "rank": 8, + "snippet": "\u67e5\u627e\u8def\u7ebf\u53ca\u5468\u8fb9\u4fe1\u606f \u9a74\u9885\u8120\u679a\u9e7f\u7164\u8117\u8119\uff08\u5364\u5364\u8120\u5a92\u7984\u8def\u8126\u6885\u8117\u8def\u8d42\u7bd3\u8117\u8def\uff09 \u5230\u8fd9\u91cc\u53bb \u4ece\u8fd9\u91cc\u51fa\u53d1 \u5728\u9644\u8fd1\u627e \u51fa\u53d1 \u5230\u8fbe \u9a74\u9885\u8120\u679a\u9e7f\u7164\u8117\u8119\uff08\u5364\u5364\u8120\u5a92\u7984\u8def\u8126\u6885\u8117\u8def\u8d42\u7bd3\u8117\u8def\uff09 ...", + "timestamp": 1332923188, + "title": "\u9a74\u9885\u8120\u679a\u9e7f\u7164\u8117\u8119\uff08\u5364\u5364\u8120\u5a92\u7984\u8def\u8126\u6885\u8117\u8def\u8d42\u7bd3\u8117\u8def\uff09 \u5317\u4eac", + "url": "http://map.sina.com.cn/poi.php?id=0010_20904" + }, + { + "rank": 9, + "snippet": "\u8119\u8def\u9732\u6ca1\uff1f\u5f55\u964b\u864f\u5f55\u8121\u9885\u8131\u6bdb\u8122\u8def\u6885\u77db\u810c\u9885\uff1f\u80eb\u964b\u8133\u8120\u812d\u5f55\u7984\u8c29\u5364\u7984\u811c\u80eb\u62e2\u7bd3\u8125\u5f55\u810c\u9e93\u8133... \u8137\u7743\u62e2\u810c\u63b3\u8133\u813d\u8128\u812d\u8d42\u8128\u9686\u5784\u6f5e\u810c\u8def\u811c\u8117\u8def\u7743\u813d\u62e2\u5362\u812d\u8117\u9c81\u73ab\u8131\u6bdb\u8119\u8def\u9732\u6ca1\u812d\u8137\u5364\u8120\u8def\u5192...", + "timestamp": 1332923188, + "title": "55\u5507\u951a\u8119\u8def\u9732\u6ca1\u9686\u9647\u5f55\u964b\u864f\u5f55\u8121\u9885\u8115\u788c\u8121\u7743\u7743\u62e2\u810c\u63b3\u80eb\u80c0\u80eb\u62e2 \u8115\u9646\u8120...", + "url": "http://real.67.com/index.php/Index/news/id/30064" + }, + { + "rank": 10, + "snippet": "\u812e\u9e7f\u8131\u8128\u7743\u813c\u9e7f\u82a6\u5507\u622e\u9686\u62e2\u9686\u63b3\u812d\u8137\u9c81\u679a\u8120\u8119\u8115\u5507\u622e\u9885\u8131\u964b\u9e7f\u813a\u810c\u94c6\u8120\u7bd3\u6f5e\u8d38\u62e2\u5362\u9646\u5192\u8121... \u812d\u9686\u5364\u9686\u62e21500\u8131\u813f\u8119\u6ca1\u8120\u8120\u8130\u812d\u8131\u8137\u9e93\u5507\u788c\u80eb\u8130\u8128\u8125\u8292\u811f\u813f\u80eb\u951a\u812d\u8137\u9c81\u9647\u9c81\u811f\u8121\u7743...", + "timestamp": 1332923188, + "title": "\u9e7f\u7164\u80eb\u8137\u8117\u8119\u8131\u8126\u8128\u811c\u7743\u5784 - 860\u8125\u8c8c\u812d\u964b\u8def\u811f\u8def\u7bd3\u9c81\u679a\u8120\u8119\u9646\u5192\u8121...", + "url": "http://www.yoyocn.cn/info/china/12461.html" + } + ], + "timestamp": 1332923188, + "url": "http://www.sogou.com/web?query=%C2%B1%C3%88%C3%88%C3%B0%C2%BC%C2%AA%C2%B9%C2%B7%C3%81%C2%B8&p=03210100" +} diff --git a/data/manual-annotations/archived-raw-serps/expected/youtube-shang-yue-xian-ru-hu-1582390054.approved.txt b/data/manual-annotations/archived-raw-serps/expected/youtube-shang-yue-xian-ru-hu-1582390054.approved.txt new file mode 100644 index 00000000..de19af3e --- /dev/null +++ b/data/manual-annotations/archived-raw-serps/expected/youtube-shang-yue-xian-ru-hu-1582390054.approved.txt @@ -0,0 +1,220 @@ +{ + "interpreted_query": "\u4e0a\u8d8a\u7dda \u30eb\u30fc\u30d7", + "offset": null, + "page": null, + "query": "\u4e0a\u8d8a\u7dda \u30eb\u30fc\u30d7", + "results": [ + { + "rank": 1, + "snippet": "\uff16\u6708\uff19\u65e5\u64ae\u5f71 \u571f\u5408\u30fb\u6e6f\u6a9c\u66fd\u306e\u8a2a\u554f\u306b\u306f\u81e8\u6642\u5217\u8eca\u306e\u3054\u5229\u7528\u304c\u4fbf\u5229\u3067\u3059\u3002", + "timestamp": 1582390054, + "title": "\u5217\u8eca\u304c\uff12\u56de\u901a\u904e\u3059\u308b\uff01\u4e0a\u8d8a\u7dda\u30fb\u6e6f\u6a9c\u66fd\u99c5\u3010201806\u7fa4\u99ac4\u3011 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 14:32", + "url": "https://www.youtube.com/watch?v=biS6Tqmriq0" + }, + { + "rank": 2, + "snippet": "\u4e0a\u8d8a\u7dda\u306e\u81e8\u6642\u5feb\u901f\u300cNODOKA\u30eb\u30fc\u30d7\u300d\u5c55\u671b\u5e2d\u304b\u3089\u306e\u524d\u9762\u5c55\u671b\u3067\u3059\u3002 \u9014\u4e2d\u3001\u7d76\u666f\u533a\u9593\u3067\u5f90\u884c\u3057\u307e\u3059\u3002", + "timestamp": 1582390054, + "title": "\u30eb\u30fc\u30d7\u7dda \u524d\u9762\u5c55\u671b\u3000\u571f\u5408\u2192\u6e6f\u6a9c\u66fd\u3000\u81e8\u6642\u5feb\u901f\u300cNODOKA\u30eb\u30fc\u30d7\u300d - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 13:54", + "url": "https://www.youtube.com/watch?v=Gq1ezCrFMq4" + }, + { + "rank": 3, + "snippet": "2014\u5e745\u670817\u65e5\u904b\u8ee2\u306e\u5feb\u901f\u300cNO. DO. KA\u30eb\u30fc\u30d7\u300d \u8d8a\u5f8c\u6e6f\u6ca2\u99c5\u304b\u3089\u30eb\u30fc\u30d7\u7dda\u3092\u901a\u3063\u3066\u3001\u6c34\u4e0a\u99c5\u307e\u30671\u6642\u959308\u5206\u306e\u524d\u9762\u5c55\u671b\u306b\u306a\u308a\u307e\u3059\u3002", + "timestamp": 1582390054, + "title": "\u5feb\u901f\u300cNO. DO. KA \u30eb\u30fc\u30d7\u300d\u8d8a\u5f8c\u6e6f\u6ca2\uff5e\u6c34\u4e0a\u3000\u4e0a\u8d8a\u7dda\u3000\u524d\u9762\u5c55\u671b - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 1:10:42", + "url": "https://www.youtube.com/watch?v=H5jhs6Z4yLg" + }, + { + "rank": 4, + "snippet": "\u9577\u5927\u30c8\u30f3\u30cd\u30eb\u3092\u6398\u308b\u3053\u3068\u304c\u96e3\u3057\u304b\u3063\u305f\u6642\u4ee3\u3001\u30c8\u30f3\u30cd\u30eb\u3092\u77ed\u304f\u3059\u308b\u305f\u3081\u306b\u4e21\u30b5\u30a4\u30c9\u306b\u30eb\u30fc\u30d7\u7dda\u3092\u8a2d\u7f6e\u3057\u307e\u3057\u305f\u3002\u4e00\u5ea6\u5c71\u8179\u3067\u898b\u3048\u305f\u5217\u8eca\u304c\u3001\u3059\u3050...", + "timestamp": 1582390054, + "title": "\u3010\u30eb\u30fc\u30d7\u3067\u30ef\u30fc\u30d7 !?\u3011\u3000\u4e0a\u8d8a\u7dda\u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u3000\u30ce\u30fc\u30ab\u30c3\u30c8\u7248\u3000\u9577\u3044\uff1f\u77ed\u3044\uff1f\u611f\u3058\u65b9\u306f\u5341\u4eba\u5341\u8272 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 3:36", + "url": "https://www.youtube.com/watch?v=2VuCakIENtA" + }, + { + "rank": 5, + "snippet": "JR\u4e0a\u8d8a\u7dda \u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u7dda\uff5e\u6e6f\u6a9c\u66fd\u99c5 2019\u5e7410\u67082\u65e5 \u64ae\u5f71 \u4ea4\u6d41\u76f4\u6d41\u4e21\u7528\u96fb\u6c17\u6a5f\u95a2\u8ecaEF81 134\uff0bE231\u7cfb\u96fb\u8eca MU38\u7de8\u6210 8\u4e21 \u914d\u7d66\u00a0...", + "timestamp": 1582390054, + "title": "2019\u5e7410\u67082\u65e5 JR\u4e0a\u8d8a\u7dda \u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u7dda\uff5e\u6e6f\u6a9c\u66fd\u99c5\u3092\u901a\u904e\u3059\u308bEF81 134\uff0bE231\u7cfb MU38\u7de8\u6210 \u914d\u7d66\u5217\u8eca - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 1:52", + "url": "https://www.youtube.com/watch?v=S9Q1j7eeVfE" + }, + { + "rank": 6, + "snippet": "\u65e5\u672c\u306b\u306f\u6570\u5c11\u306a\u3044\u306a\u304c\u3089\u3082\u5c71\u5cb3\u5730\u5e2f\u306e\u6025\u3053\u3046\u914d\u3092\u4e0a\u308b\uff08\u4e0b\u308b\uff09\u5e7e\u3064\u304b\u306eJR\u30eb\u30fc\u30d7\u7dda\u304c\u3042\u308a\u307e\u3059\u304c\u3001\u305d\u306e\u4e2d\u3067\u3082\u73cd\u3057\u3044\u898b\u3048\u308b\u30eb\u30fc\u30d7\u7dda\u306e\u4e00...", + "timestamp": 1582390054, + "title": "\u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u7dda \u3010\u65e5\u672c\u306b\u6570\u5c11\u306a\u3044\u898b\u3048\u308b\u30eb\u30fc\u30d7\u7dda\u3011\u3000\u4e0a\u8d8a\u7dda \u4e0a\u308a\u3000\u571f\u5408\uff5e\u6e6f\u6a9c\u66fd\u3000115\u7cfb - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 2:28", + "url": "https://www.youtube.com/watch?v=hcj4dlFg46Q" + }, + { + "rank": 7, + "snippet": "\u4e0a\u8d8a\u7dda\u306b\u306f\u3001\u65e5\u672c\u3067\u306f\u73cd\u3057\u3044\u30eb\u30fc\u30d7\u7dda\u304c\uff12\u304b\u6240\u3042\u308a\u307e\u3059\u3002\u4e00\u3064\u306f\u8d8a\u5f8c\u4e2d\u91cc\u99c5\uff0d\u571f\u6a3d\u99c5\u9593\u306e\u300c\u677e\u5ddd\u30eb\u30fc\u30d7\u300d\u3001\u305d\u3057\u3066\u3082\u3046\u4e00\u3064\u304c\u571f\u5408\u99c5\uff0d\u6e6f\u6a9c\u66fd\u99c5\u9593...", + "timestamp": 1582390054, + "title": "\u3010\u73cd\u3057\u3044\u201c\u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u201d\u3092\u4f53\u611f\u3057\u3066\u307f\u3088\u3046!\u3011\u571f\u5408\u99c5\uff0d\u6e6f\u6a9c\u66fd\u99c5\uff0d\u6c34\u4e0a\u99c5\u9593\u306e\u524d\u9762\u5c55\u671b - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 12:19", + "url": "https://www.youtube.com/watch?v=cUkfbPf5JqU" + }, + { + "rank": 8, + "snippet": "\u30eb\u30fc\u30d7\u7dda\u306e\u7dda\u5f62\u3092\u8a18\u9332\u3057\u305f\u304f\u3066\u3001\u53ce\u9332\u306b\u884c\u304d\u307e\u3057\u305f\u3002\u30c8\u30f3\u30cd\u30eb\u5185\u3067\u306fGPS\u306e\u30c7\u30fc\u30bf\u304c\u6b20\u843d\u3059\u308b\u306e\u3067\u3001\u88dc\u9593\u3057\u305f\u4f4d\u7f6e\u60c5\u5831\u30c7\u30fc\u30bf\u3092\u4f5c\u6210\u3057\u3066\u901f\u5ea6\u3092\u8868\u793a...", + "timestamp": 1582390054, + "title": "\u3010\u672c\u5f53\u306b\u30eb\u30fc\u30d7\u3059\u308b?\u3011\u65b0\u5feb\u901f223\u7cfb \u524d\u9762\u5c55\u671b\u3000\u65b0\u758b\u7530\u30eb\u30fc\u30d7\u7dda(1) - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 9:25", + "url": "https://www.youtube.com/watch?v=errLGwaPkO8" + }, + { + "rank": 9, + "snippet": "\u6e6f\u6a9c\u66fd\u99c5\u3067\u30eb\u30fc\u30d7\u7dda\u3092\u4e0b\u3063\u3066\u304f\u308b\u5217\u8eca\u3092\u64ae\u5f71\u3057\u305f\u3002\u5b9f\u969b\u306b\u306f\u5c71\u8179\u3092\u8d70\u308b\u96fb\u8eca\u304c\u99c5\u306b\u5165\u3063\u3066\u304f\u308b\u307e\u3067\u3082\u3046\u5c11\u3057\u6642\u9593\u304c\u304b\u304b\u3063\u305f\u304c\u3001\u30ab\u30c3\u30c8\u3057\u3066\u3044...", + "timestamp": 1582390054, + "title": "\u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u7dda - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 0:57", + "url": "https://www.youtube.com/watch?v=29vYZXWPemM" + }, + { + "rank": 10, + "snippet": "2019/03/25\u64ae\u5f71\u3002EH200\u5f62+\u6771\u60252020\u7cfb10B\u306e\u300cJ-TREC\u65b0\u6d25\u51fa\u5834\u7532\u7a2e\u8f38\u9001\u300d\u3067\u3059\u3002\u6700\u521d\u306b\u8b1d\u3063\u3066\u304a\u304d\u307e\u3059\u3002EH200\u5f62\u304c\u901a\u904e\u3057\u305f\u77ac\u9593\u306b\u30ab\u30e1\u30e9\u304c\u624b\u3059\u308a\u304b\u3089\u6ed1\u308a ...", + "timestamp": 1582390054, + "title": "\u30144K UHD|cc\u3015JR\u8ca8\u7269\u30fb\u4e0a\u8d8a\u7dda\uff1a\u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\uff5e\u6e6f\u6a9c\u66fd\u99c5\u3001EH200\u5f62+\u6771\u60252020\u7cfb10B/\u300c\u7532\u7a2e\u8f38\u9001\u300d\u8d70\u884c\u30fb\u901a\u904e\u30b7\u30fc\u30f3\u3002\u300a9788\u30ec\u300b - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 2:07", + "url": "https://www.youtube.com/watch?v=eCl7p3TmnWw" + }, + { + "rank": 11, + "snippet": "\u30eb\u30fc\u30d7\u7dda\u3067\u77e5\u3089\u308c\u308bJR\u4e0a\u8d8a\u7dda\u6e6f\u6a9c\u66fd\uff5e\u571f\u5408\u9593\u306e\u4e0a\u308a\u7dda\u3067\u3059\u3002\u6e6f\u6a9c\u66fd\u99c5\u4e0a\u308a\u30db\u30fc\u30e0\u3084\u6e29\u6cc9\u8857\u5165\u53e3\u304b\u3089\u773a\u3081\u308b\u3053\u3068\u304c\u3067\u304d\u307e\u3059\u3002", + "timestamp": 1582390054, + "title": "\u4e0a\u8d8a\u7dda\u571f\u5408\u2192\u6e6f\u6a9c\u66fd\u9593\u306e\u30eb\u30fc\u30d7\u7dda - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 3:50", + "url": "https://www.youtube.com/watch?v=8cqGUu7SvLQ" + }, + { + "rank": 12, + "snippet": "\u73cd\u3057\u3044\u96fb\u8eca\u30c8\u30f3\u30cd\u30eb\u5185\u30eb\u30fc\u30d7\u3067\u3059\u3002 \u590f\u3068\u79cb\u306b\u5229\u7528\u3057\u305f\u4e0a\u8d8a\u7dda\u3001\u4eca\u307e\u3067\u30eb\u30fc\u30d7 \u3057\u3066\u3044\u308b\u4e8b\u306b\u6c17\u3065\u304d\u307e\u305b\u3093\u3067\u3057\u305f\u3002 \u884c\u3063\u305f\u3053\u3068\u306e\u306a...", + "timestamp": 1582390054, + "title": "2017\u5317\u6d77\u9053\u65c5\u884c\u2463 JR\u4e0a\u8d8a\u7dda\u30eb\u30fc\u30d7\uff08\u571f\u5408\uff5e\u6e6f\u6a9c\u66fd\uff09 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 5:54", + "url": "https://www.youtube.com/watch?v=N0ev9Np3CWA" + }, + { + "rank": 13, + "snippet": "2016\u5e7407\u670812\u65e5 \u5e73\u65e5 \u706b\u66dc\u65e5 14\u664250\u520600\u79d2 \u64ae\u5f71\u3002 \u9014\u4e2d\u99c5\u307e\u3067\u4e07\u5168\u306e\u614b\u52e2\u3067\u64ae\u5f71\u3067\u304d\u3066\u3044\u307e\u305b\u3093\u3002 \u4e00\u811a\u3092\u5c55\u958b\u3057\u305f\u3089\u3082\u3046\u51fa\u767a\u6642\u9593\u3067\u3001\u00a0...", + "timestamp": 1582390054, + "title": "\u3010\u7bb1\u6839\u767b\u5c71\u7dda\u3011\u3000\u3010\u7bb1\u6839\u767b\u5c71\u9244\u9053\u3011\u3000\u3010\u524d\u9762\u5c55\u671b\u3011\u3000\u30103000\u5f62\u3000\u30a2\u30ec\u30b0\u30e9\u53f7\u3011\u3000\u3010\u7bb1\u6839\u6e6f\u672c\u99c5\u2192\u5f37\u7f85\u99c5\u3011 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 44:40", + "url": "https://www.youtube.com/watch?v=pi5-dn5NZvk" + }, + { + "rank": 14, + "snippet": "\u7279\u6025\u3057\u306a\u306e\u53f7\u306e\u901a\u904e\u3059\u308b\u7be0\u30ce\u4e95\u7dda\u3067\u306f\u3001\u7be0\u30ce\u4e95\uff5e\u51a0\u7740\u9593\u306b\u5927\u5909\u306a\u6025\u52fe\u914d\u533a\u9593\u3092\u6709\u3059\u308b\u3053\u3068\u304b\u3089\u3001\u30b9\u30a4\u30c3\u30c1\u30d0\u30c3\u30af\u3092\u4f7f\u3063\u3066\u505c\u8eca\u3059\u308b\u4fe1\u53f7\u5834\u3068\u00a0...", + "timestamp": 1582390054, + "title": "\u65e5\u672c\u4e09\u5927\u8eca\u7a93\u306e\u30b9\u30a4\u30c3\u30c1\u30d0\u30c3\u30af\u99c5\u3001\u7be0\u30ce\u4e95\u7dda\u30fb\u59e8\u6368\u99c5\u30101904\u9577\u91ce\uff13\u3011\u9577\u91ce\u99c5\u2192\u677e\u672c\u99c5 4/26-02 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 36:16", + "url": "https://www.youtube.com/watch?v=RRzf5YbWRG8" + }, + { + "rank": 15, + "snippet": "\u3075\u3060\u3093\u306e\u30d1\u30ce\u30e9\u30de\u30ab\u30fc\u5c55\u671b\u5e2d\u306f\u30ac\u30e9\u30ac\u30e9\u3089\u3057\u3044\u3067\u3059\u3002\u3053\u308c\u306b\u4e57\u3063\u3066\u307f\u3066\u30b9\u30fc\u30d1\u30fc\u30d3\u30e5\u30fc\u306e\u5c55\u671b\u5e2d\u306f\u5927\u3057\u305f\u3053\u3068\u306a\u3044\u3068\u308f\u304b\u308a\u307e\u3057\u305f\u3002", + "timestamp": 1582390054, + "title": "\u3010\u3069\u3051\u3088\u3011\u7279\u6025\u30d1\u30ce\u30e9\u30de\u30fc\u30b9\u30fc\u30d1\u30fc\u53f7\u306b\u4e57\u308a\u307e\u3057\u305f - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 16:49", + "url": "https://www.youtube.com/watch?v=0jnvLO-cp1g" + }, + { + "rank": 16, + "snippet": "6\u67084\u65e5\u64ae\u5f71 \u4e00\u5ea6\u306f\u884c\u304f\u3079\u304d\u99c5\u3060\u3068\u601d\u3044\u307e\u3059\u3002", + "timestamp": 1582390054, + "title": "\u51fa\u5834\u4e0d\u53ef\u80fd\u30fb\u64ae\u5f71\u6ce8\u610f\u30fb\u6771\u829d\u5c02\u7528 \u6d77\u829d\u6d66\u99c5\u3010201806\u9db4\u898b\u7dda5\u3011 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 15:15", + "url": "https://www.youtube.com/watch?v=16o7bMfBx8s" + }, + { + "rank": 17, + "snippet": "\u591c\u884c\u5217\u8eca\u306e\u9045\u5ef6\u7387\u306f\u6848\u5916\u9ad8\u3044\u3088\u3046\u306a\u6c17\u304c\u3057\u307e\u3059\u3002\u51ac\u5b63\u306f\u305f\u307e\u306b\u3068\u3093\u3067\u3082\u306a\u3044\u3053\u3068\u306b\u306a\u308a\u307e\u3059\u3002\u3042\u3051\u307c\u306e\u306f\u9031\u306b1\u56de\u3050\u3089\u30442\u6642\u9593\u9045\u5ef6\u3057\u3066\u307e\u3057\u00a0...", + "timestamp": 1582390054, + "title": "\u76ee\u899a\u3081\u305f\u3089\u30b5\u30f3\u30e9\u30a4\u30ba\u51fa\u96f2\u304c\u9045\u5ef6\u3057\u3066\u3044\u305f - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 21:17", + "url": "https://www.youtube.com/watch?v=Q4zWO4PRwwg" + }, + { + "rank": 18, + "snippet": "\u5c0f\u5e4c\u99c5\u306b\u8abf\u67fb\u7528\u30ab\u30e1\u30e9\u3092\u53d6\u308a\u4ed8\u3051\u305f\u3068\u3053\u308d\u30018\u6708\u5f8c\u534a\u3067138\u4eba\u304c\u8a2a\u308c\u3001\u5229\u7528\u8005\u306f\u6bce\u65e5\u304a\u308a\u30011\u65e5\u306e\u6700\u591a\u306f22\u4eba\u3060\u3063\u305f\u305d\u3046\u3067\u3059\u3002", + "timestamp": 1582390054, + "title": "\u3010\u30cf\u30a4\u30d1\u30fc\u79d8\u5883\u99c5\u3011\u591c\u306e\u5c0f\u5e4c\u99c5\u306f\u884c\u304f\u3082\u3093\u3058\u3083\u306a\u3044 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 17:15", + "url": "https://www.youtube.com/watch?v=AJo-FA_1n7Y" + }, + { + "rank": 19, + "snippet": "\uff16\u6708\uff19\u65e5\u64ae\u5f71.", + "timestamp": 1582390054, + "title": "\u6539\u672d\u53e3\u307e\u3067\u5f92\u6b6910\u5206 \u571f\u5408\u99c5\u30fb\u5730\u7344\u306e\u968e\u6bb5\u30c0\u30c3\u30b7\u30e5\u3010201806\u7fa4\u99ac3\u3011 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 19:07", + "url": "https://www.youtube.com/watch?v=tCjTKdDtSwY" + }, + { + "rank": 20, + "snippet": "\u8ff7\u5217\u8eca\u30b7\u30ea\u30fc\u30ba\u3068\u805e\u3044\u3066\u771f\u3063\u5148\u306b\u601d\u3044\u6d6e\u304b\u3079\u308b\u306e\u306f\u30b5\u30f3\u30d1\u30c1\u541b\u3084\u98df\u30d1\u30f3\u96fb\u8eca\u3067\u306f\u306a\u304f215\u7cfb\u3067\u3059\u3002\u306a\u304a\u3001\u52d5\u753b\u5185\u3067\u9593\u9055\u3063\u305f\u3053\u3068\u3092\u8a00\u3063\u305f\u304b\u3082\u00a0...", + "timestamp": 1582390054, + "title": "\u3010\u30b0\u30ea\u30fc\u30f3\u8eca\u3011\u30aa\u30fc\u30eb\uff12\u968e\u5efa\u3066 \u30db\u30ea\u30c7\u30fc\u5feb\u901f\u30d3\u30e5\u30fc\u3084\u307e\u306a\u3057\u53f7 \u5c0f\u6df5\u6ca2\u2192\u65b0\u5bbf\u4e57\u8eca\u8a18 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 28:26", + "url": "https://www.youtube.com/watch?v=ZmY-Q3htLPc" + }, + { + "rank": 21, + "snippet": "\u3055\u308b\u3073\u3042\u4e38\u306e\u52d5\u753b\u306b\u306f\u4e0d\u5177\u5408\u304c\u898b\u3064\u304b\u3063\u305f\u306e\u3067\u3001\u660e\u65e5\u4ee5\u964d\u51fa\u3057\u76f4\u3057\u307e\u3059\u3002 \uff17\u670814\u65e5\u64ae\u5f71 \u6771\u6d77\u6c7d\u8239\u306e\u904b\u884c\u3059\u308b\u30b8\u30a7\u30c3\u30c8\u8239\u306f\u30bb\u30d6\u30f3\u30a2\u30a4\u30e9\u30f3\u30c9\u306e\u00a0...", + "timestamp": 1582390054, + "title": "\u3010\u6771\u6d77\u6c7d\u8239\u30b8\u30a7\u30c3\u30c8\u3011\u6d77\u306e\u98db\u884c\u6a5f \u30dc\u30fc\u30a4\u30f3\u30b0929\u306b\u4e57\u8239\u30101807\u65b0\u5cf63\u30117/14-02 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 19:34", + "url": "https://www.youtube.com/watch?v=DRKvf5L1qhg" + }, + { + "rank": 22, + "snippet": "\uff15\u6708\uff11\uff13\u65e5\u64ae\u5f71 \u3010\u304a\u53ec\u5217\u8eca\u306e\u6b74\u53f2\u2460\u3011 E655\u7cfb\u306f2007\u5e74\u306b\u30c7\u30d3\u30e5\u30fc\u3057\u305f\u7687\u5ba4\u304a\u53ec\u5217\u8eca\u5bfe\u5fdc\u306e\u30b8\u30e7\u30a4\u30d5\u30eb\u30c8\u30ec\u30a4\u30f3\u3067\u3059\u3002\u305d\u308c\u307e\u3067\u306f1932\u5e74\u306b\u00a0...", + "timestamp": 1582390054, + "title": "\u7687\u5ba4\u304a\u53ec E655\u7cfb\u306a\u3054\u307f\u4e57\u8eca\u8a18\uff08\u524d\u7de8\uff09 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 11:29", + "url": "https://www.youtube.com/watch?v=cVoRY3e2Rdc" + }, + { + "rank": 23, + "snippet": "2016/5/28\u306b\u53ce\u9332 \u4e0a\u8d8a\u7dda\u306e\u8d8a\u5f8c\u6e6f\u6ca2\u301c\u6c34\u4e0a\u99c5\u9593\u3067\u4e0d\u5b9a\u671f\u306b\u904b\u8ee2\u3055\u308c\u308b\u300c\u4e0a\u8d8a\u56fd\u5883\u8d8a\u3048\u300d\u306e\u81e8\u6642\u5217\u8eca\u3067\u3059\u304c\u3001\u305d\u306e\u4e2d\u306e\u4e00\u3064\u3067\u3042\u308b\u30ad\u30cf48\u00a0...", + "timestamp": 1582390054, + "title": "\u4e0a\u8d8a\u7dda\u300e\u98a8\u3063\u3053\u30eb\u30fc\u30d7\u300f\u3067\u4e0a\u8d8a\u56fd\u5883\u8d8a\u3048\uff01 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 11:21", + "url": "https://www.youtube.com/watch?v=z5kNa1CGcys" + }, + { + "rank": 24, + "snippet": "\u300c\u4e57\u3063\u3066\u304d\u305f\u300d\u30b7\u30ea\u30fc\u30ba\u7b2c25\u5f3e\uff01 \u4e0a\u8d8a\u7dda\u306e\u4e0a\u308a\u7dda\u306b\u306f\u3001\u30eb\u30fc\u30d7\u7dda\u304c\u5b58\u5728\u3057\u307e\u3059\u3002\u7fa4\u99ac\u770c\u6700\u5317\u306e\u571f\u5408\u99c5\u3068\u6e6f\u6a9c\u66fd\u99c5\u306e\u9593\u306e\u8ddd\u96e2\u306f6.6km\u3067\u3059\u00a0...", + "timestamp": 1582390054, + "title": "\u4e0a\u8d8a\u7dda \u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u7dda\u3092\u4f53\u9a13\u3057\u3066\u304d\u305f JR Joetsu Line Yubiso Loop in Gumma - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 2:21", + "url": "https://www.youtube.com/watch?v=MhPuw_ustqE" + }, + { + "rank": 25, + "snippet": "\u5217\u8eca\u304c\u898b\u3048\u3066\u304b\u3089\u304a\u3088\u305d3\u5206\u5f8c\u3001\u76ee\u306e\u524d\u306e\u7dda\u8def\u306b\u5217\u8eca\u304c\u3084\u3063\u3066\u6765\u307e\u3059\u3002\u7d05\u8449\u306e\u6642\u5b63\u306b\u8a2a\u308c\u307e\u3057\u305f\u3002 2015.11.03\u64ae\u5f71.", + "timestamp": 1582390054, + "title": "\u4e0a\u8d8a\u7dda 115\u7cfb\u3068\u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7(\u30ce\u30fc\u30ab\u30c3\u30c8) - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 2:58", + "url": "https://www.youtube.com/watch?v=Ih1fswUThuc" + }, + { + "rank": 26, + "snippet": "\u590f\u3082\u51ac\u3082\u4e0a\u8d8a\u7dda\u306f\u3044\u3044\u3067\u3059\u306d \u30c4\u30a4\u30c3\u30bf\u30fc\u2192@wakabaJT1.", + "timestamp": 1582390054, + "title": "\u4e0a\u8d8a\u7dda\u4e57\u3063\u3066\u30eb\u30fc\u30d7\u7dda\u3092\u898b\u308b\u65c5 \u8d8a\u5f8c\u6e6f\u6ca2\u2192\u6c34\u4e0a - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 6:36", + "url": "https://www.youtube.com/watch?v=M9Nt4gSKwVw" + }, + { + "rank": 27, + "snippet": "\u64ae\u5f71\u65e52018\u5e7410\u6708\u64ae\u5f71\u6642\u9593\u4e0b\u8a18 \u203b\u30bf\u30a4\u30c8\u30eb\u5217\u8eca\u306f3:27\u304b\u3089 \u6b66\u8535\u91ce\u7dda\u306b\u8ee2\u5c5e\u3059\u308b209\u7cfb\u306eAT\u5165\u5834\u914d\u7d66\u6e96\u5099\u306e\u65b9\u8ee2\u56de\u9001\u304a\u3088\u3073\u3001\u6c34\u4e0a\u99c5\u00a0...", + "timestamp": 1582390054, + "title": "\u65b0\u6e05\u6c34\u30c8\u30f3\u30cd\u30eb\u306b\u5165\u308b209\u7cfb\u4e09\u9df9\u8eca\u6700\u5f8c\u306e\u83f1\u5f62\u30d1\u30f3\u30bf\u8eca\u914d\u7d66\u5217\u8eca\u3000\u914d\u7d66\u6e96\u5099\u56de\u9001 - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 4:20", + "url": "https://www.youtube.com/watch?v=CYq9k57h670" + }, + { + "rank": 28, + "snippet": "\u3053\u306e\u65e5\u306f\u65b0\u6f5f\u770c\u5185\u306e\u4e0a\u8d8a\u7dda\u306e\u30c8\u30e9\u30d6\u30eb\u306b\u3088\u308a\u3001\uff11\u6642\u9593\u7a0b\u5ea6\u306e\u9045\u308c\u3067\u6e6f\u6a9c\u66fd\u99c5\u3092\u901a\u904e\u3057\u3066\u3044\u304d\u307e\u3057\u305f\u3002 0:40\u3042\u305f\u308a\u3067\u5217\u8eca\u304c\u30c8\u30f3\u30cd\u30eb\u306e\u4e0a\u3092\u901a\u904e\u00a0...", + "timestamp": 1582390054, + "title": "\u3010\u305f\u304b\u306e\u53f0\u7dcf\u5408\u904b\u8ee2\u6240\u30112022\u30ec\u5bdd\u53f0\u7279\u6025\u3042\u3051\u307c\u306e \u6e6f\u6a9c\u66fd\u30eb\u30fc\u30d7\u3092\u964d\u308a\u308b - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 5:22", + "url": "https://www.youtube.com/watch?v=saYCqKIxDCs" + }, + { + "rank": 29, + "snippet": "\u4e0a\u8d8a\u7dda\u3001\u8d8a\u5f8c\u4e2d\u91cc~\u571f\u6a3d\u9593\u3001\u571f\u5408~\u6e6f\u6a9c\u66fd\u9593\u306b\u306f\u3001\u30b0\u30f3\u30de\u30fc\u5e1d\u56fd\u3068\u65b0\u6f5f\u3092\u7d50\u3076\u3001\u65e5\u672c\u3067\u3082\u5927\u5909\u73cd\u3057\u3044\u30eb\u30fc\u30d7\u7dda\u304c\u3042\u308a\u307e\u3059\u3002\u8d8a\u5f8c\u4e2d\u91cc~\u571f\u6a3d\u9593...", + "timestamp": 1582390054, + "title": "\u3010\u4e0a\u8d8a\u56fd\u5883\u30eb\u30fc\u30d7\u7dda\u3011E129\u3067\u8d70\u884c\u3001\u8eca\u7a93(\u8d8a\u5f8c\u4e2d\u91cc~\u571f\u6a3d\u3001\u571f\u5408~\u6e6f\u6a9c\u66fd) - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 13:27", + "url": "https://www.youtube.com/watch?v=1FcxIokB8FI" + }, + { + "rank": 30, + "snippet": "\u4e0a\u8d8a\u7dda\u306e\u81e8\u6642\u5feb\u901f\u300cNODOKA\u30eb\u30fc\u30d7\u300d\u5c55\u671b\u5e2d\u304b\u3089\u306e\u524d\u9762\u5c55\u671b\u3067\u3059\u3002", + "timestamp": 1582390054, + "title": "\u4e0a\u8d8a\u7dda \u524d\u9762\u5c55\u671b\u3000\u6e6f\u6a9c\u66fd\u2192\u6c34\u4e0a\u3000\u81e8\u6642\u5feb\u901f\u300cNODOKA\u30eb\u30fc\u30d7\u300d - \u041f\u0440\u043e\u0434\u043e\u043b\u0436\u0438\u0442\u0435\u043b\u044c\u043d\u043e\u0441\u0442\u044c: 5:44", + "url": "https://www.youtube.com/watch?v=ds9_dy0zD_8" + } + ], + "timestamp": 1582390054, + "url": "https://www.youtube.com/results?search_query=%E4%B8%8A%E8%B6%8A%E7%B7%9A+%E3%83%AB%E3%83%BC%E3%83%97" +} diff --git a/data/manual-annotations/archived-raw-serps/warcs/manual-facebook.warc.gz b/data/manual-annotations/archived-raw-serps/warcs/manual-facebook.warc.gz deleted file mode 100644 index 539cd90d..00000000 Binary files a/data/manual-annotations/archived-raw-serps/warcs/manual-facebook.warc.gz and /dev/null differ diff --git a/data/manual-annotations/archived-raw-serps/warcs/manual-youtube.warc.gz b/data/manual-annotations/archived-raw-serps/warcs/manual-youtube.warc.gz deleted file mode 100644 index dbab3a36..00000000 Binary files a/data/manual-annotations/archived-raw-serps/warcs/manual-youtube.warc.gz and /dev/null differ diff --git a/data/selected-services.yaml b/data/selected-services.yaml index aa7aee56..78d4660f 100644 --- a/data/selected-services.yaml +++ b/data/selected-services.yaml @@ -1,38 +1,9 @@ - name: google - public_suffix: com - alexa_domain: google.com - alexa_rank: 1 - category: search-engine - notes: 'Domains from https://www.google.com/supported_domains + notes: 'Domains from https://google.com/supported_domains and from https://github.com/JamieFarrelly/Popular-Site-Subdomains and from https://infogalactic.com/info/List_of_Google_domains.' - input_field: false - search_form: false - search_div: false domains: - - answers.google.com - - asia.google.com - - blog.google.com - - blogsearch.google.com - - books.google.com - - calendar.google.com - - cloud.google.com - - code.google.com - - contacts.google.com - - design.google.com - - developers.google.com - - ditu.google.com - - doc.google.com - - docs.google.com - - documents.google.com - - drive.google.com - - earth.google.com - - encrypted.google.com - - europe.google.com - - finance.google.com - - firebase.google.com - - fonts.google.com - - forms.google.com + - google.com - g.cn - google.ac - google.ad @@ -86,7 +57,6 @@ - google.co.za - google.co.zm - google.co.zw - - google.com - google.com.af - google.com.ag - google.com.ai @@ -232,73 +202,21 @@ - google.vg - google.vu - google.ws - - groups.google.com - - hangouts.google.com - - image.google.com - - images.google.com - - issuetracker.google.com - - keep.google.com - - local.google.com - m.google.com - - mail.google.com - - map.google.com - - maps.google.com - - music.google.com - - news.google.com - - on.google.com - - patents.google.com - - photos.google.com - - picasa.google.com - - picasaweb.google.com - - play.google.com - - plus.google.com - - print.google.com - - productforums.google.com - - research.google.com - - sb.google.com - - scholar.google.com - search.google.com - - sites.google.com - - spreadsheets.google.com - - store.google.com - - support.google.com - - talkgadget.google.com - - translate.google.com - - trends.google.com - - video.google.com - - wave.google.com - ww.google.com query_parsers: - - url_pattern: ^https?://[^/]+/scholar\? - type: query_parameter - parameter: q - - url_pattern: ^https?://(www\.)?google[^/]+/search\? + - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - - url_pattern: ^https?://(www\.)?google[^/]+/search\? + - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: for - - url_pattern: ^https?://[^/]+/citations\? - type: query_parameter - parameter: mauthors - page_parsers: [] offset_parsers: - - url_pattern: ^https?://[^/]+/scholar\? - type: query_parameter - parameter: start - - url_pattern: ^https?://(www\.)?google[^/]+/search\? + - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: start - - url_pattern: ^https?://[^/]+/citations\? - type: query_parameter - parameter: astart interpreted_query_parsers: - - url_pattern: ^https?://[^/]+/scholar\? - type: html_selector - query_selector: input#gs_hdr_tsi - - url_pattern: ^https?://[^/]+/scholar\? - type: html_selector - query_selector: input#sbhost - url_pattern: ^https?://[^/]+/search\? type: html_selector query_selector: form#tsf input[name="q"], form#sf input[name="q"] @@ -344,20 +262,99 @@ title_selector: a focused_url_prefixes: - /search? +- name: google-scholar + domains: + - scholar.google.com + query_parsers: + - url_pattern: ^https?://[^/]+/scholar\? + type: query_parameter + parameter: q + - url_pattern: ^https?://[^/]+/citations\? + type: query_parameter + parameter: mauthors + offset_parsers: + - url_pattern: ^https?://[^/]+/scholar\? + type: query_parameter + parameter: start + - url_pattern: ^https?://[^/]+/citations\? + type: query_parameter + parameter: astart + interpreted_query_parsers: + - url_pattern: ^https?://[^/]+/scholar\? + type: html_selector + query_selector: input#gs_hdr_tsi + - url_pattern: ^https?://[^/]+/scholar\? + type: html_selector + query_selector: input#sbhost + focused_url_prefixes: - /scholar? - /citations? +- name: google-translate + domains: + - translate.google.com + focused_url_prefixes: + - /? +- name: google-other + notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains + and from https://infogalactic.com/info/List_of_Google_domains.' + domains: + - answers.google.com + - asia.google.com + - blog.google.com + - blogsearch.google.com + - books.google.com + - calendar.google.com + - cloud.google.com + - code.google.com + - contacts.google.com + - design.google.com + - developers.google.com + - ditu.google.com + - doc.google.com + - docs.google.com + - documents.google.com + - drive.google.com + - earth.google.com + - encrypted.google.com + - europe.google.com + - finance.google.com + - firebase.google.com + - fonts.google.com + - forms.google.com + - groups.google.com + - hangouts.google.com + - image.google.com + - images.google.com + - issuetracker.google.com + - keep.google.com + - local.google.com + - mail.google.com + - map.google.com + - maps.google.com + - music.google.com + - news.google.com + - on.google.com + - patents.google.com + - photos.google.com + - picasa.google.com + - picasaweb.google.com + - play.google.com + - plus.google.com + - print.google.com + - productforums.google.com + - research.google.com + - sb.google.com + - sites.google.com + - spreadsheets.google.com + - store.google.com + - support.google.com + - talkgadget.google.com - name: youtube - public_suffix: com - alexa_domain: youtube.com - alexa_rank: 2 - category: media-sharing notes: 'Domains from https://stackoverflow.com/a/65375894 and from https://github.com/JamieFarrelly/Popular-Site-Subdomains. SERPs rendered using JavaScript.' - input_field: false - search_form: false - search_div: true domains: + - youtube.com - au.youtube.com - br.youtube.com - cms.youtube.com @@ -417,7 +414,6 @@ - youtube.co.ve - youtube.co.za - youtube.co.zw - - youtube.com - youtube.com.ar - youtube.com.au - youtube.com.az @@ -538,8 +534,6 @@ - url_pattern: ^https?://[^/]+/[^/]+/search\? type: query_parameter parameter: search_query - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/results\? type: html_selector @@ -554,14 +548,7 @@ focused_url_prefixes: - /results? - name: baidu - public_suffix: com - alexa_domain: baidu.com - alexa_rank: 3 - category: search-engine notes: 'Domains from manually exploring https://baidu.com.' - input_field: false - search_form: false - search_div: true domains: - baidu.com - ai.baidu.com @@ -589,7 +576,6 @@ - url_pattern: ^https?://[^/]+/cse/site\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/s\? type: query_parameter @@ -638,14 +624,7 @@ - /f? - /cse/site? - name: qq - public_suffix: com - alexa_domain: qq.com - alexa_rank: 4 - category: web-portal notes: 'SERPs from qq.com rendered using JavaScript.' - input_field: null - search_form: null - search_div: null domains: - qq.com - v.qq.com @@ -664,7 +643,6 @@ - url_pattern: ^https?://[^/]+/search\.htm\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/x/search/\? type: html_selector @@ -686,15 +664,9 @@ - //search.html? - /x/search/? - name: facebook - public_suffix: com - alexa_domain: facebook.com - alexa_rank: 5 - category: social-media notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: false - search_form: false - search_div: false domains: + - facebook.com - about.facebook.com - about.meta.com - ads.facebook.com @@ -722,7 +694,6 @@ - es.facebook.com - et-ee.facebook.com - fa-ir.facebook.com - - facebook.com - fb-lt.facebook.com - fi-fi.facebook.com - fr-ca.facebook.com @@ -775,8 +746,6 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search type: html_selector @@ -814,14 +783,6 @@ focused_url_prefixes: - /search - name: tmall - public_suffix: com - alexa_domain: tmall.com - alexa_rank: 6 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - tmall.com - list.tmall.com @@ -829,43 +790,21 @@ - url_pattern: ^https?://[^/]+/search_product type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search_product - name: taobao - public_suffix: com - alexa_domain: taobao.com - alexa_rank: 7 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - taobao.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: yahoo - public_suffix: com - alexa_domain: yahoo.com - alexa_rank: 9 - category: web-portal notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: false - search_form: false - search_div: false domains: + - yahoo.com - about.yahoo.com - abuse.yahoo.com - adtech.yahooinc.com @@ -1011,8 +950,6 @@ - view.yahoo.com - wap.yahoo.com - ww.yahoo.com - - wwww.yahoo.com - - yahoo.com - yahoo.net - yahooinc.com - za.search.yahoo.com @@ -1021,7 +958,6 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter @@ -1069,23 +1005,16 @@ focused_url_prefixes: - /search - name: amazon - public_suffix: com - alexa_domain: amazon.com - alexa_rank: 10 - category: e-commerce notes: 'Domains from https://amazon.de/customer-preferences/country (dropdown) and https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: true - search_form: true - search_div: true domains: + - amazon.com - advertising.amazon.com - amazon.ae - amazon.ca - amazon.cn - amazon.co.jp - amazon.co.uk - - amazon.com - amazon.com.au - amazon.com.be - amazon.com.br @@ -1105,6 +1034,7 @@ - ams.amazon.com - authorcentral.amazon.com - aws.amazon.com + - chime.aws - cloud.amazon.com - developer.amazon.com - fba.amazon.com @@ -1156,7 +1086,6 @@ - url_pattern: ^https?://[^/]+/s\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/s\? type: html_selector @@ -1171,14 +1100,6 @@ - /s? - /search - name: jd - public_suffix: com - alexa_domain: jd.com - alexa_rank: 12 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - jd.com - search.jd.com @@ -1190,7 +1111,6 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector @@ -1209,14 +1129,6 @@ focused_url_prefixes: - /search? - name: '360' - public_suffix: cn - alexa_domain: 360.cn - alexa_rank: 13 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: true domains: - 360.cn - so.com @@ -1242,7 +1154,6 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: pn - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/s\? type: html_selector @@ -1266,14 +1177,6 @@ - /search - /i? - name: weibo - public_suffix: com - alexa_domain: weibo.com - alexa_rank: 15 - category: social-media - notes: null - input_field: false - search_form: false - search_div: true domains: - weibo.com - s.weibo.com @@ -1299,8 +1202,6 @@ - url_pattern: ^https?://[^/]+/weibo\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/weibo/[^/]+ type: html_selector @@ -1308,7 +1209,6 @@ - url_pattern: ^https?://[^/]+/weibo\? type: html_selector query_selector: div.search-input input[type="text"] - results_parsers: [] focused_url_prefixes: - /weibo? - /weibo @@ -1318,16 +1218,10 @@ - /pic? - /topic? - name: reddit - public_suffix: com - alexa_domain: reddit.com - alexa_rank: 17 - category: forum notes: 'Domains from https://old.reddit.com/prefs/ (dropdown) and https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: false - search_form: false - search_div: true domains: + - reddit.com - af.reddit.com - ar.reddit.com - be.reddit.com @@ -1392,7 +1286,6 @@ - pt.reddit.com - pt_BR.reddit.com - redd.it - - reddit.com - redditmedia.com - ro.reddit.com - ru.reddit.com @@ -1413,15 +1306,13 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search type: html_selector query_selector: input#header-search-bar - url_pattern: ^https?://[^/]+/search type: html_selector - query_selector: form#search input + query_selector: form#search input[name="q"] results_parsers: - url_pattern: ^https?://[^/]+/search type: html_selector @@ -1447,62 +1338,27 @@ focused_url_prefixes: - /search - name: netflix - public_suffix: com - alexa_domain: netflix.com - alexa_rank: 18 - category: service - notes: null - input_field: null - search_form: null - search_div: null domains: - netflix.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: instagram - public_suffix: com - alexa_domain: instagram.com - alexa_rank: 19 - category: social-media notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: null - search_form: null - search_div: null domains: + - instagram.com - developers.instagram.com - help.instagram.com - - instagram.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: vk - public_suffix: com - alexa_domain: vk.com - alexa_rank: 20 - category: social-media - notes: null - input_field: null - search_form: null - search_div: null domains: - vk.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: c[q] - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector @@ -1517,15 +1373,9 @@ focused_url_prefixes: - /search? - name: microsoft - public_suffix: com - alexa_domain: microsoft.com - alexa_rank: 21 - category: corporate notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: true - search_form: true - search_div: true domains: + - microsoft.com - advertising.microsoft.com - answers.microsoft.com - api.microsoft.com @@ -1555,7 +1405,6 @@ - info.microsoft.com - learn.microsoft.com - messenger.microsoft.com - - microsoft.com - mran.microsoft.com - msdn.microsoft.com - mva.microsoft.com @@ -1576,28 +1425,18 @@ - testconnectivity.microsoft.com - webgallery.microsoft.com - windows.microsoft.com + - windows.com query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+-[a-z]+/search type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+-[a-z]+/search type: query_parameter parameter: skip - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - ^\/[a-z]+-[a-z]+\/search - name: csdn - public_suffix: net - alexa_domain: csdn.net - alexa_rank: 22 - category: social-media - notes: null - input_field: null - search_form: null - search_div: null domains: - csdn.net - so.csdn.net @@ -1605,24 +1444,13 @@ - url_pattern: ^https?://[^/]+/.*/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /so/search? - name: bing - public_suffix: com - alexa_domain: bing.com - alexa_rank: 25 - category: search-engine notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: false - search_form: false - search_div: false domains: - - be.bing.com - bing.com + - be.bing.com - blogs.bing.com - br.bing.com - cn.bing.com @@ -1698,15 +1526,9 @@ - /images/search? - /videos/search? - name: twitter - public_suffix: com - alexa_domain: twitter.com - alexa_rank: 26 - category: social-media notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: null - search_form: null - search_div: null domains: + - twitter.com - business.twitter.com - cards.twitter.com - dev.twitter.com @@ -1718,15 +1540,11 @@ - search.twitter.com - support.twitter.com - td.twitter.com - - twitter.com - - twitter.com - video.twitter.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector @@ -1747,35 +1565,15 @@ focused_url_prefixes: - /search? - name: twitch - public_suffix: tv - alexa_domain: twitch.tv - alexa_rank: 28 - category: streaming - notes: null - input_field: null - search_form: null - search_div: null domains: - twitch.tv query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: zoom - public_suffix: us - alexa_domain: zoom.us - alexa_rank: 30 - category: corporate - notes: null - input_field: null - search_form: null - search_div: null domains: - zoom.us - explore.zoom.us @@ -1783,26 +1581,17 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search type: fragment_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/search type: query_parameter parameter: first - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - name: ebay - public_suffix: com - alexa_domain: ebay.com - alexa_rank: 31 - category: e-commerce notes: 'Domains from https://ebay.com/ (dropdown) and from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: false - search_form: false - search_div: false domains: + - ebay.com - adn.ebay.com - anywhere.ebay.com - applications.ebay.com @@ -1826,8 +1615,6 @@ - ebay.cn - ebay.co.jp - ebay.co.uk - - ebay.com - - ebay.com - ebay.com.au - ebay.com.hk - ebay.com.my @@ -1886,7 +1673,6 @@ - url_pattern: ^https?://[^/]+/([^?]+/)?i\.html\? type: query_parameter parameter: _pgn - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/([^?]+/)?i\.html\? type: html_selector @@ -1914,14 +1700,6 @@ - /sch/i.html? - /i.html? - name: naver - public_suffix: com - alexa_domain: naver.com - alexa_rank: 32 - category: search-engine - notes: null - input_field: false - search_form: false - search_div: true domains: - naver.com - search.naver.com @@ -1929,7 +1707,6 @@ - url_pattern: ^https?://[^/]+/search\.naver\? type: query_parameter parameter: query - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\.naver\? type: query_parameter @@ -1968,14 +1745,6 @@ focused_url_prefixes: - /search.naver? - name: aliexpress - public_suffix: com - alexa_domain: aliexpress.com - alexa_rank: 33 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - aliexpress.com query_parsers: @@ -1992,7 +1761,6 @@ - url_pattern: ^https?://[^/]+/wholesale type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/w/wholesale type: html_selector @@ -2017,14 +1785,6 @@ - /w/wholesale - /wholesale - name: yandex - public_suffix: ru - alexa_domain: yandex.ru - alexa_rank: 34 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - yandex.ru query_parsers: @@ -2041,7 +1801,6 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: p - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/images/search type: html_selector @@ -2049,20 +1808,11 @@ - url_pattern: ^https?://[^/]+/video/search type: html_selector query_selector: form.search2 input[name="text"] - results_parsers: [] focused_url_prefixes: - /search - /images/search - /video/search - name: linkedin - public_suffix: com - alexa_domain: linkedin.com - alexa_rank: 36 - category: social-media - notes: null - input_field: null - search_form: null - search_div: null domains: - linkedin.com query_parsers: @@ -2073,20 +1823,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: bongacams - public_suffix: com - alexa_domain: bongacams.com - alexa_rank: 37 - category: pornography - notes: null - input_field: null - search_form: null - search_div: null domains: - bongacams.com - en.bongacams.com @@ -2094,9 +1833,6 @@ - url_pattern: ^https?://[^/]+/(female|male|couples|trans|new-models)/tags/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] results_parsers: - url_pattern: ^https?://[^/]+/(female|male|couples|trans|new-models)/tags/[^/]+ type: html_selector @@ -2111,17 +1847,10 @@ - /trans/tags/ - /new-models/tags/ - name: apple - public_suffix: com - alexa_domain: apple.com - alexa_rank: 39 - category: corporate notes: 'Domains from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: true - search_form: true - search_div: false domains: - - advertising.apple.com - apple.com + - advertising.apple.com - appleid.apple.com - asia.apple.com - asw.apple.com @@ -2170,20 +1899,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: pornhub - public_suffix: com - alexa_domain: pornhub.com - alexa_rank: 41 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - pornhub.com query_parsers: @@ -2194,7 +1912,6 @@ - url_pattern: ^https?://[^/]+/video/search\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/video/search\? type: html_selector @@ -2221,35 +1938,15 @@ focused_url_prefixes: - /video/search? - name: mail-ru - public_suffix: ru - alexa_domain: mail.ru - alexa_rank: 42 - category: web-portal - notes: null - input_field: null - search_form: null - search_div: null domains: - mail.ru query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: text - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: stackoverflow - public_suffix: com - alexa_domain: stackoverflow.com - alexa_rank: 43 - category: question-and-answer - notes: null - input_field: true - search_form: true - search_div: true domains: - stackoverflow.com query_parsers: @@ -2263,7 +1960,6 @@ - url_pattern: ^https?://[^/]+/questions/tagged type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/questions/tagged type: html_selector @@ -2285,14 +1981,6 @@ - /search? - /questions/tagged - name: tribunnews - public_suffix: com - alexa_domain: tribunnews.com - alexa_rank: 50 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - tribunnews.com query_parsers: @@ -2303,32 +1991,19 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: imdb - public_suffix: com - alexa_domain: imdb.com - alexa_rank: 51 - category: database - notes: null - input_field: true - search_form: true - search_div: true domains: - imdb.com query_parsers: - url_pattern: ^https?://[^/]+/find\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/find\? type: html_selector - query_selector: div#nb_search input + query_selector: div#nb_search input[name="q"] - url_pattern: ^https?://[^/]+/find\? type: html_selector query_selector: input#navbar-query @@ -2343,42 +2018,22 @@ title_selector: td.result_text a - url_pattern: ^https?://[^/]+/find\? type: html_selector - results_selector: div#main table tr:has(td[valign="top"]) - url_selector: td[valign="top"]:nth-child(3) a - title_selector: td[valign="top"]:nth-child(3) a + results_selector: div#main table tr:has(td:not(.media_strip_header) a:not(:has(img))) + url_selector: td:not(.media_strip_header) a:not(:has(img)) + title_selector: td:not(.media_strip_header) a:not(:has(img)) focused_url_prefixes: - /find - name: livejasmin - public_suffix: com - alexa_domain: livejasmin.com - alexa_rank: 52 - category: pornography - notes: null - input_field: null - search_form: null - search_div: null domains: - livejasmin.com query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/(girls|boys)/[^/]+/.*Search type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/girls/ - /en/boys/ - name: chaturbate - public_suffix: com - alexa_domain: chaturbate.com - alexa_rank: 53 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - chaturbate.com query_parsers: @@ -2389,20 +2044,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: ok - public_suffix: ru - alexa_domain: ok.ru - alexa_rank: 56 - category: social-media - notes: null - input_field: null - search_form: null - search_div: null domains: - ok.ru query_parsers: @@ -2412,24 +2056,13 @@ - url_pattern: ^https?://[^/]+/music/search/tracks/[^/]+ type: path_segment segment: 4 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /dk? - /music/search/tracks/ - name: xvideos - public_suffix: com - alexa_domain: xvideos.com - alexa_rank: 57 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - xvideos.com + - xvideos2.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter @@ -2438,7 +2071,6 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: p - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/\? type: html_selector @@ -2465,14 +2097,6 @@ focused_url_prefixes: - / - name: github - public_suffix: com - alexa_domain: github.com - alexa_rank: 58 - category: service - notes: null - input_field: false - search_form: false - search_div: false domains: - github.com query_parsers: @@ -2483,7 +2107,6 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector @@ -2510,14 +2133,6 @@ focused_url_prefixes: - /search? - name: cnn - public_suffix: com - alexa_domain: cnn.com - alexa_rank: 64 - category: news-and-boulevard - notes: null - input_field: null - search_form: null - search_div: null domains: - cnn.com - edition.cnn.com @@ -2529,20 +2144,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: etsy - public_suffix: com - alexa_domain: etsy.com - alexa_rank: 67 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - etsy.com query_parsers: @@ -2553,7 +2157,6 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector @@ -2580,14 +2183,6 @@ focused_url_prefixes: - /search? - name: xhamster - public_suffix: com - alexa_domain: xhamster.com - alexa_rank: 70 - category: pornography - notes: null - input_field: null - search_form: null - search_div: null domains: - xhamster.com query_parsers: @@ -2598,20 +2193,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+page type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: sogou - public_suffix: com - alexa_domain: sogou.com - alexa_rank: 73 - category: search-engine - notes: null - input_field: false - search_form: false - search_div: true domains: - sogou.com query_parsers: @@ -2646,7 +2230,6 @@ - url_pattern: ^https?://[^/]+/weixin\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/sogou\? type: html_selector @@ -2675,77 +2258,33 @@ - /weixin? - /v? - name: canva - public_suffix: com - alexa_domain: canva.com - alexa_rank: 74 - category: service - notes: null - input_field: null - search_form: null - search_div: null domains: - canva.com query_parsers: - url_pattern: ^https?://[^/]+/design/play\? type: query_parameter parameter: layoutQuery - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /design/play? - name: tumblr - public_suffix: com - alexa_domain: tumblr.com - alexa_rank: 75 - category: social-media - notes: null - input_field: null - search_form: null - search_div: null domains: - tumblr.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: espn - public_suffix: com - alexa_domain: espn.com - alexa_rank: 76 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - espn.com query_parsers: - url_pattern: ^https?://[^/]+/search/_/q/[^/]+ type: path_segment segment: 4 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: instructure - public_suffix: com - alexa_domain: instructure.com - alexa_rank: 78 - category: corporate - notes: null - input_field: null - search_form: null - search_div: null domains: - instructure.com query_parsers: @@ -2756,20 +2295,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: indeed - public_suffix: com - alexa_domain: indeed.com - alexa_rank: 79 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - indeed.com - de.indeed.com @@ -2777,7 +2305,6 @@ - url_pattern: ^https?://[^/]+/jobs\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/jobs\? type: query_parameter @@ -2811,22 +2338,12 @@ focused_url_prefixes: - /jobs? - name: roblox - public_suffix: com - alexa_domain: roblox.com - alexa_rank: 80 - category: gaming - notes: null - input_field: null - search_form: null - search_div: null domains: - roblox.com query_parsers: - url_pattern: ^https?://[^/]+/(catalog\/browse.aspx\?|discover) type: query_parameter parameter: Keyword - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/(catalog\/browse.aspx\?|discover) type: html_selector @@ -2841,39 +2358,20 @@ - /catalog/browse.aspx? - /discover - name: imgur - public_suffix: com - alexa_domain: imgur.com - alexa_rank: 81 - category: media-sharing - notes: null - input_field: null - search_form: null - search_div: null domains: - imgur.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector query_selector: span.search-term-text query_text: true - results_parsers: [] focused_url_prefixes: - /search? - name: flipkart - public_suffix: com - alexa_domain: flipkart.com - alexa_rank: 82 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - flipkart.com query_parsers: @@ -2884,20 +2382,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: fandom - public_suffix: com - alexa_domain: fandom.com - alexa_rank: 84 - category: wiki - notes: null - input_field: null - search_form: null - search_div: null domains: - fandom.com query_parsers: @@ -2911,21 +2398,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: bbc - public_suffix: co.uk - alexa_domain: bbc.co.uk - alexa_rank: 85 - category: news-and-boulevard - notes: null - input_field: null - search_form: null - search_div: null domains: - bbc.co.uk query_parsers: @@ -2936,20 +2412,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: detik - public_suffix: com - alexa_domain: detik.com - alexa_rank: 87 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - detik.com query_parsers: @@ -2960,44 +2425,22 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: booking - public_suffix: com - alexa_domain: booking.com - alexa_rank: 88 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - booking.com query_parsers: - url_pattern: ^https?://[^/]+/searchresults type: query_parameter parameter: ss - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/searchresults type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /searchresults - name: cnblogs - public_suffix: com - alexa_domain: cnblogs.com - alexa_rank: 89 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - cnblogs.com - zzk.cnblogs.com @@ -3033,9 +2476,6 @@ - url_pattern: ^https?://[^/]+/s\? type: query_parameter parameter: pageindex - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s? - /s/news? @@ -3043,14 +2483,6 @@ - /s/kb? - /s/blogpost? - name: walmart - public_suffix: com - alexa_domain: walmart.com - alexa_rank: 93 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - walmart.com query_parsers: @@ -3061,20 +2493,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: alibaba - public_suffix: com - alexa_domain: alibaba.com - alexa_rank: 95 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - alibaba.com query_parsers: @@ -3085,20 +2506,9 @@ - url_pattern: ^https?://[^/]+/trade/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /trade/search? - name: freepik - public_suffix: com - alexa_domain: freepik.com - alexa_rank: 98 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - freepik.com query_parsers: @@ -3109,20 +2519,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: nih - public_suffix: gov - alexa_domain: nih.gov - alexa_rank: 100 - category: governmental - notes: null - input_field: true - search_form: true - search_div: false domains: - nih.gov - search.nih.gov @@ -3134,20 +2533,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: salesforce - public_suffix: com - alexa_domain: salesforce.com - alexa_rank: 102 - category: corporate - notes: null - input_field: null - search_form: null - search_div: null domains: - salesforce.com - force.com @@ -3155,25 +2543,16 @@ - url_pattern: ^https?://[^/]+/search type: fragment_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: first - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: stackexchange - public_suffix: com - alexa_domain: stackexchange.com - alexa_rank: 103 - category: question-and-answer notes: 'Domains from https://stackexchange.com/sites.' - input_field: true - search_form: true - search_div: true domains: + - stackexchange.com - 3dprinting.stackexchange.com - academia.stackexchange.com - ai.stackexchange.com @@ -3327,7 +2706,6 @@ - sports.stackexchange.com - sqa.stackexchange.com - stackapps.com - - stackexchange.com - stats.stackexchange.com - stellar.stackexchange.com - substrate.stackexchange.com @@ -3360,23 +2738,12 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: daum - public_suffix: net - alexa_domain: daum.net - alexa_rank: 104 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: true domains: - - 100.daum.net - daum.net + - 100.daum.net - search.daum.net query_parsers: - url_pattern: ^https?://[^/]+/search @@ -3386,20 +2753,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: udemy - public_suffix: com - alexa_domain: udemy.com - alexa_rank: 105 - category: education - notes: null - input_field: null - search_form: null - search_div: null domains: - udemy.com query_parsers: @@ -3410,20 +2766,10 @@ - url_pattern: ^https?://[^/]+/courses/.*search\-query type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /courses - name: craigslist - public_suffix: org - alexa_domain: craigslist.org - alexa_rank: 115 - category: e-commerce - notes: 'Domains from https://www.craigslist.org/about/sites.' - input_field: true - search_form: true - search_div: true + notes: 'Domains from https://craigslist.org/about/sites.' domains: - craigslist.org - auburn.craigslist.org @@ -4144,24 +3490,13 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: query - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: s - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: avito - public_suffix: ru - alexa_domain: avito.ru - alexa_rank: 120 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - avito.ru query_parsers: @@ -4172,50 +3507,30 @@ - url_pattern: ^https?://[^/]+/[^/]+\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: grid - public_suffix: id - alexa_domain: grid.id - alexa_rank: 122 - category: news-and-boulevard - notes: Uses Google - input_field: null - search_form: null - search_div: null + notes: Uses Google search domains: - grid.id query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: duckduckgo - public_suffix: com - alexa_domain: duckduckgo.com - alexa_rank: 124 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: true domains: - duckduckgo.com - html.duckduckgo.com + - links.duckduckgo.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] + - url_pattern: ^https?://[^/]+/d\.js\? + type: query_parameter + parameter: q interpreted_query_parsers: - url_pattern: ^https?://[^/]+/\? type: html_selector @@ -4229,14 +3544,6 @@ focused_url_prefixes: - / - name: aliyun - public_suffix: com - alexa_domain: aliyun.com - alexa_rank: 125 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - aliyun.com query_parsers: @@ -4247,41 +3554,18 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: tiktok - public_suffix: com - alexa_domain: tiktok.com - alexa_rank: 126 - category: media-sharing - notes: null - input_field: null - search_form: null - search_div: null domains: - tiktok.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: shutterstock - public_suffix: com - alexa_domain: shutterstock.com - alexa_rank: 128 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - shutterstock.com query_parsers: @@ -4291,66 +3575,30 @@ - url_pattern: ^https?://[^/]+/(editorial|video|music)/search/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - /editorial/search/ - /video/search/ - /music/search/ - name: xnxx - public_suffix: com - alexa_domain: xnxx.com - alexa_rank: 131 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - xnxx.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: gome - public_suffix: com.cn - alexa_domain: gome.com.cn - alexa_rank: 132 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - gome.com.cn query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: question - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: w3schools - public_suffix: com - alexa_domain: w3schools.com - alexa_rank: 134 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - w3schools.com query_parsers: @@ -4361,44 +3609,22 @@ - url_pattern: ^https?://[^/]+/.*#gsc type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: researchgate - public_suffix: net - alexa_domain: researchgate.net - alexa_rank: 135 - category: database - notes: null - input_field: null - search_form: null - search_div: null domains: - researchgate.net query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: query - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: tokopedia - public_suffix: com - alexa_domain: tokopedia.com - alexa_rank: 141 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - tokopedia.com query_parsers: @@ -4409,20 +3635,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: youm7 - public_suffix: com - alexa_domain: youm7.com - alexa_rank: 143 - category: news-and-boulevard - notes: null - input_field: null - search_form: null - search_div: null domains: - youm7.com query_parsers: @@ -4435,42 +3650,19 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /home/Search? - /Home/Search? - name: globo - public_suffix: com - alexa_domain: globo.com - alexa_rank: 146 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - globo.com query_parsers: - url_pattern: ^https?://[^/]+/busca type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /busca - name: slideshare - public_suffix: net - alexa_domain: slideshare.net - alexa_rank: 147 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: true domains: - slideshare.net query_parsers: @@ -4481,37 +3673,13 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: tistory - public_suffix: com - alexa_domain: tistory.com - alexa_rank: 151 - category: blog - notes: exluded; Uses daum search (https://daum.net/). - input_field: false - search_form: false - search_div: true + excluded: Redirects to Daum search domains: - tistory.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: china - public_suffix: com.cn - alexa_domain: china.com.cn - alexa_rank: 153 - category: governmental - notes: null - input_field: null - search_form: null - search_div: null domains: - china.com.cn - query.china.com.cn @@ -4526,83 +3694,36 @@ - url_pattern: ^https?://[^/]+/news/query type: query_parameter parameter: startPage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /query - /news/query - name: varzesh3 - public_suffix: com - alexa_domain: varzesh3.com - alexa_rank: 155 - category: sports - notes: null - input_field: true - search_form: true - search_div: true domains: - varzesh3.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: wikihow - public_suffix: com - alexa_domain: wikihow.com - alexa_rank: 163 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - wikihow.com query_parsers: - url_pattern: ^https?://[^/]+/wikiHowTo\? type: query_parameter parameter: search - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/wikiHowTo\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /wikiHowTo? - name: quora - public_suffix: com - alexa_domain: quora.com - alexa_rank: 168 - category: question-and-answer - notes: 'excluded; Excluded from Internet Archive.' - input_field: null - search_form: null - search_div: null + excluded: Excluded from Internet Archive domains: - quora.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: bukalapak - public_suffix: com - alexa_domain: bukalapak.com - alexa_rank: 183 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - bukalapak.com query_parsers: @@ -4619,21 +3740,10 @@ - url_pattern: ^https?://[^/]+/products\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /products? - /products/ - name: ask - public_suffix: com - alexa_domain: ask.com - alexa_rank: 189 - category: question-and-answer - notes: null - input_field: false - search_form: false - search_div: false domains: - ask.com query_parsers: @@ -4644,7 +3754,6 @@ - url_pattern: ^https?://[^/]+/web\? type: query_parameter parameter: page - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/web\? type: html_selector @@ -4670,7 +3779,7 @@ results_selector: div.l-web-results div.web-result url_selector: a.sa_headline, h2.web-result-title a, div.web-result-title a title_selector: a.sa_headline, h2.web-result-title, div.web-result-title - snippet_selector: div.sa_abstract, p.web-result-description, div.web-result-description + snippet_selector: div.sa_abstract, p.web-result-description, div.web-result-description - url_pattern: ^https?://[^/]+/web\? type: html_selector results_selector: div#teoma-results div.pad.pl10.pr10, div#webr div.mb16, div#webr div.m10_0_16 @@ -4692,35 +3801,15 @@ focused_url_prefixes: - /web? - name: intuit - public_suffix: com - alexa_domain: intuit.com - alexa_rank: 190 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - intuit.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: search_term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: usps - public_suffix: com - alexa_domain: usps.com - alexa_rank: 197 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - usps.com query_parsers: @@ -4731,98 +3820,46 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: PNO - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search -- name: steamcommunity - public_suffix: com - alexa_domain: steamcommunity.com - alexa_rank: 206 - category: gaming - notes: 'excluded; Only user search; Hubs are not free text.' - input_field: true - search_form: true - search_div: false +- name: steam + excluded: Only user search, Hubs are not free text domains: - steamcommunity.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] + - steampowered.com + query_parsers: + - url_pattern: ^https?://[^/]+/search + type: query_parameter + parameter: term + focused_url_prefixes: + - /search - name: deepl - public_suffix: com - alexa_domain: deepl.com - alexa_rank: 224 - category: service - notes: 'exluded; No search' - input_field: false - search_form: false - search_div: true + excluded: No search domains: - deepl.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: airbnb - public_suffix: com - alexa_domain: airbnb.com - alexa_rank: 241 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - airbnb.com query_parsers: - url_pattern: ^https?://[^/]+/s/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: bankofamerica - public_suffix: com - alexa_domain: bankofamerica.com - alexa_rank: 243 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - bankofamerica.com query_parsers: - url_pattern: ^https?://[^/]+/global-search-public type: query_parameter parameter: state - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /global-search-public - name: wikimedia - public_suffix: org - alexa_domain: wikipedia.org - alexa_rank: 247 - category: wiki notes: 'Domains from https://commons.wikimedia.org/wiki/Data:Wikipedia_statistics/data.tab and from https://github.com/JamieFarrelly/Popular-Site-Subdomains.' - input_field: true - search_form: true - search_div: true domains: + - wikimedia.org - aa.wikibooks.org - aa.wikipedia.org - aa.wiktionary.org @@ -5678,8 +4715,8 @@ - wa.wikisource.org - wa.wiktionary.org - war.wikipedia.org + - wikidata.org - wikimania.wikimedia.org - - wikimedia.org - wikimediafoundation.org - wikipedia.org - wikitech.wikimedia.org @@ -5687,7 +4724,6 @@ - wo.wikiquote.org - wo.wiktionary.org - wuu.wikipedia.org - - www.wikidata.org - xal.wikipedia.org - xh.wikibooks.org - xh.wikipedia.org @@ -5727,7 +4763,6 @@ - url_pattern: ^https?://[^/]+/w/index.php\? type: query_parameter parameter: search - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/w/index.php\? type: query_parameter @@ -5752,41 +4787,22 @@ focused_url_prefixes: - /w/index.php? - name: blackboard - public_suffix: com - alexa_domain: blackboard.com - alexa_rank: 248 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - blackboard.com query_parsers: - url_pattern: ^https?://[^/]+/site-search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/site-search\? type: query_parameter parameter: first - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /site-search? - name: rambler - public_suffix: ru - alexa_domain: rambler.ru - alexa_rank: 261 - category: web-portal - notes: null - input_field: null - search_form: null - search_div: null domains: - - nova.rambler.ru - rambler.ru + - nova.rambler.ru - images.rambler.ru - rabota.rambler.ru query_parsers: @@ -5797,86 +4813,31 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? -- name: steampowered - public_suffix: com - alexa_domain: steampowered.com - alexa_rank: 274 - category: gaming - notes: null - input_field: true - search_form: true - search_div: true - domains: - - steampowered.com - query_parsers: - - url_pattern: ^https?://[^/]+/search - type: query_parameter - parameter: term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: - - /search - name: investopedia - public_suffix: com - alexa_domain: investopedia.com - alexa_rank: 286 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - investopedia.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: 9gag - public_suffix: com - alexa_domain: 9gag.com - alexa_rank: 299 - category: media-sharing - notes: null - input_field: null - search_form: null - search_div: null domains: - 9gag.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: chegg - public_suffix: com - alexa_domain: chegg.com - alexa_rank: 300 - category: education - notes: null - input_field: null - search_form: null - search_div: null domains: - chegg.com query_parsers: @@ -5887,20 +4848,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search\? - name: kakao - public_suffix: com - alexa_domain: kakao.com - alexa_rank: 317 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - kakao.com query_parsers: @@ -5913,20 +4863,9 @@ segment: 3 remove_patterns: - 'page:' - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: glassdoor - public_suffix: com - alexa_domain: glassdoor.com - alexa_rank: 329 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - glassdoor.com query_parsers: @@ -5943,23 +4882,11 @@ segment: 2 remove_patterns: - \.htm$ - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Search - /Job - /Salaries - name: naukri - public_suffix: com - alexa_domain: naukri.com - alexa_rank: 338 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - naukri.com query_parsers: @@ -5972,20 +4899,9 @@ segment: 1 remove_patterns: - ^[^/]+-jobs- - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: sourceforge - public_suffix: net - alexa_domain: sourceforge.net - alexa_rank: 357 - category: download - notes: null - input_field: null - search_form: null - search_div: null domains: - sourceforge.net query_parsers: @@ -5996,21 +4912,10 @@ - url_pattern: ^https?://[^/]+/(directory|software) type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /directory - /software - name: webmd - public_suffix: com - alexa_domain: webmd.com - alexa_rank: 367 - category: blog - notes: null - input_field: null - search_form: null - search_div: null domains: - webmd.com query_parsers: @@ -6021,20 +4926,9 @@ - url_pattern: ^https?://[^/]+/search/search_results type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/search_results - name: youdao - public_suffix: com - alexa_domain: youdao.com - alexa_rank: 407 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - youdao.com - ke.youdao.com @@ -6045,92 +4939,45 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /result\? - /search - name: dbs - public_suffix: com.sg - alexa_domain: dbs.com.sg - alexa_rank: 425 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - dbs.com.sg query_parsers: - url_pattern: ^https?://[^/]+/searchresults\.page type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /searchresults.page - name: seznam - public_suffix: cz - alexa_domain: seznam.cz - alexa_rank: 430 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - - search.seznam.cz - seznam.cz + - search.seznam.cz query_parsers: - url_pattern: ^https?://[^/]+/?(obrazky\/)?\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/?((obrazky|videa|clanky)\/)?\? type: query_parameter parameter: from - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /obrazky/? - /videa/? - /clanky/? - name: chinaz - public_suffix: com - alexa_domain: chinaz.com - alexa_rank: 431 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - chinaz.com query_parsers: - url_pattern: ^https?://[^/]+/search\.aspx\? type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.aspx? - name: ecosia - public_suffix: org - alexa_domain: ecosia.org - alexa_rank: 438 - category: search-engine - notes: null - input_field: false - search_form: false - search_div: false domains: - ecosia.org query_parsers: @@ -6156,7 +5003,6 @@ - url_pattern: ^https?://[^/]+/videos\? type: query_parameter parameter: p - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector @@ -6183,18 +5029,11 @@ - /news? - /videos? - name: rediff - public_suffix: com - alexa_domain: rediff.com - alexa_rank: 449 - category: e-commerce notes: 'TODO: Set correct query parser for https://shopping.rediff.com/?sc_cid=shopping_ushomesrch#!bracelet/30-60 Bracelet is the query term Currently no parser supports these queries and/or offsets: - https://shopping.rediff.com/product/test?sc_cid=shopping_ushomesrch#!design/30-60 - https://shopping.rediff.com/#!design/30-60' - input_field: null - search_form: null - search_div: null domains: - rediff.com - shopping.rediff.com @@ -6205,27 +5044,16 @@ - url_pattern: ^https?://[^/]+/product/[^/]+ type: path_segment segment: 2 - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+-[^/]+ type: path_segment segment: 3 remove_patterns: - -[^/]+$ - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /product - name: goo - public_suffix: ne.jp - alexa_domain: goo.ne.jp - alexa_rank: 464 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - goo.ne.jp - search.goo.ne.jp @@ -6237,7 +5065,6 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: MT - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/web\.jsp\? type: query_parameter @@ -6245,61 +5072,27 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: FR - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /web.jsp? - /search.php? - name: turkiye - public_suffix: gov.tr - alexa_domain: turkiye.gov.tr - alexa_rank: 477 - category: governmental - notes: null - input_field: true - search_form: true - search_div: false domains: - turkiye.gov.tr query_parsers: - url_pattern: ^https?://[^/]+/arama\? type: query_parameter parameter: aranan - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/arama\? type: query_parameter parameter: sf - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /arama? - name: europa - public_suffix: eu - alexa_domain: europa.eu - alexa_rank: 504 - category: governmental - notes: 'excluded; Query not in URL.' - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - europa.eu - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: dcinside - public_suffix: com - alexa_domain: dcinside.com - alexa_rank: 505 - category: forum - notes: null - input_field: true - search_form: true - search_div: true domains: - dcinside.com - search.dcinside.com @@ -6317,38 +5110,14 @@ - url_pattern: ^https?://[^/]+/post/p/[^/]+/sort/[^/]+/q/[^/]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /combine - /post - name: news18 - public_suffix: com - alexa_domain: news18.com - alexa_rank: 688 - category: news-and-boulevard - notes: 'excluded; Query not in URL; Uses Google search (https://cse.google.com/).' - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - news18.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: gov - public_suffix: gov.uk - alexa_domain: gov.gov.uk - alexa_rank: 700 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - gov.gov.uk query_parsers: @@ -6359,20 +5128,9 @@ - url_pattern: ^https?://[^/]+/search/all\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/all? - name: elfagr - public_suffix: com - alexa_domain: elfagr.com - alexa_rank: 704 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - elfagr.com - elfagr.org @@ -6384,20 +5142,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: bandcamp - public_suffix: com - alexa_domain: bandcamp.com - alexa_rank: 791 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - bandcamp.com query_parsers: @@ -6408,44 +5155,22 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: allrecipes - public_suffix: com - alexa_domain: allrecipes.com - alexa_rank: 858 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - allrecipes.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: 123rf - public_suffix: com - alexa_domain: 123rf.com - alexa_rank: 865 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - 123rf.com query_parsers: @@ -6466,21 +5191,10 @@ - url_pattern: ^https?://[^/]+/lizenzfreie-bilder/[^/]\.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /stock-photo - /lizenzfreie-bilder - name: fivethirtyeight - public_suffix: com - alexa_domain: fivethirtyeight.com - alexa_rank: 902 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - fivethirtyeight.com query_parsers: @@ -6491,20 +5205,9 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: stanford - public_suffix: edu - alexa_domain: stanford.edu - alexa_rank: 934 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - stanford.edu - google.stanford.edu @@ -6525,168 +5228,72 @@ - url_pattern: ^https?://[^/]+/linux\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - /? - /service/websearch? - /linux? - name: commbank - public_suffix: com.au - alexa_domain: commbank.com.au - alexa_rank: 954 - category: corporate - notes: 'excluded; Query not in URL.' - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - commbank.com.au - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: smartsheet - public_suffix: com - alexa_domain: smartsheet.com - alexa_rank: 957 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - smartsheet.com query_parsers: - url_pattern: ^https?://[^/]+/search type: fragment_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search type: fragment_parameter parameter: first - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: caf - public_suffix: fr - alexa_domain: caf.fr - alexa_rank: 1060 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - caf.fr query_parsers: - url_pattern: ^https?://[^/]+/allocataires/recherche\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /allocataires/recherche? - name: 4shared - public_suffix: com - alexa_domain: 4shared.com - alexa_rank: 1065 - category: torrent - notes: null - input_field: true - search_form: true - search_div: false domains: - 4shared.com query_parsers: - url_pattern: ^https?://[^/]+/web/q type: fragment_parameter parameter: query - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/web/q type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /web/q - name: lifewire - public_suffix: com - alexa_domain: lifewire.com - alexa_rank: 1117 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - lifewire.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: 24h - public_suffix: com.vn - alexa_domain: 24h.com.vn - alexa_rank: 1122 - category: news-and-boulevard - notes: 'excluded; Redirects to Google; Uses Google search (https://cse.google.com/).' - input_field: false - search_form: false - search_div: true + excluded: Redirects to Google search domains: - 24h.com.vn - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: baskino - public_suffix: me - alexa_domain: baskino.me - alexa_rank: 1150 - category: streaming - notes: 'excluded; Query not in URL.' - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - baskino.me - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tdameritrade - public_suffix: com - alexa_domain: tdameritrade.com - alexa_rank: 1166 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - tdameritrade.com query_parsers: @@ -6697,20 +5304,10 @@ - url_pattern: ^https?://[^/]+/search-results\.html\? type: query_parameter parameter: pageNumber - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search-results.html? - name: sfgate - public_suffix: com - alexa_domain: sfgate.com - alexa_rank: 1224 - category: news-and-boulevard - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - sfgate.com query_parsers: @@ -6721,23 +5318,11 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: gob - public_suffix: gob - alexa_domain: www.gob.mx - alexa_rank: 1255 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - gob.mx - - www.gob.mx query_parsers: - url_pattern: ^https?://[^/]+/busqueda\? type: query_parameter @@ -6746,41 +5331,18 @@ - url_pattern: ^https?://[^/]+/busqueda\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /busqueda? - name: etrade - public_suffix: com - alexa_domain: etrade.com - alexa_rank: 1262 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - etrade.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: vectorstock - public_suffix: com - alexa_domain: vectorstock.com - alexa_rank: 1279 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - vectorstock.com query_parsers: @@ -6798,38 +5360,14 @@ segment: 2 remove_patterns: - ^[^/]+-vectors-page_ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /royalty-free-vectors - /free-vectors - name: csod - public_suffix: com - alexa_domain: csod.com - alexa_rank: 1305 - category: corporate - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - csod.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: fmovies - public_suffix: to - alexa_domain: fmovies.to - alexa_rank: 1311 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - fmovies.to - fmovies.wtf @@ -6847,21 +5385,10 @@ - url_pattern: ^https?://[^/]+/ajax/film/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /ajax/film/search? - name: biggo - public_suffix: com.tw - alexa_domain: biggo.com.tw - alexa_rank: 1316 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - biggo.com.tw query_parsers: @@ -6872,20 +5399,9 @@ - url_pattern: ^https?://[^/]+/s type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: sagepub - public_suffix: com - alexa_domain: sagepub.com - alexa_rank: 1317 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - sagepub.com query_parsers: @@ -6896,39 +5412,15 @@ - url_pattern: ^https?://[^/]+/[a-z]+-[a-z]+/[a-z]+/(content|events|product)/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /product - /event - /content - name: pochta - public_suffix: ru - alexa_domain: pochta.ru - alexa_rank: 1339 - category: governmental - notes: excluded; Excluded from the web archive - input_field: true - search_form: true - search_div: true + excluded: Excluded from the Internet Archive domains: - pochta.ru - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tasnimnews - public_suffix: com - alexa_domain: tasnimnews.com - alexa_rank: 1345 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - tasnimnews.com query_parsers: @@ -6939,20 +5431,9 @@ - url_pattern: ^https?://[^/]+/fa/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /fa/search? - name: cyberleninka - public_suffix: ru - alexa_domain: cyberleninka.ru - alexa_rank: 1352 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - cyberleninka.ru query_parsers: @@ -6963,43 +5444,20 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: hangseng - public_suffix: com - alexa_domain: hangseng.com - alexa_rank: 1357 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - hangseng.com query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+-[a-z]+/search type: query_parameter parameter: searchString - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en-hk/search - /zh-hk/search - /zh-cn/search - name: lg - public_suffix: com - alexa_domain: lg.com - alexa_rank: 1363 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - lg.com query_parsers: @@ -7033,10 +5491,6 @@ - url_pattern: ^https?://[^/]+/us/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /de/search - /uk/search @@ -7049,14 +5503,6 @@ - /br/search - /cl/search - name: semanticscholar - public_suffix: org - alexa_domain: semanticscholar.org - alexa_rank: 1369 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - semanticscholar.org query_parsers: @@ -7067,38 +5513,14 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: listindiario - public_suffix: com - alexa_domain: listindiario.com - alexa_rank: 1380 - category: news-and-boulevard - notes: 'Uses Google search (https://cse.google.com/). - Query not in URL.' - input_field: false - search_form: false - search_div: true + excluded: Query not in URL + notes: Uses Google search domains: - listindiario.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: nsw - public_suffix: gov.au - alexa_domain: nsw.gov.au - alexa_rank: 1383 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - nsw.gov.au query_parsers: @@ -7109,121 +5531,49 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: startpage - public_suffix: com - alexa_domain: startpage.com - alexa_rank: 1394 - category: search-engine - notes: 'excluded; Query not in URL.' - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - startpage.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: zdf - public_suffix: de - alexa_domain: zdf.de - alexa_rank: 1418 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - zdf.de query_parsers: - url_pattern: ^https?://[^/]+/suche\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /suche? - name: postermywall - public_suffix: com - alexa_domain: postermywall.com - alexa_rank: 1427 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: true domains: - postermywall.com query_parsers: - url_pattern: ^https?://[^/]+/index\.php/posters/search\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /index.php/posters/search? - name: jagranjosh - public_suffix: com - alexa_domain: jagranjosh.com - alexa_rank: 1437 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - jagranjosh.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: berkeley - public_suffix: edu - alexa_domain: berkeley.edu - alexa_rank: 1445 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - berkeley.edu query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: gotporn - public_suffix: com - alexa_domain: gotporn.com - alexa_rank: 1455 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - gotporn.com query_parsers: @@ -7240,41 +5590,18 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: dailypost - public_suffix: ng - alexa_domain: dailypost.ng - alexa_rank: 1457 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - dailypost.ng query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: psu - public_suffix: edu - alexa_domain: psu.edu - alexa_rank: 1464 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - psu.edu query_parsers: @@ -7285,41 +5612,18 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: akhbarelyom - public_suffix: com - alexa_domain: akhbarelyom.com - alexa_rank: 1471 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - akhbarelyom.com query_parsers: - url_pattern: ^https?://[^/]+/News/Search type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /News/Search - name: prensalibre - public_suffix: com - alexa_domain: prensalibre.com - alexa_rank: 1496 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - prensalibre.com query_parsers: @@ -7330,66 +5634,32 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: indosport - public_suffix: com - alexa_domain: indosport.com - alexa_rank: 1498 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - indosport.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: iz - public_suffix: ru - alexa_domain: iz.ru - alexa_rank: 1504 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - iz.ru query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: text - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: from - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: worldstarhiphop - public_suffix: com - alexa_domain: worldstarhiphop.com - alexa_rank: 1523 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - worldstarhiphop.com query_parsers: @@ -7400,92 +5670,31 @@ - url_pattern: ^https?://[^/]+/videos/search\.php type: query_parameter parameter: start - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /videos/search.php - name: guru99 - public_suffix: com - alexa_domain: guru99.com - alexa_rank: 1529 - category: education - notes: excluded; Query no in URL - input_field: false - search_form: false - search_div: true + excluded: Query no in URL domains: - guru99.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: suumo - public_suffix: jp - alexa_domain: suumo.jp - alexa_rank: 1536 - category: corporate - notes: excluded; No search - input_field: true - search_form: true - search_div: false + excluded: No search domains: - suumo.jp - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: calculator - public_suffix: net - alexa_domain: calculator.net - alexa_rank: 1541 - category: service - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - calculator.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: virgilio - public_suffix: it - alexa_domain: virgilio.it - alexa_rank: 1558 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - virgilio.it query_parsers: - url_pattern: ^https?://[^/]+/ricerca\? type: query_parameter parameter: qs - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /ricerca? - name: sapo - public_suffix: pt - alexa_domain: sapo.pt - alexa_rank: 1576 - category: web-portal notes: Uses Google - input_field: null - search_form: null - search_div: null domains: - sapo.pt query_parsers: @@ -7496,27 +5705,15 @@ - url_pattern: ^https?://[^/]+/pesquisa type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pesquisa - name: idealo - public_suffix: de - alexa_domain: idealo.de - alexa_rank: 1579 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - idealo.de query_parsers: - url_pattern: ^https?://[^/]+/preisvergleich/MainSearchProductCategory(/100I16-[0-9]+)?\.html\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/preisvergleich/MainSearchProductCategory/100I16-[0-9]+\.html\? type: path_segment @@ -7524,43 +5721,22 @@ remove_patterns: - ^100I16- - \.html$ - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /preisvergleich/MainSearchProductCategory.html? - name: thebalancecareers - public_suffix: com - alexa_domain: thebalancecareers.com - alexa_rank: 1587 - category: career-jobs - notes: null - input_field: false - search_form: false - search_div: true domains: - thebalancecareers.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: dpreview - public_suffix: com - alexa_domain: dpreview.com - alexa_rank: 1610 - category: review - notes: null - input_field: false - search_form: false - search_div: true domains: - dpreview.com query_parsers: @@ -7577,44 +5753,16 @@ - url_pattern: ^https?://[^/]+/products/search/ type: fragment_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - /products/search - /videos - name: tim - public_suffix: it - alexa_domain: tim.it - alexa_rank: 1642 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - tim.it - query_parsers: - - url_pattern: ^https?://[^/]+/\#searchPage\| - type: fragment_segment - segment: 1 - delimiter: '|' - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /#searchPage - name: tnaflix - public_suffix: com - alexa_domain: tnaflix.com - alexa_rank: 1653 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - tnaflix.com query_parsers: @@ -7625,46 +5773,27 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: las2orillas - public_suffix: co - alexa_domain: las2orillas.co - alexa_rank: 1654 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - las2orillas.co query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: excite - public_suffix: co.jp - alexa_domain: excite.co.jp - alexa_rank: 1675 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: true domains: + - excite.com - excite.co.jp - results.excite.com - websearch.excite.co.jp query_parsers: + - url_pattern: ^https?://[^/]+/\? + type: query_parameter + parameter: q - url_pattern: ^https?://[^/]+/serp\? type: query_parameter parameter: q @@ -7675,22 +5804,12 @@ - url_pattern: ^https?://[^/]+/serp\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /serp? - /? - name: 9anime - public_suffix: id - alexa_domain: 9anime.id - alexa_rank: 1687 - category: manga-anime - notes: null - input_field: false - search_form: false - search_div: true domains: + - aniwave.to - 9anime.gs - 9anime.id - 9anime.to @@ -7702,79 +5821,31 @@ - url_pattern: ^https?://[^/]+/filter\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /filter? - name: windy - public_suffix: com - alexa_domain: windy.com - alexa_rank: 1692 - category: service - notes: excluded; Only autocomplete - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - windy.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: wetter - public_suffix: com - alexa_domain: wetter.com - alexa_rank: 1711 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - wetter.com query_parsers: - url_pattern: ^https?://[^/]+/suche type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /suche - name: techtudo - public_suffix: com.br - alexa_domain: techtudo.com.br - alexa_rank: 1718 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - techtudo.com.br query_parsers: - url_pattern: ^https?://[^/]+/busca type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /busca - name: books - public_suffix: com.tw - alexa_domain: books.com.tw - alexa_rank: 1728 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - books.com.tw query_parsers: @@ -7785,20 +5856,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/cat/[^/]+/sort/[^/]+/[^/]+/[^/]+/ovs/[^/]+/spell/[^/]+/[^/]+/[^/]+/page/[0-9]+ type: path_segment segment: 16 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: kaiserpermanente - public_suffix: org - alexa_domain: kaiserpermanente.org - alexa_rank: 1737 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - kaiserpermanente.org - healthy.kaiserpermanente.org @@ -7806,21 +5866,9 @@ - url_pattern: ^https?://[^/]+/pages/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pages/search? - name: le360 - public_suffix: ma - alexa_domain: le360.ma - alexa_rank: 1743 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - le360.ma query_parsers: @@ -7831,20 +5879,9 @@ - url_pattern: ^https?://[^/]+/recherche/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /recherche - name: euronews - public_suffix: com - alexa_domain: euronews.com - alexa_rank: 1745 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - euronews.com query_parsers: @@ -7855,54 +5892,17 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: itau - public_suffix: com.br - alexa_domain: itau.com.br - alexa_rank: 1749 - category: corporate notes: No search - input_field: true - search_form: true - search_div: false domains: - itau.com.br - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: wiocha - public_suffix: pl - alexa_domain: wiocha.pl - alexa_rank: 1753 - category: media-sharing - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: true + excluded: Query not in URL domains: - wiocha.pl - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: utoronto - public_suffix: ca - alexa_domain: utoronto.ca - alexa_rank: 1755 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - utoronto.ca query_parsers: @@ -7919,38 +5919,14 @@ - url_pattern: ^https?://[^/]+/news/searchnews\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /news/searchnews? - name: gitlab - public_suffix: com - alexa_domain: gitlab.com - alexa_rank: 1786 - category: service - notes: 'excluded; Query not in URL.' - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - gitlab.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: commentcamarche - public_suffix: net - alexa_domain: commentcamarche.net - alexa_rank: 1797 - category: question-and-answer - notes: null - input_field: false - search_form: false - search_div: true domains: - commentcamarche.net query_parsers: @@ -7961,20 +5937,9 @@ - url_pattern: ^https?://[^/]+/s/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: biglobe - public_suffix: ne.jp - alexa_domain: biglobe.ne.jp - alexa_rank: 1801 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - biglobe.ne.jp - cgi.search.biglobe.ne.jp @@ -7982,79 +5947,29 @@ - url_pattern: ^https?://[^/]+/cgi-bin/search type: query_parameter parameter: search - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/cgi-bin/search type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /cgi-bin/search - name: jf71qh5v14 - public_suffix: com - alexa_domain: jf71qh5v14.com - alexa_rank: 1807 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - jf71qh5v14.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: glosbe - public_suffix: com - alexa_domain: glosbe.com - alexa_rank: 1810 - category: service - notes: excluded - input_field: true - search_form: true - search_div: true domains: - glosbe.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: eluniversal - public_suffix: com.mx - alexa_domain: eluniversal.com.mx - alexa_rank: 1811 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - eluniversal.com.mx query_parsers: - url_pattern: ^https?://[^/]+/resultados-busqueda/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /resultados-busqueda/ - name: akurat - public_suffix: co - alexa_domain: akurat.co - alexa_rank: 1813 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - akurat.co query_parsers: @@ -8065,20 +5980,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: monster - public_suffix: com - alexa_domain: monster.com - alexa_rank: 1820 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - monster.com query_parsers: @@ -8089,22 +5993,11 @@ - url_pattern: ^https?://[^/]+/jobs/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /jobs/search\? - name: sportzwiki - public_suffix: com - alexa_domain: sportzwiki.com - alexa_rank: 1824 - category: spam-malware - notes: null - input_field: false - search_form: false - search_div: true domains: - - /? + - sportzwiki.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter @@ -8113,21 +6006,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - /page - name: arabi21 - public_suffix: com - alexa_domain: arabi21.com - alexa_rank: 1834 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - arabi21.com query_parsers: @@ -8138,38 +6020,14 @@ - url_pattern: ^https?://[^/]+/[A-z]+\/*.Search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Stories/Search - /Story/AdvancedSearch - name: xm - public_suffix: com - alexa_domain: xm.com - alexa_rank: 1835 - category: '-' - notes: null - input_field: false - search_form: false - search_div: true domains: - xm.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] + - xmtrading.com - name: rei - public_suffix: com - alexa_domain: rei.com - alexa_rank: 1839 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - rei.com query_parsers: @@ -8180,20 +6038,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: ci123 - public_suffix: com - alexa_domain: ci123.com - alexa_rank: 1841 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: false domains: - ci123.com query_parsers: @@ -8246,9 +6093,6 @@ - url_pattern: ^https?://[^/]+/ping/ type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /all - /ask @@ -8259,93 +6103,36 @@ - /zhishi - /ping - name: adyen - public_suffix: com - alexa_domain: adyen.com - alexa_rank: 1845 - category: corporate - notes: excluded; Search blocked (Requires login?) - input_field: false - search_form: false - search_div: true + excluded: Search blocked (Requires login?) domains: - adyen.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: b2b168 - public_suffix: com - alexa_domain: b2b168.com - alexa_rank: 1862 - category: e-commerce - notes: exlcuded - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - b2b168.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: thoughtco - public_suffix: com - alexa_domain: thoughtco.com - alexa_rank: 1871 - category: question-and-answer - notes: null - input_field: false - search_form: false - search_div: true domains: - thoughtco.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: iefimerida - public_suffix: gr - alexa_domain: iefimerida.gr - alexa_rank: 1872 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - iefimerida.gr query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: wa - public_suffix: gov - alexa_domain: wa.gov - alexa_rank: 1879 - category: governmental - notes: null - input_field: true - search_form: true - search_div: false domains: - wa.gov query_parsers: @@ -8359,37 +6146,13 @@ - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: republicworld - public_suffix: com - alexa_domain: republicworld.com - alexa_rank: 1913 - category: news-and-boulevard - notes: 'excluded; Query not in URL.' - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - republicworld.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mostaghelonline - public_suffix: com - alexa_domain: mostaghelonline.com - alexa_rank: 1917 - category: news - notes: null - input_field: true - search_form: true - search_div: false domains: - mostaghelonline.com query_parsers: @@ -8400,20 +6163,9 @@ - url_pattern: ^https?://[^/]+/newsstudios/archive/ type: query_parameter parameter: curp - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newsstudios/archive/ - name: elyamnelaraby - public_suffix: com - alexa_domain: elyamnelaraby.com - alexa_rank: 1939 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - elyamnelaraby.com query_parsers: @@ -8424,58 +6176,22 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: beauty321 - public_suffix: com - alexa_domain: beauty321.com - alexa_rank: 1944 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - beauty321.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: popbela - public_suffix: com - alexa_domain: popbela.com - alexa_rank: 1945 - category: news-and-boulevard - notes: exclude; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - popbela.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: finn - public_suffix: 'no' - alexa_domain: finn.no - alexa_rank: 1950 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - finn.no query_parsers: @@ -8486,20 +6202,9 @@ - url_pattern: ^https?://[^/]+/bap/forsale/search\.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /bap/forsale/search.html? - name: acfun - public_suffix: cn - alexa_domain: acfun.cn - alexa_rank: 1952 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - acfun.cn query_parsers: @@ -8510,20 +6215,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: pCursor - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: arzdigital - public_suffix: com - alexa_domain: arzdigital.com - alexa_rank: 1960 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - arzdigital.com query_parsers: @@ -8534,104 +6228,45 @@ - url_pattern: ^https?://[^/]+/search/page/[0-9]+/\? type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: bd-pratidin - public_suffix: com - alexa_domain: bd-pratidin.com - alexa_rank: 1963 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - bd-pratidin.com query_parsers: - url_pattern: ^https?://[^/]+/home/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /home/search? - name: doc88 - public_suffix: com - alexa_domain: doc88.com - alexa_rank: 1986 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - doc88.com query_parsers: - url_pattern: ^https?://[^/]+/tag/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /tag - name: masrawy - public_suffix: com - alexa_domain: masrawy.com - alexa_rank: 1993 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - masrawy.com query_parsers: - url_pattern: ^https?://[^/]+/search/0/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: php - public_suffix: net - alexa_domain: php.net - alexa_rank: 1995 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - php.net query_parsers: - url_pattern: ^https?://[^/]+/manual-lookup\.php\? type: query_parameter parameter: pattern - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /manual-lookup.php? - name: taroot-rangi - public_suffix: com - alexa_domain: taroot-rangi.com - alexa_rank: 1996 - category: religious - notes: null - input_field: true - search_form: true - search_div: false domains: - taroot-rangi.com query_parsers: @@ -8645,21 +6280,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: amarujala - public_suffix: com - alexa_domain: amarujala.com - alexa_rank: 2000 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - amarujala.com query_parsers: @@ -8670,68 +6294,28 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: yixiin - public_suffix: com - alexa_domain: yixiin.com - alexa_rank: 2002 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - yixiin.com query_parsers: - url_pattern: ^https?://[^/]+/sell/search\.php\? type: query_parameter parameter: kw - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /sell/search.php? - name: xataka - public_suffix: com - alexa_domain: xataka.com - alexa_rank: 2008 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - xataka.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: julian-fashion - public_suffix: com - alexa_domain: julian-fashion.com - alexa_rank: 2014 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - julian-fashion.com query_parsers: - url_pattern: https?://[^/]+/[a-z]+-[a-z]+/products/search\? type: query_parameter parameter: searchKey - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /it-IT/products/search? - /de-DE/products/search? @@ -8742,31 +6326,10 @@ - /en-CN/products/search? - /en-IN/products/search? - name: onliner - public_suffix: by - alexa_domain: onliner.by - alexa_rank: 2016 - category: news-and-boulevard - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - onliner.by - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: licindia - public_suffix: in - alexa_domain: licindia.in - alexa_rank: 2023 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - licindia.in query_parsers: @@ -8777,20 +6340,9 @@ - url_pattern: ^https?://[^/]+/Search-Results\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Search-Results - name: to10 - public_suffix: gr - alexa_domain: to10.gr - alexa_rank: 2042 - category: sports - notes: null - input_field: true - search_form: true - search_div: true domains: - to10.gr query_parsers: @@ -8801,168 +6353,74 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: nbg - public_suffix: gr - alexa_domain: nbg.gr - alexa_rank: 2045 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - nbg.gr query_parsers: - url_pattern: ^https?://[^/]+/el/idiwtes/search-results\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /el/idiwtes/search-results? - name: acs - public_suffix: org - alexa_domain: acs.org - alexa_rank: 2052 - category: career-jobs - notes: null - input_field: false - search_form: false - search_div: true domains: - acs.org query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: filmibeat - public_suffix: com - alexa_domain: filmibeat.com - alexa_rank: 2053 - category: blog - notes: excluded; Only autocomplete search - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - filmibeat.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: usembassy - public_suffix: gov - alexa_domain: usembassy.gov - alexa_rank: 2060 - category: governmental - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: true + excluded: Query not in URL domains: - usembassy.gov - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: nifty - public_suffix: com - alexa_domain: nifty.com - alexa_rank: 2074 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - nifty.com query_parsers: - url_pattern: ^https?://[^/]+/websearch/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/websearch/search\? type: query_parameter parameter: stpos - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /websearch/search? - name: skynewsarabia - public_suffix: com - alexa_domain: skynewsarabia.com - alexa_rank: 2076 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - skynewsarabia.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: agenziaentrate - public_suffix: gov.it - alexa_domain: agenziaentrate.gov.it - alexa_rank: 2089 - category: governmental - notes: null - input_field: true - search_form: true - search_div: false domains: - agenziaentrate.gov.it query_parsers: - url_pattern: ^https?://[^/]+/portale/ricerca\? type: query_parameter parameter: _it_smc_sogei_search_web_SogeiAdvancedSearchPortlet_keywords - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /portale/ricerca? - name: knowyourmeme - public_suffix: com - alexa_domain: knowyourmeme.com - alexa_rank: 2096 - category: database - notes: null - input_field: true - search_form: true - search_div: true domains: - knowyourmeme.com query_parsers: @@ -8973,58 +6431,22 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: filgoal - public_suffix: com - alexa_domain: filgoal.com - alexa_rank: 2106 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - filgoal.com query_parsers: - url_pattern: ^https?://[^/]+/search/filter\? type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/filter? - name: donanimhaber - public_suffix: com - alexa_domain: donanimhaber.com - alexa_rank: 2107 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - donanimhaber.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ashemaletube - public_suffix: com - alexa_domain: ashemaletube.com - alexa_rank: 2119 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - ashemaletube.com query_parsers: @@ -9035,20 +6457,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: ixxx - public_suffix: com - alexa_domain: ixxx.com - alexa_rank: 2125 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - ixxx.com query_parsers: @@ -9062,41 +6473,18 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: programiz - public_suffix: com - alexa_domain: programiz.com - alexa_rank: 2132 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - programiz.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: careers360 - public_suffix: com - alexa_domain: careers360.com - alexa_rank: 2139 - category: career-jobs - notes: null - input_field: false - search_form: false - search_div: true domains: - careers360.com query_parsers: @@ -9107,20 +6495,9 @@ - url_pattern: ^https?://[^/]+/qna\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /qna? - name: myfonts - public_suffix: com - alexa_domain: myfonts.com - alexa_rank: 2141 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - myfonts.com query_parsers: @@ -9134,41 +6511,18 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: product_data[page] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: monografias - public_suffix: com - alexa_domain: monografias.com - alexa_rank: 2164 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - monografias.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: worldbank - public_suffix: org - alexa_domain: worldbank.org - alexa_rank: 2178 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - worldbank.org query_parsers: @@ -9179,41 +6533,18 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search\? type: query_parameter parameter: currentTab - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search? - name: zulily - public_suffix: com - alexa_domain: zulily.com - alexa_rank: 2186 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - zulily.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: klix - public_suffix: ba - alexa_domain: klix.ba - alexa_rank: 2212 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - klix.ba query_parsers: @@ -9224,62 +6555,27 @@ - url_pattern: ^https?://[^/]+/pretraga\? type: query_parameter parameter: str - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pretraga? - name: ew - public_suffix: com - alexa_domain: ew.com - alexa_rank: 2220 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - ew.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: warcraftlogs - public_suffix: com - alexa_domain: warcraftlogs.com - alexa_rank: 2227 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - warcraftlogs.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: yellowpages - public_suffix: com - alexa_domain: yellowpages.com - alexa_rank: 2232 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - yellowpages.com query_parsers: @@ -9290,100 +6586,40 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: dy2018 - public_suffix: com - alexa_domain: dy2018.com - alexa_rank: 2239 - category: streaming - notes: 'excluded; Query not in URL.' - input_field: true - search_form: true - search_div: true + excluded: Query not in URL domains: - dy2018.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: vlive - public_suffix: tv - alexa_domain: vlive.tv - alexa_rank: 2253 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - vlive.tv query_parsers: - url_pattern: ^https?://[^/]+/vstore/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /vstore/search? - name: answers - public_suffix: com - alexa_domain: answers.com - alexa_rank: 2275 - category: question-and-answer - notes: null - input_field: null - search_form: null - search_div: null domains: - answers.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: ilfattoquotidiano - public_suffix: it - alexa_domain: ilfattoquotidiano.it - alexa_rank: 2292 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - ilfattoquotidiano.it query_parsers: - url_pattern: ^https?://[^/]+/risultati-di-ricerca type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /risultati-di-ricerca - name: thumbzilla - public_suffix: com - alexa_domain: thumbzilla.com - alexa_rank: 2294 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - thumbzilla.com query_parsers: @@ -9394,20 +6630,9 @@ - url_pattern: ^https?://[^/]+/tags/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /tags - name: arynews - public_suffix: tv - alexa_domain: arynews.tv - alexa_rank: 2303 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - arynews.tv query_parsers: @@ -9418,37 +6643,13 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: wetteronline - public_suffix: de - alexa_domain: wetteronline.de - alexa_rank: 2307 - category: service - notes: excluded; No SERP - input_field: true - search_form: true - search_div: true + excluded: No SERP domains: - wetteronline.de - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sportbox - public_suffix: ru - alexa_domain: sportbox.ru - alexa_rank: 2312 - category: sports - notes: null - input_field: true - search_form: true - search_div: true domains: - sportbox.ru - news.sportbox.ru @@ -9456,21 +6657,9 @@ - url_pattern: ^https?://[^/]+/reports/search-content\? type: query_parameter parameter: keys - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search-content? - name: '19888' - public_suffix: tv - alexa_domain: 19888.tv - alexa_rank: 2313 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - 19888.tv query_parsers: @@ -9495,21 +6684,10 @@ segment: 2 remove_patterns: - ^p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /chanpin - /provide - name: info - public_suffix: com - alexa_domain: info.com - alexa_rank: 2330 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - info.com query_parsers: @@ -9520,20 +6698,9 @@ - url_pattern: ^https?://[^/]+/serp\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /serp? - name: igg-games - public_suffix: com - alexa_domain: igg-games.com - alexa_rank: 2341 - category: download - notes: null - input_field: true - search_form: true - search_div: true domains: - igg-games.com query_parsers: @@ -9544,128 +6711,53 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? -- name: xmtrading - public_suffix: com - alexa_domain: xmtrading.com - alexa_rank: 2343 - category: '-' - notes: excluded; No search - input_field: false - search_form: false - search_div: true - domains: - - xmtrading.com - - xm.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ubuntu - public_suffix: com - alexa_domain: ubuntu.com - alexa_rank: 2346 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - ubuntu.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: nyc - public_suffix: gov - alexa_domain: nyc.gov - alexa_rank: 2355 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - nyc.gov query_parsers: - url_pattern: ^https?://[^/]+/search/index\.page\? type: query_parameter parameter: search-terms - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/index.page? - name: nikkansports - public_suffix: com - alexa_domain: nikkansports.com - alexa_rank: 2366 - category: sports - notes: null - input_field: true - search_form: true - search_div: true domains: - nikkansports.com query_parsers: - url_pattern: ^https?://[^/]+/search/index\.html\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: navyfederal - public_suffix: org - alexa_domain: navyfederal.org - alexa_rank: 2382 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - navyfederal.org query_parsers: - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: skipFrom - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html - name: filmweb - public_suffix: pl - alexa_domain: filmweb.pl - alexa_rank: 2399 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - filmweb.pl query_parsers: @@ -9679,21 +6771,10 @@ - url_pattern: ^https?://[^/]+/films/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /films/search? - /search? - name: docker - public_suffix: com - alexa_domain: docker.com - alexa_rank: 2400 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - docker.com query_parsers: @@ -9704,37 +6785,13 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: sf_paged - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: doramatv - public_suffix: live - alexa_domain: doramatv.live - alexa_rank: 2401 - category: streaming - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - doramatv.live - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: watanserb - public_suffix: com - alexa_domain: watanserb.com - alexa_rank: 2418 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - watanserb.com query_parsers: @@ -9745,121 +6802,49 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: movieweb - public_suffix: com - alexa_domain: movieweb.com - alexa_rank: 2435 - category: blog - notes: null - input_field: true - search_form: true - search_div: true domains: - movieweb.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: leagueofgraphs - public_suffix: com - alexa_domain: leagueofgraphs.com - alexa_rank: 2448 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - leagueofgraphs.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: channelmyanmar - public_suffix: org - alexa_domain: channelmyanmar.org - alexa_rank: 2451 - category: '-' - notes: exclude; Page not loading - input_field: true - search_form: true - search_div: false + excluded: Page not loading domains: - channelmyanmar.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: besoccer - public_suffix: com - alexa_domain: besoccer.com - alexa_rank: 2459 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - besoccer.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: razer - public_suffix: com - alexa_domain: razer.com - alexa_rank: 2462 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - razer.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: royanews - public_suffix: tv - alexa_domain: royanews.tv - alexa_rank: 2465 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - royanews.tv query_parsers: @@ -9870,20 +6855,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: ulta - public_suffix: com - alexa_domain: ulta.com - alexa_rank: 2467 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - ulta.com query_parsers: @@ -9900,64 +6874,23 @@ segment: 2 space_patterns: - '-' - page_parsers: [] - offset_parsers: - - url_pattern: ^https?://[^/]+/ulta/a/_/Ntt- - type: query_parameter - parameter: false - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /shop - /ulta/a/_/ - /brand - name: meteofrance - public_suffix: com - alexa_domain: meteofrance.com - alexa_rank: 2469 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - meteofrance.com query_parsers: - url_pattern: ^https?://[^/]+/recherche/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /recherche - name: khabarfarsi - public_suffix: com - alexa_domain: khabarfarsi.com - alexa_rank: 2474 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - khabarfarsi.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: pc6 - public_suffix: com - alexa_domain: pc6.com - alexa_rank: 2475 - category: download - notes: null - input_field: true - search_form: true - search_div: true domains: - pc6.com query_parsers: @@ -9968,20 +6901,9 @@ - url_pattern: ^https?://[^/]+/cse/search\? type: query_parameter parameter: entry - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /cse/search? - name: haibunda - public_suffix: com - alexa_domain: haibunda.com - alexa_rank: 2476 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - haibunda.com query_parsers: @@ -9992,20 +6914,9 @@ - url_pattern: ^https?://[^/]+/search/[0-9]+ type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: opensubtitles - public_suffix: org - alexa_domain: opensubtitles.org - alexa_rank: 2483 - category: download - notes: null - input_field: true - search_form: true - search_div: true domains: - opensubtitles.org query_parsers: @@ -10014,78 +6925,28 @@ segment: 4 remove_patterns: - moviename- - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/search2/sublanguageid-[a-z]+/moviename-[^/]+/offset-[0-9]+ type: path_segment segment: 5 remove_patterns: - offset- - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search2 - name: ampproject - public_suffix: org - alexa_domain: ampproject.org - alexa_rank: 2484 - category: corporate - notes: excluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - ampproject.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: citibank - public_suffix: co.in - alexa_domain: citibank.co.in - alexa_rank: 2495 - category: corporate - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - citibank.co.in - online.citibank.co.in - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: leo - public_suffix: org - alexa_domain: leo.org - alexa_rank: 2501 - category: service - notes: excluded; Only autocomplete - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete domains: - leo.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: thomann - public_suffix: de - alexa_domain: thomann.de - alexa_rank: 2514 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - thomann.de query_parsers: @@ -10096,37 +6957,14 @@ - url_pattern: ^https?://[^/]+/[a-z+]/search_dir\.html\? type: query_parameter parameter: pg - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search_dir.html? - name: alarab - public_suffix: com - alexa_domain: alarab.com - alexa_rank: 2534 - category: news-and-boulevard - notes: excluded; Query not in URL; Uses Google - input_field: false - search_form: false - search_div: true + excluded: Query not in URL + notes: Uses Google search domains: - alarab.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: corporatefinanceinstitute - public_suffix: com - alexa_domain: corporatefinanceinstitute.com - alexa_rank: 2535 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - corporatefinanceinstitute.com query_parsers: @@ -10140,21 +6978,10 @@ - url_pattern: ^https?://[^/]+/resources/\? type: query_parameter parameter: page_number - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /resources - name: 01net - public_suffix: com - alexa_domain: 01net.com - alexa_rank: 2550 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - 01net.com query_parsers: @@ -10168,21 +6995,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: all-free-download - public_suffix: com - alexa_domain: all-free-download.com - alexa_rank: 2556 - category: download - notes: null - input_field: true - search_form: true - search_div: true domains: - all-free-download.com query_parsers: @@ -10193,82 +7009,33 @@ - \.html$ space_patterns: - '-' - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /free-vector - /free-photos - /font - name: mydailymagazine - public_suffix: com - alexa_domain: mydailymagazine.com - alexa_rank: 2572 - category: news-and-boulevard notes: No search - input_field: false - search_form: false - search_div: true domains: - mydailymagazine.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sworld - public_suffix: co.uk - alexa_domain: sworld.co.uk - alexa_rank: 2596 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - sworld.co.uk query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: alanba - public_suffix: com.kw - alexa_domain: alanba.com.kw - alexa_rank: 2598 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - alanba.com.kw query_parsers: - url_pattern: ^https?://[^/]+/newspaper/search type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newspaper/search - name: pornhd - public_suffix: com - alexa_domain: pornhd.com - alexa_rank: 2612 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - pornhd.com query_parsers: @@ -10282,20 +7049,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: naijaloaded - public_suffix: com.ng - alexa_domain: naijaloaded.com.ng - alexa_rank: 2613 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - naijaloaded.com.ng query_parsers: @@ -10309,21 +7065,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: wenxuecity - public_suffix: com - alexa_domain: wenxuecity.com - alexa_rank: 2624 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - wenxuecity.com query_parsers: @@ -10334,20 +7079,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: storyblocks - public_suffix: com - alexa_domain: storyblocks.com - alexa_rank: 2625 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - storyblocks.com query_parsers: @@ -10358,23 +7092,12 @@ - url_pattern: ^https?://[^/]+/[^/]+/search/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /all-video/search - /video/search - /audio/search - /images/search - name: sbnation - public_suffix: com - alexa_domain: sbnation.com - alexa_rank: 2627 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - sbnation.com query_parsers: @@ -10385,20 +7108,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: milenio - public_suffix: com - alexa_domain: milenio.com - alexa_rank: 2641 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - milenio.com query_parsers: @@ -10409,80 +7121,32 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: fajar - public_suffix: co.id - alexa_domain: fajar.co.id - alexa_rank: 2661 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - fajar.co.id query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: walla - public_suffix: co.il - alexa_domain: walla.co.il - alexa_rank: 2663 - category: news-and-boulevard - notes: null - input_field: null - search_form: null - search_div: null domains: - - search.walla.co.il - walla.co.il + - search.walla.co.il query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: win-rar - public_suffix: com - alexa_domain: win-rar.com - alexa_rank: 2673 - category: service - notes: 'excluded; Query not in URL.' - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - win-rar.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: islcollective - public_suffix: com - alexa_domain: islcollective.com - alexa_rank: 2695 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - islcollective.com - en.islcollective.com @@ -10494,119 +7158,46 @@ - url_pattern: ^https?://[^/]+/[^/]+/search/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /english-esl-worksheets/search - /english-esl-powerpoints/search - /english-esl-video-lessons/search - name: thebalance - public_suffix: com - alexa_domain: thebalance.com - alexa_rank: 2701 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - thebalance.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: usmagazine - public_suffix: com - alexa_domain: usmagazine.com - alexa_rank: 2704 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - usmagazine.com query_parsers: - url_pattern: ^https?://[^/]+/search-results/\? type: query_parameter parameter: _s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search-results - name: weheartit - public_suffix: com - alexa_domain: weheartit.com - alexa_rank: 2712 - category: news-and-boulevard - notes: excluded; Excluded from web archive - input_field: true - search_form: true - search_div: false + excluded: Excluded from the Internet Archive domains: - weheartit.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: fitbit - public_suffix: com - alexa_domain: fitbit.com - alexa_rank: 2716 - category: corporate - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - fitbit.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: domestika - public_suffix: org - alexa_domain: domestika.org - alexa_rank: 2720 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - domestika.org query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: careerbuilder - public_suffix: com - alexa_domain: careerbuilder.com - alexa_rank: 2728 - category: career-jobs - notes: null - input_field: true - search_form: true - search_div: false domains: - careerbuilder.com query_parsers: @@ -10617,145 +7208,62 @@ - url_pattern: ^https?://[^/]+/jobs\? type: query_parameter parameter: page_number - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /jobs - name: financialexpress - public_suffix: com - alexa_domain: financialexpress.com - alexa_rank: 2730 - category: news-and-boulevard - notes: excluded; Redirects to Google; Uses Google - input_field: false - search_form: false - search_div: true + excluded: Redirects to Google search domains: - financialexpress.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: colorado - public_suffix: edu - alexa_domain: colorado.edu - alexa_rank: 2768 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - colorado.edu query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: cse - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: clubic - public_suffix: com - alexa_domain: clubic.com - alexa_rank: 2786 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - clubic.com query_parsers: - url_pattern: ^https?://[^/]+/rechercher/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /rechercher/? - name: searchmulty - public_suffix: com - alexa_domain: searchmulty.com - alexa_rank: 2795 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: false domains: - searchmulty.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: 20minutes - public_suffix: fr - alexa_domain: 20minutes.fr - alexa_rank: 2798 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - 20minutes.fr query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: thespruce - public_suffix: com - alexa_domain: thespruce.com - alexa_rank: 2806 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - thespruce.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: npmjs - public_suffix: com - alexa_domain: npmjs.com - alexa_rank: 2809 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - npmjs.com query_parsers: @@ -10766,92 +7274,31 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: arstechnica - public_suffix: com - alexa_domain: arstechnica.com - alexa_rank: 2822 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - arstechnica.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: whois - public_suffix: com - alexa_domain: whois.com - alexa_rank: 2824 - category: search-engine - notes: excluded; No SERP - input_field: true - search_form: true - search_div: false + excluded: No SERP domains: - whois.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ufreegames - public_suffix: com - alexa_domain: ufreegames.com - alexa_rank: 2825 - category: gaming - notes: exclude; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - ufreegames.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: bseindia - public_suffix: com - alexa_domain: bseindia.com - alexa_rank: 2832 - category: '-' - notes: excluded; Only autocomplete search - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - bseindia.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: proquest - public_suffix: com - alexa_domain: proquest.com - alexa_rank: 2843 - category: search-engine notes: Query is encoded - input_field: true - search_form: true - search_div: true domains: - proquest.com query_parsers: @@ -10862,42 +7309,19 @@ - url_pattern: ^https?://[^/]+/results/[^/]+/[0-9]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /results - name: balenciaga - public_suffix: com - alexa_domain: balenciaga.com - alexa_rank: 2854 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - balenciaga.com query_parsers: - url_pattern: ^https?://[^/]+/[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en-us/search? - /de-de/search? - name: boyfriendtv - public_suffix: com - alexa_domain: boyfriendtv.com - alexa_rank: 2863 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - boyfriendtv.com query_parsers: @@ -10908,41 +7332,18 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: deutsche-bank - public_suffix: de - alexa_domain: deutsche-bank.de - alexa_rank: 2882 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - deutsche-bank.de query_parsers: - url_pattern: ^https?://[^/]+/pk/service-und-kontakt/kontakt/suche\.html\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pk/service-und-kontakt/kontakt/suche.html? - name: slidesgo - public_suffix: com - alexa_domain: slidesgo.com - alexa_rank: 2887 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - slidesgo.com query_parsers: @@ -10953,20 +7354,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: puma - public_suffix: com - alexa_domain: puma.com - alexa_rank: 2894 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - puma.com - us.puma.com @@ -10975,91 +7365,45 @@ - url_pattern: ^https?://[^/]+/[a-z]+/[a-z]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/[a-z]+/search\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /us/en/search? - /de/de/search? - name: nudevista - public_suffix: com - alexa_domain: nudevista.com - alexa_rank: 2901 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - nudevista.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: uspto - public_suffix: gov - alexa_domain: uspto.gov - alexa_rank: 2907 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - uspto.gov query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: sketchfab - public_suffix: com - alexa_domain: sketchfab.com - alexa_rank: 2913 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - sketchfab.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: vodafone - public_suffix: de - alexa_domain: vodafone.de - alexa_rank: 2943 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - vodafone.de query_parsers: @@ -11069,39 +7413,14 @@ - url_pattern: ^https?://[^/]+/global-search-results\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /privat/suche.html? - /global-search-results? - name: uptvs - public_suffix: com - alexa_domain: uptvs.com - alexa_rank: 2947 - category: streaming - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - uptvs.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: pornhat - public_suffix: com - alexa_domain: pornhat.com - alexa_rank: 2951 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - pornhat.com query_parsers: @@ -11112,58 +7431,22 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: societegenerale - public_suffix: fr - alexa_domain: societegenerale.fr - alexa_rank: 2965 - category: corporate - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - societegenerale.fr - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: furaffinity - public_suffix: net - alexa_domain: furaffinity.net - alexa_rank: 2967 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - furaffinity.net query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: netacad - public_suffix: com - alexa_domain: netacad.com - alexa_rank: 2989 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - netacad.com query_parsers: @@ -11174,20 +7457,9 @@ - url_pattern: ^https?://[^/]+/search/node/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - search/node/ - name: nipic - public_suffix: com - alexa_domain: nipic.com - alexa_rank: 3007 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - nipic.com - soso.nipic.com @@ -11199,20 +7471,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: check24 - public_suffix: de - alexa_domain: check24.de - alexa_rank: 3013 - category: comparison - notes: null - input_field: true - search_form: true - search_div: true domains: - check24.de - kredit.check24.de @@ -11230,22 +7491,11 @@ - url_pattern: ^https?://[^/]+/suche\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /suche? - /vergleich? - /strom/vergleich - name: fashionnova - public_suffix: com - alexa_domain: fashionnova.com - alexa_rank: 3033 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - fashionnova.com query_parsers: @@ -11256,79 +7506,31 @@ - url_pattern: ^https?://[^/]+/pages/search-results/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pages/search-results - name: playground - public_suffix: ru - alexa_domain: playground.ru - alexa_rank: 3045 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - playground.ru query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: virgool - public_suffix: io - alexa_domain: virgool.io - alexa_rank: 3052 - category: blog - notes: null - input_field: true - search_form: true - search_div: true domains: - virgool.io query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: marieclaire - public_suffix: com.tw - alexa_domain: marieclaire.com.tw - alexa_rank: 3055 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - marieclaire.com.tw - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: joemonster - public_suffix: org - alexa_domain: joemonster.org - alexa_rank: 3072 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - joemonster.org query_parsers: @@ -11339,29 +7541,16 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: pageID - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: qwant - public_suffix: com - alexa_domain: qwant.com - alexa_rank: 3093 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: false domains: - qwant.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: + interpreted_query_parsers: - url_pattern: ^https?://[^/]+/\? type: html_selector query_selector: div[class^=SearchBar-module] input[type=search] @@ -11414,35 +7603,15 @@ focused_url_prefixes: - /? - name: thairath - public_suffix: co.th - alexa_domain: thairath.co.th - alexa_rank: 3095 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - thairath.co.th query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: powerschool - public_suffix: com - alexa_domain: powerschool.com - alexa_rank: 3097 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - powerschool.com query_parsers: @@ -11456,21 +7625,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: pearsonvue - public_suffix: com - alexa_domain: pearsonvue.com - alexa_rank: 3104 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - pearsonvue.com - home.pearsonvue.com @@ -11482,58 +7640,22 @@ - url_pattern: ^https?://[^/]+/Search\.aspx\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Search.aspx? - name: denverpost - public_suffix: com - alexa_domain: denverpost.com - alexa_rank: 3106 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - denverpost.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: uefa - public_suffix: com - alexa_domain: uefa.com - alexa_rank: 3117 - category: sports - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - uefa.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: celine - public_suffix: com - alexa_domain: celine.com - alexa_rank: 3124 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - celine.com - celine.cn @@ -11544,10 +7666,6 @@ - url_pattern: ^https?://[^/]+/[a-z]+-[a-z]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /en-us/search? @@ -11556,49 +7674,15 @@ - /es-mx/search? - /en-ng/search? - name: tcs - public_suffix: com - alexa_domain: tcs.com - alexa_rank: 3130 - category: corporate - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - tcs.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: cima4u - public_suffix: tv - alexa_domain: cima4u.tv - alexa_rank: 3135 - category: streaming - notes: 'excluded; No search.' - input_field: false - search_form: false - search_div: true + excluded: No search domains: - - cima4u.life - cima4u.tv - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] + - cima4u.life - name: sbi - public_suffix: co.in - alexa_domain: sbi.co.in - alexa_rank: 3140 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - sbi.co.in query_parsers: @@ -11609,54 +7693,17 @@ - url_pattern: ^https?://[^/]+/web/personal-banking type: query_parameter parameter: _com_liferay_portal_search_web_portlet_SearchPortlet_cur - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /web/personal-banking - name: w3 - public_suffix: org - alexa_domain: w3.org - alexa_rank: 3144 - category: corporate - notes: excluded; Uses duckduckgo; Redirects to duckduckgo - input_field: false - search_form: false - search_div: true + excluded: Redirects to DuckDuckGo search domains: - w3.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: shippingchina - public_suffix: com - alexa_domain: shippingchina.com - alexa_rank: 3147 - category: service - notes: 'excluded; No standard SERP: Search for ships.' - input_field: true - search_form: true - search_div: false + excluded: No search domains: - shippingchina.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: secnews - public_suffix: gr - alexa_domain: secnews.gr - alexa_rank: 3156 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - secnews.gr query_parsers: @@ -11670,38 +7717,14 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: tsite - public_suffix: jp - alexa_domain: tsite.jp - alexa_rank: 3168 - category: corporate - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - tsite.jp - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: interieur - public_suffix: gouv.fr - alexa_domain: interieur.gouv.fr - alexa_rank: 3170 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - interieur.gouv.fr query_parsers: @@ -11712,20 +7735,9 @@ - url_pattern: ^https?://[^/]+/recherche\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /recherche? - name: gib - public_suffix: gov.tr - alexa_domain: gib.gov.tr - alexa_rank: 3204 - category: governmental - notes: null - input_field: true - search_form: true - search_div: false domains: - gib.gov.tr query_parsers: @@ -11736,37 +7748,14 @@ - url_pattern: ^https?://[^/]+/search/node/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/node/ - name: aglasem - public_suffix: com - alexa_domain: aglasem.com - alexa_rank: 3215 - category: education - notes: excluded; Query not in URL; Uses Google - input_field: false - search_form: false - search_div: true + excluded: Query not in URL + notes: Uses Google search domains: - aglasem.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: zalando - public_suffix: de - alexa_domain: zalando.de - alexa_rank: 3232 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - zalando.de - en.zalando.de @@ -11774,10 +7763,6 @@ - url_pattern: ^https?://[^/]+/.*/.*q type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /catalogue - /women @@ -11790,14 +7775,6 @@ - /sports-men - /sports-kids - name: jbhifi - public_suffix: com.au - alexa_domain: jbhifi.com.au - alexa_rank: 3240 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - jbhifi.com.au query_parsers: @@ -11808,20 +7785,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: chefkoch - public_suffix: de - alexa_domain: chefkoch.de - alexa_rank: 3244 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: true domains: - chefkoch.de query_parsers: @@ -11834,7 +7800,6 @@ segment: 2 remove_patterns: - s - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/rs/s[0-9]+/[^/]+/ type: html_selector @@ -11856,27 +7821,13 @@ title_selector: a.search-result-title - url_pattern: ^https?://[^/]+/rs/s[0-9]+/[^/]+/ type: html_selector - results_selector: table.result tr[onclick]:has(td.name > div > span.s) - url_selector: td.name > a - title_selector: td.name > a - snippet_selector: td.name > div > span.s - - url_pattern: ^https?://[^/]+/rs/s[0-9]+/[^/]+/ - type: html_selector - results_selector: table.result tr[onclick]:has(td.name > span.s) - url_selector: td.name > a - title_selector: td.name > a - snippet_selector: td.name > span.s + results_selector: table.result tr[bgcolor] + url_selector: td.name a + title_selector: td.name a + snippet_selector: td.name span.s:last-child focused_url_prefixes: - /rs/s - name: microcenter - public_suffix: com - alexa_domain: microcenter.com - alexa_rank: 3248 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - microcenter.com query_parsers: @@ -11887,37 +7838,13 @@ - url_pattern: ^https?://[^/]+/search/search_results\.aspx type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/search_results.aspx - name: yourdailysportfix - public_suffix: com - alexa_domain: yourdailysportfix.com - alexa_rank: 3251 - category: sports - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - yourdailysportfix.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sec - public_suffix: gov - alexa_domain: sec.gov - alexa_rank: 3260 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - sec.gov - secsearch.sec.gov @@ -11929,20 +7856,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: dim - public_suffix: gov.az - alexa_domain: dim.gov.az - alexa_rank: 3283 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - dim.gov.az query_parsers: @@ -11953,41 +7869,18 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: PAGEN_1 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: tv9telugu - public_suffix: com - alexa_domain: tv9telugu.com - alexa_rank: 3288 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - tv9telugu.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: 3isk - public_suffix: tv - alexa_domain: 3isk.tv - alexa_rank: 3322 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - 3isk.tv - 3isk.video @@ -11997,24 +7890,12 @@ segment: 2 space_patterns: - \+ - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: sciencemag - public_suffix: org - alexa_domain: sciencemag.org - alexa_rank: 3325 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - - sciencemag.org - science.org + - sciencemag.org query_parsers: - url_pattern: ^https?://[^/]+/action/doSearch\? type: query_parameter @@ -12023,41 +7904,18 @@ - url_pattern: ^https?://[^/]+/action/doSearch\? type: query_parameter parameter: startPage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /action/doSearch? - name: dicio - public_suffix: com.br - alexa_domain: dicio.com.br - alexa_rank: 3334 - category: service - notes: null - input_field: true - search_form: true - search_div: true domains: - dicio.com.br query_parsers: - url_pattern: ^https?://[^/]+/pesquisa\.php\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pesquisa.php? - name: bunshun - public_suffix: jp - alexa_domain: bunshun.jp - alexa_rank: 3344 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - bunshun.jp query_parsers: @@ -12068,41 +7926,18 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: sedo - public_suffix: com - alexa_domain: sedo.com - alexa_rank: 3347 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - sedo.com query_parsers: - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: fzg360 - public_suffix: com - alexa_domain: fzg360.com - alexa_rank: 3436 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - fzg360.com - gz.fzg360.com @@ -12119,62 +7954,27 @@ segment: 6 remove_patterns: - \.html$ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /news/lists - name: simplilearn - public_suffix: com - alexa_domain: simplilearn.com - alexa_rank: 3446 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - simplilearn.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: tag - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: tovima - public_suffix: gr - alexa_domain: tovima.gr - alexa_rank: 3463 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - tovima.gr query_parsers: - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: redwap - public_suffix: me - alexa_domain: redwap.me - alexa_rank: 3469 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - redwap.me query_parsers: @@ -12183,21 +7983,9 @@ segment: 2 space_patterns: - '-' - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /to - name: manualslib - public_suffix: com - alexa_domain: manualslib.com - alexa_rank: 3471 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: false domains: - manualslib.com query_parsers: @@ -12208,62 +7996,27 @@ - url_pattern: ^https?://[^/]+/[a-z]+/[^/]+.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /brand - name: groww - public_suffix: in - alexa_domain: groww.in - alexa_rank: 3477 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - groww.in query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: fosshub - public_suffix: com - alexa_domain: fosshub.com - alexa_rank: 3527 - category: download - notes: null - input_field: true - search_form: true - search_div: false domains: - fosshub.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: search-query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: tineye - public_suffix: com - alexa_domain: tineye.com - alexa_rank: 3533 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - tineye.com query_parsers: @@ -12274,62 +8027,27 @@ - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: ixl - public_suffix: com - alexa_domain: ixl.com - alexa_rank: 3555 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - ixl.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: investors - public_suffix: com - alexa_domain: investors.com - alexa_rank: 3568 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - investors.com query_parsers: - url_pattern: ^https?://[^/]+/search-results/ type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search-results - name: javbus - public_suffix: com - alexa_domain: javbus.com - alexa_rank: 3573 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - javbus.com query_parsers: @@ -12340,41 +8058,19 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: ilna - public_suffix: news - alexa_domain: ilna.news - alexa_rank: 3581 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - ilna.news query_parsers: - url_pattern: ^https?://[^/]+/fa/newsstudios/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /fa/newsstudios/search - name: loc - public_suffix: gov - alexa_domain: loc.gov - alexa_rank: 3591 - category: governmental notes: 'Library of Congress.' - input_field: null - search_form: null - search_div: null domains: - loc.gov query_parsers: @@ -12385,9 +8081,6 @@ - url_pattern: ^https?://[^/]+/(search|audio|books|film-and-videos|manuscripts|maps|notated-music|newspapers|photos|web-archives) type: query_parameter parameter: sp - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - /audio @@ -12400,14 +8093,6 @@ - /photos - /web-archives - name: bradesco - public_suffix: com.br - alexa_domain: bradesco.com.br - alexa_rank: 3631 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - bradesco.com.br - banco.bradesco @@ -12415,42 +8100,18 @@ - url_pattern: ^https?://[^/]+/html/classic/resultado-busca/ type: query_parameter parameter: termsearched - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /html/classic/resultado-busca/ - name: thefappeningblog - public_suffix: com - alexa_domain: thefappeningblog.com - alexa_rank: 3639 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - thefappeningblog.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: msdmanuals - public_suffix: com - alexa_domain: msdmanuals.com - alexa_rank: 3640 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - msdmanuals.com query_parsers: @@ -12461,22 +8122,11 @@ - url_pattern: ^https?://[^/]+/.*SearchResults\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /SearchResults? - /professional/SearchResults? - /home/SearchResults? - name: fuq - public_suffix: com - alexa_domain: fuq.com - alexa_rank: 3665 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - fuq.com query_parsers: @@ -12490,20 +8140,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: sarvgyan - public_suffix: com - alexa_domain: sarvgyan.com - alexa_rank: 3693 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - sarvgyan.com query_parsers: @@ -12517,42 +8156,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+ type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: anyporn - public_suffix: com - alexa_domain: anyporn.com - alexa_rank: 3702 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - anyporn.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: simplyhired - public_suffix: com - alexa_domain: simplyhired.com - alexa_rank: 3747 - category: career-jobs - notes: null - input_field: false - search_form: false - search_div: true domains: - simplyhired.com query_parsers: @@ -12563,20 +8179,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: xhamsterpremium - public_suffix: com - alexa_domain: xhamsterpremium.com - alexa_rank: 3759 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - xhamsterpremium.com query_parsers: @@ -12586,99 +8191,37 @@ - url_pattern: ^https?://[^/]+/categories\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /c/amateur/videos? - /categories? - /p/xhamster-category/click? - name: diretta - public_suffix: it - alexa_domain: diretta.it - alexa_rank: 3760 - category: gambling - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - diretta.it - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: coolmathgames - public_suffix: com - alexa_domain: coolmathgames.com - alexa_rank: 3767 - category: gaming - notes: excluded; Only autocomplete search - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - coolmathgames.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: podbean - public_suffix: com - alexa_domain: podbean.com - alexa_rank: 3790 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - podbean.com query_parsers: - url_pattern: ^https?://[^/]+/site/search/index\? type: query_parameter parameter: v - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /site/search/index? - name: newgrounds - public_suffix: com - alexa_domain: newgrounds.com - alexa_rank: 3798 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - newgrounds.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: terms - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: tebyan - public_suffix: net - alexa_domain: tebyan.net - alexa_rank: 3811 - category: religious - notes: null - input_field: false - search_form: false - search_div: true domains: - tebyan.net query_parsers: @@ -12689,92 +8232,30 @@ - url_pattern: ^https?://[^/]+/newindex\.aspx\? type: query_parameter parameter: pi - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newindex.aspx? - name: paheal - public_suffix: net - alexa_domain: paheal.net - alexa_rank: 3832 - category: pornography - notes: exclude; Excluded from web archive - input_field: true - search_form: true - search_div: true + excluded: Excluded from the Internet Archive domains: - paheal.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sportzbonanza - public_suffix: com - alexa_domain: sportzbonanza.com - alexa_rank: 3838 - category: sports - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - sportzbonanza.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: dailypakistan - public_suffix: com.pk - alexa_domain: dailypakistan.com.pk - alexa_rank: 3865 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - dailypakistan.com.pk query_parsers: - url_pattern: ^https?://[^/]+/\?cx type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /?cx - name: edf - public_suffix: fr - alexa_domain: edf.fr - alexa_rank: 3911 - category: governmental - notes: exclude; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - edf.fr - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: bigbasket - public_suffix: com - alexa_domain: bigbasket.com - alexa_rank: 3922 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - bigbasket.com query_parsers: @@ -12785,20 +8266,9 @@ - url_pattern: ^https?://[^/]+/ps/\? type: fragment_parameter parameter: '!page' - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /ps - name: iwank - public_suffix: tv - alexa_domain: iwank.tv - alexa_rank: 3930 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - iwank.tv query_parsers: @@ -12809,37 +8279,13 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+/ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: coolors - public_suffix: co - alexa_domain: coolors.co - alexa_rank: 3947 - category: service - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - coolors.co - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: instant-gaming - public_suffix: com - alexa_domain: instant-gaming.com - alexa_rank: 3949 - category: gaming - notes: null - input_field: true - search_form: true - search_div: true domains: - instant-gaming.com query_parsers: @@ -12850,54 +8296,17 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - name: ovh - public_suffix: com - alexa_domain: ovh.com - alexa_rank: 3960 - category: corporate - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: true + excluded: Query not in URL domains: - ovh.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: jiayuan - public_suffix: com - alexa_domain: jiayuan.com - alexa_rank: 3980 - category: dating - notes: excluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - jiayuan.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: kongfz - public_suffix: com - alexa_domain: kongfz.com - alexa_rank: 3982 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - kongfz.com - search.kongfz.com @@ -12909,20 +8318,9 @@ - url_pattern: ^https?://[^/]+/product_result/\? type: query_parameter parameter: pagenum - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /product_result? - name: babyshop - public_suffix: com - alexa_domain: babyshop.com - alexa_rank: 4048 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - babyshop.com query_parsers: @@ -12933,54 +8331,17 @@ - url_pattern: ^https?://[^/]+/search/searchbytext\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/searchbytext? - name: '18183' - public_suffix: com - alexa_domain: 18183.com - alexa_rank: 4053 - category: gaming - notes: excluded; Redirects to baidu; Uses baidu - input_field: true - search_form: true - search_div: false + excluded: Redirects to Baidu search domains: - 18183.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: 3bmeteo - public_suffix: com - alexa_domain: 3bmeteo.com - alexa_rank: 4060 - category: service - notes: excluded; Only autocomplete search - input_field: true - search_form: true - search_div: true + excluded: Only autocomplete search domains: - 3bmeteo.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: e621 - public_suffix: net - alexa_domain: e621.net - alexa_rank: 4069 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - e621.net query_parsers: @@ -12991,20 +8352,9 @@ - url_pattern: ^https?://[^/]+/posts\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /posts? - name: interactivebrokers - public_suffix: com - alexa_domain: interactivebrokers.com - alexa_rank: 4074 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - interactivebrokers.com query_parsers: @@ -13015,20 +8365,9 @@ - url_pattern: ^https?://[^/]+/en/search/index\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search/index.php? - name: bottegaveneta - public_suffix: com - alexa_domain: bottegaveneta.com - alexa_rank: 4076 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - bottegaveneta.com - bottegaveneta.cn @@ -13039,41 +8378,16 @@ - url_pattern: ^https?://[^/]+/[a-z]+-[a-z]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - search? - /en-en/search? - /en-us/search? - /de-de/search? - name: arvancloud - public_suffix: com - alexa_domain: arvancloud.com - alexa_rank: 4160 - category: corporate - notes: exluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - arvancloud.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: movistarplus - public_suffix: es - alexa_domain: movistarplus.es - alexa_rank: 4171 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - movistarplus.es - atencionalcliente.movistar.es @@ -13081,79 +8395,30 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: club-k - public_suffix: net - alexa_domain: club-k.net - alexa_rank: 4177 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - club-k.net query_parsers: - url_pattern: ^https?://[^/]+/index type: query_parameter parameter: searchword - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/index type: query_parameter parameter: limitstart - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /index.php - name: bina - public_suffix: az - alexa_domain: bina.az - alexa_rank: 4188 - category: e-commerce - notes: excluded; Only autocomplete search - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - bina.az - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: upmedia - public_suffix: mg - alexa_domain: upmedia.mg - alexa_rank: 4191 - category: news-and-boulevard - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - upmedia.mg - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: travelermaster - public_suffix: com - alexa_domain: travelermaster.com - alexa_rank: 4197 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - travelermaster.com query_parsers: @@ -13167,21 +8432,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: elecfans - public_suffix: com - alexa_domain: elecfans.com - alexa_rank: 4202 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - elecfans.com - s.elecfans.com @@ -13193,20 +8447,9 @@ - url_pattern: ^https?://[^/]+/s\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s? - name: tailorbrands - public_suffix: com - alexa_domain: tailorbrands.com - alexa_rank: 4208 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - tailorbrands.com - support.tailorbrands.com @@ -13218,20 +8461,9 @@ - url_pattern: ^https?://[^/]+/hc/[a-z]+-[a-z]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /hc/en-us/search? - name: russianfood - public_suffix: com - alexa_domain: russianfood.com - alexa_rank: 4226 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - russianfood.com query_parsers: @@ -13242,83 +8474,36 @@ - url_pattern: ^https?://[^/]+/search/simple/index\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/simple/index.php? - name: life - public_suffix: ru - alexa_domain: life.ru - alexa_rank: 4269 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - life.ru query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: tuasaude - public_suffix: com - alexa_domain: tuasaude.com - alexa_rank: 4291 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - tuasaude.com query_parsers: - url_pattern: ^https?://[^/]+/busca/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /busca - name: iconfinder - public_suffix: com - alexa_domain: iconfinder.com - alexa_rank: 4296 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - iconfinder.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: themoviedb - public_suffix: org - alexa_domain: themoviedb.org - alexa_rank: 4320 - category: database - notes: null - input_field: true - search_form: true - search_div: false domains: - themoviedb.org query_parsers: @@ -13329,107 +8514,37 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: jiomart - public_suffix: com - alexa_domain: jiomart.com - alexa_rank: 4329 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - jiomart.com query_parsers: - url_pattern: ^https?://[^/]+/catalogsearch/result\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /catalogsearch/result? -- name: xvideos2 - public_suffix: com - alexa_domain: xvideos2.com - alexa_rank: 4445 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true - domains: - - xvideos2.com - query_parsers: - - url_pattern: ^https?://[^/]+/\? - type: query_parameter - parameter: k - page_parsers: - - url_pattern: ^https?://[^/]+/\? - type: query_parameter - parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: - - /? - name: dhl - public_suffix: de - alexa_domain: dhl.de - alexa_rank: 4452 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: + - dhl.com - dhl.de query_parsers: - url_pattern: ^https?://[^/]+/de/privatkunden/suche\.html\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /de/privatkunden/suche.html? - name: duosecurity - public_suffix: com - alexa_domain: duosecurity.com - alexa_rank: 4456 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - duosecurity.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: pcworld - public_suffix: com - alexa_domain: pcworld.com - alexa_rank: 4471 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - pcworld.com query_parsers: @@ -13440,20 +8555,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: jiwu - public_suffix: com - alexa_domain: jiwu.com - alexa_rank: 4510 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - jiwu.com - foshan.jiwu.com @@ -13464,63 +8568,27 @@ remove_patterns: - ^list-key - \.html$ - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /loupan - name: afternic - public_suffix: com - alexa_domain: afternic.com - alexa_rank: 4531 - category: service - notes: null - input_field: true - search_form: true - search_div: true domains: - afternic.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: k - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: notebookcheck - public_suffix: net - alexa_domain: notebookcheck.net - alexa_rank: 4542 - category: review - notes: null - input_field: true - search_form: true - search_div: true domains: - notebookcheck.net query_parsers: - url_pattern: ^https?://[^/]+/Google-Search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Google-Search - name: altibbi - public_suffix: com - alexa_domain: altibbi.com - alexa_rank: 4543 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - altibbi.com query_parsers: @@ -13531,37 +8599,13 @@ - url_pattern: ^https?://[^/]+/search/questions\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/questions - name: planetsuzy - public_suffix: org - alexa_domain: planetsuzy.org - alexa_rank: 4553 - category: pornography - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - planetsuzy.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: zaful - public_suffix: com - alexa_domain: zaful.com - alexa_rank: 4560 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - zaful.com query_parsers: @@ -13572,37 +8616,13 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: po-kaki-to - public_suffix: com - alexa_domain: po-kaki-to.com - alexa_rank: 4575 - category: pornography - notes: excluded; No search - input_field: true - search_form: true - search_div: false + excluded: No search domains: - po-kaki-to.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ig - public_suffix: com.br - alexa_domain: ig.com.br - alexa_rank: 4588 - category: web-portal - notes: null - input_field: false - search_form: false - search_div: true domains: - ig.com.br query_parsers: @@ -13613,64 +8633,30 @@ - url_pattern: ^https?://[^/]+/buscar type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /buscar - name: focus - public_suffix: de - alexa_domain: focus.de - alexa_rank: 4604 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - focus.de query_parsers: - url_pattern: ^https?://[^/]+/suche type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /suche - name: latam - public_suffix: com - alexa_domain: latam.com - alexa_rank: 4676 - category: e-commerce - notes: Search results are flight connections; Query is destination - input_field: false - search_form: false - search_div: true + notes: Search results are flight connections, query is the destination domains: - latam.com query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/[a-z]+/[^/]+\? type: query_parameter parameter: destination - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /gb/en/flight-offers? - /de/de/flugangebote? - /mx/es/ofertas-vuelos? - name: anysex - public_suffix: com - alexa_domain: anysex.com - alexa_rank: 4681 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - anysex.com query_parsers: @@ -13684,41 +8670,18 @@ - url_pattern: ^https?://[^/]+/search/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: geihui - public_suffix: com - alexa_domain: geihui.com - alexa_rank: 4694 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - geihui.com query_parsers: - url_pattern: ^https?://[^/]+/searchlog\? type: query_parameter parameter: k - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /searchlog? - name: patagonia - public_suffix: com - alexa_domain: patagonia.com - alexa_rank: 4728 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - patagonia.com - eu.patagonia.com @@ -13737,85 +8700,38 @@ - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/? - /de/de/search/? - /gb/en/search/? - name: verycd - public_suffix: com - alexa_domain: verycd.com - alexa_rank: 4749 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - verycd.com query_parsers: - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: kw - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: steamdb - public_suffix: info - alexa_domain: steamdb.info - alexa_rank: 4760 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - steamdb.info query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: animixplay - public_suffix: to - alexa_domain: animixplay.to - alexa_rank: 4768 - category: manga-anime - notes: null - input_field: false - search_form: false - search_div: true domains: - animixplay.to query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: bonanza - public_suffix: com - alexa_domain: bonanza.com - alexa_rank: 4778 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - bonanza.com query_parsers: @@ -13826,20 +8742,9 @@ - url_pattern: ^https?://[^/]+/items/search\? type: query_parameter parameter: q[page] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /items/search? - name: mql5 - public_suffix: com - alexa_domain: mql5.com - alexa_rank: 4791 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - mql5.com query_parsers: @@ -13850,41 +8755,18 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /it/search - name: keezmovies - public_suffix: com - alexa_domain: keezmovies.com - alexa_rank: 4795 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - keezmovies.com query_parsers: - url_pattern: ^https?://[^/]+/video\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /video? - name: 9to5mac - public_suffix: com - alexa_domain: 9to5mac.com - alexa_rank: 4809 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - 9to5mac.com query_parsers: @@ -13898,21 +8780,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: usa - public_suffix: gov - alexa_domain: usa.gov - alexa_rank: 4820 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - usa.gov query_parsers: @@ -13923,41 +8794,18 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: etudiant - public_suffix: gouv.fr - alexa_domain: etudiant.gouv.fr - alexa_rank: 4847 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - etudiant.gouv.fr query_parsers: - url_pattern: ^https?://[a-z]+/recherche\? type: query_parameter parameter: keywords - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /fr/recherche? - name: al-maraabimedias - public_suffix: net - alexa_domain: al-maraabimedias.net - alexa_rank: 4848 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - al-maraabimedias.net query_parsers: @@ -13970,20 +8818,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: paged - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: justice - public_suffix: gov - alexa_domain: justice.gov - alexa_rank: 4894 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - justice.gov - search.justice.gov @@ -13995,20 +8832,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: alison - public_suffix: com - alexa_domain: alison.com - alexa_rank: 4904 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - alison.com query_parsers: @@ -14019,21 +8845,10 @@ - url_pattern: ^https?://[^/]+/(courses|careers-search)\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /courses? - /careers-search? - name: darty - public_suffix: com - alexa_domain: darty.com - alexa_rank: 4906 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - darty.com query_parsers: @@ -14045,24 +8860,13 @@ - url_pattern: ^https?://[^/]+/nav/recherche\? type: query_parameter parameter: text - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/nav/recherche\? type: query_parameter parameter: o - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /nav/recherche - name: theringer - public_suffix: com - alexa_domain: theringer.com - alexa_rank: 4907 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - theringer.com query_parsers: @@ -14073,20 +8877,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: tueren-fachhandel - public_suffix: de - alexa_domain: tueren-fachhandel.de - alexa_rank: 4919 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - tueren-fachhandel.de query_parsers: @@ -14097,41 +8890,18 @@ - url_pattern: ^https?://[^/]+/catalogsearch/result\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /catalogsearch/result\? - name: nesn - public_suffix: com - alexa_domain: nesn.com - alexa_rank: 4945 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - nesn.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: zimbio - public_suffix: com - alexa_domain: zimbio.com - alexa_rank: 4973 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - zimbio.com query_parsers: @@ -14142,41 +8912,18 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: niksalehi - public_suffix: com - alexa_domain: niksalehi.com - alexa_rank: 4975 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - niksalehi.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: ehow - public_suffix: com - alexa_domain: ehow.com - alexa_rank: 4977 - category: question-and-answer - notes: null - input_field: null - search_form: null - search_div: null domains: - ehow.com query_parsers: @@ -14187,20 +8934,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: pornmd - public_suffix: com - alexa_domain: pornmd.com - alexa_rank: 4985 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - pornmd.com query_parsers: @@ -14209,23 +8945,11 @@ segment: 2 space_patterns: - \+ - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /straight - /gay - /tranny - name: 2ch-c - public_suffix: net - alexa_domain: 2ch-c.net - alexa_rank: 4998 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - 2ch-c.net query_parsers: @@ -14236,100 +8960,40 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: mtv - public_suffix: com - alexa_domain: mtv.com - alexa_rank: 5015 - category: corporate - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - mtv.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: setareparsi - public_suffix: com - alexa_domain: setareparsi.com - alexa_rank: 5030 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - setareparsi.com query_parsers: - url_pattern: ^https?://[^/]+/newsstudios/archive/\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newsstudios/archive - name: tweaktown - public_suffix: com - alexa_domain: tweaktown.com - alexa_rank: 5052 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - tweaktown.com query_parsers: - url_pattern: ^https?://[^/]+/cse/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /cse/? - name: softfamous - public_suffix: com - alexa_domain: softfamous.com - alexa_rank: 5084 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - softfamous.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /?s - name: hsbianma - public_suffix: com - alexa_domain: hsbianma.com - alexa_rank: 5120 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - hsbianma.com query_parsers: @@ -14340,21 +9004,10 @@ - url_pattern: ^https?://[^/]+/Search/[0-9]+\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /Search/ - name: zaobao - public_suffix: com - alexa_domain: zaobao.com - alexa_rank: 5156 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - zaobao.com - zaobao.com.sg @@ -14366,20 +9019,10 @@ - url_pattern: ^https?://[^/]+/search/site/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: givemesport - public_suffix: com - alexa_domain: givemesport.com - alexa_rank: 5201 - category: sports - notes: excluded; No search - input_field: true - search_form: true - search_div: false + excluded: No search domains: - givemesport.com query_parsers: @@ -14393,21 +9036,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+ type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: internshala - public_suffix: com - alexa_domain: internshala.com - alexa_rank: 5309 - category: career-jobs - notes: null - input_field: true - search_form: true - search_div: true domains: - internshala.com query_parsers: @@ -14422,21 +9054,10 @@ segment: 3 remove_patterns: - ^page- - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /internships - /jobs - name: jogos360 - public_suffix: com.br - alexa_domain: jogos360.com.br - alexa_rank: 5314 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - jogos360.com.br query_parsers: @@ -14447,22 +9068,12 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: belgium - public_suffix: be - alexa_domain: belgium.be - alexa_rank: 5367 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - belgium.be + - fgov.be query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/search\? type: query_parameter @@ -14471,23 +9082,12 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search? - /nl/search? - /fr/search? - /de/search? - name: todayhumor - public_suffix: co.kr - alexa_domain: todayhumor.co.kr - alexa_rank: 5372 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - todayhumor.co.kr query_parsers: @@ -14498,20 +9098,9 @@ - url_pattern: ^https?://[^/]+/board/list\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /board/list.php? - name: dice - public_suffix: com - alexa_domain: dice.com - alexa_rank: 5424 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - dice.com query_parsers: @@ -14522,121 +9111,50 @@ - url_pattern: ^https?://[^/]+/jobs\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /jobs? - name: tubegalore - public_suffix: com - alexa_domain: tubegalore.com - alexa_rank: 5435 - category: pornography - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - tubegalore.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: nationwide - public_suffix: co.uk - alexa_domain: nationwide.co.uk - alexa_rank: 5440 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - nationwide.co.uk query_parsers: - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: sunporno - public_suffix: com - alexa_domain: sunporno.com - alexa_rank: 5514 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - sunporno.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: serebii - public_suffix: net - alexa_domain: serebii.net - alexa_rank: 5540 - category: wiki - notes: null - input_field: true - search_form: true - search_div: false domains: - serebii.net query_parsers: - url_pattern: ^https?://[^/]+/search\.shtml\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - search.shtml? - name: irrawaddy - public_suffix: com - alexa_domain: irrawaddy.com - alexa_rank: 5542 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - irrawaddy.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: khabarvarzeshi - public_suffix: com - alexa_domain: khabarvarzeshi.com - alexa_rank: 5592 - category: sports - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - khabarvarzeshi.com query_parsers: @@ -14647,20 +9165,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: pi - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: arca - public_suffix: live - alexa_domain: arca.live - alexa_rank: 5597 - category: forum - notes: null - input_field: true - search_form: true - search_div: false domains: - arca.live query_parsers: @@ -14671,37 +9178,13 @@ - url_pattern: ^https?://[^/]+/b/breaking\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /b/breaking? - name: soccer24 - public_suffix: com - alexa_domain: soccer24.com - alexa_rank: 5616 - category: sports - notes: exclude; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - soccer24.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: javhdporn - public_suffix: net - alexa_domain: javhdporn.net - alexa_rank: 5683 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - javhdporn.net query_parsers: @@ -14712,20 +9195,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/page/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: letpub - public_suffix: com.cn - alexa_domain: letpub.com.cn - alexa_rank: 5699 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - letpub.com.cn query_parsers: @@ -14736,20 +9208,9 @@ - url_pattern: ^https?://[^/]+/index\.php\? type: query_parameter parameter: currentsearchpage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /index.php? - name: areavip - public_suffix: com.br - alexa_domain: areavip.com.br - alexa_rank: 5740 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - areavip.com.br query_parsers: @@ -14760,41 +9221,18 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: eztv - public_suffix: re - alexa_domain: eztv.re - alexa_rank: 5785 - category: torrent - notes: null - input_field: true - search_form: true - search_div: false domains: - eztv.re query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q1 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: opendns - public_suffix: com - alexa_domain: opendns.com - alexa_rank: 5807 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - opendns.com query_parsers: @@ -14805,20 +9243,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: cludopage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: brave - public_suffix: com - alexa_domain: brave.com - alexa_rank: 5823 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - brave.com - search.brave.com @@ -14830,8 +9257,7 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: offset - offset_parsers: [] - interpreted_query_parsers: + interpreted_query_parsers: - url_pattern: ^https?://[^/]+/search\? type: html_selector query_selector: form#searchform input#searchbox @@ -14845,52 +9271,19 @@ focused_url_prefixes: - /search? - name: thenextweb - public_suffix: com - alexa_domain: thenextweb.com - alexa_rank: 5833 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - thenextweb.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: dreamhost - public_suffix: com - alexa_domain: dreamhost.com - alexa_rank: 5901 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - dreamhost.com query_parsers: - url_pattern: ^https?://[^/]+/domains/\? type: query_parameter parameter: domain - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /domains/? - name: ck365 - public_suffix: cn - alexa_domain: ck365.cn - alexa_rank: 5919 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - ck365.cn query_parsers: @@ -14907,62 +9300,27 @@ remove_patterns: - ^search(-xzg-)?kw-[^-]-(-fields-[0-9])? - (-page-[0-9])?-?\.html$ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /news/search - name: thegospelcoalition - public_suffix: org - alexa_domain: thegospelcoalition.org - alexa_rank: 5926 - category: religious - notes: null - input_field: true - search_form: true - search_div: false domains: - thegospelcoalition.org query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: iq - public_suffix: com - alexa_domain: iq.com - alexa_rank: 5949 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - iq.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: wgu - public_suffix: edu - alexa_domain: wgu.edu - alexa_rank: 5952 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - wgu.edu query_parsers: @@ -14973,20 +9331,9 @@ - url_pattern: ^https?://[^/]+/search\.html type: query_parameter parameter: cludopage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html - name: levi - public_suffix: com - alexa_domain: levi.com - alexa_rank: 5960 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - levi.com - levi.com.cn @@ -15009,39 +9356,16 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /US/en_US/search - /DE/de_DE/search - /search/result? - /search? - name: hochi - public_suffix: news - alexa_domain: hochi.news - alexa_rank: 5979 - category: sports - notes: excluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - hochi.news - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: yalla-shoot-new - public_suffix: com - alexa_domain: yalla-shoot-new.com - alexa_rank: 6008 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - yalla-shoot-new.com - cup.yalla-shoot-new.com @@ -15049,24 +9373,13 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: radiotimes - public_suffix: com - alexa_domain: radiotimes.com - alexa_rank: 6015 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - radiotimes.com query_parsers: @@ -15080,42 +9393,19 @@ - url_pattern: ^https?://[^/]+/search/news/page/[0-9]+/\? type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/? - /search/news/page - name: dev - public_suffix: to - alexa_domain: dev.to - alexa_rank: 6026 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - dev.to query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: brstej - public_suffix: com - alexa_domain: brstej.com - alexa_rank: 6065 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - brstej.com - s.brstej.net @@ -15127,37 +9417,13 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: thisav - public_suffix: com - alexa_domain: thisav.com - alexa_rank: 6078 - category: pornography - notes: exclude; Page not loading - input_field: true - search_form: true - search_div: true + excluded: Page not loading domains: - thisav.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: olevod - public_suffix: com - alexa_domain: olevod.com - alexa_rank: 6105 - category: streaming - notes: null - input_field: true - search_form: true - search_div: false domains: - olevod.com query_parsers: @@ -15173,20 +9439,9 @@ - url_pattern: ^https?://[^/]+/index\.php/[^/]+/search/page/[0-9]+/wd/[^/]+\.html type: path_segment segment: 5 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /index.php/ - name: kinsta - public_suffix: com - alexa_domain: kinsta.com - alexa_rank: 6122 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - kinsta.com query_parsers: @@ -15197,20 +9452,9 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: aftodioikisi - public_suffix: gr - alexa_domain: aftodioikisi.gr - alexa_rank: 6147 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - aftodioikisi.gr query_parsers: @@ -15224,42 +9468,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: stardewvalleywiki - public_suffix: com - alexa_domain: stardewvalleywiki.com - alexa_rank: 6153 - category: wiki - notes: null - input_field: true - search_form: true - search_div: true domains: - stardewvalleywiki.com query_parsers: - url_pattern: ^https?://[^/]+/mediawiki/index\.php\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /mediawiki/index.php? - name: thomasnet - public_suffix: com - alexa_domain: thomasnet.com - alexa_rank: 6172 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - thomasnet.com - news.thomasnet.com @@ -15274,20 +9495,9 @@ - url_pattern: ^https?://[^/]+/search/(industry-insights|white-paper-guides|product-news|company-news)/[0-9]+ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: dafiti - public_suffix: com.br - alexa_domain: dafiti.com.br - alexa_rank: 6224 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - dafiti.com.br query_parsers: @@ -15304,38 +9514,14 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /catalog/? - /? - name: letyshops - public_suffix: com - alexa_domain: letyshops.com - alexa_rank: 6285 - category: e-commerce - notes: excluded; Only autocomplete search - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - letyshops.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: linkvertise - public_suffix: com - alexa_domain: linkvertise.com - alexa_rank: 6310 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - linkvertise.com query_parsers: @@ -15346,20 +9532,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/[0-9]+/\? type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: '999' - public_suffix: md - alexa_domain: 999.md - alexa_rank: 6314 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - 999.md query_parsers: @@ -15370,20 +9545,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: ac-versailles - public_suffix: fr - alexa_domain: ac-versailles.fr - alexa_rank: 6357 - category: education - notes: null - input_field: true - search_form: true - search_div: false domains: - ac-versailles.fr query_parsers: @@ -15394,38 +9558,13 @@ - url_pattern: ^https?://[^/]+/recherche type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /recherche? - /recherche/type/actualites? - name: exey - public_suffix: io - alexa_domain: exey.io - alexa_rank: 6359 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true domains: - exey.io - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: manganato - public_suffix: com - alexa_domain: manganato.com - alexa_rank: 6395 - category: manga-anime - notes: null - input_field: false - search_form: false - search_div: true domains: - manganato.com query_parsers: @@ -15436,44 +9575,22 @@ - url_pattern: ^https?://[^/]+/search/story/[^/]+\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: hrblock - public_suffix: com - alexa_domain: hrblock.com - alexa_rank: 6415 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - hrblock.com query_parsers: - url_pattern: ^https?://[^/]+/search type: fragment_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: firstResult - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: kizi - public_suffix: com - alexa_domain: kizi.com - alexa_rank: 6418 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - kizi.com query_parsers: @@ -15484,20 +9601,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: jobartis - public_suffix: com - alexa_domain: jobartis.com - alexa_rank: 6426 - category: career-jobs - notes: null - input_field: true - search_form: true - search_div: false domains: - jobartis.com query_parsers: @@ -15511,41 +9617,18 @@ - url_pattern: ^https?://[^/]+/vagas-emprego\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /vagas-emprego - name: ap - public_suffix: org - alexa_domain: ap.org - alexa_rank: 6460 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - ap.org query_parsers: - url_pattern: ^https?://[^/]+/search\.wz\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: photokade - public_suffix: com - alexa_domain: photokade.com - alexa_rank: 6536 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - photokade.com query_parsers: @@ -15559,21 +9642,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - /page - name: hdrezka - public_suffix: ag - alexa_domain: hdrezka.ag - alexa_rank: 6544 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - hdrezka.ag - hdrezka.me @@ -15585,20 +9657,9 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: actblue - public_suffix: com - alexa_domain: actblue.com - alexa_rank: 6646 - category: political - notes: null - input_field: false - search_form: false - search_div: true domains: - actblue.com - secure.actblue.com @@ -15610,20 +9671,9 @@ - url_pattern: ^https?://[^/]+/directory\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /directory - name: letmejerk - public_suffix: com - alexa_domain: letmejerk.com - alexa_rank: 6683 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - letmejerk.com query_parsers: @@ -15634,37 +9684,13 @@ - url_pattern: ^https?://[^/]+/se/ type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /se - name: pardot - public_suffix: com - alexa_domain: pardot.com - alexa_rank: 6689 - category: '-' - notes: exclude; Page not loading - input_field: false - search_form: false - search_div: true + excluded: Page not loading domains: - pardot.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: cjdropshipping - public_suffix: com - alexa_domain: cjdropshipping.com - alexa_rank: 6709 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - cjdropshipping.com query_parsers: @@ -15677,118 +9703,45 @@ - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: pageNum - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: chittorgarh - public_suffix: com - alexa_domain: chittorgarh.com - alexa_rank: 6723 - category: web-portal - notes: null - input_field: false - search_form: false - search_div: true domains: - chittorgarh.com query_parsers: - url_pattern: ^https?://[^/]+/search\.asp\? type: query_parameter parameter: text - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.asp? - name: gidonline - public_suffix: io - alexa_domain: gidonline.io - alexa_rank: 6739 - category: streaming - notes: null - input_field: true - search_form: true - search_div: false domains: - gidonline.io query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /?s - name: xitongcheng - public_suffix: com - alexa_domain: xitongcheng.com - alexa_rank: 6773 - category: download - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - xitongcheng.com - s.xitongcheng.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: subf2m - public_suffix: co - alexa_domain: subf2m.co - alexa_rank: 6812 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - subf2m.co query_parsers: - url_pattern: ^https?://[^/]+/subtitles/searchbytitle\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /subtitles/searchbytitle? - name: plesk - public_suffix: com - alexa_domain: plesk.com - alexa_rank: 6846 - category: corporate - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - plesk.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tigerdirect - public_suffix: com - alexa_domain: tigerdirect.com - alexa_rank: 6848 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - tigerdirect.com query_parsers: @@ -15802,85 +9755,39 @@ - url_pattern: ^https?://[^/]+/applications/SearchTools type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /applications/category - /applications/Category - /applications/SearchTools - name: fatosdesconhecidos - public_suffix: com.br - alexa_domain: fatosdesconhecidos.com.br - alexa_rank: 6855 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - fatosdesconhecidos.com.br query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: dmhy - public_suffix: org - alexa_domain: dmhy.org - alexa_rank: 6876 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - dmhy.org query_parsers: - url_pattern: ^https?://[^/]+/topics/list\? type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /topics/list? - name: ants - public_suffix: gouv.fr - alexa_domain: ants.gouv.fr - alexa_rank: 6880 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - ants.gouv.fr query_parsers: - url_pattern: ^https?://[^/]+/rechercher\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /rechercher - name: moniban - public_suffix: news - alexa_domain: moniban.news - alexa_rank: 6885 - category: news-and-boulevard - notes: excluded; No search - input_field: true - search_form: true - search_div: false + excluded: No search domains: - moniban.news query_parsers: @@ -15891,20 +9798,9 @@ - url_pattern: ^https?://[^/]+/newsstudios/archive/ type: query_parameter parameter: curp - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newsstudios/archive/ - name: letour - public_suffix: fr - alexa_domain: letour.fr - alexa_rank: 6893 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - letour.fr query_parsers: @@ -15920,24 +9816,12 @@ - url_pattern: ^https?://[^/]+/es/busqueda/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - /fr/rechercher - /de/suche - /es/busqueda - name: empflix - public_suffix: com - alexa_domain: empflix.com - alexa_rank: 6910 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - empflix.com query_parsers: @@ -15948,23 +9832,12 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: asurascans - public_suffix: com - alexa_domain: asurascans.com - alexa_rank: 6912 - category: manga-anime - notes: null - input_field: false - search_form: false - search_div: true domains: - - asura.gg - asurascans.com + - asura.gg query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter @@ -15973,37 +9846,13 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: luxuretv - public_suffix: com - alexa_domain: luxuretv.com - alexa_rank: 6925 - category: '-' - notes: excluded; Excluded from the Internet Archive - input_field: false - search_form: false - search_div: true + excluded: Excluded from the Internet Archive domains: - luxuretv.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: hi - public_suffix: gt - alexa_domain: hi.gt - alexa_rank: 6947 - category: web-portal - notes: null - input_field: true - search_form: true - search_div: false domains: - hi.gt query_parsers: @@ -16013,22 +9862,10 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /search - name: kuai8 - public_suffix: com - alexa_domain: kuai8.com - alexa_rank: 6994 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - kuai8.com query_parsers: @@ -16039,41 +9876,19 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: nsfc - public_suffix: gov.cn - alexa_domain: nsfc.gov.cn - alexa_rank: 7053 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - nsfc.gov.cn query_parsers: - url_pattern: ^https?://[^/]+/search\.htm\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.htm? - name: ftc - public_suffix: gov - alexa_domain: ftc.gov - alexa_rank: 7054 - category: governmental - notes: excluded; No valid snapshot - input_field: true - search_form: true - search_div: true + excluded: No valid snapshot domains: - ftc.gov - search.ftc.gov @@ -16085,210 +9900,73 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: websiteseochecker - public_suffix: com - alexa_domain: websiteseochecker.com - alexa_rank: 7059 - category: service - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - websiteseochecker.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: shiprocket - public_suffix: in - alexa_domain: shiprocket.in - alexa_rank: 7073 - category: corporate - notes: excluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - shiprocket.in - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: powerapp - public_suffix: download - alexa_domain: powerapp.download - alexa_rank: 7074 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: No search domains: - powerapp.download - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: al-marsd - public_suffix: com - alexa_domain: al-marsd.com - alexa_rank: 7101 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - al-marsd.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: peardeck - public_suffix: com - alexa_domain: peardeck.com - alexa_rank: 7104 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - peardeck.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: wikimapia - public_suffix: org - alexa_domain: wikimapia.org - alexa_rank: 7109 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - wikimapia.org query_parsers: - url_pattern: ^https?://[^/]+/ type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: discovercard - public_suffix: com - alexa_domain: discovercard.com - alexa_rank: 7178 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - discovercard.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: aldar - public_suffix: ma - alexa_domain: aldar.ma - alexa_rank: 7189 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - aldar.ma query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: 5i8xkqjmqubv - public_suffix: top - alexa_domain: 5i8xkqjmqubv.top - alexa_rank: 7220 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - 5i8xkqjmqubv.top - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: searchdimension - public_suffix: com - alexa_domain: searchdimension.com - alexa_rank: 7255 - category: spam-malware - notes: excluded - input_field: true - search_form: true - search_div: false domains: - searchdimension.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: gimy - public_suffix: tv - alexa_domain: gimy.tv - alexa_rank: 7264 - category: streaming - notes: null - input_field: true - search_form: true - search_div: false domains: - gimy.tv query_parsers: @@ -16306,54 +9984,17 @@ segment: 2 remove_patterns: - ([^0-9.]) - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: 0123movie - public_suffix: net - alexa_domain: 0123movie.net - alexa_rank: 7277 - category: streaming - notes: exclude; Page not loading - input_field: false - search_form: false - search_div: true + excluded: Page not loading domains: - 0123movie.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: influencermarketinghub - public_suffix: com - alexa_domain: influencermarketinghub.com - alexa_rank: 7287 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - influencermarketinghub.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: jra - public_suffix: go.jp - alexa_domain: jra.go.jp - alexa_rank: 7379 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - jra.go.jp - jra.jp @@ -16365,82 +10006,35 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: 2checkout - public_suffix: com - alexa_domain: 2checkout.com - alexa_rank: 7404 - category: corporate - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - 2checkout.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: imna - public_suffix: ir - alexa_domain: imna.ir - alexa_rank: 7420 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - imna.ir query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: bauhaus - public_suffix: info - alexa_domain: bauhaus.info - alexa_rank: 7504 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - bauhaus.info query_parsers: - url_pattern: ^https?://[^/]+/suche/produkte\? type: query_parameter parameter: text - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/suche/produkte\? type: query_parameter parameter: shownProducts - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /suche/produkte? - name: jamaran - public_suffix: news - alexa_domain: jamaran.news - alexa_rank: 7517 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - jamaran.news query_parsers: @@ -16449,66 +10043,32 @@ parameter: q remove_patterns: - \+ - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newsstudios/search? - name: senate - public_suffix: gov - alexa_domain: senate.gov - alexa_rank: 7531 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - senate.gov query_parsers: - url_pattern: ^https?://[^/]+/[^/]+/search type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[^/]+/search type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /general/search - name: hunter - public_suffix: io - alexa_domain: hunter.io - alexa_rank: 7538 - category: service - notes: null - input_field: true - search_form: true - search_div: true domains: - hunter.io query_parsers: - url_pattern: ^https?://[^/]+/try/search/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /try/search - name: educacao - public_suffix: mg.gov.br - alexa_domain: educacao.mg.gov.br - alexa_rank: 7681 - category: governmental notes: exclude - input_field: false - search_form: false - search_div: true domains: - educacao.mg.gov.br - www2.educacao.mg.gov.br @@ -16516,21 +10076,9 @@ - url_pattern: ^https?://[^/]+/component/search/\? type: query_parameter parameter: all - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /component/search - name: webcrawler - public_suffix: com - alexa_domain: webcrawler.com - alexa_rank: 7706 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - webcrawler.com query_parsers: @@ -16541,58 +10089,22 @@ - url_pattern: ^https?://[^/]+/serp\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /serp? - name: kooora365 - public_suffix: com - alexa_domain: kooora365.com - alexa_rank: 7736 - category: sports - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - kooora365.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: moviesjoy - public_suffix: to - alexa_domain: moviesjoy.to - alexa_rank: 7749 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - moviesjoy.to query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: 17zwd - public_suffix: com - alexa_domain: 17zwd.com - alexa_rank: 7754 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - 17zwd.com - gz.17zwd.com @@ -16604,20 +10116,9 @@ - url_pattern: ^https?://[^/]+/sks\.htm\? type: query_parameter parameter: spage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /sks.htm? - name: prweb - public_suffix: com - alexa_domain: prweb.com - alexa_rank: 7760 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - prweb.com query_parsers: @@ -16631,87 +10132,42 @@ - url_pattern: ^https?://[^/]+/Search\.aspx\? type: query_parameter parameter: start - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.aspx? - /Search.aspx? - name: almashhad-alyemeni - public_suffix: com - alexa_domain: almashhad-alyemeni.com - alexa_rank: 7765 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - almashhad-alyemeni.com query_parsers: - url_pattern: ^https?://[^/]+/section type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /section - name: fortinet - public_suffix: com - alexa_domain: fortinet.com - alexa_rank: 7807 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - fortinet.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: dygang - public_suffix: com - alexa_domain: dygang.com - alexa_rank: 7817 - category: streaming notes: Search ID instead of Query - input_field: true - search_form: true - search_div: false domains: - - dygang.cc - dygang.com + - dygang.cc + - dygang.net - so.dygang.com - dygod.net query_parsers: - url_pattern: ^https?://[^/]+/e/search/result/\? type: query_parameter parameter: searchid - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /e/search/result? - name: yts-subs - public_suffix: com - alexa_domain: yts-subs.com - alexa_rank: 7825 - category: download - notes: null - input_field: true - search_form: true - search_div: false domains: - yts-subs.com query_parsers: @@ -16722,100 +10178,41 @@ - url_pattern: ^https?://[^/]+/search/[^/]+\?page type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: activecampaign - public_suffix: com - alexa_domain: activecampaign.com - alexa_rank: 7835 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - activecampaign.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: loveplanet - public_suffix: ru - alexa_domain: loveplanet.ru - alexa_rank: 7850 - category: dating - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - loveplanet.ru - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: upworthy - public_suffix: com - alexa_domain: upworthy.com - alexa_rank: 7875 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - upworthy.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: diagrams - public_suffix: net - alexa_domain: diagrams.net - alexa_rank: 7878 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - diagrams.net + - drawio.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: povar - public_suffix: ru - alexa_domain: povar.ru - alexa_rank: 7899 - category: blog - notes: null - input_field: true - search_form: true - search_div: true domains: - povar.ru query_parsers: @@ -16826,37 +10223,13 @@ - url_pattern: ^https?://[^/]+/xmlsearch\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /xmlsearch? - name: shayanews - public_suffix: com - alexa_domain: shayanews.com - alexa_rank: 7973 - category: news-and-boulevard - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - shayanews.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: santander - public_suffix: co.uk - alexa_domain: santander.co.uk - alexa_rank: 8001 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - santander.co.uk - search.santander.co.uk @@ -16864,24 +10237,13 @@ - url_pattern: ^https?://[^/]+/s/search\.html\? type: query_parameter parameter: query - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/s/search\.html\? type: query_parameter parameter: start_rank - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s/search.html? - name: pingone - public_suffix: com - alexa_domain: pingone.com - alexa_rank: 8074 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - pingone.com - pingidentity.com @@ -16889,24 +10251,13 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search-results\.html type: fragment_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/search-results\.html type: query_parameter parameter: first - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search-results.html - name: ntv - public_suffix: ru - alexa_domain: ntv.ru - alexa_rank: 8127 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - ntv.ru query_parsers: @@ -16917,54 +10268,13 @@ - url_pattern: ^https?://[^/]+/finder/\? type: query_parameter parameter: pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /finder/ - name: ssl2anyone3 - public_suffix: com - alexa_domain: ssl2anyone3.com - alexa_rank: 8155 - category: spam-malware notes: exclude - input_field: false - search_form: false - search_div: true domains: - ssl2anyone3.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] -- name: chime - public_suffix: aws - alexa_domain: chime.aws - alexa_rank: 8163 - category: corporate - notes: excluded; Redirects to amazon - input_field: false - search_form: false - search_div: true - domains: - - chime.aws - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: investorplace - public_suffix: com - alexa_domain: investorplace.com - alexa_rank: 8174 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - investorplace.com query_parsers: @@ -16975,20 +10285,9 @@ - url_pattern: ^https?://[^/]+/search&\? type: query_parameter parameter: pg - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: freeones - public_suffix: com - alexa_domain: freeones.com - alexa_rank: 8229 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - freeones.com query_parsers: @@ -17005,9 +10304,6 @@ - url_pattern: ^https?://[^/]+/(photos|babes|videos|cams)\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /de/suche? - /photos? @@ -17015,31 +10311,10 @@ - /videos? - /cams? - name: limestart - public_suffix: cn - alexa_domain: limestart.cn - alexa_rank: 8239 - category: search-engine - notes: 'excluded; Redirects to Baidu; Uses Baidu search.' - input_field: false - search_form: false - search_div: true + excluded: Redirects to Baidu search domains: - limestart.cn - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: agemys - public_suffix: com - alexa_domain: agemys.com - alexa_rank: 8264 - category: manga-anime - notes: null - input_field: true - search_form: true - search_div: false domains: - agemys.com query_parsers: @@ -17050,27 +10325,16 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: gogoanime - public_suffix: sk - alexa_domain: gogoanime.sk - alexa_rank: 8291 - category: manga-anime - notes: null - input_field: true - search_form: true - search_div: true domains: + - gogoanime.sk - gogoanime.ai - gogoanime.ar - gogoanime.bid - gogoanime.fi - gogoanime.pe - - gogoanime.sk query_parsers: - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter @@ -17079,20 +10343,9 @@ - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html? - name: legifrance - public_suffix: gouv.fr - alexa_domain: legifrance.gouv.fr - alexa_rank: 8365 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - legifrance.gouv.fr query_parsers: @@ -17103,44 +10356,22 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/ - name: rozee - public_suffix: pk - alexa_domain: rozee.pk - alexa_rank: 8381 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - rozee.pk query_parsers: - url_pattern: ^https?://[^/]+/job/jsearch/q/[^/]+ type: path_segment segment: 4 - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/job/jsearch/q/[^/]+/fpn/[0-9]+ type: path_segment segment: 6 - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /job/jsearch/q - name: gaymaletube - public_suffix: com - alexa_domain: gaymaletube.com - alexa_rank: 8422 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - gaymaletube.com query_parsers: @@ -17151,41 +10382,18 @@ - url_pattern: ^https?://[^/]+/search/a/[^/]+\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/a - name: sciencealert - public_suffix: com - alexa_domain: sciencealert.com - alexa_rank: 8478 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - sciencealert.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: vavel - public_suffix: com - alexa_domain: vavel.com - alexa_rank: 8491 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - vavel.com query_parsers: @@ -17196,20 +10404,9 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: bestjavporn - public_suffix: com - alexa_domain: bestjavporn.com - alexa_rank: 8499 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - bestjavporn.com query_parsers: @@ -17220,99 +10417,40 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/page/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: vijesti - public_suffix: ba - alexa_domain: vijesti.ba - alexa_rank: 8539 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - vijesti.ba query_parsers: - url_pattern: ^https?://[^/]+/pretraga\? type: query_parameter parameter: keyword - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/pretraga\? type: query_parameter parameter: od_ - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pretraga? - name: everydaykoala - public_suffix: com - alexa_domain: everydaykoala.com - alexa_rank: 8585 - category: news-and-boulevard - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - everydaykoala.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: codelist - public_suffix: cc - alexa_domain: codelist.cc - alexa_rank: 8591 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - codelist.cc - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: joq - public_suffix: al - alexa_domain: joq.al - alexa_rank: 8616 - category: streaming - notes: null - input_field: true - search_form: true - search_div: false domains: - joq.al + - joq-albania.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: wowprogress - public_suffix: com - alexa_domain: wowprogress.com - alexa_rank: 8644 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - wowprogress.com query_parsers: @@ -17323,53 +10461,14 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: yoo7 - public_suffix: com - alexa_domain: yoo7.com - alexa_rank: 8685 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - yoo7.com - query_parsers: - - url_pattern: ^https?://[^/]+/tag/.+ - type: path_segment - segment: 0 - delimiter: ' ' - remove_patterns: - - ^/tag/ - - /p[0-9]+$ - space_patterns: - - / - page_parsers: - - url_pattern: ^https?://[^/]+/tag/.+/p[0-9]+ - type: path_segment - segment: 0 - delimiter: ' ' - remove_patterns: - - ^/tag/.+/p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /tag - name: elmwatin - public_suffix: com - alexa_domain: elmwatin.com - alexa_rank: 8706 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - elmwatin.com query_parsers: @@ -17380,79 +10479,32 @@ - url_pattern: ^https?://[^/]+/list\.aspx\? type: query_parameter parameter: Page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /list.aspx? - name: decolar - public_suffix: com - alexa_domain: decolar.com - alexa_rank: 8730 - category: e-commerce - notes: Search results are flight connections; Query is destination - input_field: false - search_form: false - search_div: true + notes: Search results are flight connections, query is the destination domains: - decolar.com query_parsers: - url_pattern: ^https?://[^/]+/shop/flights/results/roundtrip/[^/]+/[^/]+ type: path_segment segment: 6 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /shop/flights/results/roundtrip - name: yummly - public_suffix: com - alexa_domain: yummly.com - alexa_rank: 8750 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - yummly.com query_parsers: - url_pattern: ^https?://[^/]+/recipes\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /recipes? - name: yt5s - public_suffix: com - alexa_domain: yt5s.com - alexa_rank: 8781 - category: spam-malware notes: exclude - input_field: true - search_form: true - search_div: true domains: - yt5s.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: hongkiat - public_suffix: com - alexa_domain: hongkiat.com - alexa_rank: 8783 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - hongkiat.com query_parsers: @@ -17466,59 +10518,23 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: divxtotal3 - public_suffix: net - alexa_domain: divxtotal3.net - alexa_rank: 8853 - category: '-' - notes: exclude; Page not loading - input_field: false - search_form: false - search_div: true + excluded: Page not loading domains: - divxtotal3.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mybookie - public_suffix: ag - alexa_domain: mybookie.ag - alexa_rank: 8941 - category: gambling - notes: null - input_field: true - search_form: true - search_div: true domains: - mybookie.ag query_parsers: - url_pattern: ^https?://[^/]+/sportsbook/\? type: query_parameter parameter: sportsbook_search_term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /sportsbook/? - name: mangaraw - public_suffix: co - alexa_domain: mangaraw.co - alexa_rank: 9020 - category: manga-anime - notes: null - input_field: true - search_form: true - search_div: false domains: - mangaraw.co query_parsers: @@ -17529,20 +10545,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: gazetadopovo - public_suffix: com.br - alexa_domain: gazetadopovo.com.br - alexa_rank: 9154 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - gazetadopovo.com.br query_parsers: @@ -17553,37 +10558,13 @@ - url_pattern: ^https?://[^/]+/busca type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /busca - name: tax - public_suffix: gov.uk - alexa_domain: tax.gov.uk - alexa_rank: 9179 - category: '-' - notes: excluded; Not archived - input_field: true - search_form: true - search_div: true + excluded: Not archived domains: - tax.gov.uk - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: 6park - public_suffix: com - alexa_domain: 6park.com - alexa_rank: 9180 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - 6park.com query_parsers: @@ -17594,20 +10575,9 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: perezhilton - public_suffix: com - alexa_domain: perezhilton.com - alexa_rank: 9218 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - perezhilton.com query_parsers: @@ -17621,42 +10591,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: smashingmagazine - public_suffix: com - alexa_domain: smashingmagazine.com - alexa_rank: 9227 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - smashingmagazine.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: ero-video - public_suffix: net - alexa_domain: ero-video.net - alexa_rank: 9250 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - ero-video.net - en.ero-video.net @@ -17668,37 +10615,13 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: y2meta - public_suffix: com - alexa_domain: y2meta.com - alexa_rank: 9266 - category: spam-malware notes: exclude - input_field: true - search_form: true - search_div: false domains: - y2meta.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: iflscience - public_suffix: com - alexa_domain: iflscience.com - alexa_rank: 9321 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - iflscience.com query_parsers: @@ -17709,20 +10632,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: porntube - public_suffix: com - alexa_domain: porntube.com - alexa_rank: 9476 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - porntube.com query_parsers: @@ -17733,58 +10645,21 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: spyfu - public_suffix: com - alexa_domain: spyfu.com - alexa_rank: 9482 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - spyfu.com query_parsers: - url_pattern: ^https?://[^/]+/overview/domain\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /overview/domain? - name: launchpage - public_suffix: org - alexa_domain: launchpage.org - alexa_rank: 9513 - category: spam-malware - notes: excluded - input_field: true - search_form: true - search_div: false domains: - launchpage.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: aps - public_suffix: org - alexa_domain: aps.org - alexa_rank: 9516 - category: education - notes: null - input_field: false - search_form: false - search_div: true domains: - aps.org - search.aps.org @@ -17792,21 +10667,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: americanmilitarynews - public_suffix: com - alexa_domain: americanmilitarynews.com - alexa_rank: 9625 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - americanmilitarynews.com query_parsers: @@ -17820,38 +10683,14 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: btctrademart - public_suffix: com - alexa_domain: btctrademart.com - alexa_rank: 9647 - category: corporate - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - btctrademart.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: alot - public_suffix: com - alexa_domain: alot.com - alexa_rank: 9678 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - alot.com query_parsers: @@ -17862,20 +10701,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: voachinese - public_suffix: com - alexa_domain: voachinese.com - alexa_rank: 9686 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - voachinese.com query_parsers: @@ -17886,41 +10714,18 @@ - url_pattern: ^https?://[^/]+/s\? type: query_parameter parameter: pp - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s? - name: eanswers - public_suffix: com - alexa_domain: eanswers.com - alexa_rank: 9747 - category: question-and-answer - notes: null - input_field: false - search_form: false - search_div: true domains: - eanswers.com query_parsers: - url_pattern: ^https?://[^/]+/results/v0/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /results/v0/search? - name: buedemusica - public_suffix: com - alexa_domain: buedemusica.com - alexa_rank: 9920 - category: download - notes: null - input_field: true - search_form: true - search_div: true domains: - buedemusica.com query_parsers: @@ -17934,21 +10739,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: archlinux - public_suffix: org - alexa_domain: archlinux.org - alexa_rank: 9935 - category: service - notes: null - input_field: true - search_form: true - search_div: true domains: - archlinux.org query_parsers: @@ -17959,79 +10753,31 @@ - url_pattern: ^https?://[^/]+/packages type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /packages - name: insideevs - public_suffix: com - alexa_domain: insideevs.com - alexa_rank: 9950 - category: review - notes: null - input_field: true - search_form: true - search_div: false domains: - insideevs.com query_parsers: - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: tvsubtitles - public_suffix: net - alexa_domain: tvsubtitles.net - alexa_rank: 10183 - category: download - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - tvsubtitles.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: pornhdvideos - public_suffix: net - alexa_domain: pornhdvideos.net - alexa_rank: 10186 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - pornhdvideos.net query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: seoclerk - public_suffix: com - alexa_domain: seoclerk.com - alexa_rank: 10189 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - seoclerk.com query_parsers: @@ -18045,97 +10791,36 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/page/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /marketplace? - /search - name: bkrs - public_suffix: info - alexa_domain: bkrs.info - alexa_rank: 10300 - category: forum - notes: null - input_field: false - search_form: false - search_div: true domains: - bkrs.info query_parsers: - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: openai - public_suffix: com - alexa_domain: openai.com - alexa_rank: 10322 - category: corporate - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - openai.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: natro - public_suffix: com - alexa_domain: natro.com - alexa_rank: 10330 - category: service - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - natro.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: rvcj - public_suffix: com - alexa_domain: rvcj.com - alexa_rank: 10387 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - rvcj.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: ccn - public_suffix: com - alexa_domain: ccn.com - alexa_rank: 10455 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - ccn.com query_parsers: @@ -18149,59 +10834,31 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: 18comic - public_suffix: vip - alexa_domain: 18comic.vip - alexa_rank: 10626 - category: pornography - notes: excluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - 18comic.vip - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] + - jmcomic.asia + - jmcomic.mobi + query_parsers: + - url_pattern: ^https?://[^/]+/search + type: query_parameter + parameter: search_query + focused_url_prefixes: + - /search - name: moretify - public_suffix: com - alexa_domain: moretify.com - alexa_rank: 10718 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - moretify.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: pornez - public_suffix: net - alexa_domain: pornez.net - alexa_rank: 10772 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - pornez.net query_parsers: @@ -18212,99 +10869,26 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: yyets - public_suffix: com - alexa_domain: yyets.com - alexa_rank: 10869 - category: '-' - notes: excluded; No valid snapshot - input_field: true - search_form: true - search_div: false + excluded: No valid snapshot domains: - yyets.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: cashstar - public_suffix: com - alexa_domain: cashstar.com - alexa_rank: 11028 - category: '-' - notes: excluded; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - cashstar.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: yupptv - public_suffix: com - alexa_domain: yupptv.com - alexa_rank: 11051 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - yupptv.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search -- name: fgov - public_suffix: be - alexa_domain: fgov.be - alexa_rank: 11153 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true - domains: - - fgov.be - query_parsers: - - url_pattern: ^https?://[^/]+/[a-z]+/search\? - type: query_parameter - parameter: keywords - page_parsers: - - url_pattern: ^https?://[^/]+/[a-z]+/search\? - type: query_parameter - parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: - - /en/search? - name: searchengineland - public_suffix: com - alexa_domain: searchengineland.com - alexa_rank: 11155 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - searchengineland.com query_parsers: @@ -18318,21 +10902,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: pronto - public_suffix: com - alexa_domain: pronto.com - alexa_rank: 11178 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - pronto.com query_parsers: @@ -18343,41 +10916,18 @@ - url_pattern: ^https?://[^/]+/shopping\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /shopping? - name: ucloud - public_suffix: cn - alexa_domain: ucloud.cn - alexa_rank: 11329 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - ucloud.cn query_parsers: - url_pattern: ^https?://[^/]+/site/search\.html\? type: query_parameter parameter: k - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /site/search.html? - name: rolloid - public_suffix: net - alexa_domain: rolloid.net - alexa_rank: 11364 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - rolloid.net query_parsers: @@ -18388,26 +10938,16 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: 4movierulz - public_suffix: pw - alexa_domain: 4movierulz.pw - alexa_rank: 11383 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: + - 4movierulz.pw - 1movierulz.com - 3movierulz.sx - 4movierulz.pl - - 4movierulz.pw - 4movierulz.tv + - 5movierulz.ac - 5movierulz.tc - 5movierulz.tv - 6movierulz.com @@ -18424,82 +10964,34 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: playjunkie - public_suffix: com - alexa_domain: playjunkie.com - alexa_rank: 11397 - category: news-and-boulevard - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - playjunkie.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: goodporn - public_suffix: to - alexa_domain: goodporn.to - alexa_rank: 11405 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - goodporn.to query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: wkzf - public_suffix: com - alexa_domain: wkzf.com - alexa_rank: 11442 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - wkzf.com query_parsers: - url_pattern: ^https?://[^/]+/[^/]+/esf/[^/]+$ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /shanghai/esf - /chengdu/esf - /dezhou/esf - /guangzhou/esf - name: movies2watch - public_suffix: tv - alexa_domain: movies2watch.tv - alexa_rank: 11455 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - movies2watch.tv query_parsers: @@ -18510,20 +11002,9 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: auchan - public_suffix: fr - alexa_domain: auchan.fr - alexa_rank: 11562 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - auchan.fr query_parsers: @@ -18533,79 +11014,29 @@ - url_pattern: ^https?://[^/]+/.*\?redirect_keywords type: query_parameter parameter: redirect_keywords - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /recherche? - /boissons-sans-alcool/ - /oeufs-produits-laitiers - /epicerie-sucree - name: pianshen - public_suffix: com - alexa_domain: pianshen.com - alexa_rank: 11652 - category: blog - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - pianshen.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: weakstreams - public_suffix: com - alexa_domain: weakstreams.com - alexa_rank: 11721 - category: streaming - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - weakstreams.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: futebolplayhd - public_suffix: com - alexa_domain: futebolplayhd.com - alexa_rank: 11754 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - futebolplayhd.com query_parsers: - url_pattern: ^https?://[^/]+/buscar/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /buscar - name: yingjiesheng - public_suffix: com - alexa_domain: yingjiesheng.com - alexa_rank: 11835 - category: career-jobs - notes: null - input_field: true - search_form: true - search_div: false domains: - yingjiesheng.com - s.yingjiesheng.com @@ -18613,130 +11044,59 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: word - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: kat - public_suffix: cr - alexa_domain: kat.cr - alexa_rank: 11838 - category: torrent - notes: null - input_field: false - search_form: false - search_div: true domains: - kat.cr query_parsers: - url_pattern: ^https?://[^/]+/usearch/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /usearch - name: bank-zone - public_suffix: com - alexa_domain: bank-zone.com - alexa_rank: 11846 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - bank-zone.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: ic - public_suffix: net.cn - alexa_domain: ic.net.cn - alexa_rank: 11916 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - ic.net.cn query_parsers: - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: key - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: shahed4u - public_suffix: onl - alexa_domain: shahed4u.onl - alexa_rank: 11931 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - - shahed4u.land - shahed4u.onl + - shahed4u.land query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: rule34video - public_suffix: com - alexa_domain: rule34video.com - alexa_rank: 11964 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - rule34video.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: gaoding - public_suffix: com - alexa_domain: gaoding.com - alexa_rank: 11979 - category: media-sharing - notes: null - input_field: false - search_form: false - search_div: true domains: - gaoding.com query_parsers: @@ -18751,37 +11111,12 @@ segment: 2 remove_patterns: - ^.*_pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /contents - name: watchfreejavonline - public_suffix: co - alexa_domain: watchfreejavonline.co - alexa_rank: 12014 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - watchfreejavonline.co - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: thevintagenews - public_suffix: com - alexa_domain: thevintagenews.com - alexa_rank: 12202 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - thevintagenews.com query_parsers: @@ -18795,21 +11130,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: orient - public_suffix: tm - alexa_domain: orient.tm - alexa_rank: 12214 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - orient.tm query_parsers: @@ -18820,58 +11144,22 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: folkd - public_suffix: com - alexa_domain: folkd.com - alexa_rank: 12266 - category: search-engine - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: true + excluded: Query not in URL domains: - folkd.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: salesloft - public_suffix: com - alexa_domain: salesloft.com - alexa_rank: 12295 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - salesloft.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: kbhgames - public_suffix: com - alexa_domain: kbhgames.com - alexa_rank: 12405 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - kbhgames.com query_parsers: @@ -18885,104 +11173,45 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: schema - public_suffix: org - alexa_domain: schema.org - alexa_rank: 12713 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - schema.org query_parsers: - url_pattern: ^https?://[^/]+/docs/search_results\.html\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /docs/search_results.html? - name: etheplatinum - public_suffix: com - alexa_domain: etheplatinum.com - alexa_rank: 12769 - category: gambling - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - etheplatinum.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: kmart - public_suffix: com - alexa_domain: kmart.com - alexa_rank: 12799 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - kmart.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: search - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: infocert - public_suffix: it - alexa_domain: infocert.it - alexa_rank: 12917 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - infocert.it query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: fanpop - public_suffix: com - alexa_domain: fanpop.com - alexa_rank: 12930 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - fanpop.com query_parsers: @@ -18993,41 +11222,18 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page_num - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /seach? - name: sexbjcam - public_suffix: com - alexa_domain: sexbjcam.com - alexa_rank: 12985 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - sexbjcam.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: hearthstonetopdecks - public_suffix: com - alexa_domain: hearthstonetopdecks.com - alexa_rank: 13091 - category: gaming - notes: null - input_field: false - search_form: false - search_div: true domains: - hearthstonetopdecks.com query_parsers: @@ -19044,43 +11250,20 @@ - url_pattern: ^https?://[^/]+/(cards|decks)/page/[0-9]+/\? type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /cards/? - /decks/? - name: nios - public_suffix: ac.in - alexa_domain: nios.ac.in - alexa_rank: 13102 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - nios.ac.in query_parsers: - url_pattern: ^https?://[^/]+/search\.aspx\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.aspx? - name: mostkbal - public_suffix: com - alexa_domain: mostkbal.com - alexa_rank: 13161 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - mostkbal.com query_parsers: @@ -19091,20 +11274,9 @@ - url_pattern: ^https?://[^/]+/Search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Search? - name: searchlock - public_suffix: com - alexa_domain: searchlock.com - alexa_rank: 13296 - category: search-engine - notes: null - input_field: true - search_form: true - search_div: false domains: - searchlock.com - results.searchlock.com @@ -19116,37 +11288,13 @@ - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: apgn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: news19 - public_suffix: biz - alexa_domain: news19.biz - alexa_rank: 13337 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - news19.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tagdiv - public_suffix: com - alexa_domain: tagdiv.com - alexa_rank: 13373 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - tagdiv.com query_parsers: @@ -19160,21 +11308,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: livingsocial - public_suffix: com - alexa_domain: livingsocial.com - alexa_rank: 13388 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - livingsocial.com query_parsers: @@ -19185,37 +11322,13 @@ - url_pattern: ^https?://[^/]+/browse/[^/]+\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /browse - name: bestmining - public_suffix: top - alexa_domain: bestmining.top - alexa_rank: 13534 - category: '-' - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - bestmining.top - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: znzmo - public_suffix: com - alexa_domain: znzmo.com - alexa_rank: 13653 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - znzmo.com - 3d.znzmo.com @@ -19232,10 +11345,6 @@ - url_pattern: ^https?://[^/]+/searchCase/[^/]+\? type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /general - /3dmoxing @@ -19246,14 +11355,6 @@ - /xgt - /searchCase - name: unza - public_suffix: zm - alexa_domain: unza.zm - alexa_rank: 13743 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - unza.zm query_parsers: @@ -19264,37 +11365,13 @@ - url_pattern: ^https?://[^/]+/search/node\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/node? - name: newsd - public_suffix: co - alexa_domain: newsd.co - alexa_rank: 13784 - category: news-and-boulevard - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - newsd.co - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: futemax - public_suffix: live - alexa_domain: futemax.live - alexa_rank: 13900 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - futemax.live - futemax.app @@ -19302,21 +11379,9 @@ - url_pattern: ^https?://[^/]+/buscar/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /buscar - name: sexkbj - public_suffix: com - alexa_domain: sexkbj.com - alexa_rank: 13927 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - sexkbj.com query_parsers: @@ -19330,45 +11395,23 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: angola24horas - public_suffix: com - alexa_domain: angola24horas.com - alexa_rank: 14017 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - angola24horas.com query_parsers: - url_pattern: ^https?://[^/]+/mais/[^/]+/pesquisar\? type: query_parameter parameter: searchword - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/mais/[^/]+/pesquisar\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /mais - name: 51pla - public_suffix: com - alexa_domain: 51pla.com - alexa_rank: 14080 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - 51pla.com query_parsers: @@ -19403,9 +11446,6 @@ - url_pattern: ^https?://[^/]+/spec/search\? type: query_parameter parameter: pageNo - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /product/search? - /company/search? @@ -19413,35 +11453,15 @@ - /price/search? - /spec/search? - name: mcls - public_suffix: xyz - alexa_domain: mcls.xyz - alexa_rank: 14208 - category: spam-malware - notes: null - input_field: false - search_form: false - search_div: true domains: - mcls.xyz query_parsers: - url_pattern: ^https?://[^/]+/results\.php\? type: query_parameter parameter: wd - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /results.php? - name: jacquielawson - public_suffix: com - alexa_domain: jacquielawson.com - alexa_rank: 14235 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - jacquielawson.com query_parsers: @@ -19452,214 +11472,80 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: higherperspectives - public_suffix: com - alexa_domain: higherperspectives.com - alexa_rank: 14261 - category: '-' - notes: exclude; Page not loading - input_field: true - search_form: true - search_div: false + excluded: Page not loading domains: - higherperspectives.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: news-sphere - public_suffix: com - alexa_domain: news-sphere.com - alexa_rank: 14271 - category: news-and-boulevard - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - news-sphere.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: beautifultrendstoday - public_suffix: com - alexa_domain: beautifultrendstoday.com - alexa_rank: 14322 - category: news-and-boulevard - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - beautifultrendstoday.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: asianembed - public_suffix: io - alexa_domain: asianembed.io - alexa_rank: 14338 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - asianembed.io + - asianhdplay.pro query_parsers: - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html\? - name: notjustok - public_suffix: com - alexa_domain: notjustok.com - alexa_rank: 14462 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - notjustok.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: pptvhd36 - public_suffix: com - alexa_domain: pptvhd36.com - alexa_rank: 14480 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - pptvhd36.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: bestdealfor28 - public_suffix: life - alexa_domain: bestdealfor28.life - alexa_rank: 14505 - category: '-' - notes: excluded; No valid snaphshot - input_field: false - search_form: false - search_div: true + excluded: No valid snaphshot domains: - bestdealfor28.life - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sidereel - public_suffix: com - alexa_domain: sidereel.com - alexa_rank: 14508 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - sidereel.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: sopitas - public_suffix: com - alexa_domain: sopitas.com - alexa_rank: 14610 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - sopitas.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: searchalgo - public_suffix: com - alexa_domain: searchalgo.com - alexa_rank: 14750 - category: search-engine - notes: null - input_field: false - search_form: false - search_div: true domains: - searchalgo.com query_parsers: - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html? - name: healthdata - public_suffix: org - alexa_domain: healthdata.org - alexa_rank: 14800 - category: education - notes: null - input_field: true - search_form: true - search_div: true domains: - healthdata.org query_parsers: @@ -19670,75 +11556,26 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: limetorrents - public_suffix: pro - alexa_domain: limetorrents.pro - alexa_rank: 14823 - category: torrent - notes: exlcluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - limetorrents.pro - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: imovie-hd - public_suffix: com - alexa_domain: imovie-hd.com - alexa_rank: 14849 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - imovie-hd.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: nqrkzcd7ixwr - public_suffix: com - alexa_domain: nqrkzcd7ixwr.com - alexa_rank: 14898 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - nqrkzcd7ixwr.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: pishnahadevizheh - public_suffix: com - alexa_domain: pishnahadevizheh.com - alexa_rank: 14919 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - pishnahadevizheh.com query_parsers: @@ -19752,38 +11589,13 @@ - url_pattern: ^https?://[^/]+/newsstudios/archive/\? type: query_parameter parameter: curp - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /fa - /newsstudios/ - name: largeporntube - public_suffix: com - alexa_domain: largeporntube.com - alexa_rank: 14992 - category: pornography - notes: excluded; Covered by porntube.com - input_field: false - search_form: false - search_div: true domains: - largeporntube.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mojnews - public_suffix: com - alexa_domain: mojnews.com - alexa_rank: 15092 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - mojnews.com query_parsers: @@ -19794,41 +11606,18 @@ - url_pattern: ^https?://[^/]+/fa/newsstudios/search\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /fa/newsstudios/search? - name: workercn - public_suffix: cn - alexa_domain: workercn.cn - alexa_rank: 15109 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - workercn.cn query_parsers: - url_pattern: ^https?://[^/]+/search/result.shtml\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: weaplay - public_suffix: com - alexa_domain: weaplay.com - alexa_rank: 15215 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - weaplay.com - weadown.com @@ -19844,42 +11633,20 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: hentaihaven - public_suffix: xxx - alexa_domain: hentaihaven.xxx - alexa_rank: 15340 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - hentaihaven.xxx query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: webhostingtalk - public_suffix: com - alexa_domain: webhostingtalk.com - alexa_rank: 15433 - category: forum notes: No query put searchid saved - input_field: true - search_form: true - search_div: true domains: - webhostingtalk.com query_parsers: @@ -19890,46 +11657,24 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php - name: dontorrent - public_suffix: net - alexa_domain: dontorrent.net - alexa_rank: 15525 - category: torrent - notes: null - input_field: true - search_form: true - search_div: false domains: + - dontorrent.net - dontorrent.fun - dontorrent.lol - - dontorrent.net query_parsers: - url_pattern: ^https?://[^/]+/buscar/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /buscar - name: zk - public_suffix: fm - alexa_domain: zk.fm - alexa_rank: 15599 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: true domains: - zk.fm - z2.fm + - z3.fm query_parsers: - url_pattern: ^https?://[^/]+/mp3/search\? type: query_parameter @@ -19938,20 +11683,9 @@ - url_pattern: ^https?://[^/]+/mp3/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: her-news - public_suffix: com - alexa_domain: her-news.com - alexa_rank: 15893 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - her-news.com query_parsers: @@ -19962,37 +11696,13 @@ - url_pattern: ^https?://[^/]+/list\.aspx\? type: query_parameter parameter: Page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /list.aspx? - name: programmersought - public_suffix: com - alexa_domain: programmersought.com - alexa_rank: 15989 - category: blog - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - programmersought.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: khoondanionline - public_suffix: com - alexa_domain: khoondanionline.com - alexa_rank: 16058 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - khoondanionline.com query_parsers: @@ -20008,56 +11718,18 @@ - url_pattern: ^https?://[^/]+/newsstudios/archive/\? type: query_parameter parameter: curp - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /fa/newsstudios/archive - /newsstudios/archive - name: torrent9 - public_suffix: blue - alexa_domain: torrent9.blue - alexa_rank: 16097 - category: torrent - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - torrent9.blue - torrent9.nz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: diguver - public_suffix: com - alexa_domain: diguver.com - alexa_rank: 16204 - category: '-' - notes: null - input_field: false - search_form: false - search_div: true domains: - diguver.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: pirateproxy - public_suffix: lat - alexa_domain: pirateproxy.lat - alexa_rank: 16259 - category: torrent - notes: null - input_field: false - search_form: false - search_div: true domains: - pirateproxy.lat query_parsers: @@ -20068,20 +11740,9 @@ - url_pattern: ^https?://[^/]+/s type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: aufeminin - public_suffix: com - alexa_domain: aufeminin.com - alexa_rank: 16273 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - aufeminin.com query_parsers: @@ -20092,71 +11753,21 @@ - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: hotpoptoday - public_suffix: com - alexa_domain: hotpoptoday.com - alexa_rank: 16374 - category: blog - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - hotpoptoday.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: 64p3am9x95ct - public_suffix: com - alexa_domain: 64p3am9x95ct.com - alexa_rank: 16442 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - 64p3am9x95ct.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: quicklisti - public_suffix: com - alexa_domain: quicklisti.com - alexa_rank: 16516 - category: '-' - notes: excluded; No valid snaphshot - input_field: false - search_form: false - search_div: true + excluded: No valid snaphshot domains: - quicklisti.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: alriyadh - public_suffix: news - alexa_domain: alriyadh.news - alexa_rank: 16664 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - alriyadh.news - elriyadh.news @@ -20164,81 +11775,33 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: resumersvo - public_suffix: fun - alexa_domain: resumersvo.fun - alexa_rank: 16745 - category: '-' - notes: excluded; No valid snaphshot - input_field: false - search_form: false - search_div: true + excluded: No valid snaphshot domains: - resumersvo.fun - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: lassuranceretraite - public_suffix: fr - alexa_domain: lassuranceretraite.fr - alexa_rank: 16746 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - lassuranceretraite.fr query_parsers: - url_pattern: ^https?://[^/]+/.*/resultat-de-recherche\.html\? type: query_parameter parameter: searchedText - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /portail-info/hors-menu/recherche/resultat-de-recherche.html? - /portail-info/sites/pub/hors-menu/recherche/resultat-de-recherche.html? - name: in - public_suffix: com - alexa_domain: in.com - alexa_rank: 16767 - category: news-and-boulevard - notes: excluded; No valid snaphshot - input_field: false - search_form: false - search_div: true + excluded: No valid snaphshot domains: - in.com query_parsers: - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: jp - public_suffix: sharp - alexa_domain: jp.sharp - alexa_rank: 16926 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - jp.sharp query_parsers: @@ -20249,21 +11812,12 @@ - url_pattern: ^https?://[^/]+/search/index\.html type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/index.html - name: javdb39 - public_suffix: com - alexa_domain: javdb39.com - alexa_rank: 17035 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: + - javdb.com + - javdb36.com - javdb39.com query_parsers: - url_pattern: ^https?://[^/]+/search\? @@ -20273,20 +11827,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: pornerbros - public_suffix: com - alexa_domain: pornerbros.com - alexa_rank: 17183 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - pornerbros.com query_parsers: @@ -20297,62 +11840,27 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: 1plus1 - public_suffix: video - alexa_domain: 1plus1.video - alexa_rank: 17200 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - 1plus1.video query_parsers: - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: pornmedium - public_suffix: com - alexa_domain: pornmedium.com - alexa_rank: 17254 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - pornmedium.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: search-ch - public_suffix: ch - alexa_domain: search.ch - alexa_rank: 17257 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - search.ch - web.search.ch @@ -20364,20 +11872,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: pages - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: bjeea - public_suffix: cn - alexa_domain: bjeea.cn - alexa_rank: 17267 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - bjeea.cn - search.bjeea.cn @@ -20391,41 +11888,18 @@ - url_pattern: ^https?://[^/]+/plus/search\.php\? type: query_parameter parameter: PageNo - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /plus/search.php? - name: agadirinfo - public_suffix: ma - alexa_domain: agadirinfo.ma - alexa_rank: 17565 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - agadirinfo.ma query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /?s - name: hibasport - public_suffix: com - alexa_domain: hibasport.com - alexa_rank: 17588 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - hibasport.com query_parsers: @@ -20434,21 +11908,9 @@ parameter: s remove_patterns: - \+ - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: dogpile - public_suffix: com - alexa_domain: dogpile.com - alexa_rank: 17613 - category: child-safe-search - notes: null - input_field: null - search_form: null - search_div: null domains: - dogpile.com query_parsers: @@ -20459,41 +11921,18 @@ - url_pattern: ^https?://[^/]+/serp\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /serp? - name: amateur8 - public_suffix: com - alexa_domain: amateur8.com - alexa_rank: 17695 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - amateur8.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: pron - public_suffix: tv - alexa_domain: pron.tv - alexa_rank: 17805 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - pron.tv - mypron.tv @@ -20501,21 +11940,9 @@ - url_pattern: ^https?://[^/]+/videos/search/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /videos/search - name: justporno - public_suffix: tv - alexa_domain: justporno.tv - alexa_rank: 17863 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - justporno.tv - cams.justporno.tv @@ -20523,138 +11950,51 @@ - url_pattern: ^https?://[^/]+/[a-z]+/(girls|boys|tranny)/[^/] type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/girls - /en/boys - /en/tranny - name: mybeautyland - public_suffix: shop - alexa_domain: mybeautyland.shop - alexa_rank: 17911 - category: e-commerce - notes: exclude; No search - input_field: true - search_form: true - search_div: true + excluded: No search domains: - mybeautyland.shop - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: d72gb9oukw3j - public_suffix: com - alexa_domain: d72gb9oukw3j.com - alexa_rank: 18048 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - d72gb9oukw3j.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: bmony - public_suffix: space - alexa_domain: bmony.space - alexa_rank: 18148 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: No search domains: - bmony.space - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ligueimovel - public_suffix: ao - alexa_domain: ligueimovel.ao - alexa_rank: 18213 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - ligueimovel.ao query_parsers: - url_pattern: ^https?://[^/]+/pesqusiar-imovel/\? type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /pesqusiar-imovel - name: 123movies - public_suffix: fun - alexa_domain: 123movies.fun - alexa_rank: 18227 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - - 123movies.co - 123movies.fun + - 123movies.co query_parsers: - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: maturetubehere - public_suffix: com - alexa_domain: maturetubehere.com - alexa_rank: 18246 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - maturetubehere.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+/\? type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: porntop - public_suffix: com - alexa_domain: porntop.com - alexa_rank: 18281 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - porntop.com query_parsers: @@ -20665,75 +12005,26 @@ - url_pattern: ^https?://[^/]+/search/[^/]+ type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: khabarnama - public_suffix: net - alexa_domain: khabarnama.net - alexa_rank: 18404 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - khabarnama.net query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: mxss - public_suffix: xyz - alexa_domain: mxss.xyz - alexa_rank: 18420 - category: spam-malware - notes: exclude - input_field: false - search_form: false - search_div: true + notes: exclude # TODO domains: - mxss.xyz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: freesion - public_suffix: com - alexa_domain: freesion.com - alexa_rank: 18561 - category: '-' - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - freesion.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: naijanews - public_suffix: com - alexa_domain: naijanews.com - alexa_rank: 18650 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - naijanews.com query_parsers: @@ -20747,38 +12038,14 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: ws1xr1u2b4 - public_suffix: top - alexa_domain: ws1xr1u2b4.top - alexa_rank: 18883 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - ws1xr1u2b4.top - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sexseq - public_suffix: com - alexa_domain: sexseq.com - alexa_rank: 19010 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - sexseq.com query_parsers: @@ -20789,41 +12056,18 @@ - url_pattern: ^https?://[^/]+/trends/[^/]+/[0-9]+/ type: path_segment segment: 3 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /trends - name: gamingwonderland - public_suffix: com - alexa_domain: gamingwonderland.com - alexa_rank: 19017 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - gamingwonderland.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: jomys - public_suffix: xyz - alexa_domain: jomys.xyz - alexa_rank: 19130 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - jomys.xyz query_parsers: @@ -20837,38 +12081,14 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /index.php? - name: maktoob - public_suffix: com - alexa_domain: maktoob.com - alexa_rank: 19139 - category: search-engine - notes: excluded; Redirects to yahoo; Uses yahoo - input_field: false - search_form: false - search_div: true + excluded: Redirects to Yahoo! search domains: - maktoob.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: chinaso - public_suffix: com - alexa_domain: chinaso.com - alexa_rank: 19396 - category: search-engine - notes: null - input_field: false - search_form: false - search_div: true domains: - chinaso.com query_parsers: @@ -20918,9 +12138,6 @@ - url_pattern: ^https?://[^/]+/newssearch/young\? type: query_parameter parameter: pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /newssearch/all - /newssearch/social @@ -20931,14 +12148,6 @@ - /newssearch/block - /newssearch/game - name: comicat - public_suffix: org - alexa_domain: comicat.org - alexa_rank: 19505 - category: manga-anime - notes: null - input_field: false - search_form: false - search_div: true domains: - comicat.org query_parsers: @@ -20949,71 +12158,21 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: faithtap - public_suffix: com - alexa_domain: faithtap.com - alexa_rank: 19554 - category: news-and-boulevard - notes: excluded; Website offline - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - faithtap.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mmbang - public_suffix: com - alexa_domain: mmbang.com - alexa_rank: 19613 - category: corporate - notes: excluded; Redirects to baidu; Uses baidu - input_field: true - search_form: true - search_div: true + excluded: Redirects to Baidu search domains: - mmbang.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: vid - public_suffix: me - alexa_domain: vid.me - alexa_rank: 19623 - category: streaming - notes: excluded; No valid snaphshot - input_field: true - search_form: true - search_div: false + excluded: No valid snapshot domains: - vid.me - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mangafox - public_suffix: me - alexa_domain: mangafox.me - alexa_rank: 19642 - category: manga-anime - notes: null - input_field: true - search_form: true - search_div: false domains: - mangafox.me - fanfox.net @@ -21025,20 +12184,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: hellporno - public_suffix: com - alexa_domain: hellporno.com - alexa_rank: 19674 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - hellporno.com query_parsers: @@ -21052,37 +12200,13 @@ - url_pattern: ^https?://[^/]+/search/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: falafelandcaviar - public_suffix: com - alexa_domain: falafelandcaviar.com - alexa_rank: 19749 - category: blog - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - falafelandcaviar.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sarcasm - public_suffix: co - alexa_domain: sarcasm.co - alexa_rank: 19789 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - sarcasm.co query_parsers: @@ -21093,20 +12217,9 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: oreno-erohon - public_suffix: com - alexa_domain: oreno-erohon.com - alexa_rank: 19854 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - oreno-erohon.com query_parsers: @@ -21120,21 +12233,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: putlockers - public_suffix: co - alexa_domain: putlockers.co - alexa_rank: 19922 - category: streaming - notes: null - input_field: true - search_form: true - search_div: true domains: - putlockers.co - w2.putlockers.co @@ -21142,55 +12244,17 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: freefilefillableforms - public_suffix: com - alexa_domain: freefilefillableforms.com - alexa_rank: 19949 - category: governmental - notes: excluded; No valid snapshot - input_field: true - search_form: true - search_div: false + excluded: No valid snapshot domains: - freefilefillableforms.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: metbugat - public_suffix: gov.tm - alexa_domain: metbugat.gov.tm - alexa_rank: 19977 - category: governmental - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - metbugat.gov.tm - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: moe - public_suffix: gov.cn - alexa_domain: moe.gov.cn - alexa_rank: 20419 - category: governmental - notes: null - input_field: true - search_form: true - search_div: true domains: - moe.gov.cn - so.moe.gov.cn @@ -21198,55 +12262,17 @@ - url_pattern: ^https?://[^/]+/s\? type: query_parameter parameter: qt - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s? - name: slashgear - public_suffix: com - alexa_domain: slashgear.com - alexa_rank: 20426 - category: blog - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - slashgear.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: dilidili - public_suffix: name - alexa_domain: dilidili.name - alexa_rank: 20461 - category: manga-anime - notes: excluded; Redirects to baidu; Uses baidu; - input_field: false - search_form: false - search_div: true + excluded: Redirects to Baidu search domains: - dilidili.name - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ibtimes - public_suffix: co.uk - alexa_domain: ibtimes.co.uk - alexa_rank: 20571 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - ibtimes.co.uk - ibtimes.com @@ -21261,42 +12287,9 @@ - url_pattern: ^https?://[^/]+/search/site/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/site/ -- name: jmcomic - public_suffix: asia - alexa_domain: jmcomic.asia - alexa_rank: 20700 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true - domains: - - jmcomic.asia - - jmcomic.mobi - query_parsers: - - url_pattern: ^https?://[^/]+/search - type: query_parameter - parameter: search_query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: - - /search - name: bestblackfriday - public_suffix: com - alexa_domain: bestblackfriday.com - alexa_rank: 20827 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - bestblackfriday.com - blackfriday.com @@ -21308,78 +12301,13 @@ - url_pattern: ^https?://[^/]+/search-results\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search-results? -- name: javdb36 - public_suffix: com - alexa_domain: javdb36.com - alexa_rank: 20847 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true - domains: - - javdb36.com - query_parsers: - - url_pattern: ^https?://[^/]+/search\? - type: query_parameter - parameter: q - page_parsers: - - url_pattern: ^https?://[^/]+/search\? - type: query_parameter - parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: - - /search? - name: download-adblock - public_suffix: com - alexa_domain: download-adblock.com - alexa_rank: 20879 - category: service - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - download-adblock.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] -- name: windows - public_suffix: com - alexa_domain: windows.com - alexa_rank: 20927 - category: corporate - notes: excluded; Covered by microsoft - input_field: true - search_form: true - search_div: true - domains: - - windows.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: fanli - public_suffix: com - alexa_domain: fanli.com - alexa_rank: 21091 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - fanli.com - fun.fanli.com @@ -21387,80 +12315,31 @@ - url_pattern: ^https?://[^/]+/client/search type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /client/search - name: covid-19-testingkit - public_suffix: com - alexa_domain: covid-19-testingkit.com - alexa_rank: 21145 - category: '-' - notes: exclude; Page not loading - input_field: false - search_form: false - search_div: true + excluded: Page not loading domains: - covid-19-testingkit.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ces - public_suffix: tech - alexa_domain: ces.tech - alexa_rank: 21188 - category: '-' - notes: null - input_field: false - search_form: false - search_div: true domains: - ces.tech query_parsers: - url_pattern: ^https?://[^/]+/search-results\.apsx\? type: query_parameter parameter: searchtext - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search-results.apsx? - name: espnfc - public_suffix: com - alexa_domain: espnfc.com - alexa_rank: 21285 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - espnfc.com query_parsers: - url_pattern: ^https?://[^/]+/search/_/q/[^/]+ type: path_segment segment: 4 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: westernjournalism - public_suffix: com - alexa_domain: westernjournalism.com - alexa_rank: 21340 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - westernjournalism.com query_parsers: @@ -21471,42 +12350,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /?s - /page - name: aswaqinformation - public_suffix: com - alexa_domain: aswaqinformation.com - alexa_rank: 21411 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - aswaqinformation.com query_parsers: - url_pattern: ^https?://[^/]+/section type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /section - name: fanpage - public_suffix: gr - alexa_domain: fanpage.gr - alexa_rank: 21532 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - fanpage.gr query_parsers: @@ -21517,121 +12373,48 @@ - url_pattern: ^https?://[^/]+/search/[^/]+/page/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: elitetorrent - public_suffix: io - alexa_domain: elitetorrent.io - alexa_rank: 21590 - category: torrent - notes: null - input_field: true - search_form: true - search_div: false domains: - elitetorrent.io query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: trovi - public_suffix: com - alexa_domain: trovi.com - alexa_rank: 21680 - category: spam-malware - notes: excluded - input_field: true - search_form: true - search_div: false domains: - trovi.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: hostgator - public_suffix: com.br - alexa_domain: hostgator.com.br - alexa_rank: 21743 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - hostgator.com.br query_parsers: - url_pattern: ^https?://[^/]+/busca\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /busca - name: stagram - public_suffix: com - alexa_domain: stagram.com - alexa_rank: 21855 - category: corporate - notes: null - input_field: true - search_form: true - search_div: true domains: - stagram.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: sfwaa - public_suffix: com - alexa_domain: sfwaa.com - alexa_rank: 22060 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - sfwaa.com query_parsers: - url_pattern: ^https?://[^/]+/section type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /section - name: mmamania - public_suffix: com - alexa_domain: mmamania.com - alexa_rank: 22077 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - mmamania.com query_parsers: @@ -21642,37 +12425,13 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: q6idnawboy7g - public_suffix: com - alexa_domain: q6idnawboy7g.com - alexa_rank: 22145 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - q6idnawboy7g.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: asg - public_suffix: to - alexa_domain: asg.to - alexa_rank: 22212 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - asg.to query_parsers: @@ -21683,47 +12442,25 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: blackberry - public_suffix: com - alexa_domain: blackberry.com - alexa_rank: 22272 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - blackberry.com query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/[a-z]+/search#q type: fragment_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/[a-z]+/search#q type: query_parameter parameter: first - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /us/en/search - /ja/jp/search - /la/es/search - /fr/fr/search - name: bloodyelbow - public_suffix: com - alexa_domain: bloodyelbow.com - alexa_rank: 22427 - category: sports - notes: null - input_field: true - search_form: true - search_div: false domains: - bloodyelbow.com query_parsers: @@ -21734,20 +12471,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: plugrush - public_suffix: com - alexa_domain: plugrush.com - alexa_rank: 22509 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - plugrush.com query_parsers: @@ -21761,42 +12487,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: seobook - public_suffix: com - alexa_domain: seobook.com - alexa_rank: 22533 - category: service - notes: null - input_field: true - search_form: true - search_div: false domains: - seobook.com query_parsers: - url_pattern: ^https?://[^/]+/sitesearch/ type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /sitesearch - name: byrutor - public_suffix: com - alexa_domain: byrutor.com - alexa_rank: 22621 - category: download - notes: null - input_field: true - search_form: true - search_div: false domains: - byrutor.com - s1.byrutor.com @@ -21804,21 +12507,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: story - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: justindianporn - public_suffix: me - alexa_domain: justindianporn.me - alexa_rank: 22704 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - justindianporn.me query_parsers: @@ -21829,62 +12520,27 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: musicasparamissa - public_suffix: com.br - alexa_domain: musicasparamissa.com.br - alexa_rank: 22718 - category: religious - notes: null - input_field: true - search_form: true - search_div: false domains: - musicasparamissa.com.br query_parsers: - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/? - name: recommendationletters - public_suffix: pro - alexa_domain: recommendationletters.pro - alexa_rank: 22941 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - recommendationletters.pro query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: 123greetings - public_suffix: com - alexa_domain: 123greetings.com - alexa_rank: 22998 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - 123greetings.com - search.123greetings.com @@ -21892,38 +12548,13 @@ - url_pattern: ^https?://[^/]+/[^/]+/search/search\.pl\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /cgi-bin/search/search.pl? - name: betterex - public_suffix: xyz - alexa_domain: betterex.xyz - alexa_rank: 23276 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Empty page domains: - betterex.xyz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: kwork - public_suffix: com - alexa_domain: kwork.com - alexa_rank: 23392 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - kwork.com query_parsers: @@ -21934,20 +12565,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: er - public_suffix: ru - alexa_domain: er.ru - alexa_rank: 23632 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - er.ru query_parsers: @@ -21961,56 +12581,19 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: unblocked - public_suffix: llc - alexa_domain: unblocked.llc - alexa_rank: 23705 - category: torrent - notes: exclude; Page not loading - input_field: false - search_form: false - search_div: true + excluded: Page not loading domains: - - unblocked.dk - unblocked.llc + - unblocked.dk - unblocked.nz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: nflstreams - public_suffix: to - alexa_domain: nflstreams.to - alexa_rank: 23824 - category: streaming - notes: excluded; No archived SERP - input_field: false - search_form: false - search_div: true + excluded: No archived SERP domains: - nflstreams.to - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: apherald - public_suffix: com - alexa_domain: apherald.com - alexa_rank: 23994 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - apherald.com - indiaherald.com @@ -22018,21 +12601,9 @@ - url_pattern: ^https?://[^/]+/search/[a-z]+/[^/]+ type: path_segment segment: 3 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: teamliquid - public_suffix: net - alexa_domain: teamliquid.net - alexa_rank: 24355 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - teamliquid.net - tl.net @@ -22040,38 +12611,13 @@ - url_pattern: ^https?://[^/]+/[^/]+/search\.php\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /forum/search.php? - name: browserquote - public_suffix: com - alexa_domain: browserquote.com - alexa_rank: 24398 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: No search domains: - browserquote.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: jobstreet - public_suffix: com.sg - alexa_domain: jobstreet.com.sg - alexa_rank: 24637 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - jobstreet.com.sg query_parsers: @@ -22091,22 +12637,11 @@ - url_pattern: ^https?://[^/]+/career-resources/search\? type: query_parameter parameter: pages - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/job-search - /en/companies/browse-reviews? - /career-resources/search - name: angovagas - public_suffix: net - alexa_domain: angovagas.net - alexa_rank: 24690 - category: career-jobs - notes: null - input_field: true - search_form: true - search_div: true domains: - angovagas.net query_parsers: @@ -22120,61 +12655,25 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: covidvisualizer - public_suffix: com - alexa_domain: covidvisualizer.com - alexa_rank: 24960 - category: service - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - covidvisualizer.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: searchengines - public_suffix: ru - alexa_domain: searchengines.ru - alexa_rank: 24989 - category: search-engine - notes: null - input_field: false - search_form: false - search_div: true domains: - - searchengines.guru - searchengines.ru + - searchengines.guru query_parsers: - url_pattern: ^https?://[^/]+/(en|ru)/search type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - /ru/search - name: cooch - public_suffix: tv - alexa_domain: cooch.tv - alexa_rank: 25040 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - cooch.tv query_parsers: @@ -22187,20 +12686,9 @@ segment: 4 remove_patterns: - \.html$ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - name: canaltutorial - public_suffix: com - alexa_domain: canaltutorial.com - alexa_rank: 25100 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - canaltutorial.com query_parsers: @@ -22214,55 +12702,18 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: ilividnewtab - public_suffix: com - alexa_domain: ilividnewtab.com - alexa_rank: 25309 - category: spam-malware notes: exclude - input_field: false - search_form: false - search_div: true domains: - ilividnewtab.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: gostream - public_suffix: is - alexa_domain: gostream.is - alexa_rank: 25444 - category: streaming - notes: exclude; Page not loading - input_field: false - search_form: false - search_div: true + excluded: Page not loading domains: - gostream.is - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: hentai-time - public_suffix: com - alexa_domain: hentai-time.com - alexa_rank: 25512 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - hentai-time.com query_parsers: @@ -22276,55 +12727,18 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: pudding - public_suffix: cool - alexa_domain: pudding.cool - alexa_rank: 26270 - category: blog - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - pudding.cool - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: dyndns - public_suffix: org - alexa_domain: dyndns.org - alexa_rank: 26501 - category: '-' - notes: excluded; No valid snapshot - input_field: true - search_form: true - search_div: true + excluded: No valid snapshot domains: - dyndns.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: wn - public_suffix: com - alexa_domain: wn.com - alexa_rank: 26541 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - wn.com - search.wn.com @@ -22336,20 +12750,9 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: pagenum - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: shodan - public_suffix: io - alexa_domain: shodan.io - alexa_rank: 26550 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - shodan.io query_parsers: @@ -22360,22 +12763,12 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: woothemes - public_suffix: com - alexa_domain: woothemes.com - alexa_rank: 26562 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - woothemes.com + - woocommerce.com query_parsers: - url_pattern: ^https?://[^/]+/search/\? type: query_parameter @@ -22384,54 +12777,17 @@ - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: filesonic - public_suffix: com - alexa_domain: filesonic.com - alexa_rank: 26622 - category: torrent - notes: exclude; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - filesonic.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tennessean - public_suffix: com - alexa_domain: tennessean.com - alexa_rank: 26933 - category: news-and-boulevard - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - tennessean.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: opensiteexplorer - public_suffix: org - alexa_domain: opensiteexplorer.org - alexa_rank: 27162 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - opensiteexplorer.org query_parsers: @@ -22442,41 +12798,18 @@ - url_pattern: ^https?://[^/]+/links type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /links - name: xxxhdvideo - public_suffix: mobi - alexa_domain: xxxhdvideo.mobi - alexa_rank: 27398 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - xxxhdvideo.mobi query_parsers: - url_pattern: ^https?://[^/]+/sex/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /sex - name: ctitv - public_suffix: com.tw - alexa_domain: ctitv.com.tw - alexa_rank: 27415 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - ctitv.com.tw query_parsers: @@ -22490,42 +12823,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: turkmenistanhabar - public_suffix: com - alexa_domain: turkmenistanhabar.com - alexa_rank: 27437 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - turkmenistanhabar.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: liftable - public_suffix: com - alexa_domain: liftable.com - alexa_rank: 27531 - category: religious - notes: null - input_field: false - search_form: false - search_div: true domains: - liftable.com query_parsers: @@ -22539,21 +12849,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: imzog - public_suffix: com - alexa_domain: imzog.com - alexa_rank: 27653 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - imzog.com query_parsers: @@ -22564,153 +12863,54 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search/[^/]+/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - name: thoughclassifiedjeff - public_suffix: com - alexa_domain: thoughclassifiedjeff.com - alexa_rank: 27667 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - thoughclassifiedjeff.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: 4udear - public_suffix: com - alexa_domain: 4udear.com - alexa_rank: 27687 - category: '-' - notes: null - input_field: false - search_form: false - search_div: true domains: - 4udear.com query_parsers: - url_pattern: ^https?://[^/]+/se/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /se/search? - name: dunia21 - public_suffix: tv - alexa_domain: dunia21.tv - alexa_rank: 27771 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - - dunia21.com - dunia21.tv + - dunia21.com + - dunia21.net - lk21official.info query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: probux - public_suffix: com - alexa_domain: probux.com - alexa_rank: 27785 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - probux.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: inis - public_suffix: ao - alexa_domain: inis.ao - alexa_rank: 27898 - category: governmental - notes: excluded; Query not in URL - input_field: false - search_form: false - search_div: true + excluded: Query not in URL domains: - inis.ao - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: majesticseo - public_suffix: com - alexa_domain: majesticseo.com - alexa_rank: 27914 - category: service - notes: null - input_field: false - search_form: false - search_div: true domains: - majesticseo.com query_parsers: - url_pattern: ^https?://[^/]+/reports/site-explorer\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /reports/site-explorer? - name: guj5 - public_suffix: xyz - alexa_domain: guj5.xyz - alexa_rank: 28076 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - guj5.xyz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: viralnova - public_suffix: com - alexa_domain: viralnova.com - alexa_rank: 28281 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - viralnova.com query_parsers: @@ -22724,21 +12924,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - page - name: metacrawler - public_suffix: com - alexa_domain: metacrawler.com - alexa_rank: 28924 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - metacrawler.com query_parsers: @@ -22749,109 +12938,34 @@ - url_pattern: ^https?://[^/]+/serp\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /serp? - name: mycricketlive - public_suffix: live - alexa_domain: mycricketlive.live - alexa_rank: 29199 - category: sports - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - mycricketlive.live - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ukcontentdelivery - public_suffix: info - alexa_domain: ukcontentdelivery.info - alexa_rank: 29513 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Empty page domains: - ukcontentdelivery.info - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tickld - public_suffix: com - alexa_domain: tickld.com - alexa_rank: 29536 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - tickld.com query_parsers: - url_pattern: ^https?://[^/]+/search/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: injuredcandy - public_suffix: com - alexa_domain: injuredcandy.com - alexa_rank: 29665 - category: '-' - notes: excluded; No valid snaphshot - input_field: false - search_form: false - search_div: true + excluded: No valid snaphshot domains: - injuredcandy.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: podregarddomicile - public_suffix: com - alexa_domain: podregarddomicile.com - alexa_rank: 29702 - category: '-' - notes: excluded; Not archived - input_field: true - search_form: true - search_div: false + excluded: Not archived domains: - podregarddomicile.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: websta - public_suffix: me - alexa_domain: websta.me - alexa_rank: 29774 - category: question-and-answer - notes: null - input_field: false - search_form: false - search_div: true domains: - websta.me query_parsers: @@ -22865,21 +12979,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: xgap - public_suffix: tv - alexa_domain: xgap.tv - alexa_rank: 30067 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - xgap.tv query_parsers: @@ -22892,58 +12995,22 @@ segment: 4 remove_patterns: - \.html$ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - name: anahwa - public_suffix: com - alexa_domain: anahwa.com - alexa_rank: 30112 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - anahwa.com query_parsers: - url_pattern: ^https?://[^/]+/section type: query_parameter parameter: keyword - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /section - name: pretagteam - public_suffix: com - alexa_domain: pretagteam.com - alexa_rank: 30116 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Redirects to Google search domains: - pretagteam.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mbank - public_suffix: com.pl - alexa_domain: mbank.com.pl - alexa_rank: 30298 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - mbank.com.pl query_parsers: @@ -22954,41 +13021,18 @@ - url_pattern: ^https?://[^/]+/szukaj type: query_parameter parameter: pag - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /szukaj - name: sergey-mavrodi - public_suffix: com - alexa_domain: sergey-mavrodi.com - alexa_rank: 30410 - category: '-' - notes: null - input_field: true - search_form: true - search_div: true domains: - sergey-mavrodi.com query_parsers: - url_pattern: ^https?://[^/]+/spage/\? type: query_parameter parameter: search_q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /spage/? - name: nhadatso - public_suffix: com - alexa_domain: nhadatso.com - alexa_rank: 30436 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - nhadatso.com query_parsers: @@ -22999,20 +13043,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: skat - public_suffix: dk - alexa_domain: skat.dk - alexa_rank: 30725 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - skat.dk query_parsers: @@ -23023,99 +13056,38 @@ - url_pattern: ^https?://[^/]+/data\.aspx\? type: query_parameter parameter: cludopage - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /data.aspx? - name: yeus - public_suffix: xyz - alexa_domain: yeus.xyz - alexa_rank: 30753 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true domains: - yeus.xyz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: cr447 - public_suffix: xyz - alexa_domain: cr447.xyz - alexa_rank: 30841 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - cr447.xyz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: forosdelweb - public_suffix: com - alexa_domain: forosdelweb.com - alexa_rank: 30949 - category: forum - notes: null - input_field: false - search_form: false - search_div: true domains: - forosdelweb.com query_parsers: - url_pattern: ^https?://[^/]+/misc\.php\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /misc.php? - name: seb - public_suffix: se - alexa_domain: seb.se - alexa_rank: 30963 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - seb.se query_parsers: - url_pattern: ^https?://[^/]+/systemsidor/sok\? type: query_parameter parameter: s - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/systemsidor/sok\? type: query_parameter parameter: o - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /systemsidor/sok? - name: kiddle - public_suffix: co - alexa_domain: kiddle.co - alexa_rank: 31029 - category: child-safe-search - notes: null - input_field: null - search_form: null - search_div: null domains: - kiddle.co query_parsers: @@ -23126,37 +13098,13 @@ - url_pattern: ^https?://[^/]+/s\.php\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s.php? - name: pussl18 - public_suffix: com - alexa_domain: pussl18.com - alexa_rank: 31162 - category: '-' - notes: excluded; No valid snaphshot - input_field: false - search_form: false - search_div: true + excluded: No valid snaphshot domains: - pussl18.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ecrater - public_suffix: com - alexa_domain: ecrater.com - alexa_rank: 31220 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - ecrater.com query_parsers: @@ -23167,37 +13115,13 @@ - url_pattern: ^https?://[^/]+/filter\.php\? type: query_parameter parameter: srn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /filter.php? - name: localgirldating - public_suffix: com - alexa_domain: localgirldating.com - alexa_rank: 31303 - category: '-' - notes: excluded; No valid snapshot - input_field: true - search_form: true - search_div: false + excluded: No valid snapshot domains: - localgirldating.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: eurovision - public_suffix: tv - alexa_domain: eurovision.tv - alexa_rank: 31372 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - eurovision.tv query_parsers: @@ -23208,20 +13132,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: sportsala - public_suffix: com - alexa_domain: sportsala.com - alexa_rank: 31449 - category: sports - notes: null - input_field: false - search_form: false - search_div: true domains: - sportsala.com query_parsers: @@ -23235,42 +13148,19 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: egotastic - public_suffix: com - alexa_domain: egotastic.com - alexa_rank: 31585 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - egotastic.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: ah-me - public_suffix: com - alexa_domain: ah-me.com - alexa_rank: 31683 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - ah-me.com query_parsers: @@ -23290,93 +13180,31 @@ - url_pattern: ^https?://[^/]+/pics/search/[^/]+/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - /pics/search - name: exceptingincludedelivering - public_suffix: com - alexa_domain: exceptingincludedelivering.com - alexa_rank: 31689 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - exceptingincludedelivering.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: onlyfreetoonporn - public_suffix: com - alexa_domain: onlyfreetoonporn.com - alexa_rank: 31712 - category: pornography - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - onlyfreetoonporn.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: fillsitsy - public_suffix: cam - alexa_domain: fillsitsy.cam - alexa_rank: 31786 - category: '-' - notes: excluded; Only one snapshot - input_field: false - search_form: false - search_div: true + excluded: Only one snapshot domains: - fillsitsy.cam - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: uploading - public_suffix: com - alexa_domain: uploading.com - alexa_rank: 31993 - category: '-' - notes: null - input_field: true - search_form: true - search_div: true domains: - uploading.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: sxc - public_suffix: hu - alexa_domain: sxc.hu - alexa_rank: 32242 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: true domains: - sxc.hu query_parsers: @@ -23387,75 +13215,26 @@ - url_pattern: ^https?://[^/]+/browse\.phtml\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /browse.phtml? - name: ellechina - public_suffix: com - alexa_domain: ellechina.com - alexa_rank: 32458 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - ellechina.com query_parsers: - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: masslistener - public_suffix: com - alexa_domain: masslistener.com - alexa_rank: 32643 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - masslistener.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tamilrockers - public_suffix: tel - alexa_domain: tamilrockers.tel - alexa_rank: 33303 - category: streaming - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - tamilrockers.tel - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: lycos - public_suffix: com - alexa_domain: lycos.com - alexa_rank: 33395 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - lycos.com - search.lycos.com @@ -23466,60 +13245,22 @@ - url_pattern: ^https?://[^/]+/web/\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /web/? - name: anightsregalia - public_suffix: cam - alexa_domain: anightsregalia.cam - alexa_rank: 33791 - category: '-' - notes: null - input_field: false - search_form: false - search_div: true domains: - anightsregalia.cam - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ebi - public_suffix: ac.uk - alexa_domain: ebi.ac.uk - alexa_rank: 33964 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - ebi.ac.uk query_parsers: - url_pattern: ^https?://[^/]+/ebisearch/search type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /ebisearch/search - name: mayoclinic - public_suffix: com - alexa_domain: mayoclinic.com - alexa_rank: 34128 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - mayoclinic.com query_parsers: @@ -23530,71 +13271,20 @@ - url_pattern: ^https?://[^/]+/search/search-results\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: grella - public_suffix: click - alexa_domain: grella.click - alexa_rank: 34142 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - grella.click - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: technorati - public_suffix: com - alexa_domain: technorati.com - alexa_rank: 34151 - category: blog - notes: null - input_field: null - search_form: null - search_div: null domains: - technorati.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: understandingharmoniousillegal - public_suffix: com - alexa_domain: understandingharmoniousillegal.com - alexa_rank: 34313 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - understandingharmoniousillegal.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: shopping - public_suffix: com - alexa_domain: shopping.com - alexa_rank: 34467 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - shopping.com query_parsers: @@ -23605,79 +13295,31 @@ - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html? - name: manga9 - public_suffix: co - alexa_domain: manga9.co - alexa_rank: 35305 - category: manga-anime - notes: null - input_field: true - search_form: true - search_div: false domains: - manga9.co query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /?s - name: lauriegrowingdrops - public_suffix: com - alexa_domain: lauriegrowingdrops.com - alexa_rank: 35358 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - lauriegrowingdrops.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: hyves - public_suffix: nl - alexa_domain: hyves.nl - alexa_rank: 35543 - category: gaming - notes: null - input_field: true - search_form: true - search_div: false domains: - hyves.nl query_parsers: - url_pattern: ^https?://[^/]+/zoeken/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /zoeken - name: citeab - public_suffix: com - alexa_domain: citeab.com - alexa_rank: 35582 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - citeab.com query_parsers: @@ -23696,10 +13338,6 @@ - url_pattern: ^https?://[^/]+/proteins/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /antibodies - /biochemicals @@ -23707,14 +13345,6 @@ - /kits/search? - /proteins/search? - name: porn24 - public_suffix: tv - alexa_domain: porn24.tv - alexa_rank: 35927 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - porn24.tv query_parsers: @@ -23727,71 +13357,20 @@ segment: 4 remove_patterns: - \.html$ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: admtpmp127 - public_suffix: com - alexa_domain: admtpmp127.com - alexa_rank: 36636 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - admtpmp127.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: default-search - public_suffix: net - alexa_domain: default-search.net - alexa_rank: 36790 - category: spam-malware - notes: excluded - input_field: true - search_form: true - search_div: false domains: - default-search.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: reptilefightearn - public_suffix: com - alexa_domain: reptilefightearn.com - alexa_rank: 36971 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - reptilefightearn.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: pornhost - public_suffix: com - alexa_domain: pornhost.com - alexa_rank: 37241 - category: pornography - notes: null - input_field: true - search_form: true - search_div: true domains: - pornhost.com query_parsers: @@ -23802,20 +13381,9 @@ - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: start - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html - name: tuberel - public_suffix: com - alexa_domain: tuberel.com - alexa_rank: 37358 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - tuberel.com query_parsers: @@ -23826,20 +13394,9 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search/[^/]+/[0-9]+ type: path_segment segment: 4 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/search - name: alsbbora - public_suffix: com - alexa_domain: alsbbora.com - alexa_rank: 37412 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: true domains: - alsbbora.com - alsbbora.info @@ -23851,159 +13408,62 @@ - url_pattern: ^https?://[^/]+/search/ type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: ryzex - public_suffix: net - alexa_domain: ryzex.net - alexa_rank: 37429 - category: '-' - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - ryzex.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: nyaa - public_suffix: eu - alexa_domain: nyaa.eu - alexa_rank: 37627 - category: torrent - notes: null - input_field: false - search_form: false - search_div: true domains: - nyaa.eu query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: govome - public_suffix: com - alexa_domain: govome.com - alexa_rank: 37692 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - govome.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: forexmetatradexdx - public_suffix: info - alexa_domain: forexmetatradexdx.info - alexa_rank: 37833 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - forexmetatradexdx.info query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: yhdmw - public_suffix: com - alexa_domain: yhdmw.com - alexa_rank: 38064 - category: manga-anime - notes: null - input_field: true - search_form: true - search_div: false domains: - yhdmw.com query_parsers: - url_pattern: ^https?://[^/]+/[^/]+search type: query_parameter parameter: wd - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /comicsearch - name: mobfactory - public_suffix: info - alexa_domain: mobfactory.info - alexa_rank: 38095 - category: '-' - notes: exluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - mobfactory.info - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: 100ppi - public_suffix: com - alexa_domain: 100ppi.com - alexa_rank: 39064 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - 100ppi.com query_parsers: - url_pattern: ^https?://[^/]+/mprice/\? type: query_parameter parameter: terms - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /mprice/? - name: filmstreaming2 - public_suffix: com - alexa_domain: filmstreaming2.com - alexa_rank: 39135 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - filmstreaming2.com query_parsers: @@ -24013,22 +13473,10 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.json? - /? - name: ct10000 - public_suffix: com - alexa_domain: ct10000.com - alexa_rank: 39294 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - ct10000.com query_parsers: @@ -24039,231 +13487,83 @@ - url_pattern: ^https?://[^/]+/s\.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: pleasedbeginparish - public_suffix: com - alexa_domain: pleasedbeginparish.com - alexa_rank: 39557 - category: spam-malware - notes: null - input_field: false - search_form: false - search_div: true domains: - pleasedbeginparish.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: wapwon - public_suffix: com - alexa_domain: wapwon.com - alexa_rank: 40114 - category: streaming notes: "The query pattern seems to be too broad." - input_field: false - search_form: false - search_div: true domains: - wapwon.com query_parsers: - url_pattern: ^https?://[^/]+/[^/]+/ type: path_segment segment: 1 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: twindiversity - public_suffix: com - alexa_domain: twindiversity.com - alexa_rank: 40239 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - twindiversity.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: avaxhome - public_suffix: ws - alexa_domain: avaxhome.ws - alexa_rank: 40490 - category: torrent - notes: null - input_field: false - search_form: false - search_div: true domains: - avaxhome.ws query_parsers: - url_pattern: ^https?://[^/]+/avaxhome_search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /avaxhome_search? - name: royal - public_suffix: uk - alexa_domain: royal.uk - alexa_rank: 40621 - category: governmental - notes: null - input_field: true - search_form: true - search_div: false domains: - royal.uk query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: tags[] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: cryptoplace - public_suffix: pro - alexa_domain: cryptoplace.pro - alexa_rank: 40716 - category: '-' - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - cryptoplace.pro - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: snickgainfulmuch - public_suffix: com - alexa_domain: snickgainfulmuch.com - alexa_rank: 40840 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - snickgainfulmuch.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: how01 - public_suffix: com - alexa_domain: how01.com - alexa_rank: 40896 - category: question-and-answer - notes: null - input_field: true - search_form: true - search_div: false domains: - how01.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: basketball24 - public_suffix: com - alexa_domain: basketball24.com - alexa_rank: 40922 - category: gambling - notes: excluded; Only autocomplete search - input_field: false - search_form: false - search_div: true + excluded: Only autocomplete search domains: - basketball24.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: jsoftj - public_suffix: com - alexa_domain: jsoftj.com - alexa_rank: 40975 - category: download - notes: null - input_field: false - search_form: false - search_div: true domains: - jsoftj.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: tv-links - public_suffix: eu - alexa_domain: tv-links.eu - alexa_rank: 41084 - category: blog - notes: null - input_field: true - search_form: true - search_div: false domains: - tv-links.eu query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: slideplayer - public_suffix: com.br - alexa_domain: slideplayer.com.br - alexa_rank: 41165 - category: service - notes: null - input_field: true - search_form: true - search_div: true domains: - slideplayer.com.br query_parsers: @@ -24274,43 +13574,44 @@ - url_pattern: ^https?://[^/]+/search/\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: leninetudo - public_suffix: com - alexa_domain: leninetudo.com - alexa_rank: 41548 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - leninetudo.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: zooplus - public_suffix: it - alexa_domain: zooplus.it - alexa_rank: 41714 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: + - zooplus.com + - zoohit.cz + - zoohit.si + - zoohit.sk + - zooplus.at + - zooplus.be + - zooplus.bg + - zooplus.ch + - zooplus.co.uk + - zooplus.de + - zooplus.dk + - zooplus.es + - zooplus.fi + - zooplus.fr + - zooplus.gr + - zooplus.hr + - zooplus.hu + - zooplus.ie - zooplus.it + - zooplus.nl + - zooplus.no + - zooplus.pl + - zooplus.pt + - zooplus.ro + - zooplus.se query_parsers: - url_pattern: ^https?://[^/]+/search/results\? type: query_parameter @@ -24319,20 +13620,9 @@ - url_pattern: ^https?://[^/]+/search/results\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/results? - name: autoproyecto - public_suffix: com - alexa_domain: autoproyecto.com - alexa_rank: 42047 - category: review - notes: null - input_field: true - search_form: true - search_div: false domains: - autoproyecto.com query_parsers: @@ -24346,21 +13636,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: markosweb - public_suffix: com - alexa_domain: markosweb.com - alexa_rank: 42110 - category: corporate - notes: null - input_field: true - search_form: true - search_div: false domains: - markosweb.com query_parsers: @@ -24371,20 +13650,9 @@ - url_pattern: ^https?://[^/]+/s type: query_parameter parameter: qsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /s - name: parsijoo - public_suffix: ir - alexa_domain: parsijoo.ir - alexa_rank: 42267 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - parsijoo.ir query_parsers: @@ -24395,23 +13663,12 @@ - url_pattern: ^https?://[^/]+/*.\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /web? - /? - /bazaar? - /autocomplete/Search? - name: m5zn - public_suffix: com - alexa_domain: m5zn.com - alexa_rank: 42402 - category: wiki - notes: null - input_field: true - search_form: true - search_div: false domains: - m5zn.com query_parsers: @@ -24425,21 +13682,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: oomall - public_suffix: com - alexa_domain: oomall.com - alexa_rank: 42942 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - oomall.com - s.oomall.com @@ -24447,41 +13693,16 @@ - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: bdpcc - public_suffix: com - alexa_domain: bdpcc.com - alexa_rank: 42978 - category: streaming - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - bdpcc.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: kelkoo - public_suffix: com - alexa_domain: kelkoo.com - alexa_rank: 43061 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - - kelkoo.co.uk - kelkoo.com + - kelkoo.co.uk - kelkoo.de query_parsers: - url_pattern: ^https?://[^/]+/search\? @@ -24494,55 +13715,18 @@ - url_pattern: ^https?://[^/]+/(search|suche)\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /suche? - name: molodgytot - public_suffix: biz - alexa_domain: molodgytot.biz - alexa_rank: 43076 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - molodgytot.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: bhj1 - public_suffix: xyz - alexa_domain: bhj1.xyz - alexa_rank: 44199 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - bhj1.xyz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: shentai - public_suffix: org - alexa_domain: shentai.org - alexa_rank: 44242 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - shentai.org query_parsers: @@ -24556,21 +13740,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: hotsale - public_suffix: com.ar - alexa_domain: hotsale.com.ar - alexa_rank: 44964 - category: e-commerce - notes: null - input_field: false - search_form: false - search_div: true domains: - hotsale.com.ar query_parsers: @@ -24581,41 +13754,18 @@ - url_pattern: ^https?://[^/]+/ofertas/[0-9]+\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /ofertas - name: misrjournal - public_suffix: com - alexa_domain: misrjournal.com - alexa_rank: 44984 - category: blog - notes: null - input_field: true - search_form: true - search_div: true domains: - misrjournal.com query_parsers: - url_pattern: ^https?://[^/]+/search/node\? type: query_parameter parameter: keys - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/node? - name: shopzilla - public_suffix: com - alexa_domain: shopzilla.com - alexa_rank: 44988 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - shopzilla.com query_parsers: @@ -24629,24 +13779,13 @@ segment: 2 space_patterns: - '-' - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/.*/products/\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: pricegrabber - public_suffix: com - alexa_domain: pricegrabber.com - alexa_rank: 45560 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - pricegrabber.com query_parsers: @@ -24656,69 +13795,35 @@ - url_pattern: ^https?://[^/]+/classify\? type: query_parameter parameter: keyword - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[^/]+/products/ type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - / - name: blinkx - public_suffix: com - alexa_domain: blinkx.com - alexa_rank: 45616 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - blinkx.com query_parsers: - url_pattern: ^https?://[^/]+/videos/[^/]+ type: path_segment segment: 2 - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/videos/[^/]+/[0-9]+ type: path_segment segment: 3 - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /videos - name: tu - public_suffix: tv - alexa_domain: tu.tv - alexa_rank: 46902 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - tu.tv query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: globalspec - public_suffix: com - alexa_domain: globalspec.com - alexa_rank: 47501 - category: news-and-boulevard - notes: null - input_field: null - search_form: null - search_div: null domains: - globalspec.com - insights.globalspec.com @@ -24743,22 +13848,11 @@ - url_pattern: ^https?://[^/]+/search/reference type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /article/search - /search/all - /search/reference - name: petalsearch - public_suffix: com - alexa_domain: petalsearch.com - alexa_rank: 47596 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - petalsearch.com query_parsers: @@ -24769,75 +13863,26 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: pn - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: lollipop-network - public_suffix: com - alexa_domain: lollipop-network.com - alexa_rank: 47716 - category: corporate - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - lollipop-network.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: suite101 - public_suffix: com - alexa_domain: suite101.com - alexa_rank: 48262 - category: blog - notes: null - input_field: false - search_form: false - search_div: true domains: - suite101.com query_parsers: - url_pattern: ^https?://[^/]+/search\.cfm\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.cfm? - name: 3file - public_suffix: info - alexa_domain: 3file.info - alexa_rank: 48652 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - 3file.info - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: watchseries - public_suffix: cr - alexa_domain: watchseries.cr - alexa_rank: 48915 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - watchseries.cr - watchseries.sk @@ -24845,21 +13890,9 @@ - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: cp - public_suffix: pt - alexa_domain: cp.pt - alexa_rank: 48994 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - cp.pt query_parsers: @@ -24870,54 +13903,17 @@ - url_pattern: ^https?://[^/]+/passageiros/pt/resultados-pesquisa\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /passageiros/pt/resultados-pesquisa? - name: molebeagleheadless - public_suffix: com - alexa_domain: molebeagleheadless.com - alexa_rank: 49161 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - molebeagleheadless.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: surveyflsh - public_suffix: click - alexa_domain: surveyflsh.click - alexa_rank: 50187 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - surveyflsh.click - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: badjojo - public_suffix: com - alexa_domain: badjojo.com - alexa_rank: 50210 - category: pornography - notes: null - input_field: false - search_form: false - search_div: true domains: - badjojo.com query_parsers: @@ -24928,92 +13924,30 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: astercom - public_suffix: top - alexa_domain: astercom.top - alexa_rank: 50892 - category: gaming - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - astercom.top - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mocah - public_suffix: org - alexa_domain: mocah.org - alexa_rank: 51185 - category: media-sharing - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: false + excluded: Query not in URL domains: - mocah.org - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: statmyweb - public_suffix: com - alexa_domain: statmyweb.com - alexa_rank: 51366 - category: service - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - statmyweb.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: 1001jogos - public_suffix: pt - alexa_domain: 1001jogos.pt - alexa_rank: 51809 - category: gaming - notes: null - input_field: true - search_form: true - search_div: true domains: - 1001jogos.pt query_parsers: - url_pattern: ^https?://[^/]+/procurar type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /procurar - name: icij - public_suffix: org - alexa_domain: icij.org - alexa_rank: 52010 - category: news-and-boulevard - notes: null - input_field: true - search_form: true - search_div: false domains: - icij.org query_parsers: @@ -25027,63 +13961,28 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: textile - public_suffix: gov.tm - alexa_domain: textile.gov.tm - alexa_rank: 52284 - category: governmental - notes: null - input_field: false - search_form: false - search_div: true domains: - textile.gov.tm query_parsers: - url_pattern: ^https?://[^/]+/site/search\? type: query_parameter parameter: SearchForm[title] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /site/search? - name: earthday - public_suffix: org - alexa_domain: earthday.org - alexa_rank: 52606 - category: ngo - notes: null - input_field: true - search_form: true - search_div: false domains: - earthday.org query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: adzuna - public_suffix: co.uk - alexa_domain: adzuna.co.uk - alexa_rank: 53845 - category: career-jobs - notes: null - input_field: null - search_form: null - search_div: null domains: - adzuna.co.uk query_parsers: @@ -25094,299 +13993,99 @@ - url_pattern: ^https?://[^/]+/jobs/search\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /jobs/search? - name: foxcdn - public_suffix: life - alexa_domain: foxcdn.life - alexa_rank: 54007 - category: '-' - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - foxcdn.life - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: deferscoop - public_suffix: com - alexa_domain: deferscoop.com - alexa_rank: 54409 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - deferscoop.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mediaoffers - public_suffix: click - alexa_domain: mediaoffers.click - alexa_rank: 55191 - category: '-' - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - mediaoffers.click - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: grumbleoh - public_suffix: com - alexa_domain: grumbleoh.com - alexa_rank: 57798 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - grumbleoh.com query_parsers: - url_pattern: ^https?://[^/]+/watch.[0-9]+\.js\? type: query_parameter parameter: kw - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /watch - name: 1xbet-164837 - public_suffix: top - alexa_domain: 1xbet-164837.top - alexa_rank: 58272 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - 1xbet-164837.top - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mybeautylands - public_suffix: com - alexa_domain: mybeautylands.com - alexa_rank: 60548 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: true domains: - mybeautylands.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: search_query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: pricerunner - public_suffix: com - alexa_domain: pricerunner.com - alexa_rank: 60743 - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - pricerunner.com query_parsers: - url_pattern: ^https?://[^/]+/results\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /results? - name: torrentwiz1 - public_suffix: com - alexa_domain: torrentwiz1.com - alexa_rank: 60759 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - torrentwiz1.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: you - public_suffix: com - alexa_domain: you.com - alexa_rank: 60812 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - you.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: trafficunit - public_suffix: in - alexa_domain: trafficunit.in - alexa_rank: 61242 - category: '-' - notes: excluded; No archived SERP - input_field: false - search_form: false - search_div: true + excluded: No archived SERP domains: - trafficunit.in - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: business - public_suffix: com - alexa_domain: business.com - alexa_rank: 61993 - category: question-and-answer - notes: null - input_field: null - search_form: null - search_div: null domains: - business.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: neosportek8 - public_suffix: com - alexa_domain: neosportek8.com - alexa_rank: 62014 - category: '-' - notes: excluded; Not archived - input_field: false - search_form: false - search_div: true + excluded: Not archived domains: - neosportek8.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: andcareyo - public_suffix: biz - alexa_domain: andcareyo.biz - alexa_rank: 62869 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - andcareyo.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: onemanga - public_suffix: com - alexa_domain: onemanga.com - alexa_rank: 63555 - category: manga-anime - notes: excluded; Query not in URL - input_field: true - search_form: true - search_div: true + excluded: Query not in URL domains: - onemanga.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: vietxx - public_suffix: org - alexa_domain: vietxx.org - alexa_rank: 64529 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - vietxx.org query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: thegudda - public_suffix: com - alexa_domain: thegudda.com - alexa_rank: 65436 - category: media-sharing - notes: null - input_field: true - search_form: true - search_div: false domains: - thegudda.com query_parsers: @@ -25400,338 +14099,111 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: coulderlyy - public_suffix: website - alexa_domain: coulderlyy.website - alexa_rank: 67531 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: No search domains: - coulderlyy.website - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: velsoftwa - public_suffix: club - alexa_domain: velsoftwa.club - alexa_rank: 67726 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - velsoftwa.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: letassigne - public_suffix: club - alexa_domain: letassigne.club - alexa_rank: 68420 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - letassigne.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: lo - public_suffix: st - alexa_domain: lo.st - alexa_rank: 68521 - category: download - notes: null - input_field: true - search_form: true - search_div: true domains: - lo.st query_parsers: - url_pattern: ^https?://[^/]+/cgi-bin/eolost\.cgi\? type: query_parameter parameter: x_query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /cgi-bin/eolost.cgi? - name: eadingenered - public_suffix: pro - alexa_domain: eadingenered.pro - alexa_rank: 69874 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - eadingenered.pro - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: sufficulumcu - public_suffix: top - alexa_domain: sufficulumcu.top - alexa_rank: 71011 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - sufficulumcu.top - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: giggsyunited - public_suffix: com - alexa_domain: giggsyunited.com - alexa_rank: 71150 - category: '-' - notes: excluded; Not archived - input_field: false - search_form: false - search_div: true + excluded: Not archived domains: - giggsyunited.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: heroturko - public_suffix: org - alexa_domain: heroturko.org - alexa_rank: 72258 - category: download - notes: null - input_field: false - search_form: false - search_div: true domains: - heroturko.org - - heroturko.org:80 query_parsers: - url_pattern: ^https?://[^/]+/[0-9]+/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /012 - /031 - /047 - /057 - name: kino - public_suffix: to - alexa_domain: kino.to - alexa_rank: 72424 - category: streaming - notes: null - input_field: false - search_form: false - search_div: true domains: - kino.to query_parsers: - url_pattern: ^https?://[^/]+/Search\.html\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Search.html? - name: sourcecodester - public_suffix: com - alexa_domain: sourcecodester.com - alexa_rank: 73077 - category: question-and-answer - notes: null - input_field: false - search_form: false - search_div: true domains: - sourcecodester.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: anentryle - public_suffix: club - alexa_domain: anentryle.club - alexa_rank: 77213 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - anentryle.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: football3 - public_suffix: ir - alexa_domain: football3.ir - alexa_rank: 77361 - category: '-' - notes: excluded; Excluded from the Internet Archive - input_field: true - search_form: true - search_div: true + excluded: Excluded from the Internet Archive domains: - football3.ir - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: xxxcomics - public_suffix: org - alexa_domain: xxxcomics.org - alexa_rank: 77391 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - xxxcomics.org query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: messyster - public_suffix: biz - alexa_domain: messyster.biz - alexa_rank: 77804 - category: spam-malware - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - messyster.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: gobookee - public_suffix: net - alexa_domain: gobookee.net - alexa_rank: 79441 - category: corporate - notes: excluded; No search - input_field: false - search_form: false - search_div: true + excluded: No search domains: - gobookee.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: btcmanager - public_suffix: com - alexa_domain: btcmanager.com - alexa_rank: 82103 - category: news-and-boulevard - notes: null - input_field: false - search_form: false - search_div: true domains: - btcmanager.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: oureviewi - public_suffix: club - alexa_domain: oureviewi.club - alexa_rank: 83224 - category: spam-malware - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - oureviewi.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: jetboobs - public_suffix: com - alexa_domain: jetboobs.com - alexa_rank: 84813 - category: pornography - notes: null - input_field: true - search_form: true - search_div: false domains: - jetboobs.com query_parsers: @@ -25744,23 +14216,12 @@ segment: 4 remove_patterns: - \.html$ - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /en/ - name: wazap - public_suffix: com - alexa_domain: wazap.com - alexa_rank: 86520 - category: review - notes: null - input_field: null - search_form: null - search_div: null domains: - - jp.wazap.com - wazap.com + - jp.wazap.com query_parsers: - url_pattern: ^https?://[^/]+/search\.wz\? type: query_parameter @@ -25769,126 +14230,38 @@ - url_pattern: ^https?://[^/]+/search\.wz\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.wz? - name: trendymalldeals - public_suffix: com - alexa_domain: trendymalldeals.com - alexa_rank: 86918 - category: e-commerce - notes: null - input_field: true - search_form: true - search_div: false domains: - trendymalldeals.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: query - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: maandhave - public_suffix: biz - alexa_domain: maandhave.biz - alexa_rank: 89488 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - maandhave.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: eyourcom - public_suffix: fun - alexa_domain: eyourcom.fun - alexa_rank: 89797 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - eyourcom.fun - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: taskthesa - public_suffix: club - alexa_domain: taskthesa.club - alexa_rank: 90006 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: Domain parking domains: - taskthesa.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: lenglishiam - public_suffix: biz - alexa_domain: lenglishiam.biz - alexa_rank: 90306 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - lenglishiam.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: fishcod - public_suffix: com - alexa_domain: fishcod.com - alexa_rank: 92065 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - fishcod.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mychicconsulting - public_suffix: es - alexa_domain: mychicconsulting.es - alexa_rank: 92175 - category: corporate - notes: null - input_field: false - search_form: false - search_div: true domains: - mychicconsulting.es query_parsers: @@ -25902,127 +14275,39 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: sandmyre - public_suffix: club - alexa_domain: sandmyre.club - alexa_rank: 92497 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - sandmyre.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: ficanportio - public_suffix: biz - alexa_domain: ficanportio.biz - alexa_rank: 92638 - category: spam-malware - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - ficanportio.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: soverthlaest - public_suffix: space - alexa_domain: soverthlaest.space - alexa_rank: 93337 - category: spam-malware - notes: excluded - input_field: false - search_form: false - search_div: true + excluded: No search domains: - soverthlaest.space - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: picsearch - public_suffix: com - alexa_domain: picsearch.com - alexa_rank: 93540 - category: media-sharing - notes: null - input_field: null - search_form: null - search_div: null domains: - picsearch.com query_parsers: - url_pattern: ^https?://[^/]+/index\.cgi\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /index.cgi? - name: distemshu - public_suffix: biz - alexa_domain: distemshu.biz - alexa_rank: 94432 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - distemshu.biz - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: tronglyre - public_suffix: club - alexa_domain: tronglyre.club - alexa_rank: 98347 - category: '-' - notes: excluded; No valid snapshot - input_field: false - search_form: false - search_div: true + excluded: No valid snapshot domains: - tronglyre.club - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: kidzsearch - public_suffix: com - alexa_domain: kidzsearch.com - alexa_rank: 132747 - category: child-safe-search - notes: null - input_field: null - search_form: null - search_div: null domains: - kidzsearch.com - search.kidzsearch.com @@ -26037,9 +14322,6 @@ - url_pattern: ^https?://[^/]+/kz(?:image|video|facts|wiki|news|game|app)?search\.php\? type: query_parameter parameter: gsc.page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /kzsearch.php? - /kzimagesearch.php? @@ -26050,35 +14332,15 @@ - /kzgamesearch.php? - /kzappsearch.php? - name: najdi - public_suffix: si - alexa_domain: najdi.si - alexa_rank: 142763 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - najdi.si query_parsers: - url_pattern: ^https?://[^/]+/najdi/[^/]+ type: path_segment segment: 2 - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /najdi - name: metager - public_suffix: de - alexa_domain: metager.de - alexa_rank: 147629 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - metager.de - metager.org @@ -26086,110 +14348,51 @@ - url_pattern: ^https?://[^/]+/meta/meta\.ger3\? type: query_parameter parameter: eingabe - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/meta/meta\.ger3\? type: query_parameter parameter: next - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /meta.ger3? - name: omgili - public_suffix: com - alexa_domain: omgili.com - alexa_rank: 244640 - category: forum - notes: null - input_field: null - search_form: null - search_div: null domains: - omgili.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: gigablast - public_suffix: com - alexa_domain: gigablast.com - alexa_rank: 256158 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - gigablast.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: s - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: mojeek - public_suffix: com - alexa_domain: mojeek.com - alexa_rank: 258431 - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - mojeek.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: s - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: evi - public_suffix: com - alexa_domain: evi.com - alexa_rank: 284749 - category: question-and-answer - notes: 'excluded; Discontinued.' - input_field: null - search_form: null - search_div: null domains: - evi.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: args - public_suffix: me - alexa_domain: args.me - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - args.me query_parsers: @@ -26200,37 +14403,13 @@ - url_pattern: ^https?://[^/]+/search\.html\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.html? - name: askmenow - public_suffix: com - alexa_domain: askmenow.com - alexa_rank: null - category: service - notes: excluded; No valid snapshot - input_field: null - search_form: null - search_div: null + excluded: No valid snapshot domains: - askmenow.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: base-search - public_suffix: net - alexa_domain: base-search.net - alexa_rank: null - category: search-engine - notes: 'Bielefeld Academic Search Engine (BASE)' - input_field: null - search_form: null - search_div: null domains: - base-search.net query_parsers: @@ -26241,20 +14420,9 @@ - url_pattern: ^https?://[^/]+/Search/Results\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /Search/Results? - name: chatnoir - public_suffix: eu - alexa_domain: chatnoir.eu - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - chatnoir.eu query_parsers: @@ -26265,7 +14433,6 @@ - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: p - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/video/search\? type: html_selector @@ -26279,75 +14446,30 @@ focused_url_prefixes: - /? - name: chemrefer - public_suffix: com - alexa_domain: chemrefer.com - alexa_rank: null - category: blog - notes: null - input_field: null - search_form: null - search_div: null domains: - chemrefer.com query_parsers: - url_pattern: ^https?://[^/]+/\? type: query_parameter parameter: s - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - name: egerin - public_suffix: com - alexa_domain: egerin.com - alexa_rank: null - category: web-portal - notes: null - input_field: null - search_form: null - search_div: null domains: - egerin.com query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search/? - /search-images/? - /search-videos/? - name: findsounds - public_suffix: com - alexa_domain: findsounds.com - alexa_rank: null - category: search-engine - notes: 'excluded; Discontinued.' - input_field: null - search_form: null - search_div: null + excluded: Query not in URL domains: - findsounds.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: fireball - public_suffix: de - alexa_domain: fireball.de - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - fireball.de query_parsers: @@ -26357,22 +14479,10 @@ - url_pattern: ^https?://[^/]+/[a-z]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - /de/search? - name: genieknows - public_suffix: com - alexa_domain: genieknows.com - alexa_rank: null - category: question-and-answer - notes: null - input_field: null - search_form: null - search_div: null domains: - genieknows.com query_parsers: @@ -26386,21 +14496,10 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: leit - public_suffix: is - alexa_domain: leit.is - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - leit.is query_parsers: @@ -26411,21 +14510,10 @@ - url_pattern: ^https?://[^/]+/(leita|company_search)\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /leita? - /company_search? - name: miner - public_suffix: hu - alexa_domain: miner.hu - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - miner.hu query_parsers: @@ -26439,38 +14527,15 @@ - url_pattern: ^https?://[^/]+/page/[0-9]+/\? type: path_segment segment: 2 - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /? - /page - name: munax - public_suffix: com - alexa_domain: munax.com - alexa_rank: null - category: '-' - notes: 'excluded; Discontinued.' - input_field: null - search_form: null - search_div: null + excluded: No search domains: - munax.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: mysimon - public_suffix: com - alexa_domain: mysimon.com - alexa_rank: null - category: e-commerce notes: none - input_field: null - search_form: null - search_div: null domains: - mysimon.com query_parsers: @@ -26481,68 +14546,36 @@ - url_pattern: ^https?://[^/]+/shopping\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /shopping? - name: ncbi - public_suffix: nlm.nih.gov - alexa_domain: ncbi.nlm.nih.gov - alexa_rank: null - category: governmental notes: National Library of Medicine - input_field: null - search_form: null - search_div: null domains: - ncbi.nlm.nih.gov query_parsers: - url_pattern: ^https?://[^/]+/.*term= type: query_parameter parameter: term - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - /books - /gene - /protein - name: newsandmoods - public_suffix: com - alexa_domain: newsandmoods.com - alexa_rank: null - category: news-and-boulevard notes: News&Moods - input_field: null - search_form: null - search_div: null domains: - newsandmoods.com query_parsers: - url_pattern: ^https?://[^/]+/news\/search\? type: query_parameter parameter: sstring - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/news\/search\? type: query_parameter parameter: start - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /news/search? - name: newslookup - public_suffix: com - alexa_domain: newslookup.com - alexa_rank: null - category: news-and-boulevard - notes: null - input_field: null - search_form: null - search_div: null domains: - newslookup.com query_parsers: @@ -26553,20 +14586,9 @@ - url_pattern: ^https?://[^/]+/results\? type: query_parameter parameter: p - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /results? - name: nextag - public_suffix: de - alexa_domain: nextag.de - alexa_rank: null - category: e-commerce - notes: null - input_field: null - search_form: null - search_div: null domains: - nextag.de query_parsers: @@ -26577,79 +14599,31 @@ - url_pattern: ^https?://[^/]+/shopping\/products\? type: query_parameter parameter: page - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /shopping/products? - name: pixsta - public_suffix: com - alexa_domain: pixsta.com - alexa_rank: null - category: search-engine - notes: 'excluded; Reverse image search.' - input_field: null - search_form: null - search_div: null + excluded: Reverse image search domains: - pixsta.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: podscope - public_suffix: com - alexa_domain: podscope.com - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - podscope.com query_parsers: - url_pattern: ^https?://[^/]+/search\.php\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search.php? - name: qmamu - public_suffix: com - alexa_domain: qmamu.com - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - qmamu.com query_parsers: - url_pattern: ^https?://[^/]+/search\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search? - name: recipebridge - public_suffix: com - alexa_domain: recipebridge.com - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - recipebridge.com query_parsers: @@ -26661,107 +14635,53 @@ - url_pattern: ^https?://[^/]+/recipes\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: - - url_pattern: ^https?://[^/]+/recipes\? - type: query_parameter - parameter: false - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /r - /recipes? - name: seeqpod - public_suffix: com - alexa_domain: seeqpod.com - alexa_rank: null - category: search-engine - notes: excluded; Query not in URL. - input_field: null - search_form: null - search_div: null + excluded: Query not in URL. domains: - seeqpod.com - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] - name: songza - public_suffix: fm - alexa_domain: songza.fm - alexa_rank: null - category: media-sharing - notes: null - input_field: null - search_form: null - search_div: null domains: - songza.fm query_parsers: - url_pattern: ^https?://[^/]+/search type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /search - name: swisscows - public_suffix: com - alexa_domain: swisscows.com - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - swisscows.com query_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/(web|news|video|music)\? type: query_parameter parameter: query - page_parsers: [] offset_parsers: - url_pattern: ^https?://[^/]+/[a-z]+/(web|news|video|music)\? type: query_parameter parameter: offset - interpreted_query_parsers: [] - results_parsers: [] focused_url_prefixes: - /web? - /news? - /video? - /music? - name: veveo - public_suffix: net - alexa_domain: veveo.net - alexa_rank: null - category: search-engine - notes: excluded; Discontinued. - input_field: null - search_form: null - search_div: null + excluded: No search domains: - veveo.net - query_parsers: [] - page_parsers: [] - offset_parsers: [] - interpreted_query_parsers: [] - results_parsers: [] - focused_url_prefixes: [] +- name: dblp + domains: + - dblp.org + - dblp.uni-trier.de + query_parsers: + - url_pattern: ^https?://[^/]+/search\? + type: query_parameter + parameter: q + focused_url_prefixes: + - /search? - name: tripadvisor - public_suffix: net - alexa_domain: tripadvisor.com - alexa_rank: null - category: search-engine - notes: null - input_field: null - search_form: null - search_div: null domains: - tripadvisor.com query_parsers: @@ -26771,8 +14691,6 @@ - url_pattern: ^https?://[^/]+/SearchForums\? type: query_parameter parameter: q - page_parsers: [] - offset_parsers: [] interpreted_query_parsers: - url_pattern: ^https?://[^/]+/Search\? type: html_selector diff --git a/docs/favicon.png b/docs/favicon.png new file mode 100644 index 00000000..2ea253b7 Binary files /dev/null and b/docs/favicon.png differ diff --git a/docs/index.md b/docs/index.md new file mode 100644 index 00000000..2bda3b44 --- /dev/null +++ b/docs/index.md @@ -0,0 +1 @@ +Foo bar diff --git a/docs/test.md b/docs/test.md new file mode 100644 index 00000000..234fb4b5 --- /dev/null +++ b/docs/test.md @@ -0,0 +1,16 @@ +--- +title: Lorem ipsum dolor sit amet +description: Nullam urna elit, malesuada eget finibus ut, ac tortor. +icon: material/emoticon-happy +--- + +## Page title + +```mermaid +graph LR + A[Start] --> B{Error?}; + B -->|Yes| C[Hmm...]; + C --> D[Debug]; + D --> B; + B ---->|No| E[Yay!]; +``` \ No newline at end of file diff --git a/docs/webis-logo-white.svg b/docs/webis-logo-white.svg new file mode 100644 index 00000000..02c8fe93 --- /dev/null +++ b/docs/webis-logo-white.svg @@ -0,0 +1,54 @@ + + + +]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/helm/.gitignore b/helm/.gitignore new file mode 100644 index 00000000..a8c97947 --- /dev/null +++ b/helm/.gitignore @@ -0,0 +1,4 @@ +**/requirements.lock +**/requirements.yaml.bak +**/*.tgz +values.override.yaml diff --git a/helm/.helmignore b/helm/.helmignore new file mode 100644 index 00000000..0e8a0eb3 --- /dev/null +++ b/helm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 00000000..6deef090 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,19 @@ +apiVersion: v2 +name: archive-query-log +description: "The Archive Query Log: Mining Millions of Search Result Pages of Hundreds of Search Engines from 25 Years of Web Archives." +type: application +home: https://github.com/webis-de/archive-query-log +sources: +- https://github.com/webis-de/archive-query-log +icon: https://assets.webis.de/img/webis-logo-gray.svg +maintainers: +- name: Jan Heinrich Merker + email: heinrich.merker@uni-jena.de + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. +appVersion: 0.1.33 + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +version: 0.1.40 diff --git a/helm/templates/NOTES.txt b/helm/templates/NOTES.txt new file mode 100644 index 00000000..914eb576 --- /dev/null +++ b/helm/templates/NOTES.txt @@ -0,0 +1,51 @@ +Installed {{ $.Chart.Name }} (release name: {{ $.Release.Name }}{{ with $.Values.namespace }}, namespace: {{ . }}{{ end }}). + +Accessing Elasticsearch {{ $.Values.elasticsearch.host }}:{{ $.Values.elasticsearch.port }} (authenticating with user {{ $.Values.elasticsearch.username }}). + +Started cron jobs: +{{- if $.Values.sourcesBuild.enabled }} +- Build sources: + Scheduled at {{ $.Values.sourcesBuild.schedule }}. + Running {{ $.Values.sourcesBuild.completions }}x (parallelism: {{ $.Values.sourcesBuild.parallelism }}, backoff limit: {{ $.Values.sourcesBuild.backoffLimit }}). +{{- end }} +{{- if $.Values.capturesFetch.enabled }} +- Fetch captures: + Scheduled at {{ $.Values.capturesFetch.schedule }}. + Running {{ $.Values.capturesFetch.completions }}x (parallelism: {{ $.Values.capturesFetch.parallelism }}, backoff limit: {{ $.Values.capturesFetch.backoffLimit }}). +{{- end }} +{{- if $.Values.capturesImportAql22.enabled }} +- Import captures from AQL-22: + Running {{ $.Values.capturesImportAql22.completions }}x (search providers: {{ $.Values.capturesImportAql22.completions }}, parallelism: {{ $.Values.capturesImportAql22.parallelism }}, backoff limit: {{ $.Values.capturesImportAql22.backoffLimit }}). +{{- end }} +{{- if $.Values.serpsParseUrlQuery.enabled }} +- Parse SERP URL queries: + Scheduled at {{ $.Values.serpsParseUrlQuery.schedule }}. + Running {{ $.Values.serpsParseUrlQuery.completions }}x (parallelism: {{ $.Values.serpsParseUrlQuery.parallelism }}, backoff limit: {{ $.Values.serpsParseUrlQuery.backoffLimit }}). +{{- end }} +{{- if $.Values.serpsParseUrlPage.enabled }} +- Parse SERP URL pages: + Scheduled at {{ $.Values.serpsParseUrlPage.schedule }}. + Running {{ $.Values.serpsParseUrlPage.completions }}x (parallelism: {{ $.Values.serpsParseUrlPage.parallelism }}, backoff limit: {{ $.Values.serpsParseUrlPage.backoffLimit }}). +{{- end }} +{{- if $.Values.serpsParseUrlOffset.enabled }} +- Parse SERP URL offsets: + Scheduled at {{ $.Values.serpsParseUrlOffset.schedule }}. + Running {{ $.Values.serpsParseUrlOffset.completions }}x (parallelism: {{ $.Values.serpsParseUrlOffset.parallelism }}, backoff limit: {{ $.Values.serpsParseUrlOffset.backoffLimit }}). +{{- end }} +{{- if $.Values.serpsDownloadWarc.enabled }} +- Download SERP WARCs: + Scheduled at {{ $.Values.serpsDownloadWarc.schedule }}. + Running {{ $.Values.serpsDownloadWarc.completions }}x (parallelism: {{ $.Values.serpsDownloadWarc.parallelism }}, backoff limit: {{ $.Values.serpsDownloadWarc.backoffLimit }}). +{{- end }} +{{- if $.Values.serpsParseWarcQuery.enabled }} +- Parse SERP WARC queries: + Scheduled at {{ $.Values.serpsParseWarcQuery.schedule }}. + Running {{ $.Values.serpsParseWarcQuery.completions }}x (parallelism: {{ $.Values.serpsParseWarcQuery.parallelism }}, backoff limit: {{ $.Values.serpsParseWarcQuery.backoffLimit }}). +{{- end }} +{{- if $.Values.serpsParseWarcSnippets.enabled }} +- Parse SERP WARC queries: + Scheduled at {{ $.Values.serpsParseWarcSnippets.schedule }}. + Running {{ $.Values.serpsParseWarcSnippets.completions }}x (parallelism: {{ $.Values.serpsParseWarcSnippets.parallelism }}, backoff limit: {{ $.Values.serpsParseWarcSnippets.backoffLimit }}). +{{- end }} + +Serving monitoring web interface at {{ if $.Values.monitoring.forceSslRedirect }}https{{ else }}http{{ end }}://{{ $.Values.monitoring.host }}{{ $.Values.monitoring.path }}. diff --git a/helm/templates/archive-query-log-config-map.yml b/helm/templates/archive-query-log-config-map.yml new file mode 100644 index 00000000..dcef07ac --- /dev/null +++ b/helm/templates/archive-query-log-config-map.yml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ $.Release.Name }}-config-map + namespace: {{ $.Release.Namespace }} +data: + config.yml: | + es: + host: {{ $.Values.elasticsearch.host }} + port: {{ $.Values.elasticsearch.port }} + s3: + endpoint_url: {{ $.Values.s3.endpoint_url }} + bucket_name: {{ $.Values.s3.bucket_name }} diff --git a/helm/templates/archive-query-log-cron-job-captures-fetch.yml b/helm/templates/archive-query-log-cron-job-captures-fetch.yml new file mode 100644 index 00000000..d9ab6d6a --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-captures-fetch.yml @@ -0,0 +1,54 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-captures-fetch + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.capturesFetch.enabled }} + schedule: "{{ $.Values.capturesFetch.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.capturesFetch.completions }} + parallelism: {{ $.Values.capturesFetch.parallelism }} + backoffLimit: {{ $.Values.capturesFetch.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.capturesFetch.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-captures-fetch + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - captures + - fetch + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-captures-import-aql-22.yml b/helm/templates/archive-query-log-cron-job-captures-import-aql-22.yml new file mode 100644 index 00000000..4a1230fb --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-captures-import-aql-22.yml @@ -0,0 +1,70 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-captures-import-aql-22 + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.capturesImportAql22.enabled }} + schedule: "{{ $.Values.capturesImportAql22.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.capturesImportAql22.completions }} + parallelism: {{ $.Values.capturesImportAql22.parallelism }} + backoffLimit: {{ $.Values.capturesImportAql22.backoffLimit }} + completionMode: Indexed + template: + spec: + containers: + - name: {{ $.Release.Name }}-captures-import-aql-22 + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - captures + - import + - aql-22 + - --no-check-memento + - /workspace/data/ + env: + - name: SEARCH_PROVIDER_INDEX + valueFrom: + fieldRef: + fieldPath: metadata.annotations['batch.kubernetes.io/job-completion-index'] + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-data + mountPath: /workspace/data/ + readOnly: true + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret + - name: {{ $.Release.Name }}-data + hostPath: + path: {{ $.Values.capturesImportAql22.dataDir }} + type: Directory + diff --git a/helm/templates/archive-query-log-cron-job-serps-download-warc.yml b/helm/templates/archive-query-log-cron-job-serps-download-warc.yml new file mode 100644 index 00000000..49f80912 --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-serps-download-warc.yml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-serps-download-warc + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.serpsDownloadWarc.enabled }} + schedule: "{{ $.Values.serpsDownloadWarc.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.serpsDownloadWarc.completions }} + parallelism: {{ $.Values.serpsDownloadWarc.parallelism }} + backoffLimit: {{ $.Values.serpsDownloadWarc.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.serpsDownloadWarc.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-serps-download-warc + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - serps + - download + - warc + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-serps-parse-url-offset.yml b/helm/templates/archive-query-log-cron-job-serps-parse-url-offset.yml new file mode 100644 index 00000000..c1f75339 --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-serps-parse-url-offset.yml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-serps-parse-url-offset + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.serpsParseUrlOffset.enabled }} + schedule: "{{ $.Values.serpsParseUrlOffset.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.serpsParseUrlOffset.completions }} + parallelism: {{ $.Values.serpsParseUrlOffset.parallelism }} + backoffLimit: {{ $.Values.serpsParseUrlOffset.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.serpsParseUrlOffset.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-serps-parse-url-offset + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - serps + - parse + - url-offset + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-serps-parse-url-page.yml b/helm/templates/archive-query-log-cron-job-serps-parse-url-page.yml new file mode 100644 index 00000000..f5de7dbb --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-serps-parse-url-page.yml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-serps-parse-url-page + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.serpsParseUrlPage.enabled }} + schedule: "{{ $.Values.serpsParseUrlPage.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.serpsParseUrlPage.completions }} + parallelism: {{ $.Values.serpsParseUrlPage.parallelism }} + backoffLimit: {{ $.Values.serpsParseUrlPage.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.serpsParseUrlPage.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-serps-parse-url-page + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - serps + - parse + - url-page + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-serps-parse-url-query.yml b/helm/templates/archive-query-log-cron-job-serps-parse-url-query.yml new file mode 100644 index 00000000..505e472a --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-serps-parse-url-query.yml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-serps-parse-url-query + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.serpsParseUrlQuery.enabled }} + schedule: "{{ $.Values.serpsParseUrlQuery.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.serpsParseUrlQuery.completions }} + parallelism: {{ $.Values.serpsParseUrlQuery.parallelism }} + backoffLimit: {{ $.Values.serpsParseUrlQuery.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.serpsParseUrlQuery.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-serps-parse-url-query + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - serps + - parse + - url-query + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-serps-parse-warc-query.yml b/helm/templates/archive-query-log-cron-job-serps-parse-warc-query.yml new file mode 100644 index 00000000..dcfc71dc --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-serps-parse-warc-query.yml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-serps-parse-warc-query + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.serpsParseWarcQuery.enabled }} + schedule: "{{ $.Values.serpsParseWarcQuery.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.serpsParseWarcQuery.completions }} + parallelism: {{ $.Values.serpsParseWarcQuery.parallelism }} + backoffLimit: {{ $.Values.serpsParseWarcQuery.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.serpsParseWarcQuery.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-serps-parse-warc-query + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - serps + - parse + - warc-query + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-serps-parse-warc-snippets.yml b/helm/templates/archive-query-log-cron-job-serps-parse-warc-snippets.yml new file mode 100644 index 00000000..8d7f757f --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-serps-parse-warc-snippets.yml @@ -0,0 +1,55 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-serps-parse-warc-snippets + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.serpsParseWarcSnippets.enabled }} + schedule: "{{ $.Values.serpsParseWarcSnippets.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.serpsParseWarcSnippets.completions }} + parallelism: {{ $.Values.serpsParseWarcSnippets.parallelism }} + backoffLimit: {{ $.Values.serpsParseWarcSnippets.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.serpsParseWarcSnippets.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-serps-parse-warc-snippets + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - serps + - parse + - warc-snippets + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-cron-job-sources-build.yml b/helm/templates/archive-query-log-cron-job-sources-build.yml new file mode 100644 index 00000000..15dca863 --- /dev/null +++ b/helm/templates/archive-query-log-cron-job-sources-build.yml @@ -0,0 +1,54 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ $.Release.Name }}-sources-build + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + suspend: {{ not $.Values.sourcesBuild.enabled }} + schedule: "{{ $.Values.sourcesBuild.schedule }}" + concurrencyPolicy: Forbid + jobTemplate: + spec: + completions: {{ $.Values.sourcesBuild.completions }} + parallelism: {{ $.Values.sourcesBuild.parallelism }} + backoffLimit: {{ $.Values.sourcesBuild.backoffLimit }} + ttlSecondsAfterFinished: {{ mul 60 $.Values.sourcesBuild.ttlMinutesAfterFinished }} + template: + spec: + containers: + - name: {{ $.Release.Name }}-sources-build + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + imagePullPolicy: IfNotPresent + resources: + requests: + memory: 4Gi + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - sources + - build + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + restartPolicy: OnFailure + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-deployment-monitoring.yml b/helm/templates/archive-query-log-deployment-monitoring.yml new file mode 100644 index 00000000..634c0798 --- /dev/null +++ b/helm/templates/archive-query-log-deployment-monitoring.yml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: {{ $.Release.Name }}-monitoring + namespace: {{ $.Release.Namespace }} + annotations: + checksum/config-map: {{ include (print $.Template.BasePath "/archive-query-log-config-map.yml") . | sha256sum }} + checksum/secret: {{ include (print $.Template.BasePath "/archive-query-log-secret.yaml") . | sha256sum }} +spec: + selector: + matchLabels: + app: {{ $.Release.Name }}-monitoring + replicas: {{ $.Values.monitoring.replicas }} + revisionHistoryLimit: 2 + template: + metadata: + labels: + app: {{ $.Release.Name }}-monitoring + spec: + restartPolicy: Always + containers: + - name: {{ $.Release.Name }} + image: "{{ .Values.image }}:{{ .Chart.AppVersion }}" + command: + - /venv/bin/python + - -m + - archive_query_log + - -f + - /workspace/config.config-map.yml + - -f + - /workspace/config.secret.yml + - monitoring + - run + - --host + - 0.0.0.0 + - --port + - "5000" + ports: + - name: http + containerPort: 5000 + volumeMounts: + - name: {{ $.Release.Name }}-config-map + mountPath: /workspace/config.config-map.yml + readOnly: true + subPath: config.yml + - name: {{ $.Release.Name }}-secret + mountPath: /workspace/config.secret.yml + readOnly: true + subPath: config.yml + volumes: + - name: {{ $.Release.Name }}-config-map + configMap: + name: {{ $.Release.Name }}-config-map + - name: {{ $.Release.Name }}-secret + secret: + secretName: {{ $.Release.Name }}-secret diff --git a/helm/templates/archive-query-log-ingress-monitoring.yml b/helm/templates/archive-query-log-ingress-monitoring.yml new file mode 100644 index 00000000..6c023d85 --- /dev/null +++ b/helm/templates/archive-query-log-ingress-monitoring.yml @@ -0,0 +1,23 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: {{ $.Release.Name }}-monitoring + namespace: {{ $.Release.Namespace }} + annotations: + nginx.ingress.kubernetes.io/force-ssl-redirect: "{{ printf "%t" $.Values.monitoring.forceSslRedirect }}" + nginx.ingress.kubernetes.io/proxy-connect-timeout: 3600s + nginx.ingress.kubernetes.io/proxy-read-timeout: 3600s + nginx.ingress.kubernetes.io/proxy-send-timeout: 3600s +spec: + ingressClassName: {{ $.Values.monitoring.ingressClassName }} + rules: + - host: {{ $.Values.monitoring.host }} + http: + paths: + - path: {{ $.Values.monitoring.path }} + pathType: Prefix + backend: + service: + name: {{ $.Release.Name }}-monitoring + port: + name: http diff --git a/helm/templates/archive-query-log-secret.yaml b/helm/templates/archive-query-log-secret.yaml new file mode 100644 index 00000000..f545d2a0 --- /dev/null +++ b/helm/templates/archive-query-log-secret.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Secret +metadata: + name: {{ $.Release.Name }}-secret + namespace: {{ $.Release.Namespace }} +type: Opaque +stringData: + config.yml: | + es: + username: {{ $.Values.elasticsearch.username }} + password: {{ $.Values.elasticsearch.password }} + s3: + access_key: {{ $.Values.s3.access_key }} + secret_key: {{ $.Values.s3.secret_key }} diff --git a/helm/templates/archive-query-log-service-monitoring.yml b/helm/templates/archive-query-log-service-monitoring.yml new file mode 100644 index 00000000..9883b577 --- /dev/null +++ b/helm/templates/archive-query-log-service-monitoring.yml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: {{ $.Release.Name }}-monitoring + namespace: {{ $.Release.Namespace }} + labels: + app: {{ $.Release.Name }}-monitoring +spec: + selector: + app: {{ $.Release.Name }}-monitoring + ports: + - name: http + port: 4000 + targetPort: http + protocol: TCP diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 00000000..be49fe2d --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,94 @@ +image: ghcr.io/webis-de/archive-query-log + +sourcesBuild: + enabled: true + schedule: "0 6 * * 2,4" + completions: 10 + parallelism: 10 + backoffLimit: 25 + ttlMinutesAfterFinished: 30 + +capturesFetch: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 15 + parallelism: 15 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +capturesImportAql22: + enabled: false + schedule: "0 6 * * 3" + dataDir: /mnt/ceph/storage/data-in-progress/data-research/web-search/archive-query-log/focused + completions: 729 # The number of search providers. + parallelism: 10 + backoffLimit: 25 + ttlMinutesAfterFinished: 30 + +serpsParseUrlQuery: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 15 + parallelism: 15 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +serpsParseUrlPage: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 5 + parallelism: 5 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +serpsParseUrlOffset: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 5 + parallelism: 5 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +serpsDownloadWarc: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 20 + parallelism: 20 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +serpsParseWarcQuery: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 5 + parallelism: 5 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +serpsParseWarcSnippets: + enabled: true + schedule: "0,15,30,45 * * * *" + completions: 5 + parallelism: 5 + backoffLimit: 25 + ttlMinutesAfterFinished: 5 + +monitoring: + ingressClassName: nginx + host: aql-monitoring.srv.webis.de + path: / + replicas: 1 + forceSslRedirect: true + +elasticsearch: + host: elasticsearch.srv.webis.de + port: 9200 + username: null # Override with `--set elasticsearch.username=EXAMPLE` Helm option. + password: null # Override with `--set elasticsearch.password=EXAMPLE` Helm option. + + +s3: + endpoint_url: https://s3.dw.webis.de + bucket_name: archive-query-log + access_key: null # Override with `--set s3.access_key=EXAMPLE` Helm option. + secret_key: null # Override with `--set s3.secret_key=EXAMPLE` Helm option. diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 00000000..2368b806 --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,61 @@ +site_name: Archive Query Log +site_description: Mining Millions of Search Result Pages of Hundreds of Search Engines from 25 Years of Web Archives. +site_author: Webis group +copyright: Copyright Β© 2024 Webis +site_url: https://webis-de.github.io/archive-query-log +repo_url: https://github.com/webis-de/archive-query-log +repo_name: webis-de/archive-query-log +edit_uri: edit/elastic/docs/ +strict: true +nav: + - Introduction: index.md + - Foo: test.md + - Issue Tracker: https://github.com/webis-de/archive-query-log/issues +plugins: + - search + - offline +extra: + social: + - name: GitHub + icon: fontawesome/brands/github + link: https://github.com/webis-de + - name: Twitter / X + icon: fontawesome/brands/x-twitter + link: https://twitter.com/webis_de + generator: false +theme: + name: material + locale: en + language: en + logo: webis-logo-white.svg + favicon: favicon.png + features: + - content.action.edit + icon: + repo: fontawesome/brands/github + edit: material/pencil + palette: + - media: "(prefers-color-scheme)" + toggle: + icon: material/brightness-auto + name: Switch to system preference + - media: "(prefers-color-scheme: light)" + scheme: default + primary: black + accent: indigo + toggle: + icon: material/brightness-7 + name: Switch to dark mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: indigo + accent: indigo + toggle: + icon: material/brightness-4 + name: Switch to light mode +markdown_extensions: + - pymdownx.superfences: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:pymdownx.superfences.fence_code_format diff --git a/pyproject.toml b/pyproject.toml index 2cbefe6e..ddc9be35 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,21 +1,23 @@ [project] name = "archive-query-log" authors = [ - {name = "Jan Heinrich Reimer", email = "heinrich.reimer@uni-jena.de"}, - {name = "Sebastian Schmidt", email = "s.schmidt@uni-leipzig.de"}, - {name = "Maik FrΓΆbe", email = "maik.froebe@uni-jena.de"}, - {name = "Lukas Gienapp", email = "lukas.gienapp@uni-leipzig.de"}, - {name = "Harrisen Scells", email = "harry.scells@uni-leipzig.de"}, - {name = "Benno Stein", email = "benno.stein@uni-weimar.de"}, - {name = "Matthias Hagen", email = "matthias.hagen@uni-jena.de"}, - {name = "Martin Potthast", email = "martin.potthast@uni-leipzig.de"}, + { name = "Jan Heinrich Merker", email = "heinrich.merker@uni-jena.de" }, + { name = "Sebastian Schmidt", email = "s.schmidt@uni-leipzig.de" }, + { name = "Maik FrΓΆbe", email = "maik.froebe@uni-jena.de" }, + { name = "Lukas Gienapp", email = "lukas.gienapp@uni-leipzig.de" }, + { name = "Harrisen Scells", email = "harry.scells@uni-leipzig.de" }, + { name = "Benno Stein", email = "benno.stein@uni-weimar.de" }, + { name = "Matthias Hagen", email = "matthias.hagen@uni-jena.de" }, + { name = "Martin Potthast", email = "martin.potthast@uni-leipzig.de" }, ] -description = "Mining Millions of Search Result Pages of Hundreds of Search Engines from 25 Years of Web Archives" +description = "Mining Millions of Search Result Pages of Hundreds of Search Engines from 25 Years of Web Archives." +readme = "README.md" requires-python = ">=3.10" classifiers = [ "Programming Language :: Python :: 3", "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Operating System :: OS Independent", "Intended Audience :: Science/Research", "Topic :: Scientific/Engineering", @@ -24,66 +26,147 @@ dependencies = [ "aiohttp~=3.8", "aiohttp-retry~=2.8", "asyncio-pool~=0.6.0", - "beautifulsoup4~=4.11", + "beautifulsoup4~=4.12", "bleach~=6.0", + "boto3~=1.28", + "certifi~=2023.7", "click~=8.1", + "cssselect~=1.2", "dataclasses-json~=0.6.0", - "diskcache~=5.4", - "fasttext-langdetect~=1.0", - "fastwarc~=0.14.0", - "flake8~=7.0", - "fasttext @ git+https://github.com/cfculhane/fastText.git@4a44513", - "joblib~=1.2", - "jupyter~=1.0", - "marshmallow~=3.19", - "matplotlib~=3.7", + "diskcache~=5.6", + "elasticsearch~=7.0", + "elasticsearch-dsl~=7.0", + "expiringdict~=1.2", + "flasgger~=0.9.7", + "flask~=2.3", + "flask-restful~=0.3.10", + "importlib-metadata~=6.0", + "joblib~=1.3", + "lxml~=4.9", + "marshmallow~=3.20", + "mergedeep~=1.3", + "more-itertools~=10.1", "pandas~=2.0", - "pex~=2.1", - "publicsuffixlist~=0.9.3", - "pyarrow~=15.0.0", + "publicsuffixlist~=0.10.0", "pycld3~=0.22.0", - "pyspark~=3.5", "python-slugify~=8.0", + "python-whois~=0.8.0", "pyyaml~=6.0", - "ranx~=0.3.6", - "requests~=2.28", + "ranx~=0.3.16", + "requests~=2.31", "requests-html~=0.10.0", - "seaborn~=0.13.0", - "tqdm~=4.64", + "requests-ratelimiter~=0.4.0", + "resiliparse~=0.14.5", + "tqdm~=4.66", "unidecode~=1.3", - "urllib3~=2.1", + "urllib3~=1.26", "warcio~=1.7", + "warc-s3~=0.1.1", + "web-archive-api~=0.1.7", ] dynamic = ["version"] [project.optional-dependencies] tests = [ "approvaltests~=11.0", - "flake8~=7.0", - "pylint~=3.1", + "bandit[toml]~=1.7", + "boto3-stubs[s3]~=1.28", + "mypy~=1.5", + "pandas-stubs~=2.0", "pytest~=8.0", "pytest-cov~=4.0", + "ruff~=0.2.1", + "types-beautifulsoup4~=4.12", + "types-bleach~=6.0", + "types-lxml~=2023.10", + "types-python-dateutil~=2.8", + "types-python-slugify~=8.0", + "types-pyyaml~=6.0", + "types-requests~=2.31", + "types-tqdm~=4.66", +] +examples = [ + "jupyter~=1.0", + "matplotlib~=3.7", + "seaborn~=0.13.0", +] +documentation = [ + "mkdocs~=1.5", + "mkdocs-material~=9.5", ] +[project.urls] +"Homepage" = "https://github.com/webis-de/archive-query-log" +"Bug Tracker" = "https://github.com/webis-de/archive-query-log/issues" + +[project.scripts] +archive_query_log = "archive_query_log.cli:cli" +archive-query-log = "archive_query_log.cli:cli" +aql = "archive_query_log.cli:cli" + [build-system] -requires = ["setuptools>=50", "wheel"] +requires = ["setuptools>=50", "setuptools_scm[toml]>=6.2", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools] include-package-data = true -[tool.setuptools.dynamic] -version = {attr = "archive_query_log.__version__"} - [tool.setuptools.packages.find] -include = ["archive_query_log"] -namespaces = false +exclude = [ + "build", + "venv", + "notebooks", + "integrations", + "scripts", + "data", + "docs", + "helm", +] + +[tool.setuptools_scm] + +[tool.ruff] +exclude = [ + "build", + "venv", + "notebooks", + "integrations", + "scripts", + "data", + "docs", + "helm", +] + +[tool.mypy] +ignore_missing_imports = true +exclude = [ + "build", + "venv", + "notebooks", + "integrations", + "scripts", + "data", + "docs", + "helm", +] + +[tool.bandit] +exclude_dirs = [ + "build", + "venv", + "notebooks", + "integrations", + "scripts", + "data", + "docs", + "helm", +] -[tool.setuptools.package-data] -mypkg = ["*.txt", "*.md", "*.rst"] +[tool.bandit.assert_used] +skips = ["**/test_*.py"] [tool.pytest.ini_options] log_cli = "True" filterwarnings = [ - 'ignore::DeprecationWarning', + "ignore::marshmallow.warnings.RemovedInMarshmallow4Warning", ] diff --git a/run_cdx_batch.sh b/run_cdx_batch.sh deleted file mode 100755 index c276b42b..00000000 --- a/run_cdx_batch.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/usr/bin/bash - -while IFS= read -r p; do - echo "Crawl archived-urls: $p" - sleep 1s - - srun --cpus-per-task 4 --input /dev/null --ntasks-per-node 1 --mem 40G --container-writable --container-image python:3.10 --container-name web-archive-query-log-python-3.10 --container-mounts "$PWD":/workspace --chdir "$PWD" sh -c "cd /workspace && python -m pipenv run python -m archive_query_log make archived-urls -d /mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/ -f ${p}" - wait - echo "Parse archived-urls: $p" - sleep 1s - - srun --cpus-per-task 4 --input /dev/null --ntasks-per-node 1 --mem 40G --container-writable --container-image python:3.10 --container-name web-archive-query-log-python-3.10 --container-mounts "$PWD":/workspace --chdir "$PWD" sh -c "cd /workspace && python -m pipenv run python -m archive_query_log make archived-query-urls -d /mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/ -f ${p}" - wait - -done dict: - jsonl_path = _DATA_DIR / base_type / path.with_suffix(".jsonl.gz") - if not jsonl_path.exists(): - return {} - index = {} - try: - with GzipFile(jsonl_path, "r") as gzip_file: - # noinspection PyTypeChecker - for line in tqdm(gzip_file, desc="Index JSONL"): - try: - # noinspection PyTypeChecker - record = loads(line) - except: - print(f"Could not index {line} at {path}.") - continue - record_id = uuid5( - NAMESPACE_URL, - f"{record['timestamp']}:{record['url']}", - ) - index[record_id] = record - return index - except: - print(f"Could not read JSONL file at {path}.") - return {} - - -def _index_warc(path: Path, base_type: str) -> dict: - warc_path = _DATA_DIR / base_type / path - if not warc_path.exists(): - return {} - index = {} - for warc_child_path in warc_path.iterdir(): - if warc_child_path.name.startswith("."): - continue - try: - stream = FileStream(str(warc_child_path.absolute())) - records = ArchiveIterator( - stream, - record_types=WarcRecordType.response, - parse_http=False, - ) - # noinspection PyTypeChecker - for record in tqdm(records, desc="Index WARC"): - record: WarcRecord - offset = record.stream_pos - record_url_header = record.headers["Archived-URL"] - try: - record_url = loads(record_url_header) - except JSONDecodeError: - print(f"Could not index {record_url_header} at {path}.") - continue - record_id = uuid5( - NAMESPACE_URL, - f"{record_url['timestamp']}:{record_url['url']}", - ) - index[record_id] = ( - warc_child_path, - offset, - ) - except: - print(f"Could not read WARC file at {warc_child_path}.") - continue - return index - - -def _iter_relative_path_records(relative_path: Path) -> Iterator[tuple]: - print("Finished reading archived URLs.") - archived_query_urls_index = _read_jsonl(relative_path, - "archived-query-urls") - print("Finished reading archived query URLs.") - archived_raw_serps_index = _index_warc(relative_path, "archived-raw-serps") - print("Finished reading archived raw SERPs (pointers).") - - for record_id, archived_url in archived_query_urls_index.items(): - archived_query_url = archived_query_urls_index[record_id] - archived_raw_serp_location = archived_raw_serps_index.get(record_id, - None) - - yield relative_path, record_id, archived_query_url, \ - archived_raw_serp_location - - - -def _record_to_query(relative_path_record: tuple) -> Optional[str]: - relative_path, record_id, archived_query_url, archived_raw_serp_location \ - = relative_path_record - - if archived_raw_serp_location is not None: - print("SERP was already downloaded.") - - url = archived_query_url["url"] - domain = urlparse(url).hostname - timestamp = archived_query_url["timestamp"] - wayback_timestamp = \ - datetime.fromtimestamp(timestamp).strftime("%Y%m%d%H%M%S") - wayback_raw_url = \ - f"https://web.archive.org/web/{wayback_timestamp}id_/{url}" - - - task = { - "download_url": wayback_raw_url, - "output_path": str(_GLOBAL_DATA_DIR / "archived-raw-serps" / relative_path / "*.warc.gz"), - } - return dumps(task) - - - -def main(): - session = SparkSession.builder.getOrCreate() - - sc = session.sparkContext - - relative_paths = [ - path - .relative_to(_DATA_DIR / "archived-urls") - .with_name(path.name[:-len(".jsonl.gz")]) - for path in _DATA_DIR.glob("archived-urls/*/*/*.jsonl.gz") - ] - print(f"Found {len(relative_paths)} paths.") - shuffle(relative_paths) - print(f"Selected {len(relative_paths)} paths " - f"for finding downloadable SERP URLs.") - - print("Export downloadable SERP URL list at archive-query-log-urls/.") - sc.parallelize(relative_paths, 100) \ - .flatMap(_iter_relative_path_records) \ - .map(_record_to_query) \ - .filter(lambda json: json is not None) \ - .repartition(1) \ - .saveAsTextFile(f"archive-query-log-urls/", - compressionCodecClass= - "org.apache.hadoop.io.compress.GzipCodec") - - print("Done.") - - -if __name__ == "__main__": - main() diff --git a/scripts/create_url_list.sh b/scripts/create_url_list.sh deleted file mode 100755 index 133ba7b1..00000000 --- a/scripts/create_url_list.sh +++ /dev/null @@ -1,17 +0,0 @@ -#!/usr/bin/env bash - -hdfs dfs -rm -r -f archive-query-log-urls/ - -spark-submit \ - --conf spark.yarn.submit.waitAppCompletion=false \ - --name archive-query-log-urls \ - --master yarn \ - --deploy-mode cluster \ - --num-executors 4 \ - --executor-cores 8 \ - --executor-memory 4g \ - --driver-memory 32g \ - --conf spark.executor.memoryOverhead=8000 \ - --conf spark.driver.memoryOverhead=10000 \ - --conf spark.network.timeout=600 \ - create_url_list.py diff --git a/slurm-cli b/slurm-cli deleted file mode 100755 index e2269849..00000000 --- a/slurm-cli +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -e - -srun \ - --cpus-per-task 10 \ - --ntasks-per-node 1 \ - --mem 100G \ - --container-writable \ - --container-image python:3.10 \ - --container-name web-archive-query-log-python-3.10 \ - --container-mounts "$PWD":/workspace \ - --chdir "$PWD" \ - --pty \ - sh -c " - cd /workspace - venv/bin/python -m archive_query_log $* - " diff --git a/slurm-setup b/slurm-setup deleted file mode 100755 index 77ca5913..00000000 --- a/slurm-setup +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -e - -srun \ - --cpus-per-task 4 \ - --ntasks-per-node 1 \ - --mem 4G \ - --container-writable \ - --container-image python:3.10 \ - --container-name web-archive-query-log-python-3.10 \ - --container-mounts "$PWD":/workspace \ - --container-remap-root \ - --chdir "$PWD" \ - --pty \ - sh -c " - cd /workspace && - apt-get install -y protobuf-compiler && - pip install --upgrade pip cython && - (test -d venv/ || python3.10 -m venv venv/) && - venv/bin/python -m pip install -e . - "