Skip to content

Commit

Permalink
Merge pull request #8 from nlnwa/feat/simplify-indexing
Browse files Browse the repository at this point in the history
feat: simpler indexing
  • Loading branch information
trym-b authored Aug 16, 2023
2 parents 8cd954b + e0ccc29 commit 2f2a0a9
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 3 deletions.
12 changes: 9 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,12 @@
# http://localhost:8983/solr/#/

# If you have some WARC files you want to index, you can index them with the following commands:
# WARC_FILES=$(find /warc-collection/ -type f)
# ./unpacked-bundle/solrwayback_package_$SOLRWAYBACK_VERSION/indexing/warc-indexer.sh $WARC_FILES
# python3 index_it.py --collection <collection> --number-of-threads <threads> \
# --warc-file-directory /warc-collection/<path/to/collection>
# <path/to/collection> must contain WARC files that all belong to the same collection
# Note: all of the WARC files in the directory will be indexed with the given collection
# Note: both neither solr or tomcat should be running when you call this command
# Note: index_it.py will stop both solr and tomcat on exit

FROM ubuntu:22.04

Expand All @@ -28,7 +32,7 @@ ENV APACHE_TOMCAT_VERSION 8.5.60
ENV SOLR_VERSION 7.7.3

RUN apt-get update --assume-yes --quiet
RUN apt-get install wget unzip --assume-yes --quiet
RUN apt-get install wget unzip python3 --assume-yes --quiet

# Install dependencies
RUN apt-get install default-jre lsof curl --assume-yes --quiet
Expand Down Expand Up @@ -59,3 +63,5 @@ RUN unpacked-bundle/solrwayback_package_${SOLRWAYBACK_VERSION}/apache-tomcat-${A
# Verify that solr works
RUN unpacked-bundle/solrwayback_package_${SOLRWAYBACK_VERSION}/solr-${SOLR_VERSION}/bin/solr start
RUN unpacked-bundle/solrwayback_package_${SOLRWAYBACK_VERSION}/solr-${SOLR_VERSION}/bin/solr stop -all

COPY index_it.py .
159 changes: 159 additions & 0 deletions index_it.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
from argparse import ArgumentParser, Namespace
from contextlib import contextmanager
from pathlib import Path
from subprocess import check_call
from typing import Generator


def _args() -> Namespace:
parser = ArgumentParser()
parser.add_argument(
"--collection", required=True, type=str, help="Name of collection"
)
parser.add_argument(
"--warc-file-directory",
required=True,
type=Path,
help="Directory containing WARC files belonging to a collection",
)
parser.add_argument(
"--number-of-threads",
required=True,
type=int,
help="Number of threads to use for indexing",
)
return parser.parse_args()


def _main() -> None:
args = _args()
solr_version = "7.7.3"
apache_tomcat_version = "8.5.60"
solrwayback_version = "4.4.2"

with _solr(
solr_version=solr_version, solrwayback_version=solrwayback_version
), _tomcat(
apache_tomcat_version=apache_tomcat_version,
solrwayback_version=solrwayback_version,
):
_index(
args.collection,
number_of_threads=args.number_of_threads,
warc_file_directory=args.warc_file_directory,
solrwayback_version=solrwayback_version,
)


@contextmanager
def _solr(solr_version: str, solrwayback_version: str) -> Generator[None, None, None]:
_start_solr(solr_version=solr_version, solrwayback_version=solrwayback_version)
yield
_stop_solr(solr_version=solr_version, solrwayback_version=solrwayback_version)


@contextmanager
def _tomcat(
apache_tomcat_version: str, solrwayback_version: str
) -> Generator[None, None, None]:
_start_tomcat(
apache_tomcat_version=apache_tomcat_version,
solrwayback_version=solrwayback_version,
)
yield
_stop_tomcat(
apache_tomcat_version=apache_tomcat_version,
solrwayback_version=solrwayback_version,
)


def _stop_tomcat(apache_tomcat_version: str, solrwayback_version: str) -> None:
home_dir = Path().resolve()
apache_tomcat_path = (
home_dir
/ "unpacked-bundle"
/ f"solrwayback_package_{solrwayback_version}"
/ f"apache-tomcat-{apache_tomcat_version}"
/ "bin"
/ "shutdown.sh"
)
check_call(
[str(apache_tomcat_path)],
)


def _stop_solr(solr_version: str, solrwayback_version: str) -> None:
home_dir = Path().resolve()
solr_path = (
home_dir
/ "unpacked-bundle"
/ f"solrwayback_package_{solrwayback_version}"
/ f"solr-{solr_version}"
/ "bin"
/ "solr"
)
check_call(
[str(solr_path), "stop", "-all"],
)


def _start_solr(solr_version: str, solrwayback_version: str) -> None:
home_dir = Path().resolve()
solr_path = (
home_dir
/ "unpacked-bundle"
/ f"solrwayback_package_{solrwayback_version}"
/ f"solr-{solr_version}"
/ "bin"
/ "solr"
)
check_call(
[str(solr_path), "start"],
)


def _start_tomcat(apache_tomcat_version: str, solrwayback_version: str) -> None:
home_dir = Path().resolve()

apache_tomcat_path = (
home_dir
/ "unpacked-bundle"
/ f"solrwayback_package_{solrwayback_version}"
/ f"apache-tomcat-{apache_tomcat_version}"
/ "bin"
/ "startup.sh"
)
check_call(
[str(apache_tomcat_path)],
)


def _index(
collection: str,
number_of_threads: int,
warc_file_directory: Path,
solrwayback_version: str,
) -> None:
home_dir = Path().resolve()

all_warc_files = list(warc_file_directory.rglob("*.warc.gz"))
warc_indexer_path = (
home_dir
/ "unpacked-bundle"
/ f"solrwayback_package_{solrwayback_version}"
/ "indexing"
/ "warc-indexer.sh"
)

check_call(
[str(warc_indexer_path), *map(str, all_warc_files)],
env={
"THREADS": str(number_of_threads),
"INDEXER_CUSTOM": f"--collection={collection}",
"JAVA_TOOL_OPTIONS": "-Dfile.encoding=UTF8", # Remove when `solrwayback` version greater than 4.4.2 is out.
},
)


if __name__ == "__main__":
_main()

0 comments on commit 2f2a0a9

Please sign in to comment.