diff --git a/archive_query_log/cli/parsers.py b/archive_query_log/cli/parsers.py index b0107929..ff457594 100644 --- a/archive_query_log/cli/parsers.py +++ b/archive_query_log/cli/parsers.py @@ -307,4 +307,3 @@ def warc_query_import(config: Config, services_path: Path) -> None: from archive_query_log.imports.yaml import import_warc_query_parsers WarcQueryParser.init(using=config.es.client) import_warc_query_parsers(config, services_path) - diff --git a/archive_query_log/imports/yaml.py b/archive_query_log/imports/yaml.py index 01b11d97..1fbc45c0 100644 --- a/archive_query_log/imports/yaml.py +++ b/archive_query_log/imports/yaml.py @@ -5,7 +5,6 @@ from click import echo from click import prompt -from cssselect import HTMLTranslator from diskcache import Index from elasticsearch_dsl.query import Terms from tqdm.auto import tqdm @@ -339,8 +338,6 @@ def import_warc_query_parsers(config: Config, services_path: Path) -> None: services_list: Sequence[dict] = safe_load(file) echo(f"Found {len(services_list)} service definitions.") - translator = HTMLTranslator() - services: Iterable[dict] = services_list # noinspection PyTypeChecker services = tqdm( @@ -348,7 +345,7 @@ def import_warc_query_parsers(config: Config, services_path: Path) -> None: desc="Import parsers for providers", unit="provider", ) - for i, service in enumerate(services): + for service in services: if ("domains" not in service or "interpreted_query_parsers" not in service): continue diff --git a/archive_query_log/monitoring/home.py b/archive_query_log/monitoring/home.py index 68e5f01b..2fadecc5 100644 --- a/archive_query_log/monitoring/home.py +++ b/archive_query_log/monitoring/home.py @@ -30,8 +30,8 @@ class Progress(NamedTuple): DocumentType = Type[BaseDocument] _statistics_cache: dict[ - tuple[DocumentType, - tuple[str, ...]], Statistics, + tuple[DocumentType, tuple[str, ...]], + Statistics, ] = ExpiringDict( max_len=100, max_age_seconds=30, diff --git a/archive_query_log/parsers/xml.py b/archive_query_log/parsers/xml.py index 4d7835e7..6c1c6d6e 100644 --- a/archive_query_log/parsers/xml.py +++ b/archive_query_log/parsers/xml.py @@ -3,8 +3,10 @@ from cssselect import GenericTranslator from cssselect.parser import parse as cssselect_parse +# pylint: disable=no-name-in-module from lxml.etree import parse as etree_parse, XMLParser, HTMLParser # noinspection PyProtectedMember +# pylint: disable=no-name-in-module from lxml.etree import _ElementTree from warcio.recordloader import ArcWarcRecord @@ -20,6 +22,7 @@ def parse_xml_tree(record: ArcWarcRecord) -> _ElementTree | None: warn(UserWarning("No MIME type given.")) return None mime_type = mime_type.split(";", maxsplit=1)[0] + parser: XMLParser | HTMLParser if mime_type == "text/xml": parser = XMLParser() elif mime_type == "text/html": @@ -42,7 +45,7 @@ def get_xml_xpath_non_empty_string( raise ValueError( f"XPath {xpath} did not return a list, was: {type(results)}") if not all(isinstance(result, str) for result in results): - types = ", ".join(type(result) for result in results) + types = ", ".join(str(type(result)) for result in results) raise ValueError( f"XPath {xpath} did not return a list of strings, found: {types}") results = (result.strip() for result in results) @@ -61,7 +64,7 @@ def get_xml_xpath_non_empty_string( f"XPath {xpath} did not return a string, was: {type(result)}") -translator = GenericTranslator() +_translator = GenericTranslator() def text_xpath_from_css_selector( @@ -78,7 +81,7 @@ def text_xpath_from_css_selector( selectors = cssselect_parse(css_selector) xpaths = ( - "//" + translator.selector_to_xpath( + "//" + _translator.selector_to_xpath( selector, prefix="", translate_pseudo_elements=True, diff --git a/pyproject.toml b/pyproject.toml index c286b61f..43691b58 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -125,6 +125,9 @@ disable = [ "logging-fstring-interpolation" ] +[tool.bandit] +skips = ["B320", "B410"] + [tool.bandit.assert_used] skips = ["**/test_*.py"]