Skip to content

Commit

Permalink
Merge pull request #11259 from sbidoul/drop-html5lib
Browse files Browse the repository at this point in the history
Drop html5lib
  • Loading branch information
sbidoul authored Jul 16, 2022
2 parents 6d02fe2 + d3a318f commit 909be0d
Show file tree
Hide file tree
Showing 53 changed files with 22 additions and 13,420 deletions.
1 change: 1 addition & 0 deletions news/10825.removal.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove the ``html5lib`` deprecated feature flag.
1 change: 1 addition & 0 deletions news/html5lib.vendor.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Remove vendored html5lib.
1 change: 0 additions & 1 deletion src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,6 @@ def check_list_path_option(options: Values) -> None:
default=[],
choices=[
"legacy-resolver",
"html5lib",
],
help=("Enable deprecated functionality, that will be removed in the future."),
)
Expand Down
1 change: 0 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -499,5 +499,4 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)
1 change: 0 additions & 1 deletion src/pip/_internal/commands/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def _build_package_finder(
link_collector=link_collector,
selection_prefs=selection_prefs,
target_python=target_python,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def get_available_package_versions(self, options: Values, args: List[Any]) -> None:
Expand Down
1 change: 0 additions & 1 deletion src/pip/_internal/commands/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,6 @@ def _build_package_finder(
return PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib="html5lib" in options.deprecated_features_enabled,
)

def run(self, options: Values, args: List[str]) -> int:
Expand Down
72 changes: 8 additions & 64 deletions src/pip/_internal/index/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
Union,
)

from pip._vendor import html5lib, requests
from pip._vendor import requests
from pip._vendor.requests import Response
from pip._vendor.requests.exceptions import RetryError, SSLError

Expand Down Expand Up @@ -191,27 +191,6 @@ def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
return None


def _determine_base_url(document: HTMLElement, page_url: str) -> str:
"""Determine the HTML document's base URL.
This looks for a ``<base>`` tag in the HTML document. If present, its href
attribute denotes the base URL of anchor tags in the document. If there is
no such tag (or if it does not have a valid href attribute), the HTML
file's URL is used as the base URL.
:param document: An HTML document representation. The current
implementation expects the result of ``html5lib.parse()``.
:param page_url: The URL of the HTML document.
TODO: Remove when `html5lib` is dropped.
"""
for base in document.findall(".//base"):
href = base.get("href")
if href is not None:
return href
return page_url


def _clean_url_path_part(part: str) -> str:
"""
Clean a "part" of a URL path (i.e. after splitting on "@" characters).
Expand Down Expand Up @@ -313,9 +292,7 @@ def __hash__(self) -> int:


class ParseLinks(Protocol):
def __call__(
self, page: "IndexContent", use_deprecated_html5lib: bool
) -> Iterable[Link]:
def __call__(self, page: "IndexContent") -> Iterable[Link]:
...


Expand All @@ -327,49 +304,20 @@ def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
"""

@functools.lru_cache(maxsize=None)
def wrapper(
cacheable_page: CacheablePageContent, use_deprecated_html5lib: bool
) -> List[Link]:
return list(fn(cacheable_page.page, use_deprecated_html5lib))
def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
return list(fn(cacheable_page.page))

@functools.wraps(fn)
def wrapper_wrapper(
page: "IndexContent", use_deprecated_html5lib: bool
) -> List[Link]:
def wrapper_wrapper(page: "IndexContent") -> List[Link]:
if page.cache_link_parsing:
return wrapper(CacheablePageContent(page), use_deprecated_html5lib)
return list(fn(page, use_deprecated_html5lib))
return wrapper(CacheablePageContent(page))
return list(fn(page))

return wrapper_wrapper


def _parse_links_html5lib(page: "IndexContent") -> Iterable[Link]:
"""
Parse an HTML document, and yield its anchor elements as Link objects.
TODO: Remove when `html5lib` is dropped.
"""
document = html5lib.parse(
page.content,
transport_encoding=page.encoding,
namespaceHTMLElements=False,
)

url = page.url
base_url = _determine_base_url(document, url)
for anchor in document.findall(".//a"):
link = _create_link_from_element(
anchor.attrib,
page_url=url,
base_url=base_url,
)
if link is None:
continue
yield link


@with_cached_index_content
def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable[Link]:
def parse_links(page: "IndexContent") -> Iterable[Link]:
"""
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
"""
Expand Down Expand Up @@ -398,10 +346,6 @@ def parse_links(page: "IndexContent", use_deprecated_html5lib: bool) -> Iterable
hashes=file.get("hashes", {}),
)

if use_deprecated_html5lib:
yield from _parse_links_html5lib(page)
return

parser = HTMLLinkParser(page.url)
encoding = page.encoding or "utf-8"
parser.feed(page.content.decode(encoding))
Expand Down
7 changes: 1 addition & 6 deletions src/pip/_internal/index/package_finder.py
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,6 @@ def __init__(
link_collector: LinkCollector,
target_python: TargetPython,
allow_yanked: bool,
use_deprecated_html5lib: bool,
format_control: Optional[FormatControl] = None,
candidate_prefs: Optional[CandidatePreferences] = None,
ignore_requires_python: Optional[bool] = None,
Expand All @@ -623,7 +622,6 @@ def __init__(
self._ignore_requires_python = ignore_requires_python
self._link_collector = link_collector
self._target_python = target_python
self._use_deprecated_html5lib = use_deprecated_html5lib

self.format_control = format_control

Expand All @@ -640,8 +638,6 @@ def create(
link_collector: LinkCollector,
selection_prefs: SelectionPreferences,
target_python: Optional[TargetPython] = None,
*,
use_deprecated_html5lib: bool,
) -> "PackageFinder":
"""Create a PackageFinder.
Expand All @@ -666,7 +662,6 @@ def create(
allow_yanked=selection_prefs.allow_yanked,
format_control=selection_prefs.format_control,
ignore_requires_python=selection_prefs.ignore_requires_python,
use_deprecated_html5lib=use_deprecated_html5lib,
)

@property
Expand Down Expand Up @@ -796,7 +791,7 @@ def process_project_url(
if index_response is None:
return []

page_links = list(parse_links(index_response, self._use_deprecated_html5lib))
page_links = list(parse_links(index_response))

with indent_log():
package_links = self.evaluate_links(
Expand Down
1 change: 0 additions & 1 deletion src/pip/_internal/self_outdated_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ def _get_current_remote_pip_version(
finder = PackageFinder.create(
link_collector=link_collector,
selection_prefs=selection_prefs,
use_deprecated_html5lib=("html5lib" in options.deprecated_features_enabled),
)
best_candidate = finder.find_best_candidate("pip").best_candidate
if best_candidate is None:
Expand Down
3 changes: 0 additions & 3 deletions src/pip/_vendor/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,6 @@ Modifications
rather than ``appdirs``.
* ``packaging`` has been modified to import its dependencies from
``pip._vendor``.
* ``html5lib`` has been modified to import six from ``pip._vendor``, to prefer
importing from ``collections.abc`` instead of ``collections`` and does not
import ``xml.etree.cElementTree`` on Python 3.
* ``CacheControl`` has been modified to import its dependencies from
``pip._vendor``.
* ``requests`` has been modified to import its other dependencies from
Expand Down
1 change: 0 additions & 1 deletion src/pip/_vendor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ def vendored(modulename):
vendored("colorama")
vendored("distlib")
vendored("distro")
vendored("html5lib")
vendored("six")
vendored("six.moves")
vendored("six.moves.urllib")
Expand Down
1 change: 0 additions & 1 deletion src/pip/_vendor/html5lib.pyi

This file was deleted.

20 changes: 0 additions & 20 deletions src/pip/_vendor/html5lib/LICENSE

This file was deleted.

35 changes: 0 additions & 35 deletions src/pip/_vendor/html5lib/__init__.py

This file was deleted.

Loading

0 comments on commit 909be0d

Please sign in to comment.