diff --git a/pulp_python/app/pypi/views.py b/pulp_python/app/pypi/views.py index 60b3c04d..d357364e 100644 --- a/pulp_python/app/pypi/views.py +++ b/pulp_python/app/pypi/views.py @@ -1,3 +1,4 @@ +import json import logging import requests @@ -23,7 +24,7 @@ from packaging.utils import canonicalize_name from urllib.parse import urljoin, urlparse, urlunsplit from pathlib import PurePath -from pypi_simple import parse_links_stream_response +from pypi_simple import ACCEPT_JSON_PREFERRED, ProjectPage from pulpcore.plugin.viewsets import OperationPostponedResponse from pulpcore.plugin.tasking import dispatch @@ -45,6 +46,7 @@ python_content_to_json, PYPI_LAST_SERIAL, PYPI_SERIAL_CONSTANT, + get_remote_package_filter, ) from pulp_python.app import tasks @@ -232,27 +234,31 @@ def list(self, request, path): def pull_through_package_simple(self, package, path, remote): """Gets the package's simple page from remote.""" - def parse_url(link): - parsed = urlparse(link.url) - digest, _, value = parsed.fragment.partition('=') + def parse_package(dis_package): + parsed = urlparse(dis_package.url) stripped_url = urlunsplit(chain(parsed[:3], ("", ""))) - redirect = f'{path}/{link.text}?redirect={stripped_url}' - d_url = urljoin(self.base_content_url, redirect) - return link.text, d_url, value if digest == 'sha256' else '' + redirect_path = f'{path}/{dis_package.filename}?redirect={stripped_url}' + d_url = urljoin(self.base_content_url, redirect_path) + return dis_package.filename, d_url, dis_package.digests.get("sha256", "") + + rfilter = get_remote_package_filter(remote) + if not rfilter.filter_project(package): + return Http404(f"{package} does not exist.") url = remote.get_remote_artifact_url(f'simple/{package}/') - kwargs = {} - if proxy_url := remote.proxy_url: - if remote.proxy_username or remote.proxy_password: - parsed_proxy = urlparse(proxy_url) - netloc = f"{remote.proxy_username}:{remote.proxy_password}@{parsed_proxy.netloc}" - proxy_url = urlunsplit((parsed_proxy.scheme, netloc, "", "", "")) - kwargs["proxies"] = {"http": proxy_url, "https": proxy_url} - - response = requests.get(url, stream=True, **kwargs) - links = parse_links_stream_response(response) - packages = (parse_url(link) for link in links) - return StreamingHttpResponse(write_simple_detail(package, packages, streamed=True)) + remote.headers.append({"Accept": ACCEPT_JSON_PREFERRED}) + downloader = remote.get_downloader(url=url, max_retries=1) + try: + d = downloader.fetch() + except Exception: + return Http404(f"Could not find {package}.") + + if d.headers["content-type"] == "application/vnd.pypi.simple.v1+json": + page = ProjectPage.from_json_data(json.load(open(d.path, "rb"), base_url=remote.url)) + else: + page = ProjectPage.from_html(package, open(d.path, "rb").read(), base_url=remote.url) + packages = [parse_package(p) for p in page.packages if rfilter.filter_release(package, p.version)] + return Response(write_simple_detail(package, packages)) @extend_schema(operation_id="pypi_simple_package_read", summary="Get package simple page") def retrieve(self, request, path, package): diff --git a/pulp_python/app/utils.py b/pulp_python/app/utils.py index f66dcbd7..00ec5747 100644 --- a/pulp_python/app/utils.py +++ b/pulp_python/app/utils.py @@ -6,7 +6,8 @@ from django.conf import settings from jinja2 import Template from packaging.utils import canonicalize_name -from packaging.version import parse +from packaging.requirements import Requirement +from packaging.version import parse, InvalidVersion PYPI_LAST_SERIAL = "X-PYPI-LAST-SERIAL" @@ -315,3 +316,78 @@ def write_simple_detail(project_name, project_packages, streamed=False): detail = Template(simple_detail_template) context = {"project_name": project_name, "project_packages": project_packages} return detail.stream(**context) if streamed else detail.render(**context) + + +class PackageIncludeFilter: + """A special class to help filter Package's based on a remote's include/exclude""" + + def __init__(self, remote): + self.remote = remote + self._filter_includes = self._parse_packages(self.remote.includes) + self._filter_excludes = self._parse_packages(self.remote.excludes) + + def _parse_packages(self, packages): + config = defaultdict(lambda: defaultdict(list)) + for value in packages: + requirement = Requirement(value) + requirement.name = canonicalize_name(requirement.name) + if requirement.specifier: + requirement.specifier.prereleases = True + config["range"][requirement.name].append(requirement) + else: + config["full"][requirement.name].append(requirement) + return config + + def filter_project(self, project_name): + """Return true/false if project_name would be allowed through remote's filters.""" + project_name = canonicalize_name(project_name) + include_full = self._filter_includes.get("full", {}) + include_range = self._filter_includes.get("range", {}) + include = set(*include_range.keys(), *include_full.keys()) + if include and project_name not in include: + return False + + exclude_full = self._filter_excludes.get("full", {}) + if project_name in exclude_full: + return False + + return True + + def filter_release(self, project_name, version): + """Returns true/false if release would be allowed through remote's filters.""" + project_name = canonicalize_name(project_name) + if not self.filter_project(project_name): + return False + + try: + version = parse(version) + except InvalidVersion: + return False + + include_range = self._filter_includes.get("range", {}) + if project_name in include_range: + for req in include_range[project_name]: + if version in req.specifier: + break + else: + return False + + exclude_range = self._filter_excludes.get("range", {}) + if project_name in exclude_range: + for req in exclude_range[project_name]: + if version in req.specifier: + return False + + return True + + +_remote_filters = {} +def get_remote_package_filter(remote): + if date_filter_tuple := _remote_filters.get(remote.pulp_id): + last_update, rfilter = date_filter_tuple + if last_update == remote.pulp_last_updated: + return rfilter + + rfilter = PackageIncludeFilter(remote) + _remote_filters[remote.pulp_id] = (remote.pulp_last_updated, rfilter) + return rfilter