diff --git a/caching/frontend/crawler.py b/caching/frontend/crawler.py index 903d32893..bbb891b1d 100644 --- a/caching/frontend/crawler.py +++ b/caching/frontend/crawler.py @@ -1,7 +1,8 @@ import logging -from enum import Enum +from collections.abc import Iterator import requests +from defusedxml import ElementTree from rest_framework.response import Response from caching.common.geographies_crawler import ( @@ -15,29 +16,16 @@ from cms.topic.models import TopicPage DEFAULT_REQUEST_TIMEOUT = 60 +PAGE_XML_LOCATOR = ".//ns:loc" logger = logging.getLogger(__name__) -class CMSPageTypes(Enum): - home_page = "home.HomePage" - topic_page = "topic.TopicPage" - common_page = "common.CommonPage" - whats_new_parent_page = "whats_new.WhatsNewParentPage" - whats_new_child_entry = "whats_new.WhatsNewChildEntry" - metrics_documentation_parent_page = ( - "metrics_documentation.MetricsDocumentationParentPage" - ) - metrics_documentation_child_entry = ( - "metrics_documentation.MetricsDocumentationChildEntry" - ) - - class FrontEndCrawler: """This is used to traverse the front end and send GET requests to all relevant pages Notes: - Under the hood, this uses the `InternalAPIClient` to get a list of all pages from the CMS. + Under the hood, this gathers all the URLs in the front end from the associated sitemap.xml From this point, a simple GET request is made to each page. The CDN auth key for the rule on the front end should also be provided. If not 403 Forbidden errors will be returned and the cache will not be hydrated. @@ -64,23 +52,49 @@ def __init__( or GeographiesAPICrawler(internal_api_client=self._internal_api_client) ) - @classmethod - def create_crawler_for_cache_refresh( - cls, *, frontend_base_url: str, cdn_auth_key: str - ) -> "FrontEndCrawler": - return cls(frontend_base_url=frontend_base_url, cdn_auth_key=cdn_auth_key) - - # Private API/headless CMS API + @property + def sitemap_url(self) -> str: + return self._url_builder.build_url_for_sitemap() + + def _hit_sitemap_url(self) -> Response: + url: str = self.sitemap_url + return requests.get(url=url, timeout=DEFAULT_REQUEST_TIMEOUT) + + def _parse_sitemap(self): + response: Response = self._hit_sitemap_url() + xml_response_data: str = response.content.decode("utf-8") + return ElementTree.fromstring(text=xml_response_data) + + def _traverse_sitemap(self) -> Iterator[str]: + sitemap_root = self._parse_sitemap() + namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"} + return ( + loc.text + for loc in sitemap_root.findall(PAGE_XML_LOCATOR, namespaces=namespace) + ) - def get_all_page_items_from_api(self) -> list[dict]: - """Hits the `pages/` endpoint to list all page items in the CMS + def process_all_page_urls(self) -> None: + """Traverse the frontend and make a GET request to all relevant pages Returns: - List of page items information + None """ - response: Response = self._internal_api_client.hit_pages_list_endpoint() - return response.json()["items"] + logger.info("Traversing sitemap for URLs") + + urls: Iterator[str] = self._traverse_sitemap() + + for url in urls: + logger.info("Processing `%s`", url) + self.hit_frontend_page(url=url) + + logger.info("Finished processing all URLs for the frontend") + + @classmethod + def create_crawler_for_cache_refresh( + cls, *, frontend_base_url: str, cdn_auth_key: str + ) -> "FrontEndCrawler": + return cls(frontend_base_url=frontend_base_url, cdn_auth_key=cdn_auth_key) # Frontend requests @@ -110,80 +124,6 @@ def hit_frontend_page( ) logger.info("Processed `%s` for params: %s", url, params) - def process_page(self, *, page_item: dict) -> None: - """Hit the URL for the corresponding `page_item` - - Notes: - Only the following page types are supported: - - "HomePage" - - "TopicPage" - - "CommonPage" - - "WhatsNewParentPage" - - "WhatsNewChildEntry" - - "MetricsDocumentationParentPage" - - "MetricsDocumentationChildEntry" - - Args: - page_item: The individual page information - taken from the `pages/` list response - - Returns: - None - - """ - page_type: str = page_item["type"] - - match page_type: - case CMSPageTypes.home_page.value: - url = self._url_builder.build_url_for_home_page() - case CMSPageTypes.topic_page.value: - url = self._url_builder.build_url_for_topic_page(slug=page_item["slug"]) - case CMSPageTypes.common_page.value: - url = self._url_builder.build_url_for_common_page( - slug=page_item["slug"] - ) - case CMSPageTypes.whats_new_parent_page.value: - url = self._url_builder.build_url_for_whats_new_parent_page() - case CMSPageTypes.whats_new_child_entry.value: - url = self._url_builder.build_url_for_whats_new_child_entry( - slug=page_item["slug"] - ) - case CMSPageTypes.metrics_documentation_parent_page.value: - url = ( - self._url_builder.build_url_for_metrics_documentation_parent_page() - ) - case CMSPageTypes.metrics_documentation_child_entry.value: - url = self._url_builder.build_url_for_metrics_documentation_child_entry( - slug=page_item["slug"] - ) - case _: - # Pass over for root page objects - return - - self.hit_frontend_page(url=url) - - def process_all_pages(self) -> None: - """Traverse the frontend and make a GET request to all relevant pages - - Returns: - None - - """ - logger.info("Getting all pages from Headless CMS API") - all_page_items: list[dict] = self.get_all_page_items_from_api() - - for page_item in all_page_items: - self.process_page(page_item=page_item["meta"]) - - self._hit_ancillary_pages() - logger.info("Finished processing all regular pages for the frontend") - - def _hit_ancillary_pages(self): - self.hit_frontend_page( - url=self._url_builder.build_url_for_feedback_confirmation_page() - ) - self.hit_frontend_page(url=self._url_builder.build_url_for_sitemap()) - def process_geography_page_combination( self, geography_data: GeographyData, page: TopicPage ) -> None: @@ -199,7 +139,7 @@ def process_geography_page_combination( None """ - url: str = self._url_builder.build_url_for_topic_page(slug=page.slug) + url: str = page.full_url params: dict[str, str] = ( self._url_builder.build_query_params_for_area_selector_page( geography_type_name=geography_data.geography_type_name, diff --git a/caching/frontend/handlers.py b/caching/frontend/handlers.py index efbd0c67d..f7fb28b2c 100644 --- a/caching/frontend/handlers.py +++ b/caching/frontend/handlers.py @@ -41,5 +41,5 @@ def crawl_front_end() -> None: frontend_crawler = FrontEndCrawler.create_crawler_for_cache_refresh( frontend_base_url=frontend_base_url, cdn_auth_key=cdn_auth_key ) - frontend_crawler.process_all_pages() + frontend_crawler.process_all_page_urls() frontend_crawler.process_all_valid_area_selector_pages() diff --git a/caching/frontend/urls.py b/caching/frontend/urls.py index 4a14481a1..196675b52 100644 --- a/caching/frontend/urls.py +++ b/caching/frontend/urls.py @@ -7,90 +7,6 @@ class FrontEndURLBuilder: def __init__(self, *, base_url: str): self._base_url = base_url - def build_url_for_topic_page(self, *, slug: str) -> str: - """Builds the full URL for the given topic page `slug` - - Args: - slug: The slug associated with the Topic page - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, f"/topics/{slug}") - - def build_url_for_common_page(self, *, slug: str) -> str: - """Builds the full URL for the given common page `slug` - - Args: - slug: The slug associated with the Common page - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, slug) - - def build_url_for_home_page(self) -> str: - """Builds the full URL for the single home page - - Returns: - The full URL which can be passed to requests - - """ - return self._base_url - - def build_url_for_whats_new_parent_page(self) -> str: - """Builds the full URL for the single what's new parent page - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, "whats-new") - - def build_url_for_whats_new_child_entry(self, *, slug: str) -> str: - """Builds the full URL for the single what's new child entry - - Args: - slug: The slug associated with the `WhatsNewChildEntry` - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, f"whats-new/{slug}") - - def build_url_for_metrics_documentation_parent_page(self) -> str: - """Builds the full URL for the single metrics documentation parent page - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, "metrics-documentation") - - def build_url_for_metrics_documentation_child_entry(self, *, slug: str) -> str: - """Builds the full URL for the single metrics documentation child entry - - Args: - slug: The slug associated with the `MetricsDocumentationChildEntry` - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, f"metrics-documentation/{slug}") - - def build_url_for_feedback_confirmation_page(self) -> str: - """Builds the full URL for the feedback confirmation page - - Returns: - The full URL which can be passed to requests - - """ - return urljoin(self._base_url, "/feedback/confirmation") - def build_url_for_sitemap(self) -> str: """Builds the full URL for the sitemap page diff --git a/requirements-prod.txt b/requirements-prod.txt index 12362cba1..f071e24f6 100644 --- a/requirements-prod.txt +++ b/requirements-prod.txt @@ -9,6 +9,7 @@ click==8.1.7 colorama==0.4.6 coreapi==2.3.3 coreschema==0.0.4 +defusedxml==0.7.1 distlib==0.3.9 django-cors-headers==4.5.0 django-db-connection-pool==1.2.5 diff --git a/tests/unit/caching/frontend/test_crawler.py b/tests/unit/caching/frontend/test_crawler.py index df07a4765..ad1a70012 100644 --- a/tests/unit/caching/frontend/test_crawler.py +++ b/tests/unit/caching/frontend/test_crawler.py @@ -1,4 +1,6 @@ +from typing import Iterator from unittest import mock +from defusedxml import ElementTree import pytest from _pytest.logging import LogCaptureFixture @@ -20,34 +22,6 @@ def frontend_crawler_with_mocked_internal_api_client() -> FrontEndCrawler: class TestFrontEndCrawler: - # Private API/headless CMS API - - def test_get_all_page_items_from_api( - self, frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler - ): - """ - Given a mocked `InternalAPIClient` - When `get_all_page_items_from_api()` is called from an instance of `FrontEndCrawler` - Then the correct page items are returned from the response - """ - # Given - expected_mocked_page_items = mock.Mock() - page_response_data = {"items": expected_mocked_page_items} - mocked_internal_api_client = ( - frontend_crawler_with_mocked_internal_api_client._internal_api_client - ) - mocked_internal_api_client.hit_pages_list_endpoint.return_value.json.return_value = ( - page_response_data - ) - - # When - returned_page_items = ( - frontend_crawler_with_mocked_internal_api_client.get_all_page_items_from_api() - ) - - # Then - assert returned_page_items == expected_mocked_page_items - # Frontend requests @mock.patch(f"{MODULE_PATH}.requests") @@ -112,377 +86,6 @@ def test_hit_frontend_page_with_query_params( params=query_params, ) - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_home_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" of value "home.HomePage" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `_build_url_for_home_page()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - page_item = {"type": "home.HomePage"} - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - expected_home_page_url: str = frontend_url_builder.build_url_for_home_page() - spy_hit_frontend_page.assert_called_once_with(url=expected_home_page_url) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_topic_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" of value "topic.TopicPage" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `_build_url_for_topic_page()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - page_item = {"type": "topic.TopicPage", "slug": "fake-topic-page-slug"} - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - url_for_topic_page: str = frontend_url_builder.build_url_for_topic_page( - slug=page_item["slug"] - ) - spy_hit_frontend_page.assert_called_once_with(url=url_for_topic_page) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_common_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" of value "common.CommonPage" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `_build_url_for_common_page()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - page_item = {"type": "common.CommonPage", "slug": "fake-common-page-slug"} - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - url_for_common_page: str = frontend_url_builder.build_url_for_common_page( - slug=page_item["slug"] - ) - spy_hit_frontend_page.assert_called_once_with(url=url_for_common_page) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_whats_new_parent_page_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" - of value "whats_new.WhatsNewParentPage" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `_build_url_for_whats_new_parent_page()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - page_item = {"type": "whats_new.WhatsNewParentPage"} - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - url_for_whats_new_parent_page: str = ( - frontend_url_builder.build_url_for_whats_new_parent_page() - ) - spy_hit_frontend_page.assert_called_once_with(url=url_for_whats_new_parent_page) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_whats_new_child_entry_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" - of value "whats_new.WhatsNewChildEntry" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `_build_url_for_whats_new_child_entry()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - slug = "issue-with-vaccination-data" - page_item = {"type": "whats_new.WhatsNewChildEntry", "slug": slug} - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - url_for_whats_new_child_entry: str = ( - frontend_url_builder.build_url_for_whats_new_child_entry(slug=slug) - ) - spy_hit_frontend_page.assert_called_once_with(url=url_for_whats_new_child_entry) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_metrics_documentation_parent_page_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" - of value "metrics_documentation.MetricsDocumentationParentPage" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `build_url_for_metrics_documentation_parent_page()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - page_item = {"type": "metrics_documentation.MetricsDocumentationParentPage"} - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - url_for_metrics_documentation_page: str = ( - frontend_url_builder.build_url_for_metrics_documentation_parent_page() - ) - spy_hit_frontend_page.assert_called_once_with( - url=url_for_metrics_documentation_page - ) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_metrics_documentation_child_entry_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" - of value "metrics_documentation.MetricsDocumentationChildEntry" - When `process_page()` is called from an instance of `FrontEndCrawler` - Then `build_url_for_metrics_documentation_child_entry()` is called - And the returned url is passed to the call to `hit_frontend_page()` - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check the request is being made to the correct URL - """ - # Given - slug = "issue-with-vaccination-data" - page_item = { - "type": "metrics_documentation.MetricsDocumentationChildEntry", - "slug": slug, - } - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - url_for_metrics_documentation_child_entry: str = ( - frontend_url_builder.build_url_for_metrics_documentation_child_entry( - slug=slug - ) - ) - spy_hit_frontend_page.assert_called_once_with( - url=url_for_metrics_documentation_child_entry - ) - - @pytest.mark.parametrize("page_type", ["", None, "wagtailcore.Page"]) - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_page_for_any_other_page_type( - self, - spy_hit_frontend_page: mock.MagicMock, - page_type: str, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given a page item dict with a key "type" of an invalid value - When `process_page()` is called from an instance of `FrontEndCrawler` - Then the `hit_frontend_page()` call is not made - - Patches: - `spy_hit_frontend_page`: For the main assertion, - to check no request is made - """ - # Given - page_item = {"type": page_type} - - # When - frontend_crawler_with_mocked_internal_api_client.process_page( - page_item=page_item - ) - - # Then - spy_hit_frontend_page.assert_not_called() - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - @mock.patch.object(FrontEndCrawler, "process_page") - @mock.patch.object(FrontEndCrawler, "get_all_page_items_from_api") - def test_process_all_pages( - self, - mocked_get_all_page_items_from_api: mock.MagicMock, - spy_process_page: mock.MagicMock, - mocked_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given no input - When `process_all_pages()` is called from an instance of `FrontEndCrawler` - Then `process_page()` is called for each returned page item - - Patches: - `mocked_get_all_page_items_from_api`: To isolate the returned page items - `spy_process_page`: For the main assertions - `mocked_hit_frontend_page:` To remove the side effects of - having to make a network call - - """ - # Given - mocked_page_items = [mock.Mock()] * 3 - mocked_get_all_page_items_from_api.return_value = [ - {"meta": p} for p in mocked_page_items - ] - - # When - frontend_crawler_with_mocked_internal_api_client.process_all_pages() - - # Then - expected_calls = [ - mock.call(page_item=mocked_page_item) - for mocked_page_item in mocked_page_items - ] - spy_process_page.assert_has_calls(calls=expected_calls, any_order=True) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_all_pages_hits_frontend_for_feedback_confirmation_page( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given no input - When `process_all_pages()` is called from an instance of `FrontEndCrawler` - Then `hit_frontend_page()` is called - with the URL for the feedback confirmation page - - Patches: - `spy_hit_frontend_page`: For the main assertion - - """ - # Given - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_all_pages() - - # Then - expected_url = frontend_url_builder.build_url_for_feedback_confirmation_page() - spy_hit_frontend_page.assert_has_calls( - calls=[mock.call(url=expected_url)], any_order=True - ) - - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") - def test_process_all_pages_hits_frontend_for_sitemap( - self, - spy_hit_frontend_page: mock.MagicMock, - frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, - ): - """ - Given no input - When `process_all_pages()` is called from - an instance of `FrontEndCrawler` - Then `hit_frontend_page()` is called - with the URL for the sitemap - - Patches: - `spy_hit_frontend_page`: For the main assertion - - """ - # Given - frontend_url_builder = ( - frontend_crawler_with_mocked_internal_api_client._url_builder - ) - - # When - frontend_crawler_with_mocked_internal_api_client.process_all_pages() - - # Then - expected_url = frontend_url_builder.build_url_for_sitemap() - spy_hit_frontend_page.assert_has_calls( - calls=[mock.call(url=expected_url)], any_order=True - ) - @mock.patch.object(FrontEndCrawler, "hit_frontend_page") def test_process_geography_page_combination( self, @@ -501,8 +104,9 @@ def test_process_geography_page_combination( """ # Given - slug = "covid-19" - mocked_page = mock.Mock(slug=slug) + full_url = f"{frontend_crawler_with_mocked_internal_api_client._frontend_base_url}/topics/covid-19" + mocked_page = mock.Mock(full_url=full_url) + geography_data = GeographyData( name="London", geography_type_name="Lower Tier Local Authority" ) @@ -514,13 +118,12 @@ def test_process_geography_page_combination( ) # Then - expected_url = f"{frontend_crawler_with_mocked_internal_api_client._frontend_base_url}/topics/{slug}" expected_params = { "areaType": "Lower+Tier+Local+Authority", "areaName": "London", } spy_hit_frontend_page.assert_called_once_with( - url=expected_url, + url=full_url, params=expected_params, ) @@ -596,8 +199,8 @@ def test_process_geography_page_combination_logs_for_failed_request( """ # Given - slug = "covid-19" - mocked_page = mock.Mock(slug=slug) + full_url = f"{frontend_crawler_with_mocked_internal_api_client._frontend_base_url}/topics/covid-19" + mocked_page = mock.Mock(full_url=full_url) geography_data = GeographyData(name="London", geography_type_name="Nation") mocked_hit_frontend_page.side_effect = ChunkedEncodingError @@ -608,14 +211,13 @@ def test_process_geography_page_combination_logs_for_failed_request( ) # Then - expected_url = f"{frontend_crawler_with_mocked_internal_api_client._frontend_base_url}/topics/{slug}" expected_params = { "areaType": "Nation", "areaName": "London", } expected_log = ( - f"`{expected_url}` with params of `{expected_params}` could not be hit" + f"`{full_url}` with params of `{expected_params}` could not be hit" ) assert expected_log in caplog.text @@ -658,3 +260,145 @@ def test_process_all_valid_area_selector_pages( spy_process_geography_page_combinations.assert_has_calls( calls=expected_calls, any_order=True ) + + # Sitemap + + def test_sitemap_url( + self, + frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, + ): + """ + Given a base URL + When the `sitemap_url` property is called + from an instance of the `FrontEndCrawler` + Then the correct URL is returned + """ + # Given + base_url = frontend_crawler_with_mocked_internal_api_client._frontend_base_url + + # When + sitemap_url: str = frontend_crawler_with_mocked_internal_api_client.sitemap_url + + # When + assert sitemap_url == f"{base_url}/sitemap.xml" + + @mock.patch.object(FrontEndCrawler, "_hit_sitemap_url") + def test_parse_sitemap( + self, + mocked_hit_sitemap_url: mock.MagicMock, + frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, + ): + """ + Given fake sitemap XML content + When `_parse_sitemap()` is called + from an instance of the `FrontEndCrawler` + Then the sitemap is parsed correctly and returned + """ + # Given + mocked_response = mock.Mock() + mocked_response.content = b"https://example.com/" + mocked_hit_sitemap_url.return_value = mocked_response + + # When + parsed_sitemap_root = ( + frontend_crawler_with_mocked_internal_api_client._parse_sitemap() + ) + + # Then + assert parsed_sitemap_root.tag == "urlset" + assert parsed_sitemap_root.find("url/loc").text == "https://example.com/" + + @mock.patch.object(FrontEndCrawler, "_parse_sitemap") + def test_traverse_sitemap( + self, + mocked_parse_sitemap: mock.MagicMock, + frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, + ): + """ + Given a fake sitemap + When `_traverse_sitemap()` is called + from an instance of the `FrontEndCrawler` + Then the correct URLs are extracted from the sitemap + """ + # Given + fake_sitemap_content = """ + + + https://test.ukhsa-dashboard.data.gov.uk/ + 2024-10-17T15:16:05.287Z + monthly + 0.5 + + + https://test.ukhsa-dashboard.data.gov.uk/about/ + 2024-10-17T15:16:05.288Z + monthly + 0.5 + + + """ + parsed_sitemap = ElementTree.fromstring(fake_sitemap_content) + mocked_parse_sitemap.return_value = parsed_sitemap + + # When + extracted_urls: Iterator[str] = ( + frontend_crawler_with_mocked_internal_api_client._traverse_sitemap() + ) + + # Then + expected_urls: set[str] = { + "https://test.ukhsa-dashboard.data.gov.uk/", + "https://test.ukhsa-dashboard.data.gov.uk/about/", + } + assert set(extracted_urls) == expected_urls + + @mock.patch(f"{MODULE_PATH}.requests") + def test_hit_sitemap_url_returns_sitemap_xml( + self, + spy_requests: mock.MagicMock, + frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, + ): + """ + Given a base URL + When the `_hit_sitemap_url` method is called + from an instance of the `FrontEndCrawler` + Then a GET request is made to the correct URL + """ + # Given + base_url = frontend_crawler_with_mocked_internal_api_client._frontend_base_url + + # When + response = frontend_crawler_with_mocked_internal_api_client._hit_sitemap_url() + + # When + assert response == spy_requests.get.return_value + expected_url = f"{base_url}/sitemap.xml" + spy_requests.get.assert_called_once_with(url=expected_url, timeout=60) + + @mock.patch.object(FrontEndCrawler, "hit_frontend_page") + @mock.patch.object(FrontEndCrawler, "_traverse_sitemap") + def test_process_all_page_urls( + self, + mocked_traverse_sitemap: mock.MagicMock, + spy_hit_frontend_page: mock.MagicMock, + frontend_crawler_with_mocked_internal_api_client: FrontEndCrawler, + ): + """ + Given a generator of URLs to be traversed + When `process_all_page_urls()` is called + from an instance of the `FrontEndCrawler` + Then the call is delegated to `hit_frontend_page()` + for each URL + """ + # Given + traversed_urls: Iterator[str] = ( + x for x in ("https://abc.com", "https://def.com", "https://ghi.com") + ) + mocked_traverse_sitemap.return_value = traversed_urls + + # When + frontend_crawler_with_mocked_internal_api_client.process_all_page_urls() + + # Then + expected_calls = [mock.call(url=url) for url in traversed_urls] + spy_hit_frontend_page.assert_has_calls(calls=expected_calls) diff --git a/tests/unit/caching/frontend/test_handlers.py b/tests/unit/caching/frontend/test_handlers.py index 25fdef350..8cbfa4c41 100644 --- a/tests/unit/caching/frontend/test_handlers.py +++ b/tests/unit/caching/frontend/test_handlers.py @@ -43,10 +43,10 @@ def test_raises_error_when_environment_variable_does_not_exist(self, monkeypatch class TestCrawlFrontEnd: @mock.patch.object(FrontEndCrawler, "process_all_valid_area_selector_pages") - @mock.patch.object(FrontEndCrawler, "process_all_pages") + @mock.patch.object(FrontEndCrawler, "process_all_page_urls") def test_delegates_call_to_frontend_crawler( self, - spy_process_all_pages: mock.MagicMock, + spy_process_all_page_urls: mock.MagicMock, spy_process_all_valid_area_selector_pages: mock.MagicMock, monkeypatch, ): @@ -56,7 +56,7 @@ def test_delegates_call_to_frontend_crawler( Then `process_all_pages()` is called from an instance of `FrontEndCrawler` Patches: - `spy_process_all_pages`: For the main assertion + `spy_process_all_page_urls`: For the main assertion `spy_process_all_valid_area_selector_pages`: For the main assertion """ @@ -68,5 +68,5 @@ def test_delegates_call_to_frontend_crawler( crawl_front_end() # Then - spy_process_all_pages.assert_called_once() + spy_process_all_page_urls.assert_called_once() spy_process_all_valid_area_selector_pages.assert_called_once() diff --git a/tests/unit/caching/frontend/test_urls.py b/tests/unit/caching/frontend/test_urls.py index 1198d11a4..8dfeee2c7 100644 --- a/tests/unit/caching/frontend/test_urls.py +++ b/tests/unit/caching/frontend/test_urls.py @@ -8,169 +8,6 @@ class TestFrontEndURLBuilder: - def test_build_url_for_topic_page(self): - """ - Given a slug for a topic page - When `build_url_for_topic_page()` is called from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - topic_page_slug = "influenza" - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - - # When - topic_page_url: str = frontend_url_builder.build_url_for_topic_page( - slug=topic_page_slug - ) - - # Then - assert topic_page_url == f"{base_url}/topics/{topic_page_slug}" - - def test_build_url_for_common_page(self): - """ - Given a slug for a common page - When `build_url_for_common_page()` is called from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - common_page_slug = "about" - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - - # When - common_page_url: str = frontend_url_builder.build_url_for_common_page( - slug=common_page_slug - ) - - # Then - assert common_page_url == f"{base_url}/{common_page_slug}" - - def test_build_url_for_home_page(self): - """ - Given a base URL - When `build_url_for_home_page()` is called from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - - # When - home_page_url: str = frontend_url_builder.build_url_for_home_page() - - # Then - assert home_page_url == base_url - - def test_build_url_for_whats_new_parent_page(self): - """ - Given a base URL - When `build_url_for_whats_new_parent_page()` is called - from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - - # When - whats_new_parent_page_url: str = ( - frontend_url_builder.build_url_for_whats_new_parent_page() - ) - - # Then - assert whats_new_parent_page_url == f"{base_url}/whats-new" - - def test_build_url_for_whats_new_child_entry(self): - """ - Given a slug for a what's new child entry - When `build_url_for_whats_new_child_entry()` is called - from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - whats_new_child_entry_slug = "issue-with-vaccination-data" - - # When - whats_new_child_entry_url: str = ( - frontend_url_builder.build_url_for_whats_new_child_entry( - slug=whats_new_child_entry_slug - ) - ) - - # Then - assert ( - whats_new_child_entry_url - == f"{base_url}/whats-new/{whats_new_child_entry_slug}" - ) - - def test_build_url_for_feedback_confirmation_page(self): - """ - Given a base URL - When `build_url_for_feedback_confirmation_page()` is called - from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - - # When - feedback_confirmation_page_url: str = ( - frontend_url_builder.build_url_for_feedback_confirmation_page() - ) - - # Then - assert feedback_confirmation_page_url == f"{base_url}/feedback/confirmation" - - def test_build_url_for_metrics_documentation_parent_page(self): - """ - Given a base URL - When `build_url_for_metrics_documentation_parent_page()` is called - from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - - # When - metrics_documentation_parent_page_url: str = ( - frontend_url_builder.build_url_for_metrics_documentation_parent_page() - ) - - # Then - assert ( - metrics_documentation_parent_page_url == f"{base_url}/metrics-documentation" - ) - - def test_build_url_for_metrics_documentation_child_entry(self): - """ - Given a slug for a metrics documentation child entry - When `build_url_for_metrics_documentation_child_entry()` is called - from an instance of `FrontEndURLBuilder` - Then the correct URL will be returned - """ - # Given - base_url = FAKE_BASE_URL - frontend_url_builder = FrontEndURLBuilder(base_url=base_url) - metrics_documentation_child_entry_slug = "covid-19_cases_raterollingmean" - - # When - metrics_documentation_child_entry_url: str = ( - frontend_url_builder.build_url_for_metrics_documentation_child_entry( - slug=metrics_documentation_child_entry_slug - ) - ) - - # Then - assert ( - metrics_documentation_child_entry_url - == f"{base_url}/metrics-documentation/{metrics_documentation_child_entry_slug}" - ) - @pytest.mark.parametrize( "input_geography_type, input_geography_name, expected_geography_type, expected_geography_name", (