UKHSA-Internal · A-Ashiq · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 18, 2024
diff --git a/caching/frontend/crawler.py b/caching/frontend/crawler.py
@@ -1,7 +1,8 @@
 import logging
-from enum import Enum
+from collections.abc import Iterator
 
 import requests
+from defusedxml import ElementTree
 from rest_framework.response import Response
 
 from caching.common.geographies_crawler import (
@@ -15,29 +16,16 @@
 from cms.topic.models import TopicPage
 
 DEFAULT_REQUEST_TIMEOUT = 60
+PAGE_XML_LOCATOR = ".//ns:loc"
 
 logger = logging.getLogger(__name__)
 
 
-class CMSPageTypes(Enum):
-    home_page = "home.HomePage"
-    topic_page = "topic.TopicPage"
-    common_page = "common.CommonPage"
-    whats_new_parent_page = "whats_new.WhatsNewParentPage"
-    whats_new_child_entry = "whats_new.WhatsNewChildEntry"
-    metrics_documentation_parent_page = (
-        "metrics_documentation.MetricsDocumentationParentPage"
-    )
-    metrics_documentation_child_entry = (
-        "metrics_documentation.MetricsDocumentationChildEntry"
-    )
-
-
 class FrontEndCrawler:
     """This is used to traverse the front end and send GET requests to all relevant pages
 
     Notes:
-        Under the hood, this uses the `InternalAPIClient` to get a list of all pages from the CMS.
+        Under the hood, this gathers all the URLs in the front end from the associated sitemap.xml
         From this point, a simple GET request is made to each page.
         The CDN auth key for the rule on the front end should also be provided.
         If not 403 Forbidden errors will be returned and the cache will not be hydrated.
@@ -64,23 +52,49 @@ def __init__(
             or GeographiesAPICrawler(internal_api_client=self._internal_api_client)
         )
 
-    @classmethod
-    def create_crawler_for_cache_refresh(
-        cls, *, frontend_base_url: str, cdn_auth_key: str
-    ) -> "FrontEndCrawler":
-        return cls(frontend_base_url=frontend_base_url, cdn_auth_key=cdn_auth_key)
-
-    # Private API/headless CMS API
+    @property
+    def sitemap_url(self) -> str:
+        return self._url_builder.build_url_for_sitemap()
+
+    def _hit_sitemap_url(self) -> Response:
+        url: str = self.sitemap_url
+        return requests.get(url=url, timeout=DEFAULT_REQUEST_TIMEOUT)
+
+    def _parse_sitemap(self):
+        response: Response = self._hit_sitemap_url()
+        xml_response_data: str = response.content.decode("utf-8")
+        return ElementTree.fromstring(text=xml_response_data)
+
+    def _traverse_sitemap(self) -> Iterator[str]:
+        sitemap_root = self._parse_sitemap()
+        namespace = {"ns": "http://www.sitemaps.org/schemas/sitemap/0.9"}
+        return (
+            loc.text
+            for loc in sitemap_root.findall(PAGE_XML_LOCATOR, namespaces=namespace)
+        )
 
-    def get_all_page_items_from_api(self) -> list[dict]:
-        """Hits the `pages/` endpoint to list all page items in the CMS
+    def process_all_page_urls(self) -> None:
+        """Traverse the frontend and make a GET request to all relevant pages
 
         Returns:
-            List of page items information
+            None
 
         """
-        response: Response = self._internal_api_client.hit_pages_list_endpoint()
-        return response.json()["items"]
+        logger.info("Traversing sitemap for URLs")
+
+        urls: Iterator[str] = self._traverse_sitemap()
+
+        for url in urls:
+            logger.info("Processing `%s`", url)
+            self.hit_frontend_page(url=url)
+
+        logger.info("Finished processing all URLs for the frontend")
+
+    @classmethod
+    def create_crawler_for_cache_refresh(
+        cls, *, frontend_base_url: str, cdn_auth_key: str
+    ) -> "FrontEndCrawler":
+        return cls(frontend_base_url=frontend_base_url, cdn_auth_key=cdn_auth_key)
 
     # Frontend requests
 
@@ -110,80 +124,6 @@ def hit_frontend_page(
         )
         logger.info("Processed `%s` for params: %s", url, params)
 
-    def process_page(self, *, page_item: dict) -> None:
-        """Hit the URL for the corresponding `page_item`
-
-        Notes:
-            Only the following page types are supported:
-            - "HomePage"
-            - "TopicPage"
-            - "CommonPage"
-            - "WhatsNewParentPage"
-            - "WhatsNewChildEntry"
-            - "MetricsDocumentationParentPage"
-            - "MetricsDocumentationChildEntry"
-
-        Args:
-            page_item: The individual page information
-                taken from the `pages/` list response
-
-        Returns:
-            None
-
-        """
-        page_type: str = page_item["type"]
-
-        match page_type:
-            case CMSPageTypes.home_page.value:
-                url = self._url_builder.build_url_for_home_page()
-            case CMSPageTypes.topic_page.value:
-                url = self._url_builder.build_url_for_topic_page(slug=page_item["slug"])
-            case CMSPageTypes.common_page.value:
-                url = self._url_builder.build_url_for_common_page(
-                    slug=page_item["slug"]
-                )
-            case CMSPageTypes.whats_new_parent_page.value:
-                url = self._url_builder.build_url_for_whats_new_parent_page()
-            case CMSPageTypes.whats_new_child_entry.value:
-                url = self._url_builder.build_url_for_whats_new_child_entry(
-                    slug=page_item["slug"]
-                )
-            case CMSPageTypes.metrics_documentation_parent_page.value:
-                url = (
-                    self._url_builder.build_url_for_metrics_documentation_parent_page()
-                )
-            case CMSPageTypes.metrics_documentation_child_entry.value:
-                url = self._url_builder.build_url_for_metrics_documentation_child_entry(
-                    slug=page_item["slug"]
-                )
-            case _:
-                # Pass over for root page objects
-                return
-
-        self.hit_frontend_page(url=url)
-
-    def process_all_pages(self) -> None:
-        """Traverse the frontend and make a GET request to all relevant pages
-
-        Returns:
-            None
-
-        """
-        logger.info("Getting all pages from Headless CMS API")
-        all_page_items: list[dict] = self.get_all_page_items_from_api()
-
-        for page_item in all_page_items:
-            self.process_page(page_item=page_item["meta"])
-
-        self._hit_ancillary_pages()
-        logger.info("Finished processing all regular pages for the frontend")
-
-    def _hit_ancillary_pages(self):
-        self.hit_frontend_page(
-            url=self._url_builder.build_url_for_feedback_confirmation_page()
-        )
-        self.hit_frontend_page(url=self._url_builder.build_url_for_sitemap())
-
     def process_geography_page_combination(
         self, geography_data: GeographyData, page: TopicPage
     ) -> None:
@@ -199,7 +139,7 @@ def process_geography_page_combination(
             None
 
         """
-        url: str = self._url_builder.build_url_for_topic_page(slug=page.slug)
+        url: str = page.full_url
         params: dict[str, str] = (
             self._url_builder.build_query_params_for_area_selector_page(
                 geography_type_name=geography_data.geography_type_name,

diff --git a/caching/frontend/handlers.py b/caching/frontend/handlers.py
@@ -41,5 +41,5 @@ def crawl_front_end() -> None:
     frontend_crawler = FrontEndCrawler.create_crawler_for_cache_refresh(
         frontend_base_url=frontend_base_url, cdn_auth_key=cdn_auth_key
     )
-    frontend_crawler.process_all_pages()
+    frontend_crawler.process_all_page_urls()
     frontend_crawler.process_all_valid_area_selector_pages()
diff --git a/caching/frontend/urls.py b/caching/frontend/urls.py
@@ -7,90 +7,6 @@ class FrontEndURLBuilder:
     def __init__(self, *, base_url: str):
         self._base_url = base_url
 
-    def build_url_for_topic_page(self, *, slug: str) -> str:
-        """Builds the full URL for the given topic page `slug`
-
-        Args:
-            slug: The slug associated with the Topic page
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, f"/topics/{slug}")
-
-    def build_url_for_common_page(self, *, slug: str) -> str:
-        """Builds the full URL for the given common page `slug`
-
-        Args:
-            slug: The slug associated with the Common page
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, slug)
-
-    def build_url_for_home_page(self) -> str:
-        """Builds the full URL for the single home page
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return self._base_url
-
-    def build_url_for_whats_new_parent_page(self) -> str:
-        """Builds the full URL for the single what's new parent page
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, "whats-new")
-
-    def build_url_for_whats_new_child_entry(self, *, slug: str) -> str:
-        """Builds the full URL for the single what's new child entry
-
-        Args:
-            slug: The slug associated with the `WhatsNewChildEntry`
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, f"whats-new/{slug}")
-
-    def build_url_for_metrics_documentation_parent_page(self) -> str:
-        """Builds the full URL for the single metrics documentation parent page
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, "metrics-documentation")
-
-    def build_url_for_metrics_documentation_child_entry(self, *, slug: str) -> str:
-        """Builds the full URL for the single metrics documentation child entry
-
-        Args:
-            slug: The slug associated with the `MetricsDocumentationChildEntry`
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, f"metrics-documentation/{slug}")
-
-    def build_url_for_feedback_confirmation_page(self) -> str:
-        """Builds the full URL for the feedback confirmation page
-
-        Returns:
-            The full URL which can be passed to requests
-
-        """
-        return urljoin(self._base_url, "/feedback/confirmation")
-
     def build_url_for_sitemap(self) -> str:
         """Builds the full URL for the sitemap page
 

diff --git a/requirements-prod.txt b/requirements-prod.txt
@@ -9,6 +9,7 @@ click==8.1.7
 colorama==0.4.6
 coreapi==2.3.3
 coreschema==0.0.4
+defusedxml==0.7.1
 distlib==0.3.9
 django-cors-headers==4.5.0
 django-db-connection-pool==1.2.5