From 724327c0349cf8b48b0b5f5047c1f46596a99231 Mon Sep 17 00:00:00 2001
From: mike <219478+ilude@users.noreply.github.com>
Date: Thu, 9 May 2024 10:42:12 -0400
Subject: [PATCH] wip

---
 .vscode/settings.json             |   2 +
 app/models/layout.py              |  18 ++-
 app/services/favicon_finder.py    | 185 ------------------------------
 app/services/favicon_retriever.py |  80 +++++++++++++
 app/services/favicon_store.py     | 131 +++++++++++++++++++++
 app/services/favicon_utils.py     |  11 ++
 6 files changed, 232 insertions(+), 195 deletions(-)
 delete mode 100644 app/services/favicon_finder.py
 create mode 100644 app/services/favicon_retriever.py
 create mode 100644 app/services/favicon_store.py
 create mode 100644 app/services/favicon_utils.py

diff --git a/.vscode/settings.json b/.vscode/settings.json
index 8a50cf8..4c83bc4 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -33,6 +33,8 @@
     "zcompdump"
   ],
   "python.analysis.extraPaths": [
+    "./app",
+    "./app/services",
     "./app"
   ]
 }
\ No newline at end of file
diff --git a/app/models/layout.py b/app/models/layout.py
index c56fe8f..d0b126d 100644
--- a/app/models/layout.py
+++ b/app/models/layout.py
@@ -1,10 +1,8 @@
-from asyncio import tasks
-import asyncio
 import json
 import logging
 import os
-from services.favicon_finder import FaviconFinder
 import yaml
+from services.favicon_store import FaviconStore
 from models.bookmark import Bookmark
 from models.row import Row
 from models.column import Column
@@ -19,8 +17,8 @@
 
 class Layout:
   id: str = 'layout'
-  headers: list[Bookmark] = []
   tabs: list[Tab] = []
+  headers: list[Bookmark] = []
   bookmark_bar: list[dict] = []
 
   def __init__(self, config_file: str = "configs/layout.yml", bookmarks_bar_file: str = "configs/bookmarks_bar.json"):
@@ -34,7 +32,7 @@ def __init__(self, config_file: str = "configs/layout.yml", bookmarks_bar_file:
     except Exception as ex:
       logger.error(f"Error: {ex} creating empty bookmark bar file at {self.bookmark_bar_path}")
 
-    self.favicon_finder = FaviconFinder()
+    self.favicon_store = FaviconStore()
     self.reload()
 
   def load_bookmarks(self):
@@ -57,10 +55,10 @@ def is_modified(self):
   def mtime(self):
     return os.path.getmtime(self.config_path)
 
-  def bookmark_iterator(self, bookmarks, urls=[]):
+  def bookmarks_list(self, bookmarks, urls=[]):
     for bookmark in bookmarks:
       if 'contents' in bookmark:
-        self.bookmark_iterator(bookmark['contents'], urls)
+        self.bookmarks_list(bookmark['contents'], urls)
       elif 'href' in bookmark:
         urls.append(bookmark['href'])
     return urls
@@ -78,9 +76,9 @@ def reload(self):
     self.feed_hash = {}
 
     self.bookmark_bar = self.load_bookmarks()
-
-    bookmarks = self.bookmark_iterator(self.bookmark_bar)
-    self.favicon_finder.fetch_from_iterator(bookmarks)
+    bookmarks = self.bookmarks_list(self.bookmark_bar)
+    logger.debug("====== Layout calling fetch favicons!")
+    self.favicon_store.fetch_favicons_from(bookmarks)
 
     logger.debug("Completed Layout reload!")
 
diff --git a/app/services/favicon_finder.py b/app/services/favicon_finder.py
deleted file mode 100644
index 159a9e9..0000000
--- a/app/services/favicon_finder.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import json
-import logging
-import os
-import re
-import requests
-from urllib.parse import urljoin, urlparse
-from bs4 import BeautifulSoup
-from models.utils import pwd
-from models.scheduler import Scheduler
-from PIL import Image
-from io import BytesIO
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
-
-class FaviconFinder:
-  def __init__(self, cache_dir='static/assets/icons'):
-    self.full_cache_path = pwd.joinpath(cache_dir)
-    self.full_cache_path.mkdir(parents=True, exist_ok=True)
-    self.relative_cache_path = f"/{cache_dir}"
-    self.processed_domains = list()
-    self.processed_domains_file = pwd.joinpath('configs/processed_domains.json')
-    self.load_processed_domains()
-
-  def add_processed_domain(self, url, reason='completed'):
-    normalized_domain = self.normalize_domain(url)
-    self.processed_domains.append((normalized_domain, reason))
-    self.save_processed_domains()
-
-  def save_processed_domains(self):
-    try:
-      with open(self.processed_domains_file, 'w') as f:
-        json.dump(list(self.processed_domains), f, ensure_ascii=True, indent=2)
-    except Exception as ex:
-      logger.error(f"Error saving processed domains to disk: {ex}")
-
-  def load_processed_domains(self):
-    try:
-      if os.path.exists(self.processed_domains_file):
-        with open(self.processed_domains_file, 'r') as f:
-          self.processed_domains = json.load(f)
-    except Exception as ex:
-      logger.error(f"Error loading processed domains from disk: {ex}")
-
-  @property
-  def scheduler(self):
-    return Scheduler.getScheduler()
-
-  def normalize_domain(self, url):
-    parsed_url = urlparse(url)
-    domain_parts = parsed_url.netloc.split('.')
-    if domain_parts[0].startswith("www"):
-      domain_parts.pop(0)
-    return '.'.join(domain_parts)
-
-  def make_request(self, url):
-    headers = {
-      'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'}
-    response = requests.get(url, headers=headers, allow_redirects=True)
-    return response
-
-  def favicon_exists(self, url):
-    if not url:
-      return False
-    try:
-      normalized_domain = self.normalize_domain(url)
-      favicon_filename = self.get_favicon_filename(normalized_domain)
-      favicon_path = os.path.join(self.full_cache_path, favicon_filename)
-      if os.path.exists(favicon_path):
-        return f"{self.relative_cache_path}/{favicon_filename}"
-      return None
-    except Exception as ex:
-      logger.error(f"Error checking if favicon exists for {url}: {ex}")
-      return None
-
-  def get_favicon_filename(self, domain):
-    return domain + '.favicon.ico'
-
-  def is_ip_address(self, url):
-    ip_pattern = re.compile(
-        r"^(?:(?:https?://)?(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
-        r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?(?:\/)?$"
-    )
-    return bool(ip_pattern.match(url))
-
-  def is_domain_processed(self, url):
-    normalized_domain = self.normalize_domain(url)
-    domains = [domain for domain, _ in self.processed_domains]
-    return (
-      normalized_domain in domains
-      or self.is_ip_address(url)
-      or self.favicon_exists(url)
-      or 'trivantis' in normalized_domain
-      or not url
-    )
-
-  def get_base(self, url):
-    parsed_url = urlparse(url)
-    base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
-    return base_url
-
-  def fetch_from_iterator(self, urls):
-    for url in urls:
-      if not self.is_domain_processed(url):
-        self.add_processed_domain(url)
-        self.scheduler.add_job(
-          self._get_favicon,
-          args=[url],
-          misfire_grace_time=None,
-          executor='processpool'
-        )
-
-  def _get_favicon(self, url):
-    normalized_domain = self.normalize_domain(url)
-    favicon_filename = self.get_favicon_filename(normalized_domain)
-    favicon_path = os.path.join(self.full_cache_path, favicon_filename)
-
-    if os.path.exists(favicon_path):
-      return
-
-    icon_url = self.find_favicon_url(url)
-
-    if not icon_url:
-      # If favicon URL is not found for the original URL, try the base URL
-      base_url = self.get_base(url)
-      icon_url = self.find_favicon_url(base_url)
-
-    if icon_url:
-      self.download_favicon(icon_url)
-    else:
-      logger.warn(f'Favicon not found for {normalized_domain}')
-      self.add_processed_domain(normalized_domain, reason='not found')
-
-  def find_favicon_url(self, url):
-    try:
-      response = self.make_request(url)
-      if response.status_code == 200:
-        soup = BeautifulSoup(response.text, 'html.parser')
-        icon_link = soup.find('link', rel=['icon', 'shortcut icon'])
-        if icon_link:
-          icon_url = icon_link['href']
-          if not icon_url.startswith('http'):
-            icon_url = urljoin(url, icon_url)
-          return icon_url
-        else:
-          icon_url = f'http://www.google.com/s2/favicons?domain={self.normalize_domain(url)}'
-          response = self.make_request(icon_url)
-          if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image'):
-            return icon_url
-          return None
-      else:
-        return None
-    except Exception as ex:
-      logger.error(f'Error finding favicon with url: {url}: {ex}')
-      self.add_processed_domain(self.normalize_domain(url), reason=f'Error downloading favicon: {ex}')
-      return None
-
-  def download_favicon(self, icon_url):
-    if icon_url.startswith('http://www.google.com/s2/favicons?domain='):
-      # strip 'http://www.google.com/s2/favicons?domain=' from the URL
-      normalized_domain = icon_url[len('http://www.google.com/s2/favicons?domain='):]
-    else:
-      normalized_domain = self.normalize_domain(icon_url)
-    favicon_filename = self.get_favicon_filename(normalized_domain)
-    favicon_path = os.path.join(self.full_cache_path, favicon_filename)
-
-    try:
-      response = self.make_request(icon_url)
-      if response.status_code == 200:
-        content_type = response.headers.get('content-type', '').lower()
-        if content_type.startswith('image/'):
-
-          with open(favicon_path, 'wb') as file:
-            file.write(response.content)
-          logger.debug(f'Favicon for {normalized_domain} downloaded and saved as {favicon_path}')
-        else:
-          logger.warn(f'The downloaded file from {icon_url} is not a valid image')
-          self.add_processed_domain(normalized_domain, reason='not an image')
-      else:
-        logger.warn(f'Failed to download the favicon for {self.get_base(icon_url)}')
-        self.add_processed_domain(normalized_domain, reason='not an image')
-    except Exception as ex:
-      logger.error(f'Error downloading favicon for {normalized_domain} with url: {icon_url}: {ex}')
-      self.add_processed_domain(normalized_domain, reason=f'Error downloading favicon: {ex}')
diff --git a/app/services/favicon_retriever.py b/app/services/favicon_retriever.py
new file mode 100644
index 0000000..2e7dca6
--- /dev/null
+++ b/app/services/favicon_retriever.py
@@ -0,0 +1,80 @@
+import logging
+import os
+import re
+from services.favicon_utils import get_favicon_filename, normalize_domain
+import requests
+from bs4 import BeautifulSoup
+from models.utils import pwd
+from urllib.parse import urljoin
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+class FaviconRetriever:
+  def __init__(self, favicon_store, cache_dir: str):
+    self.cache_dir = pwd.joinpath(cache_dir)
+    self.cache_dir.mkdir(parents=True, exist_ok=True)
+    self.request_headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'
+    }
+    self.favicon_store = favicon_store
+
+  def make_request(self, url):
+    return requests.get(url, headers=self.request_headers, allow_redirects=True)
+
+  def favicon_path(self, url):
+    favicon_filename = get_favicon_filename(url)
+    return os.path.join(self.cache_dir, favicon_filename)
+
+  def find_favicon_url(self, url):
+    normalized_domain = normalize_domain(url)
+    for try_url in [url, normalized_domain]:
+      try:
+        response = self.make_request(try_url)
+        if response.status_code == 200:
+          soup = BeautifulSoup(response.text, 'html.parser')
+          icon_link = soup.find('link', rel=['icon', 'shortcut icon'])
+          if icon_link:
+            icon_url = icon_link['href']
+            if not icon_url.startswith('http'):
+              icon_url = urljoin(url, icon_url)
+            return icon_url
+      except Exception as ex:
+        logger.error(f"Error: find_favicon_url({try_url}): {ex}")
+
+    # if we made it here we have not found a favicon url
+    # lets check google
+
+    icon_url = f'http://www.google.com/s2/favicons?domain={normalized_domain}'
+    response = self.make_request(icon_url)
+    if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image'):
+      with open(self.favicon_path(normalized_domain), 'wb') as file:
+        file.write(response.content)
+      self.favicon_store.save_processed_domain(normalized_domain, reason='found in google')
+
+    return None
+
+  def download_favicon(self, url):
+    logger.debug(f"download_favicon({url}) called")
+    icon_url = self.find_favicon_url(url)
+    if not icon_url:
+      logger.debug(f"Could not download_favicon({url}) no icon url found!")
+      return
+
+    normalized_domain = normalize_domain(icon_url)
+    favicon_path = self.favicon_path(normalized_domain)
+
+    try:
+      response = self.make_request(icon_url)
+      if response.status_code == 200 and response.headers.get('content-type', '').lower().startswith('image/'):
+        with open(favicon_path, 'wb') as file:
+          file.write(response.content)
+        self.favicon_store.save_processed_domain(normalized_domain, reason='success')
+      else:
+        self.favicon_store.save_processed_domain(
+          normalized_domain,
+          reason=f'response_code: {response.status_code} content-type: {response.headers.get("content-type", "")}'
+        )
+    except Exception as ex:
+      self.favicon_store.save_processed_domain(normalized_domain, reason=f'{ex}')
diff --git a/app/services/favicon_store.py b/app/services/favicon_store.py
new file mode 100644
index 0000000..a7963e2
--- /dev/null
+++ b/app/services/favicon_store.py
@@ -0,0 +1,131 @@
+import os
+import re
+import sqlite3
+import logging
+from services.favicon_utils import get_favicon_filename, normalize_domain
+from services.favicon_retriever import FaviconRetriever
+from models.scheduler import Scheduler
+from models.utils import pwd
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+class FaviconStore:
+  def __init__(self, cache_dir='static/assets/icons', db_path='configs/favicons.db'):
+    self.relative_cache_dir = cache_dir
+
+    self.retriever = FaviconRetriever(self, cache_dir)
+    self.ip_pattern = re.compile(
+      r"^(?:(?:https?://)?(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}"
+      r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?(?:\/)?$"
+    )
+
+    self.db_path = pwd.joinpath(db_path)
+    self.initializing_database()
+
+  def icon_path(self, url):
+    if not url:
+      return None
+
+    favicon_filename = get_favicon_filename(url)
+    favicon_relative_path = f"{self.relative_cache_dir}/{favicon_filename}"
+
+    if os.path.exists(favicon_relative_path):
+      return f"/{favicon_relative_path}"
+    else:
+      return None
+
+  def fetch_favicons_from(self, urls):
+    self.scheduler.add_job(
+      self._process_urls_for_favicons,
+      args=[urls],
+      id='fetch_favicons',
+      name='fetch_favicons',
+      misfire_grace_time=None,
+      replace_existing=False,
+      max_instances=1,
+      coalesce=True,
+      executor='processpool'
+    )
+
+  def _process_urls_for_favicons(self, urls):
+    processable_urls = set(filter(lambda url: self.should_processed(url), urls))
+    logger.debug(f"============================ {len(processable_urls)} processable urls")
+    for url in processable_urls:
+      name = f'_get_favicon_({url})'
+      self.scheduler.add_job(
+        self.retriever.download_favicon,
+        args=[url],
+        id=name,
+        name=name,
+        misfire_grace_time=None,
+        executor='processpool'
+      )
+
+  def should_processed(self, url):
+    result = not (
+      not url
+      or bool(self.ip_pattern.match(url))
+      or self.icon_path(url)
+      or self.is_domain_processed(url)
+    )
+    # logger.debug(f"==============================================================")
+    # logger.debug(f"should_processed: {url}")
+    # logger.debug(f"ip: {bool(self.ip_pattern.match(url))}")
+    # logger.debug(f"path: {self.icon_path(url)}")
+    # logger.debug(f"processed: {self.is_domain_processed(url)}")
+    # logger.debug(f"result: {result}")
+    return result
+
+  @property
+  def scheduler(self):
+    return Scheduler.getScheduler()
+
+  def initializing_database(self):
+    try:
+      conn = sqlite3.connect(self.db_path)
+      c = conn.cursor()
+      c.execute('''CREATE TABLE IF NOT EXISTS processed_domains
+                         (domain TEXT PRIMARY KEY, reason TEXT)''')
+      conn.commit()
+      conn.close()
+    except Exception as ex:
+      logger.error(f"Error initializing database {self.db_path}: {e}")
+
+  def processed_domain_count(self):
+    try:
+      conn = sqlite3.connect(self.db_path)
+      c = conn.cursor()
+      c.execute("SELECT COUNT(*) FROM processed_domains")
+      result = c.fetchone()
+      conn.close()
+      return result[0]
+    except Exception as ex:
+      logger.error(f"Error in get_processed_domain_count_from_db(): {ex}")
+      return 0
+
+  def save_processed_domain(self, url, reason='completed'):
+    nomalized_domain = normalize_domain(url)
+    try:
+      conn = sqlite3.connect(self.db_path, check_same_thread=False)
+      c = conn.cursor()
+      c.execute("INSERT OR REPLACE INTO processed_domains (domain, reason) VALUES (?, ?)", (nomalized_domain, reason))
+      conn.commit()
+      conn.close()
+      logger.info(f"Saved processed domain {nomalized_domain} with reason {reason}")
+    except Exception as ex:
+      logger.error(f"Error in save_processed_domain({nomalized_domain}): {ex}")
+
+  def is_domain_processed(self, url):
+    nomalized_domain = normalize_domain(url)
+    try:
+      conn = sqlite3.connect(self.db_path)
+      c = conn.cursor()
+      c.execute("SELECT 1 FROM processed_domains WHERE domain = ?", [nomalized_domain])
+      result = c.fetchone()
+      conn.close()
+      return bool(result)
+    except Exception as ex:
+      logger.error(f"Error checking is_domain_processed({nomalized_domain}): {ex}")
+      return False
diff --git a/app/services/favicon_utils.py b/app/services/favicon_utils.py
new file mode 100644
index 0000000..fac805f
--- /dev/null
+++ b/app/services/favicon_utils.py
@@ -0,0 +1,11 @@
+from urllib.parse import urlparse
+
+
+def normalize_domain(url):
+  if url.startswith('http://') or url.startswith('https://'):
+    return urlparse(url).netloc
+  return url
+
+
+def get_favicon_filename(url):
+  return f"{normalize_domain(url)}.favicon.ico"