From 724327c0349cf8b48b0b5f5047c1f46596a99231 Mon Sep 17 00:00:00 2001 From: mike <219478+ilude@users.noreply.github.com> Date: Thu, 9 May 2024 10:42:12 -0400 Subject: [PATCH] wip --- .vscode/settings.json | 2 + app/models/layout.py | 18 ++- app/services/favicon_finder.py | 185 ------------------------------ app/services/favicon_retriever.py | 80 +++++++++++++ app/services/favicon_store.py | 131 +++++++++++++++++++++ app/services/favicon_utils.py | 11 ++ 6 files changed, 232 insertions(+), 195 deletions(-) delete mode 100644 app/services/favicon_finder.py create mode 100644 app/services/favicon_retriever.py create mode 100644 app/services/favicon_store.py create mode 100644 app/services/favicon_utils.py diff --git a/.vscode/settings.json b/.vscode/settings.json index 8a50cf8..4c83bc4 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -33,6 +33,8 @@ "zcompdump" ], "python.analysis.extraPaths": [ + "./app", + "./app/services", "./app" ] } \ No newline at end of file diff --git a/app/models/layout.py b/app/models/layout.py index c56fe8f..d0b126d 100644 --- a/app/models/layout.py +++ b/app/models/layout.py @@ -1,10 +1,8 @@ -from asyncio import tasks -import asyncio import json import logging import os -from services.favicon_finder import FaviconFinder import yaml +from services.favicon_store import FaviconStore from models.bookmark import Bookmark from models.row import Row from models.column import Column @@ -19,8 +17,8 @@ class Layout: id: str = 'layout' - headers: list[Bookmark] = [] tabs: list[Tab] = [] + headers: list[Bookmark] = [] bookmark_bar: list[dict] = [] def __init__(self, config_file: str = "configs/layout.yml", bookmarks_bar_file: str = "configs/bookmarks_bar.json"): @@ -34,7 +32,7 @@ def __init__(self, config_file: str = "configs/layout.yml", bookmarks_bar_file: except Exception as ex: logger.error(f"Error: {ex} creating empty bookmark bar file at {self.bookmark_bar_path}") - self.favicon_finder = FaviconFinder() + self.favicon_store = FaviconStore() self.reload() def load_bookmarks(self): @@ -57,10 +55,10 @@ def is_modified(self): def mtime(self): return os.path.getmtime(self.config_path) - def bookmark_iterator(self, bookmarks, urls=[]): + def bookmarks_list(self, bookmarks, urls=[]): for bookmark in bookmarks: if 'contents' in bookmark: - self.bookmark_iterator(bookmark['contents'], urls) + self.bookmarks_list(bookmark['contents'], urls) elif 'href' in bookmark: urls.append(bookmark['href']) return urls @@ -78,9 +76,9 @@ def reload(self): self.feed_hash = {} self.bookmark_bar = self.load_bookmarks() - - bookmarks = self.bookmark_iterator(self.bookmark_bar) - self.favicon_finder.fetch_from_iterator(bookmarks) + bookmarks = self.bookmarks_list(self.bookmark_bar) + logger.debug("====== Layout calling fetch favicons!") + self.favicon_store.fetch_favicons_from(bookmarks) logger.debug("Completed Layout reload!") diff --git a/app/services/favicon_finder.py b/app/services/favicon_finder.py deleted file mode 100644 index 159a9e9..0000000 --- a/app/services/favicon_finder.py +++ /dev/null @@ -1,185 +0,0 @@ -import json -import logging -import os -import re -import requests -from urllib.parse import urljoin, urlparse -from bs4 import BeautifulSoup -from models.utils import pwd -from models.scheduler import Scheduler -from PIL import Image -from io import BytesIO - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - - -class FaviconFinder: - def __init__(self, cache_dir='static/assets/icons'): - self.full_cache_path = pwd.joinpath(cache_dir) - self.full_cache_path.mkdir(parents=True, exist_ok=True) - self.relative_cache_path = f"/{cache_dir}" - self.processed_domains = list() - self.processed_domains_file = pwd.joinpath('configs/processed_domains.json') - self.load_processed_domains() - - def add_processed_domain(self, url, reason='completed'): - normalized_domain = self.normalize_domain(url) - self.processed_domains.append((normalized_domain, reason)) - self.save_processed_domains() - - def save_processed_domains(self): - try: - with open(self.processed_domains_file, 'w') as f: - json.dump(list(self.processed_domains), f, ensure_ascii=True, indent=2) - except Exception as ex: - logger.error(f"Error saving processed domains to disk: {ex}") - - def load_processed_domains(self): - try: - if os.path.exists(self.processed_domains_file): - with open(self.processed_domains_file, 'r') as f: - self.processed_domains = json.load(f) - except Exception as ex: - logger.error(f"Error loading processed domains from disk: {ex}") - - @property - def scheduler(self): - return Scheduler.getScheduler() - - def normalize_domain(self, url): - parsed_url = urlparse(url) - domain_parts = parsed_url.netloc.split('.') - if domain_parts[0].startswith("www"): - domain_parts.pop(0) - return '.'.join(domain_parts) - - def make_request(self, url): - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'} - response = requests.get(url, headers=headers, allow_redirects=True) - return response - - def favicon_exists(self, url): - if not url: - return False - try: - normalized_domain = self.normalize_domain(url) - favicon_filename = self.get_favicon_filename(normalized_domain) - favicon_path = os.path.join(self.full_cache_path, favicon_filename) - if os.path.exists(favicon_path): - return f"{self.relative_cache_path}/{favicon_filename}" - return None - except Exception as ex: - logger.error(f"Error checking if favicon exists for {url}: {ex}") - return None - - def get_favicon_filename(self, domain): - return domain + '.favicon.ico' - - def is_ip_address(self, url): - ip_pattern = re.compile( - r"^(?:(?:https?://)?(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}" - r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?(?:\/)?$" - ) - return bool(ip_pattern.match(url)) - - def is_domain_processed(self, url): - normalized_domain = self.normalize_domain(url) - domains = [domain for domain, _ in self.processed_domains] - return ( - normalized_domain in domains - or self.is_ip_address(url) - or self.favicon_exists(url) - or 'trivantis' in normalized_domain - or not url - ) - - def get_base(self, url): - parsed_url = urlparse(url) - base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" - return base_url - - def fetch_from_iterator(self, urls): - for url in urls: - if not self.is_domain_processed(url): - self.add_processed_domain(url) - self.scheduler.add_job( - self._get_favicon, - args=[url], - misfire_grace_time=None, - executor='processpool' - ) - - def _get_favicon(self, url): - normalized_domain = self.normalize_domain(url) - favicon_filename = self.get_favicon_filename(normalized_domain) - favicon_path = os.path.join(self.full_cache_path, favicon_filename) - - if os.path.exists(favicon_path): - return - - icon_url = self.find_favicon_url(url) - - if not icon_url: - # If favicon URL is not found for the original URL, try the base URL - base_url = self.get_base(url) - icon_url = self.find_favicon_url(base_url) - - if icon_url: - self.download_favicon(icon_url) - else: - logger.warn(f'Favicon not found for {normalized_domain}') - self.add_processed_domain(normalized_domain, reason='not found') - - def find_favicon_url(self, url): - try: - response = self.make_request(url) - if response.status_code == 200: - soup = BeautifulSoup(response.text, 'html.parser') - icon_link = soup.find('link', rel=['icon', 'shortcut icon']) - if icon_link: - icon_url = icon_link['href'] - if not icon_url.startswith('http'): - icon_url = urljoin(url, icon_url) - return icon_url - else: - icon_url = f'http://www.google.com/s2/favicons?domain={self.normalize_domain(url)}' - response = self.make_request(icon_url) - if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image'): - return icon_url - return None - else: - return None - except Exception as ex: - logger.error(f'Error finding favicon with url: {url}: {ex}') - self.add_processed_domain(self.normalize_domain(url), reason=f'Error downloading favicon: {ex}') - return None - - def download_favicon(self, icon_url): - if icon_url.startswith('http://www.google.com/s2/favicons?domain='): - # strip 'http://www.google.com/s2/favicons?domain=' from the URL - normalized_domain = icon_url[len('http://www.google.com/s2/favicons?domain='):] - else: - normalized_domain = self.normalize_domain(icon_url) - favicon_filename = self.get_favicon_filename(normalized_domain) - favicon_path = os.path.join(self.full_cache_path, favicon_filename) - - try: - response = self.make_request(icon_url) - if response.status_code == 200: - content_type = response.headers.get('content-type', '').lower() - if content_type.startswith('image/'): - - with open(favicon_path, 'wb') as file: - file.write(response.content) - logger.debug(f'Favicon for {normalized_domain} downloaded and saved as {favicon_path}') - else: - logger.warn(f'The downloaded file from {icon_url} is not a valid image') - self.add_processed_domain(normalized_domain, reason='not an image') - else: - logger.warn(f'Failed to download the favicon for {self.get_base(icon_url)}') - self.add_processed_domain(normalized_domain, reason='not an image') - except Exception as ex: - logger.error(f'Error downloading favicon for {normalized_domain} with url: {icon_url}: {ex}') - self.add_processed_domain(normalized_domain, reason=f'Error downloading favicon: {ex}') diff --git a/app/services/favicon_retriever.py b/app/services/favicon_retriever.py new file mode 100644 index 0000000..2e7dca6 --- /dev/null +++ b/app/services/favicon_retriever.py @@ -0,0 +1,80 @@ +import logging +import os +import re +from services.favicon_utils import get_favicon_filename, normalize_domain +import requests +from bs4 import BeautifulSoup +from models.utils import pwd +from urllib.parse import urljoin + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +class FaviconRetriever: + def __init__(self, favicon_store, cache_dir: str): + self.cache_dir = pwd.joinpath(cache_dir) + self.cache_dir.mkdir(parents=True, exist_ok=True) + self.request_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' + } + self.favicon_store = favicon_store + + def make_request(self, url): + return requests.get(url, headers=self.request_headers, allow_redirects=True) + + def favicon_path(self, url): + favicon_filename = get_favicon_filename(url) + return os.path.join(self.cache_dir, favicon_filename) + + def find_favicon_url(self, url): + normalized_domain = normalize_domain(url) + for try_url in [url, normalized_domain]: + try: + response = self.make_request(try_url) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + icon_link = soup.find('link', rel=['icon', 'shortcut icon']) + if icon_link: + icon_url = icon_link['href'] + if not icon_url.startswith('http'): + icon_url = urljoin(url, icon_url) + return icon_url + except Exception as ex: + logger.error(f"Error: find_favicon_url({try_url}): {ex}") + + # if we made it here we have not found a favicon url + # lets check google + + icon_url = f'http://www.google.com/s2/favicons?domain={normalized_domain}' + response = self.make_request(icon_url) + if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image'): + with open(self.favicon_path(normalized_domain), 'wb') as file: + file.write(response.content) + self.favicon_store.save_processed_domain(normalized_domain, reason='found in google') + + return None + + def download_favicon(self, url): + logger.debug(f"download_favicon({url}) called") + icon_url = self.find_favicon_url(url) + if not icon_url: + logger.debug(f"Could not download_favicon({url}) no icon url found!") + return + + normalized_domain = normalize_domain(icon_url) + favicon_path = self.favicon_path(normalized_domain) + + try: + response = self.make_request(icon_url) + if response.status_code == 200 and response.headers.get('content-type', '').lower().startswith('image/'): + with open(favicon_path, 'wb') as file: + file.write(response.content) + self.favicon_store.save_processed_domain(normalized_domain, reason='success') + else: + self.favicon_store.save_processed_domain( + normalized_domain, + reason=f'response_code: {response.status_code} content-type: {response.headers.get("content-type", "")}' + ) + except Exception as ex: + self.favicon_store.save_processed_domain(normalized_domain, reason=f'{ex}') diff --git a/app/services/favicon_store.py b/app/services/favicon_store.py new file mode 100644 index 0000000..a7963e2 --- /dev/null +++ b/app/services/favicon_store.py @@ -0,0 +1,131 @@ +import os +import re +import sqlite3 +import logging +from services.favicon_utils import get_favicon_filename, normalize_domain +from services.favicon_retriever import FaviconRetriever +from models.scheduler import Scheduler +from models.utils import pwd + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +class FaviconStore: + def __init__(self, cache_dir='static/assets/icons', db_path='configs/favicons.db'): + self.relative_cache_dir = cache_dir + + self.retriever = FaviconRetriever(self, cache_dir) + self.ip_pattern = re.compile( + r"^(?:(?:https?://)?(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}" + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?(?:\/)?$" + ) + + self.db_path = pwd.joinpath(db_path) + self.initializing_database() + + def icon_path(self, url): + if not url: + return None + + favicon_filename = get_favicon_filename(url) + favicon_relative_path = f"{self.relative_cache_dir}/{favicon_filename}" + + if os.path.exists(favicon_relative_path): + return f"/{favicon_relative_path}" + else: + return None + + def fetch_favicons_from(self, urls): + self.scheduler.add_job( + self._process_urls_for_favicons, + args=[urls], + id='fetch_favicons', + name='fetch_favicons', + misfire_grace_time=None, + replace_existing=False, + max_instances=1, + coalesce=True, + executor='processpool' + ) + + def _process_urls_for_favicons(self, urls): + processable_urls = set(filter(lambda url: self.should_processed(url), urls)) + logger.debug(f"============================ {len(processable_urls)} processable urls") + for url in processable_urls: + name = f'_get_favicon_({url})' + self.scheduler.add_job( + self.retriever.download_favicon, + args=[url], + id=name, + name=name, + misfire_grace_time=None, + executor='processpool' + ) + + def should_processed(self, url): + result = not ( + not url + or bool(self.ip_pattern.match(url)) + or self.icon_path(url) + or self.is_domain_processed(url) + ) + # logger.debug(f"==============================================================") + # logger.debug(f"should_processed: {url}") + # logger.debug(f"ip: {bool(self.ip_pattern.match(url))}") + # logger.debug(f"path: {self.icon_path(url)}") + # logger.debug(f"processed: {self.is_domain_processed(url)}") + # logger.debug(f"result: {result}") + return result + + @property + def scheduler(self): + return Scheduler.getScheduler() + + def initializing_database(self): + try: + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + c.execute('''CREATE TABLE IF NOT EXISTS processed_domains + (domain TEXT PRIMARY KEY, reason TEXT)''') + conn.commit() + conn.close() + except Exception as ex: + logger.error(f"Error initializing database {self.db_path}: {e}") + + def processed_domain_count(self): + try: + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + c.execute("SELECT COUNT(*) FROM processed_domains") + result = c.fetchone() + conn.close() + return result[0] + except Exception as ex: + logger.error(f"Error in get_processed_domain_count_from_db(): {ex}") + return 0 + + def save_processed_domain(self, url, reason='completed'): + nomalized_domain = normalize_domain(url) + try: + conn = sqlite3.connect(self.db_path, check_same_thread=False) + c = conn.cursor() + c.execute("INSERT OR REPLACE INTO processed_domains (domain, reason) VALUES (?, ?)", (nomalized_domain, reason)) + conn.commit() + conn.close() + logger.info(f"Saved processed domain {nomalized_domain} with reason {reason}") + except Exception as ex: + logger.error(f"Error in save_processed_domain({nomalized_domain}): {ex}") + + def is_domain_processed(self, url): + nomalized_domain = normalize_domain(url) + try: + conn = sqlite3.connect(self.db_path) + c = conn.cursor() + c.execute("SELECT 1 FROM processed_domains WHERE domain = ?", [nomalized_domain]) + result = c.fetchone() + conn.close() + return bool(result) + except Exception as ex: + logger.error(f"Error checking is_domain_processed({nomalized_domain}): {ex}") + return False diff --git a/app/services/favicon_utils.py b/app/services/favicon_utils.py new file mode 100644 index 0000000..fac805f --- /dev/null +++ b/app/services/favicon_utils.py @@ -0,0 +1,11 @@ +from urllib.parse import urlparse + + +def normalize_domain(url): + if url.startswith('http://') or url.startswith('https://'): + return urlparse(url).netloc + return url + + +def get_favicon_filename(url): + return f"{normalize_domain(url)}.favicon.ico"