diff --git a/.gitignore b/.gitignore index 9c3ea51..c1da06d 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,4 @@ app/static/assets notebooks/data/ -cache/ \ No newline at end of file +app/.working/ diff --git a/.vscode/settings.json b/.vscode/settings.json index 4c83bc4..b8d422d 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -35,6 +35,7 @@ "python.analysis.extraPaths": [ "./app", "./app/services", + "./app", "./app" ] } \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 11fb9be..6b2e7ca 100755 --- a/Dockerfile +++ b/Dockerfile @@ -4,37 +4,16 @@ ARG PYTHON_VERSION=3.12-slim-bookworm FROM python:${PYTHON_VERSION} as base LABEL maintainer="Mike Glenn " -ARG PUID=${PUID:-1000} -ARG PGID=${PGID:-1000} - -ARG USER=anvil ARG TZ=America/New_York -ENV USER=${USER} ENV TZ=${TZ} -ARG PROJECT_NAME -ENV PROJECT_NAME=${PROJECT_NAME} - -ARG PROJECT_PATH=/app -ENV PROJECT_PATH=${PROJECT_PATH} - -ARG ONBOARD_PORT=9830 -ENV ONBOARD_PORT=${ONBOARD_PORT} - ENV PYTHON_DEPS_PATH=/dependencies ENV PYTHONPATH="${PYTHONPATH}:${PYTHON_DEPS_PATH}" ENV PYTHONUNBUFFERED=TRUE -ENV LANGUAGE=en_US.UTF-8 -ENV LANG=en_US.UTF-8 -ENV LC_ALL=C.UTF-8 ENV DEBIAN_FRONTEND=noninteractive ENV DEBCONF_NONINTERACTIVE_SEEN=true -ENV HOME=/home/${USER} -ARG TERM_SHELL=zsh -ENV TERM_SHELL=${TERM_SHELL} - RUN apt-get update && apt-get install -y --no-install-recommends \ bash \ ca-certificates \ @@ -52,6 +31,68 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get autoclean -y && \ rm -rf /var/lib/apt/lists/* +RUN sed -i '/en_US.UTF-8/s/^# //g' /etc/locale.gen && locale-gen +ENV LANG en_US.UTF-8 +ENV LANGUAGE en_US:en +ENV LC_ALL en_US.UTF-8 + + +############################## +# Begin build +############################## +FROM base as build + +RUN apt-get update && apt-get install -y --no-install-recommends \ + binutils \ + build-essential \ + pkg-config gfortran \ + cmake \ + coreutils \ + extra-cmake-modules \ + findutils \ + git \ + openssl \ + openssh-client \ + sqlite3 \ + libsqlite3-dev && \ + apt-get autoremove -fy && \ + apt-get clean && \ + apt-get autoclean -y && \ + rm -rf /var/lib/apt/lists/* + +COPY requirements.txt requirements.txt +RUN pip3 install --upgrade pip && \ + pip3 install --upgrade setuptools && \ + pip3 install --upgrade wheel && \ + mkdir -p ${PYTHON_DEPS_PATH} && \ + pip3 install --no-cache-dir --target=${PYTHON_DEPS_PATH} -r requirements.txt && \ + rm -rf requirements.txt + + +############################## +# Begin user_base +############################## +FROM base as user_base + +ARG PUID=${PUID:-1000} +ARG PGID=${PGID:-1000} + +ARG USER=anvil +ENV USER=${USER} + +ARG PROJECT_NAME +ENV PROJECT_NAME=${PROJECT_NAME} + +ARG PROJECT_PATH=/app +ENV PROJECT_PATH=${PROJECT_PATH} + +ARG ONBOARD_PORT=9830 +ENV ONBOARD_PORT=${ONBOARD_PORT} + +ENV HOME=/home/${USER} +ARG TERM_SHELL=zsh +ENV TERM_SHELL=${TERM_SHELL} + RUN sed -i 's/UID_MAX .*/UID_MAX 100000/' /etc/login.defs && \ groupadd --gid ${PGID} ${USER} && \ useradd --uid ${PUID} --gid ${PGID} -s /bin/${TERM_SHELL} -m ${USER} && \ @@ -78,13 +119,13 @@ if [ "$(id -u)" = "0" ]; then groupmod -o -g ${PGID:-1000} ${USER} usermod -o -u ${PUID:-1000} ${USER} - # get gid of docker socket file - SOCK_DOCKER_GID=`stat -c %g /var/run/docker.sock` - groupmod -o -g "$SOCK_DOCKER_GID" ${USER} + chown ${USER}:${USER} /var/run/docker.sock - # Add call to gosu to drop from root user to jenkins user + # Add call to gosu to drop from root user # when running original entrypoint set -- gosu ${USER} "$@" +else + sudo chown -R ${USER}:${USER} /var/run/docker.sock fi echo "Running: $@" @@ -94,66 +135,29 @@ EOF WORKDIR $PROJECT_PATH ENTRYPOINT [ "/usr/local/bin/docker-entrypoint.sh" ] - -############################## -# Begin build -############################## -FROM base as build - -RUN apt-get update && apt-get install -y --no-install-recommends \ - binutils \ - build-essential \ - pkg-config gfortran \ - cmake \ - coreutils \ - extra-cmake-modules \ - findutils \ - git \ - openssl \ - openssh-client \ - sqlite3 \ - libsqlite3-dev && \ - apt-get autoremove -fy && \ - apt-get clean && \ - apt-get autoclean -y && \ - rm -rf /var/lib/apt/lists/* - -COPY requirements.txt requirements.txt -RUN pip3 install --upgrade pip && \ - pip3 install --upgrade setuptools && \ - pip3 install --upgrade wheel && \ - mkdir -p ${PYTHON_DEPS_PATH} && \ - chown -R ${USER}:${USER} ${PROJECT_PATH} ${PYTHON_DEPS_PATH} && \ - pip3 install --no-cache-dir --target=${PYTHON_DEPS_PATH} -r requirements.txt && \ - rm -rf requirements.txt - - ############################## # Begin production ############################## -FROM base as production +FROM user_base as production -COPY --from=build --chown=${USER}:${USER} ${PYTHON_DEPS_PATH} ${PYTHON_DEPS_PATH} +COPY --from=build --chown=${USER}:${USER} ${PYTHON_DEPS_PATH} ${PYTHON_DEPS_PATH} COPY --chown=${USER}:${USER} app ${PROJECT_PATH} ENV FLASK_ENV=production -RUN mkdir /cache && \ - chown -R ${USER}:${USER} /cache && \ - mkdir -p /app/static/icons && \ - mkdir -p /app/static/assets &&\ +RUN mkdir -p /app/static/icons && \ + mkdir -p /app/static/assets && \ chown -R ${USER}:${USER} /app/static HEALTHCHECK --interval=10s --timeout=3s --start-period=40s \ CMD wget --no-verbose --tries=1 --spider --no-check-certificate http://localhost:$ONBOARD_PORT/api/healthcheck || exit 1 - CMD [ "python3", "app.py" ] ############################## -# Begin jupyter-devcontainer +# Begin jupyter-builder ############################## -FROM build as jupyter-devcontainer +FROM build as jupyter-builder RUN apt-get update && apt-get install -y --no-install-recommends \ graphviz \ @@ -173,11 +177,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ apt-get autoclean -y && \ rm -rf /var/lib/apt/lists/* -############################## -# Begin jupyter-builder -############################## -FROM jupyter-devcontainer as jupyter-builder - RUN pip3 install --no-cache-dir --target=${PYTHON_DEPS_PATH} docutils h5py ipykernel ipython jupyter jupyterhub notebook numpy nltk pyyaml pylint scikit-learn scipy==1.11.0 watermark RUN pip3 install --no-cache-dir --target=${PYTHON_DEPS_PATH} --no-deps --prefer-binary matplotlib seaborn plotly graphviz imutils keras RUN pip3 install --no-cache-dir --target=${PYTHON_DEPS_PATH} --prefer-binary pandas-datareader bottleneck scipy duckdb sqlalchemy pyautogui requests_cache statsmodels @@ -186,9 +185,7 @@ RUN pip3 install --no-cache-dir --target=${PYTHON_DEPS_PATH} --prefer-binary pan ############################## # Begin devcontainer ############################## -FROM jupyter-devcontainer as devcontainer - -COPY --from=jupyter-builder --chown=${USER}:${USER} ${PYTHON_DEPS_PATH} ${PYTHON_DEPS_PATH} +FROM user_base as devcontainer RUN apt-get update && apt-get install -y --no-install-recommends \ ansible \ @@ -196,6 +193,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ dnsutils \ exa \ iproute2 \ + iputils-ping \ jq \ openssh-client \ ripgrep \ @@ -219,8 +217,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ USER ${USER} COPY .devcontainer .devcontainer -RUN LC_ALL=C.UTF-8 ansible-playbook --inventory 127.0.0.1 --connection=local .devcontainer/ansible/requirements.yml && \ - LC_ALL=C.UTF-8 ansible-playbook --inventory 127.0.0.1 --connection=local .devcontainer/ansible/install-docker.yml - +RUN ansible-playbook --inventory 127.0.0.1 --connection=local .devcontainer/ansible/requirements.yml && \ + ansible-playbook --inventory 127.0.0.1 --connection=local .devcontainer/ansible/install-docker.yml + +COPY --from=jupyter-builder --chown=${USER}:${USER} ${PYTHON_DEPS_PATH} ${PYTHON_DEPS_PATH} + # https://code.visualstudio.com/remote/advancedcontainers/start-processes#_adding-startup-commands-to-the-docker-image-instead CMD [ "sleep", "infinity" ] diff --git a/Makefile b/Makefile index 7b3cd08..636e804 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ .SUFFIXES: -SITE_PACKAGES := $(shell pip show pip | grep '^Location' | cut -d' ' -f2-) +SITE_PACKAGES := $(shell pip show pip | grep '^Location' | cut -d' ' -f2-) # read and include .devcontainer/.env exporting all variables ifneq (,$(wildcard .devcontainer/.env)) @@ -7,8 +7,16 @@ ifneq (,$(wildcard .devcontainer/.env)) export endif +ifneq (,$(wildcard .env)) + include .env + export +endif + + .PHONY: run reqs ansible run: $(SITE_PACKAGES) + @clear + -@killall python3 3>&1 1>/dev/null 2>&3 || true python3 app/app.py reqs: $(SITE_PACKAGES) diff --git a/app/app.py b/app/app.py index eb45e81..3668dce 100644 --- a/app/app.py +++ b/app/app.py @@ -56,11 +56,10 @@ @app.context_processor def inject_current_date(): - favicon_finder = models.layout.FaviconFinder() return { 'today_date': datetime.now(), 'site_title': os.environ.get('ONBOARD_SITE_TITLE', 'OnBoard'), - 'favicon_exists': favicon_finder.favicon_exists, + 'favicon_path': layout.favicon_path, } diff --git a/app/defaults/layout.yml b/app/defaults/layout.yml index a328774..d6ec73b 100644 --- a/app/defaults/layout.yml +++ b/app/defaults/layout.yml @@ -85,9 +85,9 @@ tabs: - name: Frigate link: https://frigate.traefikturkey.icu/ - name: Proxmox - ilude.com - link: https://pve.ilude.com:8006/ + link: 'https://pve.ilude.com:8006/#v1:0:=qemu%2F101:4:::::::' - name: Proxmox - traefikturkey.icu - link: https://pve-patriot.ilude.com:8006/ + link: 'https://pve-patriot.ilude.com:8006/#v1:0:=qemu%2F101:4:::::::' - name: Vaultwarden link: https://vaultwarden.ilude.com/ @@ -118,13 +118,17 @@ tabs: type: "feed" link: "https://www.battleswarmblog.com/" feed_url: "https://www.battleswarmblog.com/?feed=rss2" - - name: "Cafe Hayek" + - name: "Slashdot" type: "feed" - link: "https://cafehayek.com/" - feed_url: "https://cafehayek.com/feed" - filters: - strip: - - summary: '^Tweet\s*\.{0,3}|\…\s+' + summary_enabled: False + link: "https://slashdot.org/" + feed_url: "https://rss.slashdot.org/Slashdot/slashdotMain" + - name: "Phoronix" + type: "feed" + summary_enabled: False + link: "https://www.phoronix.com" + feed_url: "https://www.phoronix.com/rss.php" + - column: widgets: @@ -135,16 +139,14 @@ tabs: - name: "Krebs on Security" type: "feed" summary_enabled: False - display_limit: 12 link: "https://krebsonsecurity.com/" feed_url: "https://krebsonsecurity.com/feed/" - - name: "Slashdot" + - name: "Linux Hint" type: "feed" summary_enabled: False - display_limit: 12 - link: "https://slashdot.org/" - feed_url: "https://rss.slashdot.org/Slashdot/slashdotMain" - + link: "https://linuxhint.com/" + feed_url: "https://linuxhint.com/feed/" + - column: widgets: - name: "Instapundit" @@ -161,6 +163,13 @@ tabs: type: "feed" link: "https://twitchy.com/" feed_url: "https://twitchy.com/feed" + - name: "Cafe Hayek" + type: "feed" + link: "https://cafehayek.com/" + feed_url: "https://cafehayek.com/feed" + filters: + strip: + - summary: '^Tweet\s*\.{0,3}|\…\s+' - tab: "Tech" columns: diff --git a/app/defaults/processed_domains.json b/app/defaults/processed_domains.json deleted file mode 100644 index 0ce005c..0000000 --- a/app/defaults/processed_domains.json +++ /dev/null @@ -1 +0,0 @@ -[]] \ No newline at end of file diff --git a/app/models/feed.py b/app/models/feed.py index 1ff9e91..063a593 100644 --- a/app/models/feed.py +++ b/app/models/feed.py @@ -23,200 +23,187 @@ class Feed(Widget): - feed_url: str - summary_enabled: bool = True - hx_get: str = None - - def __init__(self, widget) -> None: - super().__init__(widget) - - self.feed_url = widget['feed_url'] - self.display_limit = widget.get('display_limit', 10) - self.summary_enabled = widget.get('summary_enabled', True) - self.hx_get = f"/feed/{self.id}" - - self._filters = [] - if 'filters' in widget: - for filter_type in self.widget['filters']: - for filter in self.widget['filters'][filter_type]: - for attribute in filter: - filter_text = filter[attribute] - self._filters.append({ - "type": filter_type, - "text": filter_text, - "attribute": attribute - }) - - if not self.cache_path.parent.exists(): - self.cache_path.parent.mkdir(parents=True, exist_ok=True) - - self.items = self.load_cache(self.cache_path) - if self.items: - self._last_updated = datetime.fromtimestamp(os.path.getmtime(self.cache_path)) - else: - self._last_updated = None - - if self.scheduler.running: - self.job = self.scheduler.add_job(self.update, 'cron', name=f'{self.id} - {self.name} - cron', hour='*', jitter=30, max_instances=1) - logger.debug(f"Feed: {self.name} cron job for scheduled with job id: {self.job.id}") - - if self.needs_update: - self.refresh() - - def refresh(self): - if self.job: - logging.debug(f"Feed: {self.name} scheduled for immediate update now!") - self.job.modify(next_run_time=datetime.now()) - else: - logging.warn(f"Feed: {self.name} does not have a scheduled job!") - - - @property - def needs_update(self): - force_update = bool(os.getenv("ONBOARD_FEED_FORCE_UPDATE", "False")) - # if there is no last_updated time, or if it's more 10 minutes ago, then force an update - return force_update or self.last_updated is None or self.last_updated < datetime.now() - timedelta(minutes=10) - - - @property - def filters(self): - return self._filters - - - @cached_property - def cache_path(self): - return pwd.joinpath(os.getenv("ONBOARD_FEED_CACHE_DIR", "../cache"), f"{self.id}.json").resolve() - - - @property - def feed_url(self) -> str: - return self._url - - - @feed_url.setter - def feed_url(self, url: str): - self._url = url - self.id = calculate_sha1_hash(url) - - - def update(self): - articles = self.download(self.feed_url) - self.items = self.save_articles(articles) - - - def load_cache(self, cache_path: Path) -> list[FeedArticle]: - articles = [] - if cache_path.exists(): - with open(cache_path, 'r') as f: - json_articles = json.load(f)['articles'] - - for article in json_articles: - articles.append( - FeedArticle( - original_title = article.get('original_title', article['title']), - title = article['title'], - link = article['link'], - description = article['description'], - pub_date = dateutil.parser.parse(article['pub_date']), - processed = article.get('processed', None), - parent = self - ) - ) - logging.debug(f"Loaded {len(articles)} cached articles for {self.name} : file {self.cache_path}") - else: - logging.debug(f"Failed to load cached articles for {self.name} : file {self.cache_path} does not exist") - - articles.sort(key=lambda a: a.pub_date, reverse=True) - return articles - - - def download(self, feed_url: str) -> list[FeedArticle]: - articles = [] - feed = feedparser.parse(feed_url) - for entry in feed.entries: - pub_date = dateutil.parser.parse(entry.get('published', entry.get('updated', formatdate()))) - articles.append( - FeedArticle( - original_title = entry.title, - title = entry.title, - link = entry.link, - description = entry.description, - pub_date = pub_date, - processed = None, - parent = self - ) - ) - - return articles - - def process(self): - self.items = self.processors(self.items) - - def processors(self, articles: list[FeedArticle]) -> list[FeedArticle]: - if 'process' in self.widget: - for processor in self.widget['process']: - processor_name = processor['processor'] - processor_path = pwd.joinpath("processors", processor_name + ".py") - if processor_path.exists(): - module = importlib.import_module(f"processors.{processor_name}") - processor_class = getattr(module, ''.join(word.title() for word in processor_name.split('_'))) - processor_instance = processor_class() - else: - processor_instance = NoOpFeedProcessor() - - articles = processor_instance.process(articles) - - return articles - - - def remove_duplicate_articles(self, articles): - # Filters a list of objects and returns a new list with objects where 'removed' is False. - articles = list(filter(lambda obj: not obj.removed, articles)) - - # Create a dictionary to group articles by their ID - article_dict = defaultdict(list) - for article in articles: - article_dict[article.id].append(article) - - # Filter the articles, keeping the one with 'processed' set if it exists - return [ - next((a for a in articles_list if a.processed is not None), articles_list[0]) - for articles_list in article_dict.values() - ] - - - def save_articles(self, articles: list[FeedArticle]): - # load all existing articles from the json file, and add the new ones - # then apply the filters - all_articles = self.load_cache(self.cache_path) + articles - - # using article.id remove duplicates from articles - all_articles = self.remove_duplicate_articles(all_articles) - - all_articles = self.processors(all_articles) - - # sort articles in place by pub_date newest to oldest - all_articles.sort(key=lambda a: a.pub_date, reverse=True) - - - data = { - 'name': self.name, - 'link': self.link, - 'articles': [ - { - 'original_title': article.original_title, - 'title': article.title, - 'link': article.link, - 'description': article.description, - 'pub_date': utils.format_datetime(article.pub_date), - 'id': article.id, - 'processed': article.processed - } for article in all_articles - ] - } - with open(self.cache_path, 'w') as f: - json.dump(data, f, indent=2) - - logger.info(f"Saved {len(all_articles)} articles for {self.name} to cache file {self.cache_path}") - return all_articles - \ No newline at end of file + feed_url: str + summary_enabled: bool = True + hx_get: str = None + + def __init__(self, widget) -> None: + super().__init__(widget) + + self.feed_url = widget['feed_url'] + self.display_limit = widget.get('display_limit', 10) + self.summary_enabled = widget.get('summary_enabled', True) + self.hx_get = f"/feed/{self.id}" + + self._filters = [] + if 'filters' in widget: + for filter_type in self.widget['filters']: + for filter in self.widget['filters'][filter_type]: + for attribute in filter: + filter_text = filter[attribute] + self._filters.append({ + "type": filter_type, + "text": filter_text, + "attribute": attribute + }) + + cache_dir = pwd.joinpath(os.getenv("WORKING_STORAGE", ".working"), "cache").resolve() + if not cache_dir.exists(): + cache_dir.mkdir(parents=True, exist_ok=True) + + self.cache_path = cache_dir.joinpath(f"{self.id}.json") + + self.items = self.load_cache(self.cache_path) + if self.items: + self._last_updated = datetime.fromtimestamp(os.path.getmtime(self.cache_path)) + else: + self._last_updated = None + + if self.scheduler.running: + self.job = self.scheduler.add_job(self.update, 'cron', name=f'{self.id} - {self.name} - cron', hour='*', jitter=30, max_instances=1) + logger.debug(f"Feed: {self.name} cron job for scheduled with job id: {self.job.id}") + + if self.needs_update: + self.refresh() + + def refresh(self): + if self.job: + logging.debug(f"Feed: {self.name} scheduled for immediate update now!") + self.job.modify(next_run_time=datetime.now()) + else: + logging.warn(f"Feed: {self.name} does not have a scheduled job!") + + @property + def needs_update(self): + force_update = bool(os.getenv("ONBOARD_FEED_FORCE_UPDATE", "False")) + # if there is no last_updated time, or if it's more 10 minutes ago, then force an update + return force_update or self.last_updated is None or self.last_updated < datetime.now() - timedelta(minutes=10) + + @property + def filters(self): + return self._filters + + @property + def feed_url(self) -> str: + return self._url + + @feed_url.setter + def feed_url(self, url: str): + self._url = url + self.id = calculate_sha1_hash(url) + + def update(self): + articles = self.download(self.feed_url) + self.items = self.save_articles(articles) + + def load_cache(self, cache_path: Path) -> list[FeedArticle]: + articles = [] + if cache_path.exists(): + with open(cache_path, 'r') as f: + json_articles = json.load(f)['articles'] + + for article in json_articles: + articles.append( + FeedArticle( + original_title=article.get('original_title', article['title']), + title=article['title'], + link=article['link'], + description=article['description'], + pub_date=dateutil.parser.parse(article['pub_date']), + processed=article.get('processed', None), + parent=self + ) + ) + logging.debug(f"Loaded {len(articles)} cached articles for {self.name} : file {self.cache_path}") + else: + logging.debug(f"Failed to load cached articles for {self.name} : file {self.cache_path} does not exist") + + articles.sort(key=lambda a: a.pub_date, reverse=True) + return articles + + def download(self, feed_url: str) -> list[FeedArticle]: + articles = [] + feed = feedparser.parse(feed_url) + for entry in feed.entries: + pub_date = dateutil.parser.parse(entry.get('published', entry.get('updated', formatdate()))) + articles.append( + FeedArticle( + original_title=entry.title, + title=entry.title, + link=entry.link, + description=entry.description, + pub_date=pub_date, + processed=None, + parent=self + ) + ) + + return articles + + def process(self): + self.items = self.processors(self.items) + + def processors(self, articles: list[FeedArticle]) -> list[FeedArticle]: + if 'process' in self.widget: + for processor in self.widget['process']: + processor_name = processor['processor'] + processor_path = pwd.joinpath("processors", processor_name + ".py") + if processor_path.exists(): + module = importlib.import_module(f"processors.{processor_name}") + processor_class = getattr(module, ''.join(word.title() for word in processor_name.split('_'))) + processor_instance = processor_class() + else: + processor_instance = NoOpFeedProcessor() + + articles = processor_instance.process(articles) + + return articles + + def remove_duplicate_articles(self, articles): + # Filters a list of objects and returns a new list with objects where 'removed' is False. + articles = list(filter(lambda obj: not obj.removed, articles)) + + # Create a dictionary to group articles by their ID + article_dict = defaultdict(list) + for article in articles: + article_dict[article.id].append(article) + + # Filter the articles, keeping the one with 'processed' set if it exists + return [ + next((a for a in articles_list if a.processed is not None), articles_list[0]) + for articles_list in article_dict.values() + ] + + def save_articles(self, articles: list[FeedArticle]): + # load all existing articles from the json file, and add the new ones + # then apply the filters + all_articles = self.load_cache(self.cache_path) + articles + + # using article.id remove duplicates from articles + all_articles = self.remove_duplicate_articles(all_articles) + + all_articles = self.processors(all_articles) + + # sort articles in place by pub_date newest to oldest + all_articles.sort(key=lambda a: a.pub_date, reverse=True) + + data = { + 'name': self.name, + 'link': self.link, + 'articles': [ + { + 'original_title': article.original_title, + 'title': article.title, + 'link': article.link, + 'description': article.description, + 'pub_date': utils.format_datetime(article.pub_date), + 'id': article.id, + 'processed': article.processed + } for article in all_articles + ] + } + with open(self.cache_path, 'w') as f: + json.dump(data, f, indent=2) + + logger.info(f"Saved {len(all_articles)} articles for {self.name} to cache file {self.cache_path}") + return all_articles diff --git a/app/models/layout.py b/app/models/layout.py index d0b126d..6166832 100644 --- a/app/models/layout.py +++ b/app/models/layout.py @@ -46,6 +46,9 @@ def load_bookmarks(self): def stop_scheduler(self): Scheduler.shutdown() + def favicon_path(self, url): + return self.favicon_store.icon_path(url) + def is_modified(self): modified = self.mtime > self.last_reload logger.info(f"Layout modified?: {modified}") diff --git a/app/models/scheduler.py b/app/models/scheduler.py index 0531299..9eb2641 100644 --- a/app/models/scheduler.py +++ b/app/models/scheduler.py @@ -30,8 +30,8 @@ def getScheduler() -> BackgroundScheduler: 'max_workers': '5' }, 'apscheduler.executors.processpool': { - 'type': 'processpool', - 'max_workers': '5' + 'class': 'apscheduler.executors.pool:ThreadPoolExecutor', + 'max_workers': '20' } }) diff --git a/app/services/favicon_retriever.py b/app/services/favicon_retriever.py deleted file mode 100644 index 2e7dca6..0000000 --- a/app/services/favicon_retriever.py +++ /dev/null @@ -1,80 +0,0 @@ -import logging -import os -import re -from services.favicon_utils import get_favicon_filename, normalize_domain -import requests -from bs4 import BeautifulSoup -from models.utils import pwd -from urllib.parse import urljoin - -logger = logging.getLogger(__name__) -logger.setLevel(logging.DEBUG) - - -class FaviconRetriever: - def __init__(self, favicon_store, cache_dir: str): - self.cache_dir = pwd.joinpath(cache_dir) - self.cache_dir.mkdir(parents=True, exist_ok=True) - self.request_headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' - } - self.favicon_store = favicon_store - - def make_request(self, url): - return requests.get(url, headers=self.request_headers, allow_redirects=True) - - def favicon_path(self, url): - favicon_filename = get_favicon_filename(url) - return os.path.join(self.cache_dir, favicon_filename) - - def find_favicon_url(self, url): - normalized_domain = normalize_domain(url) - for try_url in [url, normalized_domain]: - try: - response = self.make_request(try_url) - if response.status_code == 200: - soup = BeautifulSoup(response.text, 'html.parser') - icon_link = soup.find('link', rel=['icon', 'shortcut icon']) - if icon_link: - icon_url = icon_link['href'] - if not icon_url.startswith('http'): - icon_url = urljoin(url, icon_url) - return icon_url - except Exception as ex: - logger.error(f"Error: find_favicon_url({try_url}): {ex}") - - # if we made it here we have not found a favicon url - # lets check google - - icon_url = f'http://www.google.com/s2/favicons?domain={normalized_domain}' - response = self.make_request(icon_url) - if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image'): - with open(self.favicon_path(normalized_domain), 'wb') as file: - file.write(response.content) - self.favicon_store.save_processed_domain(normalized_domain, reason='found in google') - - return None - - def download_favicon(self, url): - logger.debug(f"download_favicon({url}) called") - icon_url = self.find_favicon_url(url) - if not icon_url: - logger.debug(f"Could not download_favicon({url}) no icon url found!") - return - - normalized_domain = normalize_domain(icon_url) - favicon_path = self.favicon_path(normalized_domain) - - try: - response = self.make_request(icon_url) - if response.status_code == 200 and response.headers.get('content-type', '').lower().startswith('image/'): - with open(favicon_path, 'wb') as file: - file.write(response.content) - self.favicon_store.save_processed_domain(normalized_domain, reason='success') - else: - self.favicon_store.save_processed_domain( - normalized_domain, - reason=f'response_code: {response.status_code} content-type: {response.headers.get("content-type", "")}' - ) - except Exception as ex: - self.favicon_store.save_processed_domain(normalized_domain, reason=f'{ex}') diff --git a/app/services/favicon_store.py b/app/services/favicon_store.py index a7963e2..c3eb35e 100644 --- a/app/services/favicon_store.py +++ b/app/services/favicon_store.py @@ -1,9 +1,8 @@ import os import re -import sqlite3 import logging -from services.favicon_utils import get_favicon_filename, normalize_domain -from services.favicon_retriever import FaviconRetriever +from services.redis_store import RedisStore +from services.favicon_utils import base, download_favicon, get_favicon_filename, normalize_domain from models.scheduler import Scheduler from models.utils import pwd @@ -12,51 +11,55 @@ class FaviconStore: - def __init__(self, cache_dir='static/assets/icons', db_path='configs/favicons.db'): - self.relative_cache_dir = cache_dir + def __init__(self, icon_dir='static/assets/icons'): + self.relative_icon_dir = icon_dir + self.icon_dir = pwd.joinpath(icon_dir).resolve() + self.icon_dir.mkdir(parents=True, exist_ok=True) - self.retriever = FaviconRetriever(self, cache_dir) + self.redis_store = RedisStore() self.ip_pattern = re.compile( - r"^(?:(?:https?://)?(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}" - r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?(?:\/)?$" + r"^(?:(?:https?://)?(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}" + r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?(?:\/)?$" ) - self.db_path = pwd.joinpath(db_path) - self.initializing_database() + @property + def scheduler(self): + return Scheduler.getScheduler() - def icon_path(self, url): + def icon_path(self, url) -> str: if not url: return None favicon_filename = get_favicon_filename(url) - favicon_relative_path = f"{self.relative_cache_dir}/{favicon_filename}" + favicon_relative_path = f"{self.relative_icon_dir}/{favicon_filename}" - if os.path.exists(favicon_relative_path): + if pwd.joinpath(favicon_relative_path).exists(): return f"/{favicon_relative_path}" else: return None def fetch_favicons_from(self, urls): + base_urls = sorted(set(map(lambda url: base(url), urls))) + processable_urls = set(filter(lambda url: self.should_processed(url), base_urls)) + # logger.debug(f"============================ {len(processable_urls)} processable urls") + self.scheduler.add_job( self._process_urls_for_favicons, - args=[urls], + args=[processable_urls], id='fetch_favicons', name='fetch_favicons', misfire_grace_time=None, replace_existing=False, max_instances=1, - coalesce=True, - executor='processpool' + coalesce=True ) def _process_urls_for_favicons(self, urls): - processable_urls = set(filter(lambda url: self.should_processed(url), urls)) - logger.debug(f"============================ {len(processable_urls)} processable urls") - for url in processable_urls: + for url in urls: name = f'_get_favicon_({url})' self.scheduler.add_job( - self.retriever.download_favicon, - args=[url], + download_favicon, + args=[url, self.icon_dir], id=name, name=name, misfire_grace_time=None, @@ -64,68 +67,9 @@ def _process_urls_for_favicons(self, urls): ) def should_processed(self, url): - result = not ( - not url - or bool(self.ip_pattern.match(url)) - or self.icon_path(url) - or self.is_domain_processed(url) + return not ( + not url + or bool(self.ip_pattern.match(url)) + or self.icon_path(url) + or self.redis_store.is_domain_processed(normalize_domain(url)) ) - # logger.debug(f"==============================================================") - # logger.debug(f"should_processed: {url}") - # logger.debug(f"ip: {bool(self.ip_pattern.match(url))}") - # logger.debug(f"path: {self.icon_path(url)}") - # logger.debug(f"processed: {self.is_domain_processed(url)}") - # logger.debug(f"result: {result}") - return result - - @property - def scheduler(self): - return Scheduler.getScheduler() - - def initializing_database(self): - try: - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - c.execute('''CREATE TABLE IF NOT EXISTS processed_domains - (domain TEXT PRIMARY KEY, reason TEXT)''') - conn.commit() - conn.close() - except Exception as ex: - logger.error(f"Error initializing database {self.db_path}: {e}") - - def processed_domain_count(self): - try: - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - c.execute("SELECT COUNT(*) FROM processed_domains") - result = c.fetchone() - conn.close() - return result[0] - except Exception as ex: - logger.error(f"Error in get_processed_domain_count_from_db(): {ex}") - return 0 - - def save_processed_domain(self, url, reason='completed'): - nomalized_domain = normalize_domain(url) - try: - conn = sqlite3.connect(self.db_path, check_same_thread=False) - c = conn.cursor() - c.execute("INSERT OR REPLACE INTO processed_domains (domain, reason) VALUES (?, ?)", (nomalized_domain, reason)) - conn.commit() - conn.close() - logger.info(f"Saved processed domain {nomalized_domain} with reason {reason}") - except Exception as ex: - logger.error(f"Error in save_processed_domain({nomalized_domain}): {ex}") - - def is_domain_processed(self, url): - nomalized_domain = normalize_domain(url) - try: - conn = sqlite3.connect(self.db_path) - c = conn.cursor() - c.execute("SELECT 1 FROM processed_domains WHERE domain = ?", [nomalized_domain]) - result = c.fetchone() - conn.close() - return bool(result) - except Exception as ex: - logger.error(f"Error checking is_domain_processed({nomalized_domain}): {ex}") - return False diff --git a/app/services/favicon_utils.py b/app/services/favicon_utils.py index fac805f..bf7a8d5 100644 --- a/app/services/favicon_utils.py +++ b/app/services/favicon_utils.py @@ -1,4 +1,12 @@ -from urllib.parse import urlparse +import logging +import os +import requests +from bs4 import BeautifulSoup +from services.redis_store import RedisStore +from urllib.parse import urljoin, urlparse + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) def normalize_domain(url): @@ -7,5 +15,73 @@ def normalize_domain(url): return url +def base(url): + try: + parsed_url = urlparse(url) + base_url = f"{parsed_url.scheme}://{parsed_url.netloc}" + return base_url + except Exception as ex: + logger.error(f"Error in base({url}): {ex}") + return url + + def get_favicon_filename(url): return f"{normalize_domain(url)}.favicon.ico" + + +def make_request(url): + request_headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' + } + return requests.get(url, headers=request_headers, allow_redirects=True, timeout=5) + + +def favicon_path(icon_path, url): + favicon_filename = get_favicon_filename(url) + return os.path.join(icon_path, favicon_filename) + + +def find_favicon_from(url): + try: + response = make_request(url) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + icon_link = soup.find('link', rel=['icon', 'shortcut icon']) + if icon_link: + icon_url = icon_link['href'] + if not icon_url.startswith('http'): + icon_url = urljoin(url, icon_url) + return icon_url + except Exception as ex: + logger.debug(f"Error: find_favicon_url({url}): {ex}") + return None + + +def download_favicon(url, icon_dir): + icon_url = find_favicon_from(url) + if icon_url: + _download(url, icon_dir, icon_url) + else: + _download(url, icon_dir, f'http://www.google.com/s2/favicons?domain={normalize_domain(url)}') + + +def _download(url, icon_dir, icon_url): + redis_store = RedisStore() + try: + response = make_request(icon_url) + + if response.status_code == 200 and response.headers.get('content-type', '').lower().startswith('image/'): + filename = favicon_path(icon_dir, url) + with open(favicon_path(icon_dir, url), 'wb') as file: + file.write(response.content) + logger.debug(f"saving {url} as {filename}") + else: + redis_store.save_processed_domain( + normalize_domain(url), + reason=f'response_code: {response.status_code} content-type: {response.headers.get("content-type", "")}' + ) + logger.debug(f"issues {url} complete") + except Exception as ex: + redis_store.save_processed_domain(normalize_domain(url), reason=f'{ex}') + + logger.debug(f"_download({icon_url}) completed") diff --git a/app/services/redis_store.py b/app/services/redis_store.py new file mode 100644 index 0000000..f12bca1 --- /dev/null +++ b/app/services/redis_store.py @@ -0,0 +1,33 @@ +import logging +import os + +logger = logging.getLogger(__name__) +logger.setLevel(logging.DEBUG) + + +class RedisStore: + def __init__(self): + import redis + self.redis = redis.Redis(host=os.getenv('REDIS_HOST', 'redis')) + self.namespace = os.getenv('REDIS_NAMESPACE', f'onboard/{os.environ.get("FLASK_ENV", "development")}/favicons') + + def processed_domain_count(self): + try: + return self.redis.scard(f"{self.namespace}:processed_domains") + except Exception as ex: + logger.error(f"Error in get_processed_domain_count_from_db(): {ex}") + return 0 + + def save_processed_domain(self, normalized_domain, reason='completed'): + try: + self.redis.sadd(f"{self.namespace}:processed_domains", normalized_domain) + logger.info(f"Domain {normalized_domain} processed with reason {reason}") + except Exception as ex: + logger.error(f"Error in save_processed_domain({normalized_domain}): {ex}") + + def is_domain_processed(self, normalized_domain): + try: + return self.redis.sismember(f"{self.namespace}:processed_domains", normalized_domain) + except Exception as ex: + logger.error(f"Error checking is_domain_processed({normalized_domain}): {ex}") + return False diff --git a/app/templates/bookmark_bar.html b/app/templates/bookmark_bar.html index 3f858a5..9f7eb14 100644 --- a/app/templates/bookmark_bar.html +++ b/app/templates/bookmark_bar.html @@ -11,7 +11,7 @@ {% endwith %} {% else %} - {% set favicon_url = favicon_exists(bookmark.href) %} + {% set favicon_url = favicon_path(bookmark.href) %} {% if favicon_url %} {{ bookmark.name }} {% else %} diff --git a/requirements.txt b/requirements.txt index 947afdb..e1f7306 100755 --- a/requirements.txt +++ b/requirements.txt @@ -21,6 +21,7 @@ pillow python-dateutil python-dotenv pyyaml +redis requests unidecode url-normalize