From 24c326a751a2dbc42e2b04e7e92aa74d326400b7 Mon Sep 17 00:00:00 2001 From: mike <219478+ilude@users.noreply.github.com> Date: Thu, 28 Mar 2024 14:22:06 -0400 Subject: [PATCH] implement async requests to feed to increase performance on cold cache --- app/app.py | 110 +++++++++++++++++++++++++-------------- app/configs/layout.yml | 88 ++++++++++++++++++------------- app/rss.py | 51 ++++++++++++++++-- app/static/css/index.css | 33 ++++++++++++ app/static/js/script.js | 56 ++++++++++++++++++++ app/templates/index.html | 37 +++++++++++-- app/utils.py | 60 +++++++++------------ requirements.txt | 5 +- 8 files changed, 321 insertions(+), 119 deletions(-) create mode 100644 app/static/js/script.js diff --git a/app/app.py b/app/app.py index 4c790c2..69e0341 100644 --- a/app/app.py +++ b/app/app.py @@ -1,66 +1,100 @@ import os +import yaml +import asyncio from datetime import datetime -import feedparser -from datetime import datetime -from flask import Flask, render_template +from flask import Flask, request, render_template from flask_caching import Cache -from post_processor import post_processor -from utils import clean_html, copy_default_to_configs, load_file +from utils import copy_default_to_configs, load_file +from rss import rss copy_default_to_configs() app = Flask(__name__) -# 600 seconds = 10 minutes -cache = Cache(app, config={ - 'CACHE_TYPE': 'simple', - 'CACHE_DEFAULT_TIMEOUT': 600 -}) +if os.environ.get("FLASK_DEBUG", "False") == "True": + cache_config={ + 'CACHE_TYPE': 'null' + } +else: + # 600 seconds = 10 minutes + cache_config={ + 'CACHE_TYPE': 'simple', + 'CACHE_DEFAULT_TIMEOUT': 600 + } + +cache = Cache(app, config=cache_config) @app.context_processor def inject_current_date(): return {'today_date': datetime.now()} +@app.route('/save_tab_name', methods=['POST']) +def save_tab_name(): + data = request.get_json() + tab_name = data.get('tab_name') + tab_index = data.get('tab_index') + column_count = data.get('column_count') + + if tab_name and column_count >= 1 and column_count <= 6: + with open('configs/layout.yml', 'r') as file: + layout = yaml.safe_load(file) + + tabs = layout['tabs'] + + if tab_index is not None: + # Edit an existing tab + tabs[tab_index]['name'] = tab_name + tabs[tab_index]['columns'] = column_count + else: + # Add a new tab + tabs.append({'name': tab_name, 'columns': column_count, 'widgets': []}) + + with open('configs/layout.yml', 'w') as file: + yaml.safe_dump(layout, file) + + return {'message': f'Tab name "{tab_name}" with {column_count} columns saved successfully'} + else: + return {'error': 'Invalid tab name or column count'}, 400 + # Define route to render the template @app.route('/') +@app.route('/') @cache.cached(timeout=600) -def index(): +async def index(tab_name=None): # Load feeds and bookmarks layout = load_file('layout.yml', cache) headers = layout['headers'] - widgets = layout['widgets'] - # Divide feeds into three columns - columns = [[], [], []] - + tabs = layout['tabs'] + if tab_name is None: + tab = tabs[0] + else: + tab = next((tab for tab in tabs if tab["name"].lower() == tab_name.lower()), tabs[0]) + current_tab = tab['name'] + + column_count = tab['columns'] + columns = [[] for _ in range(column_count)] + + tasks = [] + # Add feeds to the appropriate column - for widget in widgets: - column_index = (widget['column'] - 1) % 3 - if widget['type'] == 'feed': - parsed_feed = feedparser.parse(widget['url']) - parsed_item = { - 'title': widget['name'], - 'link': widget['link'], - 'type': widget['type'], - 'summary_enabled': bool(widget.get('summary', True)), - 'articles': [{ - 'title': " ".join(entry.get('title', 'No Title').split()).strip() , - 'link': entry.link, - 'summary': clean_html(entry.get('summary', ''))} for entry in parsed_feed.entries[:10]] if 'entries' in parsed_feed else [] - } - parsed_item = post_processor.process(parsed_item['title'], parsed_item) - columns[column_index].append(parsed_item) - elif widget['type'] == 'bookmarks': - columns[column_index].append({ - 'title': widget['name'], - 'type': widget['type'], - 'articles': [{'title': entry['title'], 'link': entry['url']} for entry in widget['bookmarks']] - }) + if tab['widgets']: + for widget in tab['widgets']: + column_index = (widget['column'] - 1) % column_count + if widget['type'] == 'feed': + tasks.append(asyncio.create_task(rss.load_feed(widget, columns[column_index]))) + elif widget['type'] == 'bookmarks': + widget['articles'] = [{'title': entry['title'], 'link': entry['url']} for entry in widget['bookmarks']] + columns[column_index].append(widget) + await asyncio.wait(tasks) + for column in columns: + column.sort(key = lambda x: x['position']) + # Pass column data to the template - return render_template('index.html', columns=columns, headers=headers) + return render_template('index.html', tabs=tabs, columns=columns, headers=headers, current_tab=current_tab) if __name__ == '__main__': port = int(os.environ.get("ONBOARD_PORT", 9830)) diff --git a/app/configs/layout.yml b/app/configs/layout.yml index 74725d8..ed3e878 100644 --- a/app/configs/layout.yml +++ b/app/configs/layout.yml @@ -10,27 +10,34 @@ headers: - name: "Youtube" link: "https://www.youtube.com/" -widgets: - - name: "Lawrence Person's BattleSwarm Blog" - type: "feed" - link: "https://www.battleswarmblog.com/" - url: "https://www.battleswarmblog.com/?feed=rss2" - column: 1 - - name: "Cafe Hayek" - type: "feed" - link: "https://cafehayek.com/" - url: "https://cafehayek.com/feed" - column: 1 - - name: "Slashdot" - type: "feed" - link: "https://slashdot.org/" - url: "https://rss.slashdot.org/Slashdot/slashdotMain" - column: 1 - summary: false - - name: "Bookmarks" - type: "bookmarks" - column: 2 - bookmarks: +tabs: + - name: "Home" + columns: 3 + widgets: + - name: "Lawrence Person's BattleSwarm Blog" + type: "feed" + link: "https://www.battleswarmblog.com/" + url: "https://www.battleswarmblog.com/?feed=rss2" + column: 1 + position: 1 + - name: "Cafe Hayek" + type: "feed" + link: "https://cafehayek.com/" + url: "https://cafehayek.com/feed" + column: 1 + position: 2 + - name: "Slashdot" + type: "feed" + summary_enabled: False + link: "https://slashdot.org/" + url: "https://rss.slashdot.org/Slashdot/slashdotMain" + column: 1 + position: 3 + - name: "Bookmarks" + type: "bookmarks" + column: 2 + position: 1 + bookmarks: - title: Ali Express url: https://www.aliexpress.com/ - title: Amazon @@ -65,18 +72,27 @@ widgets: url: https://tailscale.com/ - title: Thingiverse url: https://www.thingiverse.com/ - - name: "Real Clear Politics" - type: "feed" - link: "https://www.realclearpolitics.com/" - url: "https://www.realclearpolitics.com/index.xml" - column: 2 - - name: "Instapundit" - type: "feed" - link: "https://instapundit.com/" - url: "https://instapundit.com/feed/" - column: 3 - - name: "Twitchy" - type: "feed" - link: "https://twitchy.com/" - url: "https://twitchy.com/feed" - column: 3 \ No newline at end of file + - name: "Real Clear Politics" + type: "feed" + link: "https://www.realclearpolitics.com/" + url: "https://www.realclearpolitics.com/index.xml" + column: 2 + position: 2 + - name: "Instapundit" + type: "feed" + link: "https://instapundit.com/" + url: "https://instapundit.com/feed/" + column: 3 + position: 1 + - name: "Twitchy" + type: "feed" + link: "https://twitchy.com/" + url: "https://twitchy.com/feed" + column: 3 + position: 2 + - name: "More" + columns: 3 + widgets: + - name: "Monitoring" + columns: 3 + widgets: \ No newline at end of file diff --git a/app/rss.py b/app/rss.py index e932ba1..4ca5d8c 100644 --- a/app/rss.py +++ b/app/rss.py @@ -1,11 +1,53 @@ +import time +import aiohttp import feedparser import html import requests -from bs4 import BeautifulSoup +import re + +from post_processor import post_processor +from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning +import warnings + +warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning) class Rss: - def clean_html(self, text): - return BeautifulSoup( html.unescape(text), 'lxml').get_text() + + def clean_html(self, text: str) -> str: + """ + Removes HTML tags, decode HTML entities, and strip leading and trailing + whitespace from the given text. + + Args: + text (str): The text to clean. + + Returns: + str: The cleaned text. + """ + text = text.replace('\n', ' ').replace('\r', ' ').strip() + + if not text: + return text + + text = BeautifulSoup(html.unescape(text), 'lxml').text + text = re.sub(r'\[.*?\].*$', '', text) + # text = re.sub(r'http[s]?://\S+', '', text, flags=re.IGNORECASE) + # text = ' '.join([x.capitalize() for x in text.split(' ')]) + return text + + async def load_feed(self, widget, column): + start_time = time.time() + async with aiohttp.ClientSession() as session: + async with session.get(widget['url']) as response: + parsed_feed = feedparser.parse(await response.text()) + widget['summary_enabled'] = widget.get('summary_enabled', True) + widget['articles'] = [{ + 'title': " ".join(entry.get('title', 'No Title').split()).strip() , + 'link': entry.link, + 'summary': self.clean_html(entry.get('summary', ''))} for entry in parsed_feed.entries[:10]] if 'entries' in parsed_feed else [] + widget = post_processor.process(widget['name'], widget) + column.append(widget) + return (time.time() - start_time) def find_feed_links(self, url): response = requests.get(url) @@ -23,6 +65,9 @@ def find_feed_links(self, url): print(f"Failed to retrieve content from {url}") return None +rss = Rss() + + if __name__ == "__main__": webpage_url = "https://blog.langchain.dev/automating-web-research/"# input("Enter the URL of the webpage: ") diff --git a/app/static/css/index.css b/app/static/css/index.css index 9e510c2..24c3a32 100644 --- a/app/static/css/index.css +++ b/app/static/css/index.css @@ -217,3 +217,36 @@ ul li:last-child { color: #ddd; } +.new-tab-btn, .edit-tab-btn { + float: right; + margin-left: 10px; +} + +.tab-buttons { + float: right; +} + +.new-tab-btn, .edit-tab-btn { + margin-left: 10px; +} + +.modal { + display: none; + position: fixed; + z-index: 1; + left: 0; + top: 0; + width: 100%; + height: 100%; + overflow: auto; + background-color: rgba(0, 0, 0, 0.4); +} + +.modal-content { + background-color: #fefefe; + margin: 15% auto; + padding: 20px; + border: 1px solid #888; + width: 30%; +} + diff --git a/app/static/js/script.js b/app/static/js/script.js new file mode 100644 index 0000000..3bdcbb1 --- /dev/null +++ b/app/static/js/script.js @@ -0,0 +1,56 @@ +$(document).ready(function() { + // Get the modal elements + const $modal = $('.modal'); + const $tabNameInput = $('#tabNameInput'); + const $columnSelect = $('#columnSelect'); + const $saveBtn = $('#saveBtn'); + + // Open the modal + function openModal(tabName='', editIndex = -1) { + $tabNameInput.val(tabName); + $saveBtn.off('click').on('click', function() { saveTabName(editIndex) }); + $modal.show(); + } + + // Save the tab name + function saveTabName(editIndex) { + const newTabName = $tabNameInput.val().trim(); + if (newTabName) { + const tabIndex = editIndex !== -1 ? editIndex : null; + + // Make an AJAX request to the Flask route + $.ajax({ + url: '/save_tab_name', + type: 'POST', + data: JSON.stringify({ 'tab_name': newTabName, 'tab_index': tabIndex }), + contentType: 'application/json; charset=utf-8', + dataType: 'json', + success: function(response) { + console.log('Tab name saved:', response.message); + // Reload the page or update the UI as needed + }, + error: function(xhr, status, error) { + console.error('Error saving tab name:', error); + } + }); + } + $modal.hide(); + $tabNameInput.val(''); + } + + // Close the modal when clicking outside of it + $(window).click(function(event) { + if (event.target === $modal[0]) { + $modal.hide(); + } + }); + + // Add event listeners to the buttons + $('.new-tab-btn').click(function() { openModal(); }); + $('.edit-tab-btn').click(function() { + active_tab = $('.tab-bar a').filter(function() { return $(this).data('current') === 'True'; }).first(); + + openModal(active_tab.text(), active_tab.data('index')); + }); + +}); \ No newline at end of file diff --git a/app/templates/index.html b/app/templates/index.html index 81d67e0..02e7d89 100644 --- a/app/templates/index.html +++ b/app/templates/index.html @@ -6,6 +6,8 @@ OnBoard + + @@ -20,9 +22,34 @@
- Tab 1 - Tab 2 - Tab 3 + {% for tab in tabs %} + {{tab.name}} + {% endfor %} +
+ + +
+
{% for column in columns %} @@ -30,7 +57,7 @@ {% for feed in column %}
    {% for article in feed.articles %} @@ -42,7 +69,7 @@ No Title {% endif %} - {% if feed.summary_enabled and article.summary and article.summary >= article.title %} + {% if feed.summary_enabled and article.summary %}
    {{ article.summary }}
    {% endif %} diff --git a/app/utils.py b/app/utils.py index 17b0043..93f8ab2 100644 --- a/app/utils.py +++ b/app/utils.py @@ -1,12 +1,7 @@ -import html import os -import re import shutil - import yaml -from bs4 import BeautifulSoup - def copy_default_to_configs(): pwd = os.path.dirname(os.path.abspath(__file__)) default_dir = os.path.join(pwd, 'defaults') @@ -27,33 +22,28 @@ def copy_default_to_configs(): print(f"Default files synced from {default_dir} to {config_dir}.") else: print(f"No files copied from {default_dir} to {config_dir}.") - -def clean_html(text): - text = text.replace('\n', ' ').replace('\r', ' ') - text = BeautifulSoup(html.unescape(text), 'lxml').text - text = re.sub(r'\[.*?\].*$', '', text) - # text = re.sub(r'http[s]?://\S+', '', text, flags=re.IGNORECASE) - # text = ' '.join([x.capitalize() for x in text.split(' ')]) - - return text.strip() - -global last_modified_times -last_modified_times = {} - -def load_file(file_name, cache): - # Adjust file path for the configs subdirectory - current_working_directory = os.path.dirname(os.path.realpath(__file__)) - file_path = os.path.join(current_working_directory, 'configs', file_name) - - # Check the last modification time of the file - current_modified_time = os.path.getmtime(file_path) - current_data = cache.get(file_path) - - # If the file has been modified since the last check, reload it - if current_modified_time != last_modified_times.get(file_path) or not current_data: - last_modified_times[file_path] = current_modified_time - with open(file_path, 'r') as file: - current_data = yaml.safe_load(file) - cache.set(file_path, current_data) - - return current_data \ No newline at end of file + +class FileData: + def __init__(self, last_modified=0, contents=None): + self.last_modified = last_modified + self.contents = contents + + def __getitem__(self, key, default=None): + return self.contents.get(key, default) + +file_cache = {} + +def load_file(file_name, cache=None): + current_working_directory = os.path.dirname(os.path.realpath(__file__)) + file_path = os.path.join(current_working_directory, 'configs', file_name) + + # Check the last modification time of the file + current_modified_time = os.path.getmtime(file_path) + + # Only load the file if it has been modified since the last check or if there is no value for that file in the dict + if current_modified_time > file_cache.get(file_path, FileData()).last_modified or file_path not in file_cache: + with open(file_path, 'r') as file: + contents = yaml.safe_load(file) + file_cache[file_path] = FileData(current_modified_time, contents) + + return file_cache[file_path].contents diff --git a/requirements.txt b/requirements.txt index b1321c2..49ece52 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,12 @@ argcomplete bs4 docker -flask +flask[async] flask-caching feedparser lxml python-dotenv pyyaml requests -hypercorn==0.15.0 \ No newline at end of file +hypercorn==0.15.0 +aiohttp[speedups] \ No newline at end of file