Skip to content

Commit

Permalink
implement async requests to feed to increase performance on cold cache
Browse files Browse the repository at this point in the history
  • Loading branch information
ilude committed Mar 28, 2024
1 parent 802ad8e commit 24c326a
Show file tree
Hide file tree
Showing 8 changed files with 321 additions and 119 deletions.
110 changes: 72 additions & 38 deletions app/app.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,100 @@
import os
import yaml
import asyncio
from datetime import datetime

import feedparser
from datetime import datetime
from flask import Flask, render_template
from flask import Flask, request, render_template
from flask_caching import Cache
from post_processor import post_processor

from utils import clean_html, copy_default_to_configs, load_file
from utils import copy_default_to_configs, load_file
from rss import rss

copy_default_to_configs()

app = Flask(__name__)

# 600 seconds = 10 minutes
cache = Cache(app, config={
'CACHE_TYPE': 'simple',
'CACHE_DEFAULT_TIMEOUT': 600
})
if os.environ.get("FLASK_DEBUG", "False") == "True":
cache_config={
'CACHE_TYPE': 'null'
}
else:
# 600 seconds = 10 minutes
cache_config={
'CACHE_TYPE': 'simple',
'CACHE_DEFAULT_TIMEOUT': 600
}

cache = Cache(app, config=cache_config)

@app.context_processor
def inject_current_date():
return {'today_date': datetime.now()}

@app.route('/save_tab_name', methods=['POST'])
def save_tab_name():
data = request.get_json()
tab_name = data.get('tab_name')
tab_index = data.get('tab_index')
column_count = data.get('column_count')

if tab_name and column_count >= 1 and column_count <= 6:
with open('configs/layout.yml', 'r') as file:
layout = yaml.safe_load(file)

tabs = layout['tabs']

if tab_index is not None:
# Edit an existing tab
tabs[tab_index]['name'] = tab_name
tabs[tab_index]['columns'] = column_count
else:
# Add a new tab
tabs.append({'name': tab_name, 'columns': column_count, 'widgets': []})

with open('configs/layout.yml', 'w') as file:
yaml.safe_dump(layout, file)

return {'message': f'Tab name "{tab_name}" with {column_count} columns saved successfully'}
else:
return {'error': 'Invalid tab name or column count'}, 400

# Define route to render the template
@app.route('/')
@app.route('/<tab_name>')
@cache.cached(timeout=600)
def index():
async def index(tab_name=None):
# Load feeds and bookmarks
layout = load_file('layout.yml', cache)
headers = layout['headers']
widgets = layout['widgets']

# Divide feeds into three columns
columns = [[], [], []]

tabs = layout['tabs']
if tab_name is None:
tab = tabs[0]
else:
tab = next((tab for tab in tabs if tab["name"].lower() == tab_name.lower()), tabs[0])
current_tab = tab['name']

column_count = tab['columns']
columns = [[] for _ in range(column_count)]

tasks = []

# Add feeds to the appropriate column
for widget in widgets:
column_index = (widget['column'] - 1) % 3
if widget['type'] == 'feed':
parsed_feed = feedparser.parse(widget['url'])
parsed_item = {
'title': widget['name'],
'link': widget['link'],
'type': widget['type'],
'summary_enabled': bool(widget.get('summary', True)),
'articles': [{
'title': " ".join(entry.get('title', 'No Title').split()).strip() ,
'link': entry.link,
'summary': clean_html(entry.get('summary', ''))} for entry in parsed_feed.entries[:10]] if 'entries' in parsed_feed else []
}
parsed_item = post_processor.process(parsed_item['title'], parsed_item)
columns[column_index].append(parsed_item)
elif widget['type'] == 'bookmarks':
columns[column_index].append({
'title': widget['name'],
'type': widget['type'],
'articles': [{'title': entry['title'], 'link': entry['url']} for entry in widget['bookmarks']]
})
if tab['widgets']:
for widget in tab['widgets']:
column_index = (widget['column'] - 1) % column_count
if widget['type'] == 'feed':
tasks.append(asyncio.create_task(rss.load_feed(widget, columns[column_index])))
elif widget['type'] == 'bookmarks':
widget['articles'] = [{'title': entry['title'], 'link': entry['url']} for entry in widget['bookmarks']]
columns[column_index].append(widget)

await asyncio.wait(tasks)
for column in columns:
column.sort(key = lambda x: x['position'])

# Pass column data to the template
return render_template('index.html', columns=columns, headers=headers)
return render_template('index.html', tabs=tabs, columns=columns, headers=headers, current_tab=current_tab)

if __name__ == '__main__':
port = int(os.environ.get("ONBOARD_PORT", 9830))
Expand Down
88 changes: 52 additions & 36 deletions app/configs/layout.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,27 +10,34 @@ headers:
- name: "Youtube"
link: "https://www.youtube.com/"

widgets:
- name: "Lawrence Person's BattleSwarm Blog"
type: "feed"
link: "https://www.battleswarmblog.com/"
url: "https://www.battleswarmblog.com/?feed=rss2"
column: 1
- name: "Cafe Hayek"
type: "feed"
link: "https://cafehayek.com/"
url: "https://cafehayek.com/feed"
column: 1
- name: "Slashdot"
type: "feed"
link: "https://slashdot.org/"
url: "https://rss.slashdot.org/Slashdot/slashdotMain"
column: 1
summary: false
- name: "Bookmarks"
type: "bookmarks"
column: 2
bookmarks:
tabs:
- name: "Home"
columns: 3
widgets:
- name: "Lawrence Person's BattleSwarm Blog"
type: "feed"
link: "https://www.battleswarmblog.com/"
url: "https://www.battleswarmblog.com/?feed=rss2"
column: 1
position: 1
- name: "Cafe Hayek"
type: "feed"
link: "https://cafehayek.com/"
url: "https://cafehayek.com/feed"
column: 1
position: 2
- name: "Slashdot"
type: "feed"
summary_enabled: False
link: "https://slashdot.org/"
url: "https://rss.slashdot.org/Slashdot/slashdotMain"
column: 1
position: 3
- name: "Bookmarks"
type: "bookmarks"
column: 2
position: 1
bookmarks:
- title: Ali Express
url: https://www.aliexpress.com/
- title: Amazon
Expand Down Expand Up @@ -65,18 +72,27 @@ widgets:
url: https://tailscale.com/
- title: Thingiverse
url: https://www.thingiverse.com/
- name: "Real Clear Politics"
type: "feed"
link: "https://www.realclearpolitics.com/"
url: "https://www.realclearpolitics.com/index.xml"
column: 2
- name: "Instapundit"
type: "feed"
link: "https://instapundit.com/"
url: "https://instapundit.com/feed/"
column: 3
- name: "Twitchy"
type: "feed"
link: "https://twitchy.com/"
url: "https://twitchy.com/feed"
column: 3
- name: "Real Clear Politics"
type: "feed"
link: "https://www.realclearpolitics.com/"
url: "https://www.realclearpolitics.com/index.xml"
column: 2
position: 2
- name: "Instapundit"
type: "feed"
link: "https://instapundit.com/"
url: "https://instapundit.com/feed/"
column: 3
position: 1
- name: "Twitchy"
type: "feed"
link: "https://twitchy.com/"
url: "https://twitchy.com/feed"
column: 3
position: 2
- name: "More"
columns: 3
widgets:
- name: "Monitoring"
columns: 3
widgets:
51 changes: 48 additions & 3 deletions app/rss.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,53 @@
import time
import aiohttp
import feedparser
import html
import requests
from bs4 import BeautifulSoup
import re

from post_processor import post_processor
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import warnings

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

class Rss:
def clean_html(self, text):
return BeautifulSoup( html.unescape(text), 'lxml').get_text()

def clean_html(self, text: str) -> str:
"""
Removes HTML tags, decode HTML entities, and strip leading and trailing
whitespace from the given text.
Args:
text (str): The text to clean.
Returns:
str: The cleaned text.
"""
text = text.replace('\n', ' ').replace('\r', ' ').strip()

if not text:
return text

text = BeautifulSoup(html.unescape(text), 'lxml').text
text = re.sub(r'\[.*?\].*$', '', text)
# text = re.sub(r'http[s]?://\S+', '', text, flags=re.IGNORECASE)
# text = ' '.join([x.capitalize() for x in text.split(' ')])
return text

async def load_feed(self, widget, column):
start_time = time.time()
async with aiohttp.ClientSession() as session:
async with session.get(widget['url']) as response:
parsed_feed = feedparser.parse(await response.text())
widget['summary_enabled'] = widget.get('summary_enabled', True)
widget['articles'] = [{
'title': " ".join(entry.get('title', 'No Title').split()).strip() ,
'link': entry.link,
'summary': self.clean_html(entry.get('summary', ''))} for entry in parsed_feed.entries[:10]] if 'entries' in parsed_feed else []
widget = post_processor.process(widget['name'], widget)
column.append(widget)
return (time.time() - start_time)

def find_feed_links(self, url):
response = requests.get(url)
Expand All @@ -23,6 +65,9 @@ def find_feed_links(self, url):
print(f"Failed to retrieve content from {url}")
return None

rss = Rss()



if __name__ == "__main__":
webpage_url = "https://blog.langchain.dev/automating-web-research/"# input("Enter the URL of the webpage: ")
Expand Down
33 changes: 33 additions & 0 deletions app/static/css/index.css
Original file line number Diff line number Diff line change
Expand Up @@ -217,3 +217,36 @@ ul li:last-child {
color: #ddd;
}

.new-tab-btn, .edit-tab-btn {
float: right;
margin-left: 10px;
}

.tab-buttons {
float: right;
}

.new-tab-btn, .edit-tab-btn {
margin-left: 10px;
}

.modal {
display: none;
position: fixed;
z-index: 1;
left: 0;
top: 0;
width: 100%;
height: 100%;
overflow: auto;
background-color: rgba(0, 0, 0, 0.4);
}

.modal-content {
background-color: #fefefe;
margin: 15% auto;
padding: 20px;
border: 1px solid #888;
width: 30%;
}

Loading

0 comments on commit 24c326a

Please sign in to comment.