allow ai to be optional

traefikturkey · Mar 30, 2024 · e806dcf · e806dcf
1 parent 4737cb0
commit e806dcf
Show file tree

Hide file tree

Showing 5 changed files with 80 additions and 35 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -1,7 +1,12 @@
 {
   "cSpell.words": [
     "Instapundit",
+    "langchain",
+    "llms",
     "Monero",
+    "ollama",
+    "Ollama",
+    "OLLAMA",
     "Slickdeals",
     "Tailscale",
     "Thingiverse"

diff --git a/app/post_processor.py b/app/post_processor.py
@@ -27,7 +27,10 @@ def to_snake_case(self, input_string):
 		return snake_case_string
 
 	def process(self, widget):
-
+		if 'processed' in widget and widget['processed'] and not bool(os.environ.get('FLASK_DEBUG')):
+			print (f"Widget {widget['name']} already processed.")
+			return widget
+
 		self.normalize(widget)
 
 		# Check if the class has already been loaded
@@ -47,29 +50,33 @@ def process(self, widget):
 			self.loaded_classes[class_name] = instance
 
 		# Call process() method of the instance with the provided data
-		result = instance.process(widget)
-		return result
+		widget = instance.process(widget)
+		widget['processed'] = True
+		return widget
 
 	def normalize(self, widget):
-			for article in widget['articles']:
-				article['title'] = re.sub(r'\s+', ' ', article['title'])
-
-				if not article['summary']:
-					continue
-
-				article['summary'] = article['summary'].replace('\n', ' ').replace('\r', ' ').strip()
-				article['summary'] = BeautifulSoup(html.unescape(article['summary']), 'lxml').text
-				# strip [...] from the end of the summary
-				article['summary'] = re.sub(r'\[[\.+|…\]].*$', '', article['summary'])
-
-				if article['summary'] == article['title']:
-					article['summary'] = None
-				elif (article['title'] in article['summary'] and len(article['title'])/len(article['summary']) > 0.64):
-					article['title'] = article['summary']
-					article['summary'] = None
-				elif (article['summary'] in article['title']):
-					article['summary'] = article['title']
-					article['title'] = None
+		for article in widget['articles']:
+			article['title'] = article['original_title'].strip()
+			article['title'] = re.sub(r'\s+', ' ', article['title'])
+
+			if not article['original_summary']:
+				continue
+			else:
+				article['summary'] = article['original_summary']
+
+			article['summary'] = article['summary'].replace('\n', ' ').replace('\r', ' ').strip()
+			article['summary'] = BeautifulSoup(html.unescape(article['summary']), 'lxml').text
+			# strip [...] from the end of the summary
+			article['summary'] = re.sub(r'\[[\.+|…\]].*$', '', article['summary'])
+
+			if article['summary'] == article['title']:
+				article['summary'] = None
+			elif (article['title'] in article['summary'] and len(article['title'])/len(article['summary']) > 0.64):
+				article['title'] = article['summary']
+				article['summary'] = None
+			elif (article['summary'] in article['title']):
+				article['summary'] = article['title']
+				article['title'] = None
 
 # Instantiate loader when the module is imported
 post_processor = PostProcessor()
diff --git a/app/processors/instapundit.py b/app/processors/instapundit.py
@@ -1,12 +1,44 @@
+import os
 import re
-
+from langchain_community.llms import Ollama
+from langchain.prompts import ChatPromptTemplate
 
 class Instapundit:
+	def __init__(self):
+		ollama_url = os.getenv('OLLAMA_URL')
+		if ollama_url:
+			_prompt = ChatPromptTemplate.from_messages([
+				("human", """
+					Title: {title}
+					Summary: {summary}
+				"""),
+				("system", """
+					you are and news article title editor that reviews and provides a concise and accurate title when given
+					an existing Title and article Summary. 
+					Remove all links from the title.
+					Title should be as short as possible, aim to be less that 70 characters long.
+					Title should have an absolute minimum of punctuation.
+					Do your best to keep the existing title if possible.
+					DO NOT provide any additional text or thoughts before or after the title.
+					DO NOT put notes in parentheses.
+					Provide the title only!
+					"""),
+				])
+
+			_model = Ollama(base_url=ollama_url, model="dolphin-mistral", keep_alive=5, temperature=0.0)
+
+			self.chain = _prompt | _model
+
 	def process(self, widget):
 		for article in widget['articles'][:]:
-			if '#CommissionEarned' in article['title']:
+			if '#CommissionEarned' in article['title'] or re.search('Open Thread', article['title'], re.IGNORECASE):
 				widget['articles'].remove(article)
 				next
-			article['title'] = re.sub(r'http[s]?://\S+', '', article['title'], flags=re.IGNORECASE)
+			if self.chain:
+				title = self.chain.invoke({"title": article['original_title'], "summary": article['original_summary']})
+				title = title.strip().strip('""')
+				article['title'] = title
+			else:
+				article['title'] = article['title'].strip().strip('""')
 
 		return widget
diff --git a/app/rss.py b/app/rss.py
@@ -3,9 +3,7 @@
 import aiohttp
 from cachelib import FileSystemCache
 import feedparser
-import html
 import requests
-import re
 
 from post_processor import post_processor
 from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
@@ -26,7 +24,7 @@ async def load_feed(self, widget):
 		cached_widget = self.feed_cache.get(widget['name'])
 
 		# check if feed is in self.feeds and that the last updated time is less than 15 minutes ago	
-		if cached_widget and (start_time - cached_widget['last_updated']) < 60 * 15:
+		if cached_widget and 'last_updated' in cached_widget and (start_time - cached_widget['last_updated']) < 60 * 15:
 			widget['articles'] = cached_widget['articles']
 			# print(f"Loaded {widget['name']} from cache")
 		else:
@@ -42,15 +40,16 @@ async def load_feed(self, widget):
 						parsed_feed = feedparser.parse(await response.text())
 
 						widget['articles'] = [{
-								'title': entry.get('title', 'No Title').strip() , 
+								'original_title': entry.get('title', 'No Title').strip(), 
 								'link': entry.link, 
-								'summary': entry.get('summary', None)
-        		} for entry in parsed_feed.entries[:article_limit]] if 'entries' in parsed_feed else []
+								'original_summary': entry.get('summary', None)
+						} for entry in parsed_feed.entries[:article_limit]] if 'entries' in parsed_feed else []
 
 						widget['last_updated'] = start_time
 						self.feed_cache.set(widget['name'], widget)
-
-		widget = post_processor.process(widget)
+
+		post_processor.process(widget)
+		self.feed_cache.set(widget['name'], widget)
 
 		return (time.time() - start_time)
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,13 +1,15 @@
 argcomplete
 bs4
 docker
+feedparser
 flask[async]
 flask-caching
 Flask-Minify
-feedparser
+langchain
+langchain-community
 lxml
 python-dotenv
 pyyaml 
 requests
 hypercorn==0.15.0
-aiohttp[speedups]
+aiohttp[speedups]