From 09c820b01135e276f68fceeef339312792fe7712 Mon Sep 17 00:00:00 2001
From: Elehiggle <Elehiggle@users.noreply.github.com>
Date: Sat, 25 May 2024 00:56:06 +0200
Subject: [PATCH] Improve website text extractor

---
 chatbot.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/chatbot.py b/chatbot.py
index 8511405..505e23b 100644
--- a/chatbot.py
+++ b/chatbot.py
@@ -1045,6 +1045,12 @@ def request_link_text_content(link, prev_response):
         soup = BeautifulSoup(raw_content, "html.parser")
         website_content = soup.get_text(" | ", strip=True)
 
+    # Replace with a tokenizer once there is one for latest Anthropic models
+    if len(website_content) > 1_000_000:
+        logger.debug("Website text content too large, trying to extract article content only")
+        article_texts = [article.get_text(" | ", strip=True) for article in soup.find_all('article')]
+        website_content = " | ".join(article_texts)
+
     if not website_content:
         raise Exception("No text content found on website")