From 09c820b01135e276f68fceeef339312792fe7712 Mon Sep 17 00:00:00 2001 From: Elehiggle Date: Sat, 25 May 2024 00:56:06 +0200 Subject: [PATCH] Improve website text extractor --- chatbot.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/chatbot.py b/chatbot.py index 8511405..505e23b 100644 --- a/chatbot.py +++ b/chatbot.py @@ -1045,6 +1045,12 @@ def request_link_text_content(link, prev_response): soup = BeautifulSoup(raw_content, "html.parser") website_content = soup.get_text(" | ", strip=True) + # Replace with a tokenizer once there is one for latest Anthropic models + if len(website_content) > 1_000_000: + logger.debug("Website text content too large, trying to extract article content only") + article_texts = [article.get_text(" | ", strip=True) for article in soup.find_all('article')] + website_content = " | ".join(article_texts) + if not website_content: raise Exception("No text content found on website")