FIX: handling of very long chat messages

- cheshire_cat.py: input message is split at MAX_TEXT_INPUT tokens saving what exceeds in declarative memory - rabbit_hole.py: new method string_to_docs to just convert strings in documents relates to issue #334
cheshire-cat-ai · Nov 10, 2023 · 267a6a1 · 267a6a1
1 parent c928d7a
commit 267a6a1
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 2 deletions.
diff --git a/core/cat/looking_glass/cheshire_cat.py b/core/cat/looking_glass/cheshire_cat.py
@@ -26,6 +26,7 @@
 
 
 MSG_TYPES = Literal["notification", "chat", "error", "chat_token"]
+MAX_TEXT_INPUT = 500
 
 # main class
 class CheshireCat():
@@ -432,6 +433,19 @@ def __call__(self, user_message_json):
         # hook to modify/enrich user input
         user_message_json = self.mad_hatter.execute_hook("before_cat_reads_message", user_message_json)
 
+        # split text after MAX_TEXT_INPUT tokens, on a whitespace, if any, and send it to declarative memory
+        if len(user_message_json["text"]) > MAX_TEXT_INPUT:
+            index = MAX_TEXT_INPUT
+            char = user_message_json["text"][index]
+            while not char.isspace() and index > 0:
+                index -= 1
+                char = user_message_json["text"][index]
+            if index <= 0:
+                index = MAX_TEXT_INPUT
+            user_message_json["text"], to_declarative_memory = user_message_json["text"][:index], user_message_json["text"][index:]
+            docs = self.rabbit_hole.string_to_docs(to_declarative_memory, content_type="text/plain", send_message=False)
+            self.rabbit_hole.store_documents(docs=docs, source="")
+
         # store last message in working memory
         user_working_memory["user_message_json"] = user_message_json
 

diff --git a/core/cat/rabbit_hole.py b/core/cat/rabbit_hole.py
@@ -213,7 +213,48 @@ def file_to_docs(
                     file_bytes = f.read()
         else:
             raise ValueError(f"{type(file)} is not a valid type.")
+        return self.string_to_docs(
+            file_bytes,
+            source,
+            content_type,
+            chunk_size,
+            chunk_overlap
+        )
+
+    def string_to_docs(
+            self,
+            file_bytes: str,
+            source: str = None,
+            content_type: str = "text/plain",
+            chunk_size: int = 400,
+            chunk_overlap: int = 100,
+            send_message: bool = True
+        ) -> List[Document]:
+        """Convert string to Langchain `Document`.
+
+        Takes a string, converts it to langchain `Document`.
+        Hence, loads it in memory and splits it in overlapped chunks of text.
+
+        Parameters
+        ----------
+        file_bytes : str
+            The string to be converted.
+        source: str
+            Source filename.
+        content_type:
+            Mimetype of content.
+        chunk_size : int
+            Number of characters in each document chunk.
+        chunk_overlap : int
+            Number of overlapping characters between consecutive chunks.
+        send_message: bool
+            If true will send parsing message information to frontend.
 
+        Returns
+        -------
+        docs : List[Document]
+            List of Langchain `Document` of chunked text.
+        """
         # Load the bytes in the Blob schema
         blob = Blob(data=file_bytes,
                     mimetype=content_type,
@@ -224,10 +265,12 @@ def file_to_docs(
         parser = MimeTypeBasedParser(handlers=self.file_handlers)
 
         # Parse the text
-        self.cat.send_ws_message("I'm parsing the content. Big content could require some minutes...")
+        if send_message:
+            self.cat.send_ws_message("I'm parsing the content. Big content could require some minutes...")
         text = parser.parse(blob)
 
-        self.cat.send_ws_message(f"Parsing completed. Now let's go with reading process...")
+        if send_message:
+            self.cat.send_ws_message(f"Parsing completed. Now let's go with reading process...")
         docs = self.split_text(text, chunk_size, chunk_overlap)
         return docs