Skip to content

Commit

Permalink
FIX: handling of very long chat messages
Browse files Browse the repository at this point in the history
- cheshire_cat.py: input message is split at MAX_TEXT_INPUT tokens
saving what exceeds in declarative memory
- rabbit_hole.py: new method string_to_docs to just convert strings
in documents

relates to issue #334
  • Loading branch information
kodaline committed Nov 10, 2023
1 parent c928d7a commit 267a6a1
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 2 deletions.
14 changes: 14 additions & 0 deletions core/cat/looking_glass/cheshire_cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@


MSG_TYPES = Literal["notification", "chat", "error", "chat_token"]
MAX_TEXT_INPUT = 500

# main class
class CheshireCat():
Expand Down Expand Up @@ -432,6 +433,19 @@ def __call__(self, user_message_json):
# hook to modify/enrich user input
user_message_json = self.mad_hatter.execute_hook("before_cat_reads_message", user_message_json)

# split text after MAX_TEXT_INPUT tokens, on a whitespace, if any, and send it to declarative memory
if len(user_message_json["text"]) > MAX_TEXT_INPUT:
index = MAX_TEXT_INPUT
char = user_message_json["text"][index]
while not char.isspace() and index > 0:
index -= 1
char = user_message_json["text"][index]
if index <= 0:
index = MAX_TEXT_INPUT
user_message_json["text"], to_declarative_memory = user_message_json["text"][:index], user_message_json["text"][index:]
docs = self.rabbit_hole.string_to_docs(to_declarative_memory, content_type="text/plain", send_message=False)
self.rabbit_hole.store_documents(docs=docs, source="")

# store last message in working memory
user_working_memory["user_message_json"] = user_message_json

Expand Down
47 changes: 45 additions & 2 deletions core/cat/rabbit_hole.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,48 @@ def file_to_docs(
file_bytes = f.read()
else:
raise ValueError(f"{type(file)} is not a valid type.")
return self.string_to_docs(
file_bytes,
source,
content_type,
chunk_size,
chunk_overlap
)

def string_to_docs(
self,
file_bytes: str,
source: str = None,
content_type: str = "text/plain",
chunk_size: int = 400,
chunk_overlap: int = 100,
send_message: bool = True
) -> List[Document]:
"""Convert string to Langchain `Document`.
Takes a string, converts it to langchain `Document`.
Hence, loads it in memory and splits it in overlapped chunks of text.
Parameters
----------
file_bytes : str
The string to be converted.
source: str
Source filename.
content_type:
Mimetype of content.
chunk_size : int
Number of characters in each document chunk.
chunk_overlap : int
Number of overlapping characters between consecutive chunks.
send_message: bool
If true will send parsing message information to frontend.
Returns
-------
docs : List[Document]
List of Langchain `Document` of chunked text.
"""
# Load the bytes in the Blob schema
blob = Blob(data=file_bytes,
mimetype=content_type,
Expand All @@ -224,10 +265,12 @@ def file_to_docs(
parser = MimeTypeBasedParser(handlers=self.file_handlers)

# Parse the text
self.cat.send_ws_message("I'm parsing the content. Big content could require some minutes...")
if send_message:
self.cat.send_ws_message("I'm parsing the content. Big content could require some minutes...")
text = parser.parse(blob)

self.cat.send_ws_message(f"Parsing completed. Now let's go with reading process...")
if send_message:
self.cat.send_ws_message(f"Parsing completed. Now let's go with reading process...")
docs = self.split_text(text, chunk_size, chunk_overlap)
return docs

Expand Down

0 comments on commit 267a6a1

Please sign in to comment.