Skip to content

Commit

Permalink
Merge branch 'progress-bar' of https://github.com/varrialeciro/cheshi…
Browse files Browse the repository at this point in the history
…re-cat into varrialeciro-progress-bar
  • Loading branch information
pieroit committed Aug 11, 2023
2 parents 3ab165b + 94e45b4 commit 0417ed9
Showing 1 changed file with 47 additions and 9 deletions.
56 changes: 47 additions & 9 deletions core/cat/rabbit_hole.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
import time
import math
import json
import mimetypes
from typing import List, Union
Expand Down Expand Up @@ -125,6 +126,8 @@ def ingest_file(
before_rabbithole_stores_documents
"""

start = time.time()

# split file into a list of docs
docs = self.file_to_docs(
file=file, chunk_size=chunk_size, chunk_overlap=chunk_overlap
Expand All @@ -135,6 +138,8 @@ def ingest_file(
filename = file
else:
filename = file.filename

start = time.time()
self.store_documents(docs=docs, source=filename)

def file_to_docs(
Expand Down Expand Up @@ -209,21 +214,46 @@ def file_to_docs(
else:
raise ValueError(f"{type(file)} is not a valid type.")

start = time.time()
# Load the bytes in the Blob schema
blob = Blob(data=file_bytes,
mimetype=content_type,
source=source).from_data(data=file_bytes,
mime_type=content_type)

# Parser based on the mime type
parser = MimeTypeBasedParser(handlers=self.file_handlers)

# Parse the text
self.send_rabbit_thought("I'm parsing the content. Big content could require some minutes...")
start = time.time()
text = parser.parse(blob)

self.send_rabbit_thought(f"Parsing completed. Now let's go with reading process...")
start = time.time()
docs = self.split_text(text, chunk_size, chunk_overlap)
return docs

def send_rabbit_thought(self, thought):
"""Append a message to the notification list.
This method receive a string and create the message to append to the list of notifications.
Parameters
----------
thought : str
Text of the message to append to the notification list.
"""

self.cat.web_socket_notifications.append(
{
"error": False,
"type": "notification",
"content": thought,
"why": {},
}
)


def store_documents(self, docs: List[Document], source: str) -> None:
"""Add documents to the Cat's declarative memory.
Expand Down Expand Up @@ -254,8 +284,22 @@ def store_documents(self, docs: List[Document], source: str) -> None:
"before_rabbithole_stores_documents", docs
)

# parameters for storing progress percentage
perc_100 = len(docs)
perc_resolution = 10
perc_step = math.floor((perc_100 * perc_resolution) / 100)

perc_read = 0

# classic embed
for d, doc in enumerate(docs):
# every percStep send a notification in order to monitor the progress
# N.B. file with a len < resolution does not show any percentage
# the storing should be immediate, and it's correct show only the final msg
if ((perc_step > 0) and ((d+1) % perc_step) == 0):
perc_read += perc_resolution
self.send_rabbit_thought(f"Read {perc_read}% of {source}")

doc.metadata["source"] = source
doc.metadata["when"] = time.time()
doc = self.cat.mad_hatter.execute_hook(
Expand All @@ -279,14 +323,8 @@ def store_documents(self, docs: List[Document], source: str) -> None:
# notify client
finished_reading_message = f"Finished reading {source}, " \
f"I made {len(docs)} thoughts on it."
self.cat.web_socket_notifications.append(
{
"error": False,
"type": "notification",
"content": finished_reading_message,
"why": {},
}
)

self.send_rabbit_thought(finished_reading_message)

print(f"\n\nDone uploading {source}")

Expand Down

0 comments on commit 0417ed9

Please sign in to comment.