should chunk param

nux-ai · Mar 19, 2024 · 9e52ca3 · 9e52ca3
1 parent 9db1e69
commit 9e52ca3
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 18 deletions.
diff --git a/src/parsers/files/service.py b/src/parsers/files/service.py
@@ -8,7 +8,7 @@
 from .text.service import TextService
 
 from _exceptions import InternalServerError, NotFoundError, BadRequestError
-from _utils import create_json_response
+from _utils import create_success_response
 
 files = {
     "text": ["pdf", "docx", "txt", "md", "html", "xml"],
@@ -43,7 +43,9 @@ async def download_into_memory(self):
                     )
 
         except Exception as e:
-            raise InternalServerError(error={"message": f"Error downloading file: {e}"})
+            raise BadRequestError(
+                error={"message": f"Error downloading file: {response.status_code}"}
+            )
 
     def detect_filetype(self, contents):
         try:
@@ -57,29 +59,34 @@ def detect_filetype(self, contents):
             # }
             data = {
                 "label": res.output.ct_label,
-                "description": res.output.description,
                 "mime_type": res.output.mime_type,
                 "group": res.output.group,
             }
             return data
         except Exception as e:
-            raise InternalServerError(
+            raise BadRequestError(
                 error={"message": "Error occurred while detecting filetype"}
             )
 
-    async def parse_file(self):
+    async def parse_file(self, should_chunk=True):
         # Download file into memory
         contents, filename = await self.download_into_memory()
         stream = BytesIO(contents)
 
         # Detect file type
         metadata = self.detect_filetype(stream.getvalue())
-        metadata["filename"] = filename
-        metadata["start_time"] = time.time() * 1000
+        metadata.update({"filename": filename})
 
         text_service = TextService(stream, metadata)
 
-        if metadata["label"] == "pdf":
-            return await text_service.run_pdf()
+        start_time = time.time() * 1000
+        # Process file based on chunking preference and file type
+        if metadata["label"] in files["text"]:
+            text_output = await text_service.run(should_chunk)
         else:
             raise BadRequestError(error={"message": "File type not supported"})
+
+        # Calculate elapsed time
+        metadata["elapsed_taken"] = (time.time() * 1000) - start_time
+
+        return create_success_response({"text": text_output, "metadata": metadata})
diff --git a/src/parsers/files/text/service.py b/src/parsers/files/text/service.py
@@ -4,7 +4,6 @@
 from unstructured.cleaners.core import clean
 from unstructured.chunking.basic import chunk_elements
 
-from _utils import create_json_response
 from _exceptions import InternalServerError
 
 
@@ -25,7 +24,7 @@ def _chunk(self, elements, chunk_size=500, overlap_percent=15):
             overlap=overlap_subset,
         )
 
-    async def run_pdf(self):
+    async def run(self, should_chunk=True):
         try:
             elements = partition_pdf(
                 file=self.file_stream,
@@ -35,12 +34,22 @@ async def run_pdf(self):
                 # hi_res_model_name="detectron2_onnx",
             )
             chunks = self._chunk(elements)
-            for c in chunks:
-                response_obj = c.to_dict()
-                response_obj["text"] = self._clean(response_obj["text"])
-                self.chunks.append(response_obj)
 
-            return create_json_response(True, 200, None, self.chunks)
+            # Process chunks based on should_chunk flag
+            processed_chunks = self.process_chunks(chunks, should_chunk)
+
+            return processed_chunks
         except Exception as e:
             error = {"message": str(e)}
             raise InternalServerError(error)
+
+    def process_chunks(self, chunks, should_chunk):
+        if should_chunk:
+            return [self.process_chunk(c) for c in chunks]
+        else:
+            return "".join(self._clean(c.to_dict()["text"]) for c in chunks)
+
+    def process_chunk(self, chunk):
+        response_obj = chunk.to_dict()
+        response_obj["text"] = self._clean(response_obj["text"])
+        return response_obj
diff --git a/src/parsers/main.py b/src/parsers/main.py
@@ -13,6 +13,8 @@
 from package.model import PackageData
 from package.service import PackageManager
 
+from _exceptions import APIError
+
 
 app = FastAPI()
 
@@ -31,9 +33,12 @@ class ApiResponse(BaseModel):
 
 
 @app.post("/file")
-async def process_file(file: FileData):
+async def process_file(
+    file: FileData,
+    should_chunk: Optional[bool] = True,
+):
     file_handler = FileHandler(file.file_url)
-    return await file_handler.parse_file()
+    return await file_handler.parse_file(should_chunk)
 
 
 @app.post("/website")