diff --git a/src/parsers/files/service.py b/src/parsers/files/service.py index eff6302..a82cd67 100644 --- a/src/parsers/files/service.py +++ b/src/parsers/files/service.py @@ -8,7 +8,7 @@ from .text.service import TextService from _exceptions import InternalServerError, NotFoundError, BadRequestError -from _utils import create_json_response +from _utils import create_success_response files = { "text": ["pdf", "docx", "txt", "md", "html", "xml"], @@ -43,7 +43,9 @@ async def download_into_memory(self): ) except Exception as e: - raise InternalServerError(error={"message": f"Error downloading file: {e}"}) + raise BadRequestError( + error={"message": f"Error downloading file: {response.status_code}"} + ) def detect_filetype(self, contents): try: @@ -57,29 +59,34 @@ def detect_filetype(self, contents): # } data = { "label": res.output.ct_label, - "description": res.output.description, "mime_type": res.output.mime_type, "group": res.output.group, } return data except Exception as e: - raise InternalServerError( + raise BadRequestError( error={"message": "Error occurred while detecting filetype"} ) - async def parse_file(self): + async def parse_file(self, should_chunk=True): # Download file into memory contents, filename = await self.download_into_memory() stream = BytesIO(contents) # Detect file type metadata = self.detect_filetype(stream.getvalue()) - metadata["filename"] = filename - metadata["start_time"] = time.time() * 1000 + metadata.update({"filename": filename}) text_service = TextService(stream, metadata) - if metadata["label"] == "pdf": - return await text_service.run_pdf() + start_time = time.time() * 1000 + # Process file based on chunking preference and file type + if metadata["label"] in files["text"]: + text_output = await text_service.run(should_chunk) else: raise BadRequestError(error={"message": "File type not supported"}) + + # Calculate elapsed time + metadata["elapsed_taken"] = (time.time() * 1000) - start_time + + return create_success_response({"text": text_output, "metadata": metadata}) diff --git a/src/parsers/files/text/service.py b/src/parsers/files/text/service.py index 468f307..4c73938 100644 --- a/src/parsers/files/text/service.py +++ b/src/parsers/files/text/service.py @@ -4,7 +4,6 @@ from unstructured.cleaners.core import clean from unstructured.chunking.basic import chunk_elements -from _utils import create_json_response from _exceptions import InternalServerError @@ -25,7 +24,7 @@ def _chunk(self, elements, chunk_size=500, overlap_percent=15): overlap=overlap_subset, ) - async def run_pdf(self): + async def run(self, should_chunk=True): try: elements = partition_pdf( file=self.file_stream, @@ -35,12 +34,22 @@ async def run_pdf(self): # hi_res_model_name="detectron2_onnx", ) chunks = self._chunk(elements) - for c in chunks: - response_obj = c.to_dict() - response_obj["text"] = self._clean(response_obj["text"]) - self.chunks.append(response_obj) - return create_json_response(True, 200, None, self.chunks) + # Process chunks based on should_chunk flag + processed_chunks = self.process_chunks(chunks, should_chunk) + + return processed_chunks except Exception as e: error = {"message": str(e)} raise InternalServerError(error) + + def process_chunks(self, chunks, should_chunk): + if should_chunk: + return [self.process_chunk(c) for c in chunks] + else: + return "".join(self._clean(c.to_dict()["text"]) for c in chunks) + + def process_chunk(self, chunk): + response_obj = chunk.to_dict() + response_obj["text"] = self._clean(response_obj["text"]) + return response_obj diff --git a/src/parsers/main.py b/src/parsers/main.py index 2b422d0..a8f66b5 100644 --- a/src/parsers/main.py +++ b/src/parsers/main.py @@ -13,6 +13,8 @@ from package.model import PackageData from package.service import PackageManager +from _exceptions import APIError + app = FastAPI() @@ -31,9 +33,12 @@ class ApiResponse(BaseModel): @app.post("/file") -async def process_file(file: FileData): +async def process_file( + file: FileData, + should_chunk: Optional[bool] = True, +): file_handler = FileHandler(file.file_url) - return await file_handler.parse_file() + return await file_handler.parse_file(should_chunk) @app.post("/website")