Skip to content

Commit

Permalink
should chunk param
Browse files Browse the repository at this point in the history
  • Loading branch information
esteininger committed Mar 19, 2024
1 parent 9db1e69 commit 9e52ca3
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 18 deletions.
25 changes: 16 additions & 9 deletions src/parsers/files/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from .text.service import TextService

from _exceptions import InternalServerError, NotFoundError, BadRequestError
from _utils import create_json_response
from _utils import create_success_response

files = {
"text": ["pdf", "docx", "txt", "md", "html", "xml"],
Expand Down Expand Up @@ -43,7 +43,9 @@ async def download_into_memory(self):
)

except Exception as e:
raise InternalServerError(error={"message": f"Error downloading file: {e}"})
raise BadRequestError(
error={"message": f"Error downloading file: {response.status_code}"}
)

def detect_filetype(self, contents):
try:
Expand All @@ -57,29 +59,34 @@ def detect_filetype(self, contents):
# }
data = {
"label": res.output.ct_label,
"description": res.output.description,
"mime_type": res.output.mime_type,
"group": res.output.group,
}
return data
except Exception as e:
raise InternalServerError(
raise BadRequestError(
error={"message": "Error occurred while detecting filetype"}
)

async def parse_file(self):
async def parse_file(self, should_chunk=True):
# Download file into memory
contents, filename = await self.download_into_memory()
stream = BytesIO(contents)

# Detect file type
metadata = self.detect_filetype(stream.getvalue())
metadata["filename"] = filename
metadata["start_time"] = time.time() * 1000
metadata.update({"filename": filename})

text_service = TextService(stream, metadata)

if metadata["label"] == "pdf":
return await text_service.run_pdf()
start_time = time.time() * 1000
# Process file based on chunking preference and file type
if metadata["label"] in files["text"]:
text_output = await text_service.run(should_chunk)
else:
raise BadRequestError(error={"message": "File type not supported"})

# Calculate elapsed time
metadata["elapsed_taken"] = (time.time() * 1000) - start_time

return create_success_response({"text": text_output, "metadata": metadata})
23 changes: 16 additions & 7 deletions src/parsers/files/text/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from unstructured.cleaners.core import clean
from unstructured.chunking.basic import chunk_elements

from _utils import create_json_response
from _exceptions import InternalServerError


Expand All @@ -25,7 +24,7 @@ def _chunk(self, elements, chunk_size=500, overlap_percent=15):
overlap=overlap_subset,
)

async def run_pdf(self):
async def run(self, should_chunk=True):
try:
elements = partition_pdf(
file=self.file_stream,
Expand All @@ -35,12 +34,22 @@ async def run_pdf(self):
# hi_res_model_name="detectron2_onnx",
)
chunks = self._chunk(elements)
for c in chunks:
response_obj = c.to_dict()
response_obj["text"] = self._clean(response_obj["text"])
self.chunks.append(response_obj)

return create_json_response(True, 200, None, self.chunks)
# Process chunks based on should_chunk flag
processed_chunks = self.process_chunks(chunks, should_chunk)

return processed_chunks
except Exception as e:
error = {"message": str(e)}
raise InternalServerError(error)

def process_chunks(self, chunks, should_chunk):
if should_chunk:
return [self.process_chunk(c) for c in chunks]
else:
return "".join(self._clean(c.to_dict()["text"]) for c in chunks)

def process_chunk(self, chunk):
response_obj = chunk.to_dict()
response_obj["text"] = self._clean(response_obj["text"])
return response_obj
9 changes: 7 additions & 2 deletions src/parsers/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
from package.model import PackageData
from package.service import PackageManager

from _exceptions import APIError


app = FastAPI()

Expand All @@ -31,9 +33,12 @@ class ApiResponse(BaseModel):


@app.post("/file")
async def process_file(file: FileData):
async def process_file(
file: FileData,
should_chunk: Optional[bool] = True,
):
file_handler = FileHandler(file.file_url)
return await file_handler.parse_file()
return await file_handler.parse_file(should_chunk)


@app.post("/website")
Expand Down

0 comments on commit 9e52ca3

Please sign in to comment.