update with included config

mindee · Apr 10, 2024 · f0412ec · f0412ec
1 parent 73e3faa
commit f0412ec
Show file tree

Hide file tree

Showing 13 changed files with 416 additions and 130 deletions.
diff --git a/api/README.md b/api/README.md
@@ -45,12 +45,22 @@ should yield
 ```json
 [
   {
-      "name": "invitation.png",
-      "boxes": [
-        [0.50390625, 0.712890625, 0.5185546875, 0.720703125],
-        [0.4716796875, 0.712890625, 0.48828125, 0.720703125]
+    "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+    "geometries": [
+      [
+        0.724609375,
+        0.1787109375,
+        0.7900390625,
+        0.2080078125
+      ],
+      [
+        0.6748046875,
+        0.1796875,
+        0.7314453125,
+        0.20703125
       ]
-  },
+    ]
+  }
 ]
 ```
 
@@ -73,9 +83,10 @@ should yield
 ```json
 [
   {
-      "name": "invitation.png",
-      "value": "invite"
-  },
+    "name": "117133599-c073fa00-ada4-11eb-831b-412de4d28341.jpeg",
+    "value": "invite",
+    "confidence": 1.0
+  }
 ]
 ```
 
@@ -98,17 +109,61 @@ should yield
 ```json
 [
   {
-      "name": "hello_world.jpg",
-      "items": [
+    "name": "117319856-fc35bf00-ae8b-11eb-9b51-ca5aba673466.jpg",
+    "orientation": {
+      "value": 0,
+      "confidence": null
+    },
+    "language": {
+      "value": null,
+      "confidence": null
+    },
+    "items": [
       {
-          "value": "Hello",
-          "box": [0.005859375, 0.003312938981562763, 0.0205078125, 0.0332854340430202]
-      },
-      {
-          "value": "world!",
-          "box": [0.005859375, 0.003312938981562763, 0.0205078125, 0.0332854340430202]
-      },
-      ],
+        "blocks": [
+          {
+            "geometry": [
+              0.7471996155154171,
+              0.1787109375,
+              0.9101580212741838,
+              0.2080078125
+            ],
+            "lines": [
+              {
+                "geometry": [
+                  0.7471996155154171,
+                  0.1787109375,
+                  0.9101580212741838,
+                  0.2080078125
+                ],
+                "words": [
+                  {
+                    "value": "Hello",
+                    "geometry": [
+                      0.7471996155154171,
+                      0.1796875,
+                      0.8272978149561669,
+                      0.20703125
+                    ],
+                    "confidence": 1.0
+                  },
+                  {
+                    "value": "world!",
+                    "geometry": [
+                      0.8176307908857315,
+                      0.1787109375,
+                      0.9101580212741838,
+                      0.2080078125
+                    ],
+                    "confidence": 1.0
+                  }
+                ]
+              }
+            ]
+          }
+        ]
+      }
+    ]
   }
 ]
 ```
diff --git a/api/app/routes/detection.py b/api/app/routes/detection.py
@@ -5,33 +5,31 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import DetectionOut
-from app.vision import det_predictor
+from app.schemas import DetectionIn, DetectionOut
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 from doctr.file_utils import CLASS_NAME
-from doctr.io import DocumentFile
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[DetectionOut], status_code=status.HTTP_200_OK, summary="Perform text detection")
-async def text_detection(files: List[UploadFile] = [File(...)]):
+async def text_detection(request: DetectionIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR text detection model to analyze the input image"""
-    boxes: List[DetectionOut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        elif mime_type == "application/pdf":
-            content = DocumentFile.from_pdf(await file.read())
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file format for detection endpoint: {mime_type}")
-
-        boxes.append(
-            DetectionOut(
-                name=file.filename or "", boxes=[box.tolist() for box in det_predictor(content)[0][CLASS_NAME][:, :-1]]
-            )
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    return [
+        DetectionOut(
+            name=filename,
+            geometries=[
+                geom[:-1].tolist() if len(geom) == 5 else resolve_geometry(geom.tolist()) for geom in doc[CLASS_NAME]
+            ],
         )
-
-    return boxes
+        for doc, filename in zip(predictor(content), filenames)
+    ]
diff --git a/api/app/routes/kie.py b/api/app/routes/kie.py
@@ -5,45 +5,47 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import KIEElement, KIEOut
-from app.vision import kie_predictor
-from doctr.io import DocumentFile
+from app.schemas import KIEElement, KIEIn, KIEOut
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[KIEOut], status_code=status.HTTP_200_OK, summary="Perform KIE")
-async def perform_kie(files: List[UploadFile] = [File(...)]):
+async def perform_kie(request: KIEIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR KIE model to analyze the input image"""
-    results: List[KIEOut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        elif mime_type == "application/pdf":
-            content = DocumentFile.from_pdf(await file.read())
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file format for KIE endpoint: {mime_type}")
-
-        out = kie_predictor(content)
-
-        for page in out.pages:
-            results.append(
-                KIEOut(
-                    name=file.filename or "",
-                    predictions=[
-                        KIEElement(
-                            class_name=class_name,
-                            items=[
-                                dict(value=prediction.value, box=(*prediction.geometry[0], *prediction.geometry[1]))
-                                for prediction in page.predictions[class_name]
-                            ],
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    out = predictor(content)
+
+    results = [
+        KIEOut(
+            name=filenames[i],
+            orientation=page.orientation,
+            language=page.language,
+            predictions=[
+                KIEElement(
+                    class_name=class_name,
+                    items=[
+                        dict(
+                            value=prediction.value,
+                            geometry=resolve_geometry(prediction.geometry),
+                            confidence=round(prediction.confidence, 2),
                         )
-                        for class_name in page.predictions.keys()
+                        for prediction in page.predictions[class_name]
                     ],
                 )
-            )
+                for class_name in page.predictions.keys()
+            ],
+        )
+        for i, page in enumerate(out.pages)
+    ]
 
     return results
diff --git a/api/app/routes/ocr.py b/api/app/routes/ocr.py
@@ -5,40 +5,58 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import OCROut
-from app.vision import predictor
-from doctr.io import DocumentFile
+from app.schemas import OCRBlock, OCRIn, OCRLine, OCROut, OCRPage, OCRWord
+from app.utils import get_documents, resolve_geometry
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
 @router.post("/", response_model=List[OCROut], status_code=status.HTTP_200_OK, summary="Perform OCR")
-async def perform_ocr(files: List[UploadFile] = [File(...)]):
+async def perform_ocr(request: OCRIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR OCR model to analyze the input image"""
-    results: List[OCROut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        elif mime_type == "application/pdf":
-            content = DocumentFile.from_pdf(await file.read())
-        else:
-            raise HTTPException(status_code=400, detail=f"Unsupported file format for OCR endpoint: {mime_type}")
-
-        out = predictor(content)
-        for page in out.pages:
-            results.append(
-                OCROut(
-                    name=file.filename or "",
-                    items=[
-                        dict(value=word.value, box=(*word.geometry[0], *word.geometry[1]))
+    try:
+        # generator object to list
+        content, filenames = await get_documents(files)
+        predictor = init_predictor(request)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+
+    out = predictor(content)
+
+    results = [
+        OCROut(
+            name=filenames[i],
+            orientation=page.orientation,
+            language=page.language,
+            items=[
+                OCRPage(
+                    blocks=[
+                        OCRBlock(
+                            geometry=resolve_geometry(block.geometry),
+                            lines=[
+                                OCRLine(
+                                    geometry=resolve_geometry(line.geometry),
+                                    words=[
+                                        OCRWord(
+                                            value=word.value,
+                                            geometry=resolve_geometry(word.geometry),
+                                            confidence=round(word.confidence, 2),
+                                        )
+                                        for word in line.words
+                                    ],
+                                )
+                                for line in block.lines
+                            ],
+                        )
                         for block in page.blocks
-                        for line in block.lines
-                        for word in line.words
-                    ],
+                    ]
                 )
-            )
+            ],
+        )
+        for i, page in enumerate(out.pages)
+    ]
 
     return results
diff --git a/api/app/routes/recognition.py b/api/app/routes/recognition.py
@@ -5,30 +5,26 @@
 
 from typing import List
 
-from fastapi import APIRouter, File, HTTPException, UploadFile, status
+from fastapi import APIRouter, Depends, File, HTTPException, UploadFile, status
 
-from app.schemas import RecognitionOut
-from app.vision import reco_predictor
-from doctr.io import DocumentFile
+from app.schemas import RecognitionIn, RecognitionOut
+from app.utils import get_documents
+from app.vision import init_predictor
 
 router = APIRouter()
 
 
 @router.post(
     "/", response_model=List[RecognitionOut], status_code=status.HTTP_200_OK, summary="Perform text recognition"
 )
-async def text_recognition(files: List[UploadFile] = [File(...)]):
+async def text_recognition(request: RecognitionIn = Depends(), files: List[UploadFile] = [File(...)]):
     """Runs docTR text recognition model to analyze the input image"""
-    words: List[RecognitionOut] = []
-    for file in files:
-        mime_type = file.content_type
-        if mime_type in ["image/jpeg", "image/png"]:
-            content = DocumentFile.from_images([await file.read()])
-        else:
-            raise HTTPException(
-                status_code=400, detail=f"Unsupported file format for recognition endpoint: {mime_type}"
-            )
-
-        words.append(RecognitionOut(name=file.filename or "", value=reco_predictor(content)[0][0]))
-
-    return words
+    try:
+        predictor = init_predictor(request)
+        content, filenames = await get_documents(files)
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    return [
+        RecognitionOut(name=filename, value=res[0], confidence=round(res[1], 2))
+        for res, filename in zip(predictor(content), filenames)
+    ]