Validate content type (#2404)

nuclia · Aug 21, 2024 · 69404d7 · 69404d7
1 parent 0b327ef
commit 69404d7
Show file tree

Hide file tree

Showing 12 changed files with 388 additions and 68 deletions.
diff --git a/nucliadb/src/nucliadb/ingest/orm/resource.py b/nucliadb/src/nucliadb/ingest/orm/resource.py
@@ -36,8 +36,9 @@
 from nucliadb.ingest.fields.text import Text
 from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
 from nucliadb.ingest.orm.metrics import processor_observer
+from nucliadb_models import content_types
 from nucliadb_models.common import CloudLink
-from nucliadb_models.writer import GENERIC_MIME_TYPE
+from nucliadb_models.content_types import GENERIC_MIME_TYPE
 from nucliadb_protos import utils_pb2, writer_pb2
 from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
 from nucliadb_protos.resources_pb2 import (
@@ -1231,8 +1232,17 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
     if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
         # Icon already set or detected
         return False
+
     if not mimetype:
         return False
+
+    if not content_types.valid(mimetype):
+        logger.warning(
+            "Invalid mimetype. Skipping icon update.",
+            extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
+        )
+        return False
+
     basic.icon = mimetype
     return True
 

diff --git a/nucliadb/src/nucliadb/writer/api/v1/upload.py b/nucliadb/src/nucliadb/writer/api/v1/upload.py
@@ -18,7 +18,6 @@
 # along with this program. If not, see <http://www.gnu.org/licenses/>.
 #
 import base64
-import mimetypes
 import pickle
 import uuid
 from datetime import datetime
@@ -61,6 +60,7 @@
 from nucliadb.writer.tus.storage import FileStorageManager
 from nucliadb.writer.tus.utils import parse_tus_metadata
 from nucliadb.writer.utilities import get_processing
+from nucliadb_models import content_types
 from nucliadb_models.resource import NucliaDBRoles
 from nucliadb_models.utils import FieldIdString
 from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
@@ -251,8 +251,15 @@ async def _tus_post(
     request_content_type = None
     if item is None:
         request_content_type = request.headers.get("content-type")
-    if not request_content_type:
-        request_content_type = guess_content_type(metadata["filename"])
+    if request_content_type is None:
+        request_content_type = content_types.guess(metadata["filename"]) or "application/octet-stream"
+
+    if request_content_type is not None and not content_types.valid(request_content_type):
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported content type: {request_content_type}",
+        )
+
     metadata.setdefault("content_type", request_content_type)
 
     metadata["implies_resource_creation"] = implies_resource_creation
@@ -530,10 +537,18 @@ async def _tus_patch(
             if isinstance(item_payload, str):
                 item_payload = item_payload.encode()
             creation_payload = pickle.loads(base64.b64decode(item_payload))
+
+        content_type = dm.get("metadata", {}).get("content_type")
+        if content_type is not None and not content_types.valid(content_type):
+            return HTTPClientError(
+                status_code=415,
+                detail=f"Unsupported content type: {content_type}",
+            )
+
         try:
             seqid = await store_file_on_nuclia_db(
                 size=dm.get("size"),
-                content_type=dm.get("metadata", {}).get("content_type"),
+                content_type=content_type,
                 override_resource_title=dm.get("metadata", {}).get("implies_resource_creation", False),
                 filename=dm.get("metadata", {}).get("filename"),
                 password=dm.get("metadata", {}).get("password"),
@@ -702,8 +717,14 @@ async def _upload(
     # - content-type set by the user in the upload request header takes precedence.
     # - if not set, we will try to guess it from the filename and default to a generic binary content type otherwise
     content_type = request.headers.get("content-type")
-    if not content_type:
-        content_type = guess_content_type(filename)
+    if content_type is None:
+        content_type = content_types.guess(filename) or "application/octet-stream"
+
+    if not content_types.valid(content_type):
+        raise HTTPException(
+            status_code=415,
+            detail=f"Unsupported content type: {content_type}",
+        )
 
     metadata = {"content_type": content_type, "filename": filename}
 
@@ -814,7 +835,6 @@ async def store_file_on_nuclia_db(
     item: Optional[CreateResourcePayload] = None,
 ) -> Optional[int]:
     # File is on NucliaDB Storage at path
-
     partitioning = get_partitioning()
     processing = get_processing()
     storage = await get_storage(service_name=SERVICE_NAME)
@@ -920,9 +940,3 @@ def maybe_b64decode(some_string: str) -> str:
     except ValueError:
         # not b64encoded
         return some_string
-
-
-def guess_content_type(filename: str) -> str:
-    default = "application/octet-stream"
-    guessed, _ = mimetypes.guess_type(filename)
-    return guessed or default
diff --git a/nucliadb/src/nucliadb/writer/resource/basic.py b/nucliadb/src/nucliadb/writer/resource/basic.py
@@ -24,6 +24,7 @@
 from nucliadb.ingest.orm.utils import set_title
 from nucliadb.ingest.processing import PushPayload
 from nucliadb_models.common import FIELD_TYPES_MAP_REVERSE
+from nucliadb_models.content_types import GENERIC_MIME_TYPE
 from nucliadb_models.file import FileField
 from nucliadb_models.link import LinkField
 from nucliadb_models.metadata import (
@@ -34,7 +35,6 @@
 )
 from nucliadb_models.text import TEXT_FORMAT_TO_MIMETYPE, PushTextFormat, Text
 from nucliadb_models.writer import (
-    GENERIC_MIME_TYPE,
     ComingResourcePayload,
     CreateResourcePayload,
     UpdateResourcePayload,

diff --git a/nucliadb/src/nucliadb/writer/resource/field.py b/nucliadb/src/nucliadb/writer/resource/field.py
@@ -29,9 +29,9 @@
 from nucliadb.writer import SERVICE_NAME
 from nucliadb.writer.utilities import get_processing
 from nucliadb_models.common import FIELD_TYPES_MAP, FieldTypeName
+from nucliadb_models.content_types import GENERIC_MIME_TYPE
 from nucliadb_models.conversation import PushConversation
 from nucliadb_models.writer import (
-    GENERIC_MIME_TYPE,
     CreateResourcePayload,
     UpdateResourcePayload,
 )

diff --git a/nucliadb/tests/ingest/unit/orm/test_resource.py b/nucliadb/tests/ingest/unit/orm/test_resource.py
@@ -141,12 +141,16 @@ def test_get_text_field_mimetype(text_format, mimetype):
         (Basic(), "text/html", True),
         (Basic(icon=""), "text/html", True),
         (Basic(icon="application/octet-stream"), "text/html", True),
+        # Invalid icon should not be updated
+        (Basic(), "invalid", False),
     ],
 )
 def test_maybe_update_basic_icon(basic, icon, updated):
     assert maybe_update_basic_icon(basic, icon) == updated
     if updated:
         assert basic.icon == icon
+    else:
+        assert basic.icon != icon
 
 
 class Transaction:

diff --git a/nucliadb/tests/standalone/integration/test_upload_download.py b/nucliadb/tests/standalone/integration/test_upload_download.py
@@ -65,9 +65,11 @@ async def test_file_tus_upload_and_download(
     knowledgebox_one,
 ):
     language = "ca"
-    filename = "image.jpg"
+    filename = "image.jpeg"
     md5 = "7af0916dba8b70e29d99e72941923529"
-    content_type = "image/jpg"
+    # aitable is a custom content type suffix to indicate
+    # that the file must be processed with the ai tables feature...
+    content_type = "image/jpeg+aitable"
 
     # Create a resource
     kb_path = f"/{KB_PREFIX}/{knowledgebox_one}"
@@ -99,7 +101,7 @@ async def test_file_tus_upload_and_download(
             "upload-defer-length": "1",
         },
     )
-    assert resp.status_code == 201
+    assert resp.status_code == 201, resp.json()
     # Get the URL to upload the file to
     url = resp.headers["location"]
 
@@ -220,3 +222,209 @@ async def test_tus_upload_handles_unknown_upload_ids(
     assert resp.status_code == 404
     error_detail = resp.json().get("detail")
     assert error_detail == "Resumable URI not found for upload_id: foobarid"
+
+
+@pytest.mark.asyncio
+async def test_content_type_validation(
+    local_storage_settings,
+    configure_redis_dm,
+    nucliadb_writer,
+    nucliadb_reader,
+    knowledgebox_one,
+):
+    language = "ca"
+    filename = "image.jpg"
+    md5 = "7af0916dba8b70e29d99e72941923529"
+
+    # Create a resource
+    kb_path = f"/{KB_PREFIX}/{knowledgebox_one}"
+    resp = await nucliadb_writer.post(
+        f"{kb_path}/{RESOURCES_PREFIX}",
+        json={
+            "slug": "resource1",
+            "title": "Resource 1",
+        },
+    )
+    assert resp.status_code == 201
+    resource = resp.json().get("uuid")
+
+    # Start TUS upload
+    url = f"{kb_path}/{RESOURCE_PREFIX}/{resource}/file/field1/{TUSUPLOAD}"
+    upload_metadata = ",".join(
+        [
+            f"filename {header_encode(filename)}",
+            f"language {header_encode(language)}",
+            f"md5 {header_encode(md5)}",
+        ]
+    )
+    resp = await nucliadb_writer.post(
+        url,
+        headers={
+            "tus-resumable": "1.0.0",
+            "upload-metadata": upload_metadata,
+            "content-type": "invalid-content-type",
+            "upload-defer-length": "1",
+        },
+    )
+    assert resp.status_code == 415
+    error_detail = resp.json().get("detail")
+    assert error_detail == "Unsupported content type: invalid-content-type"
+
+
+@pytest.mark.parametrize(
+    "content_type",
+    [
+        "application/epub+zip",
+        "application/font-woff",
+        "application/generic",
+        "application/java-archive",
+        "application/java-vm",
+        "application/json",
+        "application/mp4",
+        "application/msword",
+        "application/octet-stream",
+        "application/pdf",
+        "application/pdf+aitable",
+        "application/postscript",
+        "application/rls-services+xml",
+        "application/rtf",
+        "application/stf-link",
+        "application/toml",
+        "application/vnd.jgraph.mxfile",
+        "application/vnd.lotus-organizer",
+        "application/vnd.ms-excel.sheet.macroenabled.12",
+        "application/vnd.ms-excel",
+        "application/vnd.ms-excel+aitable",
+        "application/vnd.ms-outlook",
+        "application/vnd.ms-powerpoint",
+        "application/vnd.ms-project",
+        "application/vnd.ms-word.document.macroenabled.12",
+        "application/vnd.oasis.opendocument.presentation",
+        "application/vnd.oasis.opendocument.text",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
+        "application/vnd.openxmlformats-officedocument.presentationml.presentation+aitable",
+        "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet+aitable",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.document+aitable",
+        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+        "application/vnd.rar",
+        "application/x-mobipocket-ebook",
+        "application/x-ms-shortcut",
+        "application/x-msdownload",
+        "application/x-ndjson",
+        "application/x-openscad",
+        "application/x-sql",
+        "application/x-zip-compressed",
+        "application/xml",
+        "application/zip",
+        "application/zstd",
+        "audio/aac",
+        "audio/mp4",
+        "audio/mpeg",
+        "audio/vnd.dlna.adts",
+        "audio/wav",
+        "audio/x-m4a",
+        "image/avif",
+        "image/gif",
+        "image/heic",
+        "image/jpeg",
+        "image/jpeg+aitable",
+        "image/png",
+        "image/png+aitable",
+        "image/svg+xml",
+        "image/tiff",
+        "image/vnd.djvu",
+        "image/vnd.dwg",
+        "image/webp",
+        "model/stl",
+        "text/calendar",
+        "text/css",
+        "text/csv",
+        "text/csv+aitable",
+        "text/html",
+        "text/javascript",
+        "text/jsx",
+        "text/markdown",
+        "text/plain",
+        "text/rtf",
+        "text/rtf+aitable",
+        "text/x-java-source",
+        "text/x-log",
+        "text/x-python",
+        "text/xml",
+        "text/yaml",
+        "video/mp4",
+        "video/mp4+aitable",
+        "video/quicktime",
+        "video/webm",
+        "video/x-m4v",
+        "video/x-ms-wmv",
+        "video/YouTube",
+        "multipart/form-data",
+    ],
+)
+def test_valid_content_types(content_type):
+    from nucliadb_models import content_types
+
+    assert content_types.valid(content_type)
+
+
+@pytest.mark.parametrize(
+    "content_type",
+    [
+        "multipart/form-data;boundary=--------------------------472719318099714047986957",
+    ],
+)
+def test_invalid_content_types(content_type):
+    from nucliadb_models import content_types
+
+    assert not content_types.valid(content_type)
+
+
+@pytest.mark.parametrize(
+    "filename,content_type",
+    [
+        # Text files
+        ("foo.txt", "text/plain"),
+        ("foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+        ("foo.pdf", "application/pdf"),
+        ("foo.json", "application/json"),
+        # Spreadsheets
+        ("foo.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+        ("foo.csv", "text/csv"),
+        # Presentations
+        ("foo.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        # Images
+        ("image.jpg", "image/jpeg"),
+        ("image.jpeg", "image/jpeg"),
+        ("image.png", "image/png"),
+        ("image.tiff", "image/tiff"),
+        ("image.gif", "image/gif"),
+        # Videos
+        ("video.mp4", "video/mp4"),
+        ("video.webm", "video/webm"),
+        ("video.avi", "video/x-msvideo"),
+        ("video.mpeg", "video/mpeg"),
+        # Audio
+        ("audio.mp3", "audio/mpeg"),
+        ("audio.wav", "audio/x-wav"),
+        # Web data
+        ("data.html", "text/html"),
+        ("data.xml", "application/xml"),
+        # Archive files
+        ("archive.zip", "application/zip"),
+        ("archive.rar", "application/x-rar-compressed"),
+        ("archive.tar", "application/x-tar"),
+        ("archive.tar.gz", "application/x-tar"),
+        # Invalid content types
+        ("foobar", None),
+        ("someuuidwithoutextension", None),
+        ("", None),
+    ],
+)
+def test_guess_content_type(filename, content_type):
+    from nucliadb_models import content_types
+
+    assert content_types.guess(filename) == content_type