Skip to content

Commit

Permalink
Validate content type (#2404)
Browse files Browse the repository at this point in the history
  • Loading branch information
lferran authored Aug 21, 2024
1 parent 0b327ef commit 69404d7
Show file tree
Hide file tree
Showing 12 changed files with 388 additions and 68 deletions.
12 changes: 11 additions & 1 deletion nucliadb/src/nucliadb/ingest/orm/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,9 @@
from nucliadb.ingest.fields.text import Text
from nucliadb.ingest.orm.brain import FilePagePositions, ResourceBrain
from nucliadb.ingest.orm.metrics import processor_observer
from nucliadb_models import content_types
from nucliadb_models.common import CloudLink
from nucliadb_models.writer import GENERIC_MIME_TYPE
from nucliadb_models.content_types import GENERIC_MIME_TYPE
from nucliadb_protos import utils_pb2, writer_pb2
from nucliadb_protos.resources_pb2 import AllFieldIDs as PBAllFieldIDs
from nucliadb_protos.resources_pb2 import (
Expand Down Expand Up @@ -1231,8 +1232,17 @@ def maybe_update_basic_icon(basic: PBBasic, mimetype: Optional[str]) -> bool:
if basic.icon not in (None, "", "application/octet-stream", GENERIC_MIME_TYPE):
# Icon already set or detected
return False

if not mimetype:
return False

if not content_types.valid(mimetype):
logger.warning(
"Invalid mimetype. Skipping icon update.",
extra={"mimetype": mimetype, "rid": basic.uuid, "slug": basic.slug},
)
return False

basic.icon = mimetype
return True

Expand Down
40 changes: 27 additions & 13 deletions nucliadb/src/nucliadb/writer/api/v1/upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
import base64
import mimetypes
import pickle
import uuid
from datetime import datetime
Expand Down Expand Up @@ -61,6 +60,7 @@
from nucliadb.writer.tus.storage import FileStorageManager
from nucliadb.writer.tus.utils import parse_tus_metadata
from nucliadb.writer.utilities import get_processing
from nucliadb_models import content_types
from nucliadb_models.resource import NucliaDBRoles
from nucliadb_models.utils import FieldIdString
from nucliadb_models.writer import CreateResourcePayload, ResourceFileUploaded
Expand Down Expand Up @@ -251,8 +251,15 @@ async def _tus_post(
request_content_type = None
if item is None:
request_content_type = request.headers.get("content-type")
if not request_content_type:
request_content_type = guess_content_type(metadata["filename"])
if request_content_type is None:
request_content_type = content_types.guess(metadata["filename"]) or "application/octet-stream"

if request_content_type is not None and not content_types.valid(request_content_type):
raise HTTPException(
status_code=415,
detail=f"Unsupported content type: {request_content_type}",
)

metadata.setdefault("content_type", request_content_type)

metadata["implies_resource_creation"] = implies_resource_creation
Expand Down Expand Up @@ -530,10 +537,18 @@ async def _tus_patch(
if isinstance(item_payload, str):
item_payload = item_payload.encode()
creation_payload = pickle.loads(base64.b64decode(item_payload))

content_type = dm.get("metadata", {}).get("content_type")
if content_type is not None and not content_types.valid(content_type):
return HTTPClientError(
status_code=415,
detail=f"Unsupported content type: {content_type}",
)

try:
seqid = await store_file_on_nuclia_db(
size=dm.get("size"),
content_type=dm.get("metadata", {}).get("content_type"),
content_type=content_type,
override_resource_title=dm.get("metadata", {}).get("implies_resource_creation", False),
filename=dm.get("metadata", {}).get("filename"),
password=dm.get("metadata", {}).get("password"),
Expand Down Expand Up @@ -702,8 +717,14 @@ async def _upload(
# - content-type set by the user in the upload request header takes precedence.
# - if not set, we will try to guess it from the filename and default to a generic binary content type otherwise
content_type = request.headers.get("content-type")
if not content_type:
content_type = guess_content_type(filename)
if content_type is None:
content_type = content_types.guess(filename) or "application/octet-stream"

if not content_types.valid(content_type):
raise HTTPException(
status_code=415,
detail=f"Unsupported content type: {content_type}",
)

metadata = {"content_type": content_type, "filename": filename}

Expand Down Expand Up @@ -814,7 +835,6 @@ async def store_file_on_nuclia_db(
item: Optional[CreateResourcePayload] = None,
) -> Optional[int]:
# File is on NucliaDB Storage at path

partitioning = get_partitioning()
processing = get_processing()
storage = await get_storage(service_name=SERVICE_NAME)
Expand Down Expand Up @@ -920,9 +940,3 @@ def maybe_b64decode(some_string: str) -> str:
except ValueError:
# not b64encoded
return some_string


def guess_content_type(filename: str) -> str:
default = "application/octet-stream"
guessed, _ = mimetypes.guess_type(filename)
return guessed or default
2 changes: 1 addition & 1 deletion nucliadb/src/nucliadb/writer/resource/basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from nucliadb.ingest.orm.utils import set_title
from nucliadb.ingest.processing import PushPayload
from nucliadb_models.common import FIELD_TYPES_MAP_REVERSE
from nucliadb_models.content_types import GENERIC_MIME_TYPE
from nucliadb_models.file import FileField
from nucliadb_models.link import LinkField
from nucliadb_models.metadata import (
Expand All @@ -34,7 +35,6 @@
)
from nucliadb_models.text import TEXT_FORMAT_TO_MIMETYPE, PushTextFormat, Text
from nucliadb_models.writer import (
GENERIC_MIME_TYPE,
ComingResourcePayload,
CreateResourcePayload,
UpdateResourcePayload,
Expand Down
2 changes: 1 addition & 1 deletion nucliadb/src/nucliadb/writer/resource/field.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
from nucliadb.writer import SERVICE_NAME
from nucliadb.writer.utilities import get_processing
from nucliadb_models.common import FIELD_TYPES_MAP, FieldTypeName
from nucliadb_models.content_types import GENERIC_MIME_TYPE
from nucliadb_models.conversation import PushConversation
from nucliadb_models.writer import (
GENERIC_MIME_TYPE,
CreateResourcePayload,
UpdateResourcePayload,
)
Expand Down
4 changes: 4 additions & 0 deletions nucliadb/tests/ingest/unit/orm/test_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,16 @@ def test_get_text_field_mimetype(text_format, mimetype):
(Basic(), "text/html", True),
(Basic(icon=""), "text/html", True),
(Basic(icon="application/octet-stream"), "text/html", True),
# Invalid icon should not be updated
(Basic(), "invalid", False),
],
)
def test_maybe_update_basic_icon(basic, icon, updated):
assert maybe_update_basic_icon(basic, icon) == updated
if updated:
assert basic.icon == icon
else:
assert basic.icon != icon


class Transaction:
Expand Down
214 changes: 211 additions & 3 deletions nucliadb/tests/standalone/integration/test_upload_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,11 @@ async def test_file_tus_upload_and_download(
knowledgebox_one,
):
language = "ca"
filename = "image.jpg"
filename = "image.jpeg"
md5 = "7af0916dba8b70e29d99e72941923529"
content_type = "image/jpg"
# aitable is a custom content type suffix to indicate
# that the file must be processed with the ai tables feature...
content_type = "image/jpeg+aitable"

# Create a resource
kb_path = f"/{KB_PREFIX}/{knowledgebox_one}"
Expand Down Expand Up @@ -99,7 +101,7 @@ async def test_file_tus_upload_and_download(
"upload-defer-length": "1",
},
)
assert resp.status_code == 201
assert resp.status_code == 201, resp.json()
# Get the URL to upload the file to
url = resp.headers["location"]

Expand Down Expand Up @@ -220,3 +222,209 @@ async def test_tus_upload_handles_unknown_upload_ids(
assert resp.status_code == 404
error_detail = resp.json().get("detail")
assert error_detail == "Resumable URI not found for upload_id: foobarid"


@pytest.mark.asyncio
async def test_content_type_validation(
local_storage_settings,
configure_redis_dm,
nucliadb_writer,
nucliadb_reader,
knowledgebox_one,
):
language = "ca"
filename = "image.jpg"
md5 = "7af0916dba8b70e29d99e72941923529"

# Create a resource
kb_path = f"/{KB_PREFIX}/{knowledgebox_one}"
resp = await nucliadb_writer.post(
f"{kb_path}/{RESOURCES_PREFIX}",
json={
"slug": "resource1",
"title": "Resource 1",
},
)
assert resp.status_code == 201
resource = resp.json().get("uuid")

# Start TUS upload
url = f"{kb_path}/{RESOURCE_PREFIX}/{resource}/file/field1/{TUSUPLOAD}"
upload_metadata = ",".join(
[
f"filename {header_encode(filename)}",
f"language {header_encode(language)}",
f"md5 {header_encode(md5)}",
]
)
resp = await nucliadb_writer.post(
url,
headers={
"tus-resumable": "1.0.0",
"upload-metadata": upload_metadata,
"content-type": "invalid-content-type",
"upload-defer-length": "1",
},
)
assert resp.status_code == 415
error_detail = resp.json().get("detail")
assert error_detail == "Unsupported content type: invalid-content-type"


@pytest.mark.parametrize(
"content_type",
[
"application/epub+zip",
"application/font-woff",
"application/generic",
"application/java-archive",
"application/java-vm",
"application/json",
"application/mp4",
"application/msword",
"application/octet-stream",
"application/pdf",
"application/pdf+aitable",
"application/postscript",
"application/rls-services+xml",
"application/rtf",
"application/stf-link",
"application/toml",
"application/vnd.jgraph.mxfile",
"application/vnd.lotus-organizer",
"application/vnd.ms-excel.sheet.macroenabled.12",
"application/vnd.ms-excel",
"application/vnd.ms-excel+aitable",
"application/vnd.ms-outlook",
"application/vnd.ms-powerpoint",
"application/vnd.ms-project",
"application/vnd.ms-word.document.macroenabled.12",
"application/vnd.oasis.opendocument.presentation",
"application/vnd.oasis.opendocument.text",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
"application/vnd.openxmlformats-officedocument.presentationml.presentation+aitable",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet+aitable",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document+aitable",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.rar",
"application/x-mobipocket-ebook",
"application/x-ms-shortcut",
"application/x-msdownload",
"application/x-ndjson",
"application/x-openscad",
"application/x-sql",
"application/x-zip-compressed",
"application/xml",
"application/zip",
"application/zstd",
"audio/aac",
"audio/mp4",
"audio/mpeg",
"audio/vnd.dlna.adts",
"audio/wav",
"audio/x-m4a",
"image/avif",
"image/gif",
"image/heic",
"image/jpeg",
"image/jpeg+aitable",
"image/png",
"image/png+aitable",
"image/svg+xml",
"image/tiff",
"image/vnd.djvu",
"image/vnd.dwg",
"image/webp",
"model/stl",
"text/calendar",
"text/css",
"text/csv",
"text/csv+aitable",
"text/html",
"text/javascript",
"text/jsx",
"text/markdown",
"text/plain",
"text/rtf",
"text/rtf+aitable",
"text/x-java-source",
"text/x-log",
"text/x-python",
"text/xml",
"text/yaml",
"video/mp4",
"video/mp4+aitable",
"video/quicktime",
"video/webm",
"video/x-m4v",
"video/x-ms-wmv",
"video/YouTube",
"multipart/form-data",
],
)
def test_valid_content_types(content_type):
from nucliadb_models import content_types

assert content_types.valid(content_type)


@pytest.mark.parametrize(
"content_type",
[
"multipart/form-data;boundary=--------------------------472719318099714047986957",
],
)
def test_invalid_content_types(content_type):
from nucliadb_models import content_types

assert not content_types.valid(content_type)


@pytest.mark.parametrize(
"filename,content_type",
[
# Text files
("foo.txt", "text/plain"),
("foo.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
("foo.pdf", "application/pdf"),
("foo.json", "application/json"),
# Spreadsheets
("foo.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
("foo.csv", "text/csv"),
# Presentations
("foo.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
# Images
("image.jpg", "image/jpeg"),
("image.jpeg", "image/jpeg"),
("image.png", "image/png"),
("image.tiff", "image/tiff"),
("image.gif", "image/gif"),
# Videos
("video.mp4", "video/mp4"),
("video.webm", "video/webm"),
("video.avi", "video/x-msvideo"),
("video.mpeg", "video/mpeg"),
# Audio
("audio.mp3", "audio/mpeg"),
("audio.wav", "audio/x-wav"),
# Web data
("data.html", "text/html"),
("data.xml", "application/xml"),
# Archive files
("archive.zip", "application/zip"),
("archive.rar", "application/x-rar-compressed"),
("archive.tar", "application/x-tar"),
("archive.tar.gz", "application/x-tar"),
# Invalid content types
("foobar", None),
("someuuidwithoutextension", None),
("", None),
],
)
def test_guess_content_type(filename, content_type):
from nucliadb_models import content_types

assert content_types.guess(filename) == content_type
Loading

0 comments on commit 69404d7

Please sign in to comment.