From 50d375ea2f00400823b761b8d9aa0de81477bd8c Mon Sep 17 00:00:00 2001 From: Ramon Navarro Bosch Date: Mon, 11 Nov 2024 12:48:47 +0100 Subject: [PATCH] Show mime type on extracted metadata (#2601) * Show mime type on extracted metadata * Mime type * fix fmt * Fix * fix test * fix mypy --------- Co-authored-by: Ferran Llamas --- nucliadb/src/nucliadb/ingest/orm/brain.py | 14 ++++++++++++-- nucliadb/tests/ingest/fixtures.py | 11 +++++++++-- .../integration/orm/test_orm_metadata.py | 2 ++ .../integration/orm/test_orm_resource.py | 18 ++++++++++++------ .../src/nucliadb_models/extracted.py | 1 + nucliadb_models/src/nucliadb_models/labels.py | 1 + 6 files changed, 37 insertions(+), 10 deletions(-) diff --git a/nucliadb/src/nucliadb/ingest/orm/brain.py b/nucliadb/src/nucliadb/ingest/orm/brain.py index 0b43c574b4..88bc64e287 100644 --- a/nucliadb/src/nucliadb/ingest/orm/brain.py +++ b/nucliadb/src/nucliadb/ingest/orm/brain.py @@ -29,7 +29,11 @@ from nucliadb_models.metadata import ResourceProcessingStatus from nucliadb_protos import utils_pb2 from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph -from nucliadb_protos.noderesources_pb2 import ParagraphMetadata, Representation, ResourceID +from nucliadb_protos.noderesources_pb2 import ( + ParagraphMetadata, + Representation, + ResourceID, +) from nucliadb_protos.noderesources_pb2 import Position as TextPosition from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource from nucliadb_protos.resources_pb2 import ( @@ -483,6 +487,8 @@ def process_field_metadata( relation_node_document: RelationNode, user_canceled_labels: set[str], ): + if metadata.mime_type != "": + labels["mt"].add(metadata.mime_type) for classification in metadata.classifications: label = f"{classification.labelset}/{classification.label}" if label not in user_canceled_labels: @@ -563,7 +569,11 @@ def apply_field_labels( if classification.cancelled_by_user ) relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE) - labels: dict[str, set[str]] = {"l": set(), "e": set()} + labels: dict[str, set[str]] = { + "l": set(), # classification labels + "e": set(), # entities + "mt": set(), # mime type + } if metadata is not None: for meta in metadata.split_metadata.values(): self.process_field_metadata( diff --git a/nucliadb/tests/ingest/fixtures.py b/nucliadb/tests/ingest/fixtures.py index 494c4dc870..e5df20d4cb 100644 --- a/nucliadb/tests/ingest/fixtures.py +++ b/nucliadb/tests/ingest/fixtures.py @@ -393,7 +393,10 @@ def make_field_metadata(field_id): rpb.FieldEntity( text="document", label="ENTITY", - positions=[rpb.Position(start=0, end=5), rpb.Position(start=13, end=18)], + positions=[ + rpb.Position(start=0, end=5), + rpb.Position(start=13, end=18), + ], ), ] ) @@ -402,10 +405,14 @@ def make_field_metadata(field_id): rpb.FieldEntity( text="document", label="NOUN", - positions=[rpb.Position(start=0, end=5), rpb.Position(start=13, end=18)], + positions=[ + rpb.Position(start=0, end=5), + rpb.Position(start=13, end=18), + ], ), ] ) + ex1.metadata.metadata.mime_type = "text/html" return ex1 diff --git a/nucliadb/tests/ingest/integration/orm/test_orm_metadata.py b/nucliadb/tests/ingest/integration/orm/test_orm_metadata.py index ee542d6a8c..597cdfd6a0 100644 --- a/nucliadb/tests/ingest/integration/orm/test_orm_metadata.py +++ b/nucliadb/tests/ingest/integration/orm/test_orm_metadata.py @@ -57,6 +57,7 @@ async def test_create_resource_orm_metadata( p1.sentences.append(Sentence(start=11, end=20, key="test")) cl1 = Classification(labelset="labelset1", label="label1") p1.classifications.append(cl1) + ex1.metadata.metadata.mime_type = "text/plain" ex1.metadata.metadata.paragraphs.append(p1) ex1.metadata.metadata.classifications.append(cl1) ex1.metadata.metadata.last_index.FromDatetime(datetime.now()) @@ -79,6 +80,7 @@ async def test_create_resource_orm_metadata( ex2: Optional[FieldComputedMetadata] = await field_obj.get_field_metadata() assert ex2 is not None assert ex2.metadata.links[0] == ex1.metadata.metadata.links[0] + assert ex2.metadata.mime_type == ex1.metadata.metadata.mime_type @pytest.mark.asyncio diff --git a/nucliadb/tests/ingest/integration/orm/test_orm_resource.py b/nucliadb/tests/ingest/integration/orm/test_orm_resource.py index decb643fff..b2592d9af8 100644 --- a/nucliadb/tests/ingest/integration/orm/test_orm_resource.py +++ b/nucliadb/tests/ingest/integration/orm/test_orm_resource.py @@ -433,6 +433,10 @@ async def test_generate_index_message_contains_all_metadata( # Make sure there are no duplicates assert len(index_message.labels) == len(set(index_message.labels)) + # Check that field labels contain the right set of labels + for text_info in index_message.texts.values(): + assert "/mt/text/html" in text_info.labels + # Check texts are populated with field extracted text and field computed labels expected_fields = { "a/title", @@ -443,15 +447,17 @@ async def test_generate_index_message_contains_all_metadata( "t/text1", } fields_to_be_found = expected_fields.copy() - for field, text in index_message.texts.items(): + for field, text_info in index_message.texts.items(): assert field in fields_to_be_found fields_to_be_found.remove(field) - assert text.text == "MyText" - assert {"/l/labelset1/label1", "/e/ENTITY/document", "/e/NOUN/document"}.issubset( - set(text.labels) - ) + assert text_info.text == "MyText" + assert { + "/l/labelset1/label1", + "/e/ENTITY/document", + "/e/NOUN/document", + }.issubset(set(text_info.labels)) if field in ("u/link", "t/text1"): - assert "/e/Location/My home" in text.labels + assert "/e/Location/My home" in text_info.labels assert len(fields_to_be_found) == 0 diff --git a/nucliadb_models/src/nucliadb_models/extracted.py b/nucliadb_models/src/nucliadb_models/extracted.py index cfe0961526..666fc49c37 100644 --- a/nucliadb_models/src/nucliadb_models/extracted.py +++ b/nucliadb_models/src/nucliadb_models/extracted.py @@ -133,6 +133,7 @@ class FieldMetadata(BaseModel): summary: Optional[str] = None positions: Dict[str, Positions] # TODO: Remove once processor doesn't use this anymore relations: Optional[List[Relation]] = None + mime_type: Optional[str] = None class FieldComputedMetadata(BaseModel): diff --git a/nucliadb_models/src/nucliadb_models/labels.py b/nucliadb_models/src/nucliadb_models/labels.py index 9c9e19496c..880188133b 100644 --- a/nucliadb_models/src/nucliadb_models/labels.py +++ b/nucliadb_models/src/nucliadb_models/labels.py @@ -36,6 +36,7 @@ "p": set(), # origin metadata in the form of (key/value). Max key/value size is 255 "k": set(), # kind of text paragraph to be stored "q": set(), # reserved for internal use: h (hidden) + "mt": set(), # field mime type }