Skip to content

Commit

Permalink
Show mime type on extracted metadata (#2601)
Browse files Browse the repository at this point in the history
* Show mime type on extracted metadata

* Mime type

* fix fmt

* Fix

* fix test

* fix mypy

---------

Co-authored-by: Ferran Llamas <[email protected]>
  • Loading branch information
bloodbare and lferran authored Nov 11, 2024
1 parent a47e87a commit 50d375e
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 10 deletions.
14 changes: 12 additions & 2 deletions nucliadb/src/nucliadb/ingest/orm/brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,11 @@
from nucliadb_models.metadata import ResourceProcessingStatus
from nucliadb_protos import utils_pb2
from nucliadb_protos.noderesources_pb2 import IndexParagraph as BrainParagraph
from nucliadb_protos.noderesources_pb2 import ParagraphMetadata, Representation, ResourceID
from nucliadb_protos.noderesources_pb2 import (
ParagraphMetadata,
Representation,
ResourceID,
)
from nucliadb_protos.noderesources_pb2 import Position as TextPosition
from nucliadb_protos.noderesources_pb2 import Resource as PBBrainResource
from nucliadb_protos.resources_pb2 import (
Expand Down Expand Up @@ -483,6 +487,8 @@ def process_field_metadata(
relation_node_document: RelationNode,
user_canceled_labels: set[str],
):
if metadata.mime_type != "":
labels["mt"].add(metadata.mime_type)
for classification in metadata.classifications:
label = f"{classification.labelset}/{classification.label}"
if label not in user_canceled_labels:
Expand Down Expand Up @@ -563,7 +569,11 @@ def apply_field_labels(
if classification.cancelled_by_user
)
relation_node_resource = RelationNode(value=uuid, ntype=RelationNode.NodeType.RESOURCE)
labels: dict[str, set[str]] = {"l": set(), "e": set()}
labels: dict[str, set[str]] = {
"l": set(), # classification labels
"e": set(), # entities
"mt": set(), # mime type
}
if metadata is not None:
for meta in metadata.split_metadata.values():
self.process_field_metadata(
Expand Down
11 changes: 9 additions & 2 deletions nucliadb/tests/ingest/fixtures.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,10 @@ def make_field_metadata(field_id):
rpb.FieldEntity(
text="document",
label="ENTITY",
positions=[rpb.Position(start=0, end=5), rpb.Position(start=13, end=18)],
positions=[
rpb.Position(start=0, end=5),
rpb.Position(start=13, end=18),
],
),
]
)
Expand All @@ -402,10 +405,14 @@ def make_field_metadata(field_id):
rpb.FieldEntity(
text="document",
label="NOUN",
positions=[rpb.Position(start=0, end=5), rpb.Position(start=13, end=18)],
positions=[
rpb.Position(start=0, end=5),
rpb.Position(start=13, end=18),
],
),
]
)
ex1.metadata.metadata.mime_type = "text/html"
return ex1


Expand Down
2 changes: 2 additions & 0 deletions nucliadb/tests/ingest/integration/orm/test_orm_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ async def test_create_resource_orm_metadata(
p1.sentences.append(Sentence(start=11, end=20, key="test"))
cl1 = Classification(labelset="labelset1", label="label1")
p1.classifications.append(cl1)
ex1.metadata.metadata.mime_type = "text/plain"
ex1.metadata.metadata.paragraphs.append(p1)
ex1.metadata.metadata.classifications.append(cl1)
ex1.metadata.metadata.last_index.FromDatetime(datetime.now())
Expand All @@ -79,6 +80,7 @@ async def test_create_resource_orm_metadata(
ex2: Optional[FieldComputedMetadata] = await field_obj.get_field_metadata()
assert ex2 is not None
assert ex2.metadata.links[0] == ex1.metadata.metadata.links[0]
assert ex2.metadata.mime_type == ex1.metadata.metadata.mime_type


@pytest.mark.asyncio
Expand Down
18 changes: 12 additions & 6 deletions nucliadb/tests/ingest/integration/orm/test_orm_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,10 @@ async def test_generate_index_message_contains_all_metadata(
# Make sure there are no duplicates
assert len(index_message.labels) == len(set(index_message.labels))

# Check that field labels contain the right set of labels
for text_info in index_message.texts.values():
assert "/mt/text/html" in text_info.labels

# Check texts are populated with field extracted text and field computed labels
expected_fields = {
"a/title",
Expand All @@ -443,15 +447,17 @@ async def test_generate_index_message_contains_all_metadata(
"t/text1",
}
fields_to_be_found = expected_fields.copy()
for field, text in index_message.texts.items():
for field, text_info in index_message.texts.items():
assert field in fields_to_be_found
fields_to_be_found.remove(field)
assert text.text == "MyText"
assert {"/l/labelset1/label1", "/e/ENTITY/document", "/e/NOUN/document"}.issubset(
set(text.labels)
)
assert text_info.text == "MyText"
assert {
"/l/labelset1/label1",
"/e/ENTITY/document",
"/e/NOUN/document",
}.issubset(set(text_info.labels))
if field in ("u/link", "t/text1"):
assert "/e/Location/My home" in text.labels
assert "/e/Location/My home" in text_info.labels

assert len(fields_to_be_found) == 0

Expand Down
1 change: 1 addition & 0 deletions nucliadb_models/src/nucliadb_models/extracted.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ class FieldMetadata(BaseModel):
summary: Optional[str] = None
positions: Dict[str, Positions] # TODO: Remove once processor doesn't use this anymore
relations: Optional[List[Relation]] = None
mime_type: Optional[str] = None


class FieldComputedMetadata(BaseModel):
Expand Down
1 change: 1 addition & 0 deletions nucliadb_models/src/nucliadb_models/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
"p": set(), # origin metadata in the form of (key/value). Max key/value size is 255
"k": set(), # kind of text paragraph to be stored
"q": set(), # reserved for internal use: h (hidden)
"mt": set(), # field mime type
}


Expand Down

0 comments on commit 50d375e

Please sign in to comment.