fix: brat parser now allows spaces and digits in labels

aphp · Jan 10, 2024 · db532d4 · db532d4
1 parent 634af8d
commit db532d4
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 53 deletions.
diff --git a/changelog.md b/changelog.md
@@ -11,6 +11,7 @@
 
 - Measurements now correctly match "0.X", "0.XX", ... numbers
 - Typo in "celsius" measurement unit
+- Spaces and digits are now supported in BRAT entity labels
 
 ## v0.10.2
 

diff --git a/edsnlp/connectors/brat.py b/edsnlp/connectors/brat.py
@@ -45,6 +45,7 @@ def __init__(
         directory: Union[str, Path],
         n_jobs: int = 1,
         attributes: Optional[AttributesMappingArg] = None,
+        bool_attributes: Optional[List[str]] = [],
         span_groups: SpanSetterArg = ["ents", "*"],
         keep_raw_attribute_values: bool = False,
     ):
@@ -57,6 +58,7 @@ def __init__(
         self.attr_map = attributes
         self.span_setter = validate_span_setter(span_groups)
         self.keep_raw_attribute_values = keep_raw_attribute_values
+        self.bool_attributes = list(bool_attributes)
 
     def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]:
         res = read_standoff(
@@ -66,7 +68,7 @@ def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]:
             span_attributes=self.attr_map,
             span_setter=self.span_setter,
             keep_raw_attribute_values=self.keep_raw_attribute_values,
-            bool_attributes=[],
+            bool_attributes=self.bool_attributes,
         )
         return list(nlp.pipe(res) if run_pipe else res)
 

diff --git a/edsnlp/data/converters.py b/edsnlp/data/converters.py
@@ -179,7 +179,7 @@ def __init__(
     ):
         self.tokenizer = tokenizer or (nlp.tokenizer if nlp is not None else None)
         self.span_setter = span_setter
-        self.span_attributes = span_attributes
+        self.span_attributes = span_attributes  # type: ignore
         self.keep_raw_attribute_values = keep_raw_attribute_values
         self.bool_attributes = bool_attributes
 
@@ -190,10 +190,12 @@ def __call__(self, obj):
 
         spans = []
 
-        if self.span_attributes is not None:
-            for dst in self.span_attributes.values():
-                if not Span.has_extension(dst):
-                    Span.set_extension(dst, default=None)
+        for dst in (
+            *(() if self.span_attributes is None else self.span_attributes.values()),
+            *self.bool_attributes,
+        ):
+            if not Span.has_extension(dst):
+                Span.set_extension(dst, default=None)
 
         for ent in obj.get("entities") or ():
             for fragment in ent["fragments"]:
@@ -351,10 +353,12 @@ def __call__(self, obj):
 
         spans = []
 
-        if self.span_attributes is not None:
-            for dst in self.span_attributes.values():
-                if not Span.has_extension(dst):
-                    Span.set_extension(dst, default=None)
+        for dst in (
+            *(() if self.span_attributes is None else self.span_attributes.values()),
+            *self.bool_attributes,
+        ):
+            if not Span.has_extension(dst):
+                Span.set_extension(dst, default=None)
 
         for ent in obj.get("entities") or ():
             ent = dict(ent)

diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py
@@ -29,10 +29,10 @@
 from edsnlp.utils.collections import flatten_once
 from edsnlp.utils.span_getters import SpanSetterArg
 
-REGEX_ENTITY = re.compile(r"^(T\d+)\t(\S+)([^\t]+)\t(.*)$")
+REGEX_ENTITY = re.compile(r"^(T\d+)\t(.*) (\d+ \d+(?:;\d+ \d+)*)\t(.*)$")
 REGEX_NOTE = re.compile(r"^(#\d+)\tAnnotatorNotes ([^\t]+)\t(.*)$")
 REGEX_RELATION = re.compile(r"^(R\d+)\t(\S+) Arg1:(\S+) Arg2:(\S+)")
-REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+)$")
+REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+?) ([TE]\d+)(?: (.+))?$")
 REGEX_EVENT = re.compile(r"^(E\d+)\t(.+)$")
 REGEX_EVENT_PART = re.compile(r"(\S+):([TE]\d+)")
 
@@ -131,19 +131,14 @@ def parse_standoff_file(path: str, merge_spaced_fragments: bool = True) -> Dict:
                         match = REGEX_ATTRIBUTE.match(line)
                         if match is None:
                             raise BratParsingError(ann_file, line)
-                        parts = match.group(2).split(" ")
-                        if len(parts) >= 3:
-                            entity, entity_id, value = parts
-                        elif len(parts) == 2:
-                            entity, entity_id = parts
-                            value = None
-                        else:
+                        _, attr_name, entity_id, value = match.groups()
+                        if attr_name is None:
                             raise BratParsingError(ann_file, line)
                         (
                             entities[entity_id]
                             if entity_id.startswith("T")
                             else events[entity_id]
-                        )["attributes"][entity] = value
+                        )["attributes"][attr_name] = value
                     elif line.startswith("R"):
                         match = REGEX_RELATION.match(line)
                         if match is None:

diff --git a/tests/data/test_standoff.py b/tests/data/test_standoff.py
@@ -72,12 +72,12 @@ def brat2(tmpdir) -> BratConnector:
 @pytest.fixture
 def brat_importer():
     brat_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data"
-    return BratConnector(str(brat_dir))
+    return BratConnector(str(brat_dir), bool_attributes=["bool flag 0"])
 
 
 @pytest.fixture
 def brat_exporter(tmpdir):
-    return BratConnector(tmpdir, attributes=["etat", "assertion"])
+    return BratConnector(tmpdir, attributes=["etat", "assertion", "bool flag 0"])
 
 
 def test_empty_brat(brat2: BratConnector, blank_nlp: PipelineProtocol):
@@ -128,7 +128,7 @@ def test_docs2brat(nlp, brat2):
 def assert_doc_read(doc):
     assert doc._.note_id == "subfolder/doc-1"
 
-    attrs = ("etat", "assertion")
+    attrs = ("etat", "assertion", "bool flag 0")
     spans_and_attributes = {
         "__ents__": sorted(
             [
@@ -149,35 +149,36 @@ def assert_doc_read(doc):
 
     assert spans_and_attributes == {
         "__ents__": [
-            (6, 7, "douleurs", (None, None)),
-            (7, 11, "dans le bras droit", (None, None)),
-            (17, 21, "problème \nde locomotion", (None, "absent")),
-            (25, 26, "AVC", ("passé", "non-associé")),
-            (35, 36, "rhume", ("présent", "hypothétique")),
-            (45, 46, "rhume", ("présent", "hypothétique")),
-            (51, 52, "Douleurs", (None, None)),
-            (52, 56, "dans le bras droit", (None, None)),
-            (68, 69, "anomalie", (None, "absent")),
+            (6, 7, "douleurs", (None, None, False)),
+            (7, 11, "dans le bras droit", (None, None, False)),
+            (17, 21, "problème \nde locomotion", (None, "absent", True)),
+            (25, 26, "AVC", ("passé", "non-associé", False)),
+            (35, 36, "rhume", ("présent", "hypothétique", False)),
+            (45, 46, "rhume", ("présent", "hypothétique", False)),
+            (51, 52, "Douleurs", (None, None, False)),
+            (52, 56, "dans le bras droit", (None, None, False)),
+            (68, 69, "anomalie", (None, "absent", False)),
         ],
         "anatomie": [
-            (9, 11, "bras droit", (None, None)),
-            (54, 56, "bras droit", (None, None)),
+            (9, 11, "bras droit", (None, None, False)),
+            (54, 56, "bras droit", (None, None, False)),
         ],
         "localisation": [
-            (7, 11, "dans le bras droit", (None, None)),
-            (52, 56, "dans le bras droit", (None, None)),
+            (7, 11, "dans le bras droit", (None, None, False)),
+            (52, 56, "dans le bras droit", (None, None, False)),
         ],
         "pathologie": [
-            (17, 21, "problème \nde locomotion", (None, "absent")),
-            (25, 26, "AVC", ("passé", "non-associé")),
-            (35, 36, "rhume", ("présent", "hypothétique")),
-            (45, 46, "rhume", ("présent", "hypothétique")),
+            (17, 21, "problème \nde locomotion", (None, "absent", True)),
+            (25, 26, "AVC", ("passé", "non-associé", False)),
+            (35, 36, "rhume", ("présent", "hypothétique", False)),
+            (45, 46, "rhume", ("présent", "hypothétique", False)),
         ],
         "sosy": [
-            (6, 7, "douleurs", (None, None)),
-            (51, 52, "Douleurs", (None, None)),
-            (68, 69, "anomalie", (None, "absent")),
+            (6, 7, "douleurs", (None, None, False)),
+            (51, 52, "Douleurs", (None, None, False)),
+            (68, 69, "anomalie", (None, "absent", False)),
         ],
+        "test label 0": [(68, 69, "anomalie", (None, "absent", False))],
     }
 
 
@@ -189,20 +190,23 @@ def assert_doc_write(exported_ann_text):
         "T3	anatomie 47 57	bras droit\n"
         "T4	pathologie 75 83;85 98	problème de locomotion\n"
         "A2	assertion T4 absent\n"
+        "A3	bool flag 0 T4\n"
         "T5	pathologie 114 117	AVC\n"
-        "A3	etat T5 passé\n"
-        "A4	assertion T5 non-associé\n"
+        "A4	etat T5 passé\n"
+        "A5	assertion T5 non-associé\n"
         "T6	pathologie 159 164	rhume\n"
-        "A5	etat T6 présent\n"
-        "A6	assertion T6 hypothétique\n"
+        "A6	etat T6 présent\n"
+        "A7	assertion T6 hypothétique\n"
         "T7	pathologie 291 296	rhume\n"
-        "A7	etat T7 présent\n"
-        "A8	assertion T7 hypothétique\n"
+        "A8	etat T7 présent\n"
+        "A9	assertion T7 hypothétique\n"
         "T8	sosy 306 314	Douleurs\n"
         "T9	localisation 315 333	dans le bras droit\n"
         "T10	anatomie 323 333	bras droit\n"
         "T11	sosy 378 386	anomalie\n"
-        "A9	assertion T11 absent\n"
+        "A10	assertion T11 absent\n"
+        "T12	test label 0 378 386	anomalie\n"
+        "A11	assertion T12 absent\n"
     )
 
 
@@ -228,15 +232,22 @@ def test_brat(
 def test_read_to_standoff(blank_nlp, tmpdir):
     input_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data"
     output_dir = Path(tmpdir)
-    doc = list(edsnlp.data.read_standoff(input_dir))[0]
+    doc = list(edsnlp.data.read_standoff(input_dir, bool_attributes=["bool flag 0"]))[0]
     assert_doc_read(doc)
     doc.ents[0]._.etat = "test"
 
     edsnlp.data.write_standoff(
         [doc],
         output_dir,
-        span_attributes=["etat", "assertion"],
-        span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"],
+        span_attributes=["etat", "assertion", "bool flag 0"],
+        span_getter=[
+            "ents",
+            "sosy",
+            "localisation",
+            "anatomie",
+            "pathologie",
+            "test label 0",
+        ],
     )
 
     with open(output_dir / "subfolder" / "doc-1.ann") as f:

diff --git a/tests/resources/brat_data/subfolder/doc-1.ann b/tests/resources/brat_data/subfolder/doc-1.ann
@@ -4,6 +4,7 @@ T2	localisation 39 57	dans le bras droit
 T3	anatomie 47 57	bras droit
 T4	pathologie 75 83;85 98	problème de locomotion
 A1	assertion T4 absent
+A9	bool flag 0 T4
 T5	pathologie 114 117	AVC
 A2	etat T5 passé
 A3	assertion T5 non-associé
@@ -22,3 +23,4 @@ R2	lieu Arg1:T1 Arg2:T2
 A8	assertion T11 absent
 E1	MyArg1:T3 MyArg2:T1
 E2	MyArg1:T1 MyArg2:E1
+T12	test label 0 378 386	anomalie