Skip to content

Commit

Permalink
fix: brat parser now allows spaces and digits in labels
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Jan 10, 2024
1 parent 634af8d commit db532d4
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 53 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

- Measurements now correctly match "0.X", "0.XX", ... numbers
- Typo in "celsius" measurement unit
- Spaces and digits are now supported in BRAT entity labels

## v0.10.2

Expand Down
4 changes: 3 additions & 1 deletion edsnlp/connectors/brat.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def __init__(
directory: Union[str, Path],
n_jobs: int = 1,
attributes: Optional[AttributesMappingArg] = None,
bool_attributes: Optional[List[str]] = [],
span_groups: SpanSetterArg = ["ents", "*"],
keep_raw_attribute_values: bool = False,
):
Expand All @@ -57,6 +58,7 @@ def __init__(
self.attr_map = attributes
self.span_setter = validate_span_setter(span_groups)
self.keep_raw_attribute_values = keep_raw_attribute_values
self.bool_attributes = list(bool_attributes)

def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]:
res = read_standoff(
Expand All @@ -66,7 +68,7 @@ def brat2docs(self, nlp: PipelineProtocol, run_pipe=False) -> List[Doc]:
span_attributes=self.attr_map,
span_setter=self.span_setter,
keep_raw_attribute_values=self.keep_raw_attribute_values,
bool_attributes=[],
bool_attributes=self.bool_attributes,
)
return list(nlp.pipe(res) if run_pipe else res)

Expand Down
22 changes: 13 additions & 9 deletions edsnlp/data/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def __init__(
):
self.tokenizer = tokenizer or (nlp.tokenizer if nlp is not None else None)
self.span_setter = span_setter
self.span_attributes = span_attributes
self.span_attributes = span_attributes # type: ignore
self.keep_raw_attribute_values = keep_raw_attribute_values
self.bool_attributes = bool_attributes

Expand All @@ -190,10 +190,12 @@ def __call__(self, obj):

spans = []

if self.span_attributes is not None:
for dst in self.span_attributes.values():
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)
for dst in (
*(() if self.span_attributes is None else self.span_attributes.values()),
*self.bool_attributes,
):
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)

for ent in obj.get("entities") or ():
for fragment in ent["fragments"]:
Expand Down Expand Up @@ -351,10 +353,12 @@ def __call__(self, obj):

spans = []

if self.span_attributes is not None:
for dst in self.span_attributes.values():
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)
for dst in (
*(() if self.span_attributes is None else self.span_attributes.values()),
*self.bool_attributes,
):
if not Span.has_extension(dst):
Span.set_extension(dst, default=None)

for ent in obj.get("entities") or ():
ent = dict(ent)
Expand Down
15 changes: 5 additions & 10 deletions edsnlp/data/standoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,10 @@
from edsnlp.utils.collections import flatten_once
from edsnlp.utils.span_getters import SpanSetterArg

REGEX_ENTITY = re.compile(r"^(T\d+)\t(\S+)([^\t]+)\t(.*)$")
REGEX_ENTITY = re.compile(r"^(T\d+)\t(.*) (\d+ \d+(?:;\d+ \d+)*)\t(.*)$")
REGEX_NOTE = re.compile(r"^(#\d+)\tAnnotatorNotes ([^\t]+)\t(.*)$")
REGEX_RELATION = re.compile(r"^(R\d+)\t(\S+) Arg1:(\S+) Arg2:(\S+)")
REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+)$")
REGEX_ATTRIBUTE = re.compile(r"^([AM]\d+)\t(.+?) ([TE]\d+)(?: (.+))?$")
REGEX_EVENT = re.compile(r"^(E\d+)\t(.+)$")
REGEX_EVENT_PART = re.compile(r"(\S+):([TE]\d+)")

Expand Down Expand Up @@ -131,19 +131,14 @@ def parse_standoff_file(path: str, merge_spaced_fragments: bool = True) -> Dict:
match = REGEX_ATTRIBUTE.match(line)
if match is None:
raise BratParsingError(ann_file, line)
parts = match.group(2).split(" ")
if len(parts) >= 3:
entity, entity_id, value = parts
elif len(parts) == 2:
entity, entity_id = parts
value = None
else:
_, attr_name, entity_id, value = match.groups()
if attr_name is None:
raise BratParsingError(ann_file, line)
(
entities[entity_id]
if entity_id.startswith("T")
else events[entity_id]
)["attributes"][entity] = value
)["attributes"][attr_name] = value
elif line.startswith("R"):
match = REGEX_RELATION.match(line)
if match is None:
Expand Down
77 changes: 44 additions & 33 deletions tests/data/test_standoff.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,12 @@ def brat2(tmpdir) -> BratConnector:
@pytest.fixture
def brat_importer():
brat_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data"
return BratConnector(str(brat_dir))
return BratConnector(str(brat_dir), bool_attributes=["bool flag 0"])


@pytest.fixture
def brat_exporter(tmpdir):
return BratConnector(tmpdir, attributes=["etat", "assertion"])
return BratConnector(tmpdir, attributes=["etat", "assertion", "bool flag 0"])


def test_empty_brat(brat2: BratConnector, blank_nlp: PipelineProtocol):
Expand Down Expand Up @@ -128,7 +128,7 @@ def test_docs2brat(nlp, brat2):
def assert_doc_read(doc):
assert doc._.note_id == "subfolder/doc-1"

attrs = ("etat", "assertion")
attrs = ("etat", "assertion", "bool flag 0")
spans_and_attributes = {
"__ents__": sorted(
[
Expand All @@ -149,35 +149,36 @@ def assert_doc_read(doc):

assert spans_and_attributes == {
"__ents__": [
(6, 7, "douleurs", (None, None)),
(7, 11, "dans le bras droit", (None, None)),
(17, 21, "problème \nde locomotion", (None, "absent")),
(25, 26, "AVC", ("passé", "non-associé")),
(35, 36, "rhume", ("présent", "hypothétique")),
(45, 46, "rhume", ("présent", "hypothétique")),
(51, 52, "Douleurs", (None, None)),
(52, 56, "dans le bras droit", (None, None)),
(68, 69, "anomalie", (None, "absent")),
(6, 7, "douleurs", (None, None, False)),
(7, 11, "dans le bras droit", (None, None, False)),
(17, 21, "problème \nde locomotion", (None, "absent", True)),
(25, 26, "AVC", ("passé", "non-associé", False)),
(35, 36, "rhume", ("présent", "hypothétique", False)),
(45, 46, "rhume", ("présent", "hypothétique", False)),
(51, 52, "Douleurs", (None, None, False)),
(52, 56, "dans le bras droit", (None, None, False)),
(68, 69, "anomalie", (None, "absent", False)),
],
"anatomie": [
(9, 11, "bras droit", (None, None)),
(54, 56, "bras droit", (None, None)),
(9, 11, "bras droit", (None, None, False)),
(54, 56, "bras droit", (None, None, False)),
],
"localisation": [
(7, 11, "dans le bras droit", (None, None)),
(52, 56, "dans le bras droit", (None, None)),
(7, 11, "dans le bras droit", (None, None, False)),
(52, 56, "dans le bras droit", (None, None, False)),
],
"pathologie": [
(17, 21, "problème \nde locomotion", (None, "absent")),
(25, 26, "AVC", ("passé", "non-associé")),
(35, 36, "rhume", ("présent", "hypothétique")),
(45, 46, "rhume", ("présent", "hypothétique")),
(17, 21, "problème \nde locomotion", (None, "absent", True)),
(25, 26, "AVC", ("passé", "non-associé", False)),
(35, 36, "rhume", ("présent", "hypothétique", False)),
(45, 46, "rhume", ("présent", "hypothétique", False)),
],
"sosy": [
(6, 7, "douleurs", (None, None)),
(51, 52, "Douleurs", (None, None)),
(68, 69, "anomalie", (None, "absent")),
(6, 7, "douleurs", (None, None, False)),
(51, 52, "Douleurs", (None, None, False)),
(68, 69, "anomalie", (None, "absent", False)),
],
"test label 0": [(68, 69, "anomalie", (None, "absent", False))],
}


Expand All @@ -189,20 +190,23 @@ def assert_doc_write(exported_ann_text):
"T3 anatomie 47 57 bras droit\n"
"T4 pathologie 75 83;85 98 problème de locomotion\n"
"A2 assertion T4 absent\n"
"A3 bool flag 0 T4\n"
"T5 pathologie 114 117 AVC\n"
"A3 etat T5 passé\n"
"A4 assertion T5 non-associé\n"
"A4 etat T5 passé\n"
"A5 assertion T5 non-associé\n"
"T6 pathologie 159 164 rhume\n"
"A5 etat T6 présent\n"
"A6 assertion T6 hypothétique\n"
"A6 etat T6 présent\n"
"A7 assertion T6 hypothétique\n"
"T7 pathologie 291 296 rhume\n"
"A7 etat T7 présent\n"
"A8 assertion T7 hypothétique\n"
"A8 etat T7 présent\n"
"A9 assertion T7 hypothétique\n"
"T8 sosy 306 314 Douleurs\n"
"T9 localisation 315 333 dans le bras droit\n"
"T10 anatomie 323 333 bras droit\n"
"T11 sosy 378 386 anomalie\n"
"A9 assertion T11 absent\n"
"A10 assertion T11 absent\n"
"T12 test label 0 378 386 anomalie\n"
"A11 assertion T12 absent\n"
)


Expand All @@ -228,15 +232,22 @@ def test_brat(
def test_read_to_standoff(blank_nlp, tmpdir):
input_dir = Path(__file__).parent.parent.resolve() / "resources" / "brat_data"
output_dir = Path(tmpdir)
doc = list(edsnlp.data.read_standoff(input_dir))[0]
doc = list(edsnlp.data.read_standoff(input_dir, bool_attributes=["bool flag 0"]))[0]
assert_doc_read(doc)
doc.ents[0]._.etat = "test"

edsnlp.data.write_standoff(
[doc],
output_dir,
span_attributes=["etat", "assertion"],
span_getter=["ents", "sosy", "localisation", "anatomie", "pathologie"],
span_attributes=["etat", "assertion", "bool flag 0"],
span_getter=[
"ents",
"sosy",
"localisation",
"anatomie",
"pathologie",
"test label 0",
],
)

with open(output_dir / "subfolder" / "doc-1.ann") as f:
Expand Down
2 changes: 2 additions & 0 deletions tests/resources/brat_data/subfolder/doc-1.ann
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ T2 localisation 39 57 dans le bras droit
T3 anatomie 47 57 bras droit
T4 pathologie 75 83;85 98 problème de locomotion
A1 assertion T4 absent
A9 bool flag 0 T4
T5 pathologie 114 117 AVC
A2 etat T5 passé
A3 assertion T5 non-associé
Expand All @@ -22,3 +23,4 @@ R2 lieu Arg1:T1 Arg2:T2
A8 assertion T11 absent
E1 MyArg1:T3 MyArg2:T1
E2 MyArg1:T1 MyArg2:E1
T12 test label 0 378 386 anomalie

0 comments on commit db532d4

Please sign in to comment.