From caaa60327853b6c82acbd07cf4df15986be229ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Jolivet?= Date: Mon, 31 Jul 2023 19:03:58 +0200 Subject: [PATCH] fix: support spaces around newlines in brat export --- changelog.md | 5 +++++ edsnlp/connectors/brat.py | 13 ++++++------- tests/connectors/test_brat.py | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/changelog.md b/changelog.md index d1aa2a9f4..321608297 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,10 @@ # Changelog +## Pending + +### Fixed +- `export_to_brat` issue with spans of entities on multiple lines. + ## v0.8.1 (2023-05-31) Fix release to allow installation from source diff --git a/edsnlp/connectors/brat.py b/edsnlp/connectors/brat.py index 794355074..dac2f4069 100644 --- a/edsnlp/connectors/brat.py +++ b/edsnlp/connectors/brat.py @@ -226,18 +226,17 @@ def export_to_brat(doc, txt_filename, overwrite_txt=False, overwrite_ann=False): ): idx = fragment["begin"] entity_text = doc["text"][fragment["begin"] : fragment["end"]] - for part in entity_text.split("\n"): - begin = idx - end = idx + len(part) - idx = end + 1 - if begin != end: - spans.append((begin, end)) + # eg: "mon entité \n est problématique" + for match in re.finditer( + r"\s*(.+?)(?:( *\n+)+ *|$)", entity_text, flags=re.DOTALL + ): + spans.append((idx + match.start(1), idx + match.end(1))) print( "{}\t{} {}\t{}".format( brat_entity_id, str(entity["label"]), ";".join(" ".join(map(str, span)) for span in spans), - entity_text.replace("\n", " "), + " ".join(doc["text"][begin:end] for begin, end in spans), ), file=f, ) diff --git a/tests/connectors/test_brat.py b/tests/connectors/test_brat.py index 738868c57..73dd09d36 100644 --- a/tests/connectors/test_brat.py +++ b/tests/connectors/test_brat.py @@ -193,7 +193,7 @@ def test_brat( A1 etat T1 test T2 localisation 39 57 dans le bras droit T3 anatomie 47 57 bras droit -T4 pathologie 75 84;85 98 problème de locomotion +T4 pathologie 75 83;85 98 problème de locomotion A2 assertion T4 absent T5 pathologie 114 117 AVC A3 etat T5 passé