aphp · Thomzoy · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/changelog.md b/changelog.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## Pending
+
+### Added
+- `split_on_newlines` parameter to the `sentences` pipeline.
+
+### Fixed
+- `eds.sentences` issue with punctuation followed by a digit.
+
+
 ## v0.7.4 (2022-12-12)
 
 ### Added

diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md
@@ -2,7 +2,7 @@
 
 The `eds.sentences` pipeline provides an alternative to spaCy's default `sentencizer`, aiming to overcome some of its limitations.
 
-Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances.
+Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances. This exact behaviour can be adjusted using the `split_on_newlines` parameter (see below)
 
 Moreover, the `eds.sentences` pipeline can use the output of the `eds.normalizer` pipeline, and more specifically the end-of-line classification. This is activated by default.
 
@@ -61,10 +61,30 @@ Notice how EDS-NLP's implementation is more robust to ill-defined sentence endin
 
 The pipeline can be configured using the following parameters :
 
-| Parameter      | Explanation                                                             | Default                           |
-| -------------- | ----------------------------------------------------------------------- | --------------------------------- |
-| `punct_chars`  | Punctuation patterns                                                    | `None` (use pre-defined patterns) |
-| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True`                            |
+| Parameter           | Explanation                                                             | Default                           |
+| ------------------- | ----------------------------------------------------------------------- | --------------------------------- |
+| `punct_chars`       | Punctuation patterns                                                    | `None` (use pre-defined patterns) |
+| `use_endlines`      | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True`                            |
+| `split_on_newlines` | Rule to use to consider a newline (`\n`) as a sentence split            | `with_capitalized`                |
+
+### The `split_on_newlines` parameter.
+
+=== "`with_capitalized` (Default)"
+
+    The rule applied here is to consider a newline as a sentence split if the following token is capitalized,
+    i.e. it's first letter is in uppercase and it's other letters are lowercase.
+    This rule should cover most cases, but might be problematic with long lists of fully uppercased strings (e.g. lsits of drugs commercial names)
+
+=== "`with_uppercase`"
+
+    The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter,
+    with no regards on the other letters.
+    This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms.
+
+
+=== "`False`"
+
+    No sentence split is set using newlines alone.
 
 ## Authors and citation
 

diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py
@@ -10,6 +10,8 @@
     punct_chars=None,
     ignore_excluded=True,
     use_endlines=None,
+    split_on_newlines="with_capitalized",
+    split_on_bullets=False,
 )
 
 
@@ -30,10 +32,13 @@ def create_component(
     punct_chars: Optional[List[str]],
     use_endlines: Optional[bool],
     ignore_excluded: bool,
+    split_on_newlines: Optional[str],
+    split_on_bullets: Optional[bool],
 ):
     return SentenceSegmenter(
         nlp.vocab,
         punct_chars=punct_chars,
         use_endlines=use_endlines,
         ignore_excluded=ignore_excluded,
+        split_on_newlines=split_on_newlines,
     )
diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd
@@ -5,12 +5,18 @@ from spacy.tokens.doc cimport Doc
 from spacy.typedefs cimport attr_t
 
 
+cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE
+
 cdef class SentenceSegmenter(object):
     cdef bool ignore_excluded
+    cdef bool split_on_bullets
     cdef attr_t newline_hash
     cdef attr_t excluded_hash
     cdef attr_t endline_hash
     cdef set[attr_t] punct_chars_hash
     cdef set[attr_t] capitalized_shapes_hash
+    cdef set[attr_t] capitalized_chars_hash
+    cdef set[attr_t] bullets_chars_hash
+    cdef split_options split_on_newlines
 
     cdef void process(self, Doc doc) nogil
diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx
@@ -3,13 +3,21 @@ from typing import Iterable, List, Optional
 from libcpp cimport bool
 
 # from spacy.typedefs cimport attr_t
-from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE
+from spacy.attrs cimport (
+    IS_ALPHA,
+    IS_ASCII,
+    IS_DIGIT,
+    IS_LOWER,
+    IS_PUNCT,
+    IS_SPACE,
+    IS_UPPER,
+)
 from spacy.lexeme cimport Lexeme
 from spacy.tokens.doc cimport Doc
 from spacy.tokens.token cimport TokenC
 from spacy.vocab cimport Vocab
 
-from .terms import punctuation
+from .terms import punctuation, uppercase, bullets
 
 
 cdef class SentenceSegmenter(object):
@@ -37,17 +45,38 @@ cdef class SentenceSegmenter(object):
         punct_chars: Optional[List[str]],
         use_endlines: bool,
         ignore_excluded: bool = True,
+        split_on_newlines: Optional[str] = "with_capitalized",
+        split_on_bullets: bool = False,
     ):
 
         if punct_chars is None:
             punct_chars = punctuation
 
         self.ignore_excluded = ignore_excluded or use_endlines
+        self.split_on_bullets = split_on_bullets
         self.newline_hash = vocab.strings["\n"]
         self.excluded_hash = vocab.strings["EXCLUDED"]
         self.endline_hash = vocab.strings["ENDLINE"]
         self.punct_chars_hash = {vocab.strings[c] for c in punct_chars}
         self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")}
+        self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase}
+        self.bullets_chars_hash = {vocab.strings[bullet] for bullet in bullets}
+
+        options = {
+            "with_capitalized": 0,
+            "with_uppercase": 1,
+            False: 2
+        }
+        chosen = options.get(split_on_newlines, None)
+        if chosen is None:
+            raise ValueError(
+                (
+                    "Incorrect value for 'split_on_newlines'. "
+                    f"Provided: {split_on_newlines}\n"
+                    f"Available: {options}."
+                )
+            )
+        self.split_on_newlines = chosen
 
         if use_endlines:
             print("The use_endlines is deprecated and has been replaced by the ignore_excluded parameter")
@@ -90,16 +119,23 @@ cdef class SentenceSegmenter(object):
             is_newline = Lexeme.c_check_flag(token.lex, IS_SPACE) and token.lex.orth == self.newline_hash
 
             if seen_period or seen_newline:
-                if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT):
-                    continue
                 if is_in_punct_chars or is_newline or Lexeme.c_check_flag(token.lex, IS_PUNCT):
                     continue
+                if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT):
+                    continue
+                    seen_newline = False
+                    seen_period = False
                 if seen_period:
                     doc.c[i].sent_start = 1
                     seen_newline = False
                     seen_period = False
                 else:
-                    doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
+                    if self.split_on_newlines == WITH_UPPERCASE:
+                        doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1
+                    if self.split_on_newlines == WITH_CAPITALIZED:
+                        doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1
+                    if self.split_on_bullets:
+                        doc.c[i].sent_start = 1 if self.bullets_chars_hash.const_find(token.lex.prefix) != self.bullets_chars_hash.const_end() else -1
                     seen_newline = False
                     seen_period = False
             elif is_in_punct_chars:

diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py
@@ -129,3 +129,66 @@
     "｡",
     "。",
 ]
+
+uppercase = [
+    "A",
+    "À",
+    "Â",
+    "B",
+    "C",
+    "D",
+    "E",
+    "É",
+    "È",
+    "Ê",
+    "Ë",
+    "F",
+    "G",
+    "H",
+    "I",
+    "Î",
+    "Ï",
+    "J",
+    "K",
+    "L",
+    "M",
+    "N",
+    "O",
+    "Ô",
+    "Œ",
+    "P",
+    "Q",
+    "R",
+    "S",
+    "T",
+    "U",
+    "Ù",
+    "Û",
+    "Ü",
+    "V",
+    "W",
+    "X",
+    "Y",
+    "Ÿ",
+    "Z",
+]
+
+bullets = [
+    "-",
+    "*",
+    "•",
+    "‣",
+    "⁃",
+    "⁌",
+    "⁍",
+    "∙",
+    "○",
+    "●",
+    "◘",
+    "◦",
+    "☙",
+    "❥",
+    "❧",
+    "⦾",
+    "⦿",
+]
diff --git a/tests/pipelines/core/test_sentences.py b/tests/pipelines/core/test_sentences.py
@@ -50,3 +50,45 @@ def test_false_positives(blank_nlp):
     for fp in false_positives:
         doc = blank_nlp(fp)
         assert len(list(doc.sents)) == 1
+
+
+@mark.parametrize(
+    "split_options",
+    [
+        dict(
+            split_on_newlines=False,
+            n_sents=2,
+        ),
+        dict(
+            split_on_newlines="with_capitalized",
+            n_sents=3,
+        ),
+        dict(
+            split_on_newlines="with_uppercase",
+            n_sents=4,
+        ),
+        dict(
+            split_on_newlines="with_uppercase",
+            split_on_bullets=True,
+            n_sents=5,
+        ),
+    ],
+)
+def test_newline_split_options(blank_nlp, split_options):
+
+    text = "Une première phrase. "
+    text += "Une deuxième\n"
+    text += "Peut-être un autre\n"
+    text += "ET encore une\n"
+    text += "- Enfin une dernière avec une liste."
+
+    segmenter = SentenceSegmenter(
+        blank_nlp.vocab,
+        punct_chars=terms.punctuation,
+        use_endlines=False,
+        split_on_newlines=split_options["split_on_newlines"],
+        split_on_bullets=split_options.get("split_on_bullets", False),
+    )
+
+    doc = segmenter(blank_nlp(text))
+    assert len(list(doc.sents)) == split_options["n_sents"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -129,3 +129,66 @@ @@
         "｡",
         "。",
     ]
+    uppercase = [
+        "A",
+        "À",
+        "Â",
+        "B",
+        "C",
+        "D",
+        "E",
+        "É",
+        "È",
+        "Ê",
+        "Ë",
+        "F",
+        "G",
+        "H",
+        "I",
+        "Î",
+        "Ï",
+        "J",
+        "K",
+        "L",
+        "M",
+        "N",
+        "O",
+        "Ô",
+        "Œ",
+        "P",
+        "Q",
+        "R",
+        "S",
+        "T",
+        "U",
+        "Ù",
+        "Û",
+        "Ü",
+        "V",
+        "W",
+        "X",
+        "Y",
+        "Ÿ",
+        "Z",
+    ]
+    bullets = [
+        "-",
+        "*",
+        "•",
+        "‣",
+        "⁃",
+        "⁌",
+        "⁍",
+        "∙",
+        "○",
+        "●",
+        "◘",
+        "◦",
+        "☙",
+        "❥",
+        "❧",
+        "⦾",
+        "⦿",
+    ]