diff --git a/changelog.md b/changelog.md index 59c5a3405..ec217840d 100644 --- a/changelog.md +++ b/changelog.md @@ -1,5 +1,14 @@ # Changelog +## Pending + +### Added +- `split_on_newlines` parameter to the `sentences` pipeline. + +### Fixed +- `eds.sentences` issue with punctuation followed by a digit. + + ## v0.7.4 (2022-12-12) ### Added diff --git a/docs/pipelines/core/sentences.md b/docs/pipelines/core/sentences.md index 9bd34a166..b78b0f4df 100644 --- a/docs/pipelines/core/sentences.md +++ b/docs/pipelines/core/sentences.md @@ -2,7 +2,7 @@ The `eds.sentences` pipeline provides an alternative to spaCy's default `sentencizer`, aiming to overcome some of its limitations. -Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances. +Indeed, the `sentencizer` merely looks at period characters to detect the end of a sentence, a strategy that often fails in a clinical note settings. Our `sentences` component also classifies end-of-lines as sentence boundaries if the subsequent token begins with an uppercase character, leading to slightly better performances. This exact behaviour can be adjusted using the `split_on_newlines` parameter (see below) Moreover, the `eds.sentences` pipeline can use the output of the `eds.normalizer` pipeline, and more specifically the end-of-line classification. This is activated by default. @@ -61,10 +61,30 @@ Notice how EDS-NLP's implementation is more robust to ill-defined sentence endin The pipeline can be configured using the following parameters : -| Parameter | Explanation | Default | -| -------------- | ----------------------------------------------------------------------- | --------------------------------- | -| `punct_chars` | Punctuation patterns | `None` (use pre-defined patterns) | -| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True` | +| Parameter | Explanation | Default | +| ------------------- | ----------------------------------------------------------------------- | --------------------------------- | +| `punct_chars` | Punctuation patterns | `None` (use pre-defined patterns) | +| `use_endlines` | Whether to use endlines prediction (see [documentation](./endlines.md)) | `True` | +| `split_on_newlines` | Rule to use to consider a newline (`\n`) as a sentence split | `with_capitalized` | + +### The `split_on_newlines` parameter. + +=== "`with_capitalized` (Default)" + + The rule applied here is to consider a newline as a sentence split if the following token is capitalized, + i.e. it's first letter is in uppercase and it's other letters are lowercase. + This rule should cover most cases, but might be problematic with long lists of fully uppercased strings (e.g. lsits of drugs commercial names) + +=== "`with_uppercase`" + + The rule applied here is to consider a newline as a sentence split if the following token starts with an uppercase letter, + with no regards on the other letters. + This rule will correct the problem of long fully-uppercased texts, but might wrongly split sentences e.g. around acronyms. + + +=== "`False`" + + No sentence split is set using newlines alone. ## Authors and citation diff --git a/edsnlp/pipelines/core/sentences/factory.py b/edsnlp/pipelines/core/sentences/factory.py index 5f7731a5f..355887cab 100644 --- a/edsnlp/pipelines/core/sentences/factory.py +++ b/edsnlp/pipelines/core/sentences/factory.py @@ -10,6 +10,8 @@ punct_chars=None, ignore_excluded=True, use_endlines=None, + split_on_newlines="with_capitalized", + split_on_bullets=False, ) @@ -30,10 +32,13 @@ def create_component( punct_chars: Optional[List[str]], use_endlines: Optional[bool], ignore_excluded: bool, + split_on_newlines: Optional[str], + split_on_bullets: Optional[bool], ): return SentenceSegmenter( nlp.vocab, punct_chars=punct_chars, use_endlines=use_endlines, ignore_excluded=ignore_excluded, + split_on_newlines=split_on_newlines, ) diff --git a/edsnlp/pipelines/core/sentences/sentences.pxd b/edsnlp/pipelines/core/sentences/sentences.pxd index 531c55830..f215e9614 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pxd +++ b/edsnlp/pipelines/core/sentences/sentences.pxd @@ -5,12 +5,18 @@ from spacy.tokens.doc cimport Doc from spacy.typedefs cimport attr_t +cdef enum split_options: WITH_CAPITALIZED, WITH_UPPERCASE, NONE + cdef class SentenceSegmenter(object): cdef bool ignore_excluded + cdef bool split_on_bullets cdef attr_t newline_hash cdef attr_t excluded_hash cdef attr_t endline_hash cdef set[attr_t] punct_chars_hash cdef set[attr_t] capitalized_shapes_hash + cdef set[attr_t] capitalized_chars_hash + cdef set[attr_t] bullets_chars_hash + cdef split_options split_on_newlines cdef void process(self, Doc doc) nogil diff --git a/edsnlp/pipelines/core/sentences/sentences.pyx b/edsnlp/pipelines/core/sentences/sentences.pyx index 7123e72eb..fc4f63fbe 100644 --- a/edsnlp/pipelines/core/sentences/sentences.pyx +++ b/edsnlp/pipelines/core/sentences/sentences.pyx @@ -3,13 +3,21 @@ from typing import Iterable, List, Optional from libcpp cimport bool # from spacy.typedefs cimport attr_t -from spacy.attrs cimport IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT, IS_SPACE +from spacy.attrs cimport ( + IS_ALPHA, + IS_ASCII, + IS_DIGIT, + IS_LOWER, + IS_PUNCT, + IS_SPACE, + IS_UPPER, +) from spacy.lexeme cimport Lexeme from spacy.tokens.doc cimport Doc from spacy.tokens.token cimport TokenC from spacy.vocab cimport Vocab -from .terms import punctuation +from .terms import punctuation, uppercase, bullets cdef class SentenceSegmenter(object): @@ -37,17 +45,38 @@ cdef class SentenceSegmenter(object): punct_chars: Optional[List[str]], use_endlines: bool, ignore_excluded: bool = True, + split_on_newlines: Optional[str] = "with_capitalized", + split_on_bullets: bool = False, ): if punct_chars is None: punct_chars = punctuation self.ignore_excluded = ignore_excluded or use_endlines + self.split_on_bullets = split_on_bullets self.newline_hash = vocab.strings["\n"] self.excluded_hash = vocab.strings["EXCLUDED"] self.endline_hash = vocab.strings["ENDLINE"] self.punct_chars_hash = {vocab.strings[c] for c in punct_chars} self.capitalized_shapes_hash = {vocab.strings[shape] for shape in ("Xx", "Xxx", "Xxxx", "Xxxxx")} + self.capitalized_chars_hash = {vocab.strings[letter] for letter in uppercase} + self.bullets_chars_hash = {vocab.strings[bullet] for bullet in bullets} + + options = { + "with_capitalized": 0, + "with_uppercase": 1, + False: 2 + } + chosen = options.get(split_on_newlines, None) + if chosen is None: + raise ValueError( + ( + "Incorrect value for 'split_on_newlines'. " + f"Provided: {split_on_newlines}\n" + f"Available: {options}." + ) + ) + self.split_on_newlines = chosen if use_endlines: print("The use_endlines is deprecated and has been replaced by the ignore_excluded parameter") @@ -90,16 +119,23 @@ cdef class SentenceSegmenter(object): is_newline = Lexeme.c_check_flag(token.lex, IS_SPACE) and token.lex.orth == self.newline_hash if seen_period or seen_newline: - if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT): - continue if is_in_punct_chars or is_newline or Lexeme.c_check_flag(token.lex, IS_PUNCT): continue + if seen_period and Lexeme.c_check_flag(token.lex, IS_DIGIT): + continue + seen_newline = False + seen_period = False if seen_period: doc.c[i].sent_start = 1 seen_newline = False seen_period = False else: - doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1 + if self.split_on_newlines == WITH_UPPERCASE: + doc.c[i].sent_start = 1 if self.capitalized_chars_hash.const_find(token.lex.prefix) != self.capitalized_chars_hash.const_end() else -1 + if self.split_on_newlines == WITH_CAPITALIZED: + doc.c[i].sent_start = 1 if self.capitalized_shapes_hash.const_find(token.lex.shape) != self.capitalized_shapes_hash.const_end() else -1 + if self.split_on_bullets: + doc.c[i].sent_start = 1 if self.bullets_chars_hash.const_find(token.lex.prefix) != self.bullets_chars_hash.const_end() else -1 seen_newline = False seen_period = False elif is_in_punct_chars: diff --git a/edsnlp/pipelines/core/sentences/terms.py b/edsnlp/pipelines/core/sentences/terms.py index 2e0a422c3..8350a7e32 100644 --- a/edsnlp/pipelines/core/sentences/terms.py +++ b/edsnlp/pipelines/core/sentences/terms.py @@ -129,3 +129,66 @@ "。", "。", ] + +uppercase = [ + "A", + "À", + "Â", + "B", + "C", + "D", + "E", + "É", + "È", + "Ê", + "Ë", + "F", + "G", + "H", + "I", + "Î", + "Ï", + "J", + "K", + "L", + "M", + "N", + "O", + "Ô", + "Œ", + "P", + "Q", + "R", + "S", + "T", + "U", + "Ù", + "Û", + "Ü", + "V", + "W", + "X", + "Y", + "Ÿ", + "Z", +] + +bullets = [ + "-", + "*", + "•", + "‣", + "⁃", + "⁌", + "⁍", + "∙", + "○", + "●", + "◘", + "◦", + "☙", + "❥", + "❧", + "⦾", + "⦿", +] diff --git a/tests/pipelines/core/test_sentences.py b/tests/pipelines/core/test_sentences.py index b4e078393..a401114c9 100644 --- a/tests/pipelines/core/test_sentences.py +++ b/tests/pipelines/core/test_sentences.py @@ -50,3 +50,45 @@ def test_false_positives(blank_nlp): for fp in false_positives: doc = blank_nlp(fp) assert len(list(doc.sents)) == 1 + + +@mark.parametrize( + "split_options", + [ + dict( + split_on_newlines=False, + n_sents=2, + ), + dict( + split_on_newlines="with_capitalized", + n_sents=3, + ), + dict( + split_on_newlines="with_uppercase", + n_sents=4, + ), + dict( + split_on_newlines="with_uppercase", + split_on_bullets=True, + n_sents=5, + ), + ], +) +def test_newline_split_options(blank_nlp, split_options): + + text = "Une première phrase. " + text += "Une deuxième\n" + text += "Peut-être un autre\n" + text += "ET encore une\n" + text += "- Enfin une dernière avec une liste." + + segmenter = SentenceSegmenter( + blank_nlp.vocab, + punct_chars=terms.punctuation, + use_endlines=False, + split_on_newlines=split_options["split_on_newlines"], + split_on_bullets=split_options.get("split_on_bullets", False), + ) + + doc = segmenter(blank_nlp(text)) + assert len(list(doc.sents)) == split_options["n_sents"]