From b66eea198ba77013afbd9f4e7bca9706c740f08f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= Date: Tue, 8 Aug 2023 18:55:01 +0200 Subject: [PATCH] fix: add missing pipeline methods and tests in parallelize --- edsnlp/core/pipeline.py | 28 ++- .../trainable/test_span_classifier.py | 169 ------------------ tests/processing/test_processing.py | 7 +- 3 files changed, 29 insertions(+), 175 deletions(-) delete mode 100644 tests/pipelines/trainable/test_span_classifier.py diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py index dc6258ef5..693fb82ef 100644 --- a/edsnlp/core/pipeline.py +++ b/edsnlp/core/pipeline.py @@ -128,6 +128,8 @@ def pipeline(self) -> List[Tuple[str, Pipe]]: def pipe_names(self) -> List[str]: return FrozenList([name for name, _ in self._components]) + component_names = pipe_names + def get_pipe(self, name: str) -> Pipe: """ Get a component by its name. @@ -198,6 +200,9 @@ def create_pipe( def add_pipe( self, factory: Union[str, Pipe], + first: bool = False, + before: Optional[str] = None, + after: Optional[str] = None, name: Optional[str] = None, config: Optional[Dict[str, Any]] = None, ) -> Pipe: @@ -211,6 +216,15 @@ def add_pipe( name: Optional[str] The name of the component. If not provided, the name of the component will be used if it has one (.name), otherwise the factory name will be used. + first: bool + Whether to add the component to the beginning of the pipeline. This argument + is mutually exclusive with `before` and `after`. + before: Optional[str] + The name of the component to add the new component before. This argument is + mutually exclusive with `after` and `first`. + after: Optional[str] + The name of the component to add the new component after. This argument is + mutually exclusive with `before` and `first`. config: Dict[str, Any] The arguments to pass to the component factory. @@ -245,7 +259,19 @@ def add_pipe( "The component does not have a name, so you must provide one", ) pipe.name = name - self._components.append((name, pipe)) + assert sum([before is not None, after is not None, first]) <= 1, ( + "You can only use one of before, after, or first", + ) + insertion_idx = ( + 0 + if first + else self.pipe_names.index(before) + if before is not None + else self.pipe_names.index(after) + 1 + if after is not None + else len(self._components) + ) + self._components.insert(insertion_idx, (name, pipe)) return pipe def get_pipe_meta(self, name: str) -> FactoryMeta: diff --git a/tests/pipelines/trainable/test_span_classifier.py b/tests/pipelines/trainable/test_span_classifier.py deleted file mode 100644 index d127f4f00..000000000 --- a/tests/pipelines/trainable/test_span_classifier.py +++ /dev/null @@ -1,169 +0,0 @@ -import spacy -from pytest import fixture, mark -from spacy.tokens import Span -from spacy.training import Corpus, Example - -from edsnlp.pipelines.trainable.span_qualifier.factory import SPAN_QUALIFIER_DEFAULTS -from edsnlp.utils.training import make_spacy_corpus_config, train - -if not Span.has_extension("label"): - Span.set_extension("label", default=None) - - -if not Span.has_extension("event_type"): - Span.set_extension("event_type", default=None) - - -if not Span.has_extension("test_negated"): - Span.set_extension("test_negated", default=False) - - -@fixture -def gold(blank_nlp): - doc1 = blank_nlp.make_doc("Arret du ttt si folfox inefficace") - - doc1.spans["sc"] = [ - # drug = "folfox" - Span(doc1, 4, 5, "drug"), - # event = "Arret" - Span(doc1, 0, 1, "event"), - # criteria = "si" - Span(doc1, 3, 4, "criteria"), - ] - doc1.spans["sc"][0]._.test_negated = False - doc1.spans["sc"][1]._.test_negated = True - doc1.spans["sc"][2]._.test_negated = False - doc1.spans["sc"][1]._.event_type = "stop" - - doc1.spans["sent"] = [Span(doc1, 0, 6, "sent")] - - doc2 = blank_nlp.make_doc("Début du traitement") - - span = Span(doc2, 0, 1, "event") - doc2.ents = [ - # drug = "Début" - span, - ] - span._.test_negated = False - span._.event_type = "start" - - doc2.spans["sent"] = [Span(doc2, 0, 3, "sent")] - - return [doc1, doc2] - - -@spacy.registry.readers.register("test-span-classification-corpus") -class SpanClassificationCorpus(Corpus): - def _make_example(self, nlp, reference, gold_preproc: bool): - pred = reference.copy() - pred.user_data = { - key: value - for key, value in pred.user_data.items() - if not (isinstance(key, tuple) and len(key) == 4 and key[0] == "._.") - } - return Example( - pred, - reference, - ) - - -@mark.parametrize("lang", ["eds"], indirect=True) -def test_span_qualifier_label_training(gold, tmp_path): - tmp_path.mkdir(parents=True, exist_ok=True) - - nlp = spacy.blank("eds") - nlp.add_pipe( - "span_qualifier", - config={ - **SPAN_QUALIFIER_DEFAULTS, - "qualifiers": ("label_",), - "on_ents": False, - "on_span_groups": True, - "model": { - **SPAN_QUALIFIER_DEFAULTS["model"], - }, - }, - ) - - train( - nlp, - output_path=tmp_path, - config=dict( - **make_spacy_corpus_config( - train_data=gold, - dev_data=gold, - reader="test-span-classification-corpus", - ), - **{ - "training.max_steps": 10, - "training.eval_frequency": 5, - # "training.optimizer.learn_rate": 0, - }, - ), - ) - nlp = spacy.load(tmp_path / "model-best") - - pred = gold[0].copy() - pred.spans["sc"] = [ - Span(span.doc, span.start, span.end, "ent") for span in pred.spans["sc"] - ] - pred.user_data = { - key: value - for key, value in pred.user_data.items() - if not (isinstance(key, tuple) and len(key) == 4 and key[0] == "._.") - } - pred = nlp(pred) - scores = nlp.pipeline[-1][1].score([Example(pred, gold[0])]) - assert [span.label_ for span in pred.spans["sc"]] == ["drug", "event", "criteria"] - assert scores["qual_f"] == 1.0 - - -@mark.parametrize("lang", ["eds"], indirect=True) -def test_span_qualifier_constrained_training(gold, tmp_path): - tmp_path.mkdir(parents=True, exist_ok=True) - - nlp = spacy.blank("eds") - nlp.add_pipe( - "span_qualifier", - config={ - **SPAN_QUALIFIER_DEFAULTS, - "candidate_getter": { - "@misc": "eds.candidate_span_qualifier_getter", - "qualifiers": ("_.test_negated", "_.event_type"), - "label_constraints": {"_.event_type": ("event",)}, - "on_ents": False, - "on_span_groups": ("sc",), - }, - "model": SPAN_QUALIFIER_DEFAULTS["model"], - }, - ) - - train( - nlp, - output_path=tmp_path, - config=dict( - **make_spacy_corpus_config( - train_data=gold, - dev_data=gold, - reader="test-span-classification-corpus", - ), - **{ - "training.max_steps": 5, - "training.eval_frequency": 5, - }, - ), - ) - nlp = spacy.load(tmp_path / "model-best") - - pred = gold[0].copy() - pred.user_data = { - key: value - for key, value in pred.user_data.items() - if not (isinstance(key, tuple) and len(key) == 4 and key[0] == "._.") - } - assert [span._.test_negated for span in pred.spans["sc"]] == [False, False, False] - pred = nlp(pred) - scores = nlp.pipeline[-1][1].score([Example(pred, gold[0])]) - assert [s._.test_negated for s in pred.spans["sc"]] == [False, True, False] - assert [s._.event_type for s in pred.spans["sc"]] == [None, "stop", None] - assert scores["qual_f"] == 1.0 diff --git a/tests/processing/test_processing.py b/tests/processing/test_processing.py index c9f70e0a1..bce0a5027 100644 --- a/tests/processing/test_processing.py +++ b/tests/processing/test_processing.py @@ -3,7 +3,6 @@ import databricks.koalas # noqa F401 import pandas as pd import pytest -import spacy from pyspark.sql import types as T from pyspark.sql.session import SparkSession @@ -58,14 +57,12 @@ def note(module: DataFrameModules): @pytest.fixture -def model(lang): +def model(blank_nlp): # Creates the spaCy instance - nlp = spacy.blank(lang) + nlp = blank_nlp # Normalisation of accents, case and other special characters nlp.add_pipe("eds.normalizer") - # Detecting end of lines - nlp.add_pipe("eds.sentences") # Extraction of named entities nlp.add_pipe(