From b66eea198ba77013afbd9f4e7bca9706c740f08f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Perceval=20Wajsb=C3=BCrt?= <perceval.wajsburt-ext@aphp.fr>
Date: Tue, 8 Aug 2023 18:55:01 +0200
Subject: [PATCH] fix: add missing pipeline methods and tests in parallelize

---
 edsnlp/core/pipeline.py                       |  28 ++-
 .../trainable/test_span_classifier.py         | 169 ------------------
 tests/processing/test_processing.py           |   7 +-
 3 files changed, 29 insertions(+), 175 deletions(-)
 delete mode 100644 tests/pipelines/trainable/test_span_classifier.py

diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py
index dc6258ef5..693fb82ef 100644
--- a/edsnlp/core/pipeline.py
+++ b/edsnlp/core/pipeline.py
@@ -128,6 +128,8 @@ def pipeline(self) -> List[Tuple[str, Pipe]]:
     def pipe_names(self) -> List[str]:
         return FrozenList([name for name, _ in self._components])
 
+    component_names = pipe_names
+
     def get_pipe(self, name: str) -> Pipe:
         """
         Get a component by its name.
@@ -198,6 +200,9 @@ def create_pipe(
     def add_pipe(
         self,
         factory: Union[str, Pipe],
+        first: bool = False,
+        before: Optional[str] = None,
+        after: Optional[str] = None,
         name: Optional[str] = None,
         config: Optional[Dict[str, Any]] = None,
     ) -> Pipe:
@@ -211,6 +216,15 @@ def add_pipe(
         name: Optional[str]
             The name of the component. If not provided, the name of the component
             will be used if it has one (.name), otherwise the factory name will be used.
+        first: bool
+            Whether to add the component to the beginning of the pipeline. This argument
+            is mutually exclusive with `before` and `after`.
+        before: Optional[str]
+            The name of the component to add the new component before. This argument is
+            mutually exclusive with `after` and `first`.
+        after: Optional[str]
+            The name of the component to add the new component after. This argument is
+            mutually exclusive with `before` and `first`.
         config: Dict[str, Any]
             The arguments to pass to the component factory.
 
@@ -245,7 +259,19 @@ def add_pipe(
                         "The component does not have a name, so you must provide one",
                     )
                 pipe.name = name
-        self._components.append((name, pipe))
+        assert sum([before is not None, after is not None, first]) <= 1, (
+            "You can only use one of before, after, or first",
+        )
+        insertion_idx = (
+            0
+            if first
+            else self.pipe_names.index(before)
+            if before is not None
+            else self.pipe_names.index(after) + 1
+            if after is not None
+            else len(self._components)
+        )
+        self._components.insert(insertion_idx, (name, pipe))
         return pipe
 
     def get_pipe_meta(self, name: str) -> FactoryMeta:
diff --git a/tests/pipelines/trainable/test_span_classifier.py b/tests/pipelines/trainable/test_span_classifier.py
deleted file mode 100644
index d127f4f00..000000000
--- a/tests/pipelines/trainable/test_span_classifier.py
+++ /dev/null
@@ -1,169 +0,0 @@
-import spacy
-from pytest import fixture, mark
-from spacy.tokens import Span
-from spacy.training import Corpus, Example
-
-from edsnlp.pipelines.trainable.span_qualifier.factory import SPAN_QUALIFIER_DEFAULTS
-from edsnlp.utils.training import make_spacy_corpus_config, train
-
-if not Span.has_extension("label"):
-    Span.set_extension("label", default=None)
-
-
-if not Span.has_extension("event_type"):
-    Span.set_extension("event_type", default=None)
-
-
-if not Span.has_extension("test_negated"):
-    Span.set_extension("test_negated", default=False)
-
-
-@fixture
-def gold(blank_nlp):
-    doc1 = blank_nlp.make_doc("Arret du ttt si folfox inefficace")
-
-    doc1.spans["sc"] = [
-        # drug = "folfox"
-        Span(doc1, 4, 5, "drug"),
-        # event = "Arret"
-        Span(doc1, 0, 1, "event"),
-        # criteria = "si"
-        Span(doc1, 3, 4, "criteria"),
-    ]
-    doc1.spans["sc"][0]._.test_negated = False
-    doc1.spans["sc"][1]._.test_negated = True
-    doc1.spans["sc"][2]._.test_negated = False
-    doc1.spans["sc"][1]._.event_type = "stop"
-
-    doc1.spans["sent"] = [Span(doc1, 0, 6, "sent")]
-
-    doc2 = blank_nlp.make_doc("Début du traitement")
-
-    span = Span(doc2, 0, 1, "event")
-    doc2.ents = [
-        # drug = "Début"
-        span,
-    ]
-    span._.test_negated = False
-    span._.event_type = "start"
-
-    doc2.spans["sent"] = [Span(doc2, 0, 3, "sent")]
-
-    return [doc1, doc2]
-
-
-@spacy.registry.readers.register("test-span-classification-corpus")
-class SpanClassificationCorpus(Corpus):
-    def _make_example(self, nlp, reference, gold_preproc: bool):
-        pred = reference.copy()
-        pred.user_data = {
-            key: value
-            for key, value in pred.user_data.items()
-            if not (isinstance(key, tuple) and len(key) == 4 and key[0] == "._.")
-        }
-        return Example(
-            pred,
-            reference,
-        )
-
-
-@mark.parametrize("lang", ["eds"], indirect=True)
-def test_span_qualifier_label_training(gold, tmp_path):
-    tmp_path.mkdir(parents=True, exist_ok=True)
-
-    nlp = spacy.blank("eds")
-    nlp.add_pipe(
-        "span_qualifier",
-        config={
-            **SPAN_QUALIFIER_DEFAULTS,
-            "qualifiers": ("label_",),
-            "on_ents": False,
-            "on_span_groups": True,
-            "model": {
-                **SPAN_QUALIFIER_DEFAULTS["model"],
-            },
-        },
-    )
-
-    train(
-        nlp,
-        output_path=tmp_path,
-        config=dict(
-            **make_spacy_corpus_config(
-                train_data=gold,
-                dev_data=gold,
-                reader="test-span-classification-corpus",
-            ),
-            **{
-                "training.max_steps": 10,
-                "training.eval_frequency": 5,
-                # "training.optimizer.learn_rate": 0,
-            },
-        ),
-    )
-    nlp = spacy.load(tmp_path / "model-best")
-
-    pred = gold[0].copy()
-    pred.spans["sc"] = [
-        Span(span.doc, span.start, span.end, "ent") for span in pred.spans["sc"]
-    ]
-    pred.user_data = {
-        key: value
-        for key, value in pred.user_data.items()
-        if not (isinstance(key, tuple) and len(key) == 4 and key[0] == "._.")
-    }
-    pred = nlp(pred)
-    scores = nlp.pipeline[-1][1].score([Example(pred, gold[0])])
-    assert [span.label_ for span in pred.spans["sc"]] == ["drug", "event", "criteria"]
-    assert scores["qual_f"] == 1.0
-
-
-@mark.parametrize("lang", ["eds"], indirect=True)
-def test_span_qualifier_constrained_training(gold, tmp_path):
-    tmp_path.mkdir(parents=True, exist_ok=True)
-
-    nlp = spacy.blank("eds")
-    nlp.add_pipe(
-        "span_qualifier",
-        config={
-            **SPAN_QUALIFIER_DEFAULTS,
-            "candidate_getter": {
-                "@misc": "eds.candidate_span_qualifier_getter",
-                "qualifiers": ("_.test_negated", "_.event_type"),
-                "label_constraints": {"_.event_type": ("event",)},
-                "on_ents": False,
-                "on_span_groups": ("sc",),
-            },
-            "model": SPAN_QUALIFIER_DEFAULTS["model"],
-        },
-    )
-
-    train(
-        nlp,
-        output_path=tmp_path,
-        config=dict(
-            **make_spacy_corpus_config(
-                train_data=gold,
-                dev_data=gold,
-                reader="test-span-classification-corpus",
-            ),
-            **{
-                "training.max_steps": 5,
-                "training.eval_frequency": 5,
-            },
-        ),
-    )
-    nlp = spacy.load(tmp_path / "model-best")
-
-    pred = gold[0].copy()
-    pred.user_data = {
-        key: value
-        for key, value in pred.user_data.items()
-        if not (isinstance(key, tuple) and len(key) == 4 and key[0] == "._.")
-    }
-    assert [span._.test_negated for span in pred.spans["sc"]] == [False, False, False]
-    pred = nlp(pred)
-    scores = nlp.pipeline[-1][1].score([Example(pred, gold[0])])
-    assert [s._.test_negated for s in pred.spans["sc"]] == [False, True, False]
-    assert [s._.event_type for s in pred.spans["sc"]] == [None, "stop", None]
-    assert scores["qual_f"] == 1.0
diff --git a/tests/processing/test_processing.py b/tests/processing/test_processing.py
index c9f70e0a1..bce0a5027 100644
--- a/tests/processing/test_processing.py
+++ b/tests/processing/test_processing.py
@@ -3,7 +3,6 @@
 import databricks.koalas  # noqa F401
 import pandas as pd
 import pytest
-import spacy
 from pyspark.sql import types as T
 from pyspark.sql.session import SparkSession
 
@@ -58,14 +57,12 @@ def note(module: DataFrameModules):
 
 
 @pytest.fixture
-def model(lang):
+def model(blank_nlp):
     # Creates the spaCy instance
-    nlp = spacy.blank(lang)
+    nlp = blank_nlp
 
     # Normalisation of accents, case and other special characters
     nlp.add_pipe("eds.normalizer")
-    # Detecting end of lines
-    nlp.add_pipe("eds.sentences")
 
     # Extraction of named entities
     nlp.add_pipe(