fix: add missing pipeline methods and tests in parallelize

aphp · Aug 8, 2023 · b66eea1 · b66eea1
1 parent ee9dacb
commit b66eea1
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 175 deletions.
diff --git a/edsnlp/core/pipeline.py b/edsnlp/core/pipeline.py
@@ -128,6 +128,8 @@ def pipeline(self) -> List[Tuple[str, Pipe]]:
     def pipe_names(self) -> List[str]:
         return FrozenList([name for name, _ in self._components])
 
+    component_names = pipe_names
+
     def get_pipe(self, name: str) -> Pipe:
         """
         Get a component by its name.
@@ -198,6 +200,9 @@ def create_pipe(
     def add_pipe(
         self,
         factory: Union[str, Pipe],
+        first: bool = False,
+        before: Optional[str] = None,
+        after: Optional[str] = None,
         name: Optional[str] = None,
         config: Optional[Dict[str, Any]] = None,
     ) -> Pipe:
@@ -211,6 +216,15 @@ def add_pipe(
         name: Optional[str]
             The name of the component. If not provided, the name of the component
             will be used if it has one (.name), otherwise the factory name will be used.
+        first: bool
+            Whether to add the component to the beginning of the pipeline. This argument
+            is mutually exclusive with `before` and `after`.
+        before: Optional[str]
+            The name of the component to add the new component before. This argument is
+            mutually exclusive with `after` and `first`.
+        after: Optional[str]
+            The name of the component to add the new component after. This argument is
+            mutually exclusive with `before` and `first`.
         config: Dict[str, Any]
             The arguments to pass to the component factory.
 
@@ -245,7 +259,19 @@ def add_pipe(
                         "The component does not have a name, so you must provide one",
                     )
                 pipe.name = name
-        self._components.append((name, pipe))
+        assert sum([before is not None, after is not None, first]) <= 1, (
+            "You can only use one of before, after, or first",
+        )
+        insertion_idx = (
+            0
+            if first
+            else self.pipe_names.index(before)
+            if before is not None
+            else self.pipe_names.index(after) + 1
+            if after is not None
+            else len(self._components)
+        )
+        self._components.insert(insertion_idx, (name, pipe))
         return pipe
 
     def get_pipe_meta(self, name: str) -> FactoryMeta:

diff --git a/tests/pipelines/trainable/test_span_classifier.py b/tests/pipelines/trainable/test_span_classifier.py
diff --git a/tests/processing/test_processing.py b/tests/processing/test_processing.py
@@ -3,7 +3,6 @@
 import databricks.koalas  # noqa F401
 import pandas as pd
 import pytest
-import spacy
 from pyspark.sql import types as T
 from pyspark.sql.session import SparkSession
 
@@ -58,14 +57,12 @@ def note(module: DataFrameModules):
 
 
 @pytest.fixture
-def model(lang):
+def model(blank_nlp):
     # Creates the spaCy instance
-    nlp = spacy.blank(lang)
+    nlp = blank_nlp
 
     # Normalisation of accents, case and other special characters
     nlp.add_pipe("eds.normalizer")
-    # Detecting end of lines
-    nlp.add_pipe("eds.sentences")
 
     # Extraction of named entities
     nlp.add_pipe(