fix: custom training and brat/standoff doc pages

aphp · Jan 10, 2024 · 634af8d · 634af8d
1 parent 7e41fae
commit 634af8d
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 13 deletions.
diff --git a/docs/tutorials/make-a-training-script.md b/docs/tutorials/make-a-training-script.md
@@ -167,7 +167,7 @@ training loop
     with nlp.cache():
         loss = torch.zeros((), device="cpu")
         for name, component in nlp.torch_components():
-            output = component.module_forward(batch[component.name])  # (1)
+            output = component.module_forward(batch[name])  # (1)
             if "loss" in output:
                 loss += output["loss"]
 
@@ -194,13 +194,13 @@ scorer = create_ner_exact_scorer(nlp.get_pipe('ner').target_span_getter)
         with nlp.select_pipes(enable=["ner"]):  # (1)
             print(scorer(val_docs, nlp.pipe(deepcopy(val_docs))))  # (2)
 
-    nlp.save("model")  # (3)
+    nlp.to_disk("model")  # (3)
 ```
 
 1. In the case we have multiple pipes in our model, we may want to selectively evaluate each pipe, thus we use the `select_pipes` method to disable every pipe except "ner".
 2. We use the `pipe` method to run the "ner" component on the validation dataset. This method is similar to the `__call__` method of EDS-NLP components, but it is used to run a component on a list of
    spaCy Docs.
-3. We could also have saved the model with `torch.save(model, "model.pt")`, but `nlp.save` avoids pickling and allows to inspect the model's files by saving them into a structured directory.
+3. We could also have saved the model with `torch.save(model, "model.pt")`, but `nlp.to_disk` avoids pickling and allows to inspect the model's files by saving them into a structured directory.
 
 ## Full example
 
@@ -298,7 +298,7 @@ Let's wrap the training code in a function, and make it callable from the comman
             loss = torch.zeros((), device="cpu")
             with nlp.cache():
                 for name, component in nlp.torch_components():
-                    output = component.module_forward(batch[component.name])
+                    output = component.module_forward(batch[name])
                     if "loss" in output:
                         loss += output["loss"]
 

diff --git a/edsnlp/data/standoff.py b/edsnlp/data/standoff.py
@@ -1,21 +1,33 @@
+# ruff: noqa: F401
 import glob
 import os
 import re
 from collections import Counter, defaultdict
 from pathlib import Path
-from typing import Any, Callable, Dict, Optional, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Optional,
+    Union,
+)
 
+import spacy.tokenizer
 from loguru import logger
 
 from edsnlp import registry
+from edsnlp.core import PipelineProtocol
 from edsnlp.core.lazy_collection import LazyCollection
 from edsnlp.data.base import BaseReader, BaseWriter
 from edsnlp.data.converters import (
     FILENAME,
+    AttributesMappingArg,
+    SequenceStr,
     get_dict2doc_converter,
     get_doc2dict_converter,
 )
 from edsnlp.utils.collections import flatten_once
+from edsnlp.utils.span_getters import SpanSetterArg
 
 REGEX_ENTITY = re.compile(r"^(T\d+)\t(\S+)([^\t]+)\t(.*)$")
 REGEX_NOTE = re.compile(r"^(#\d+)\tAnnotatorNotes ([^\t]+)\t(.*)$")
@@ -382,20 +394,55 @@ def read_standoff(
         docs = list(edsnlp.data.read_standoff("path/to/brat/directory"))
         ```
 
+    !!! warning "True/False attributes"
+
+        Boolean values are not supported by the BRAT editor, and are stored as empty
+        (key: empty value) if true, and not stored otherwise. This means that False
+        values will not be assigned to attributes by default, which can be problematic
+        when deciding if an entity is negated or not : is the entity not negated, or
+        has the negation attribute not been annotated ?
+
+        To avoid this issue, you can use the `bool_attributes` argument to specify
+        which attributes should be considered as boolean when reading a BRAT dataset.
+        These attributes will be assigned a value of `True` if they are present, and
+        `False` otherwise.
+
+        ```{ .python .no-check }
+        doc_iterator = edsnlp.data.read_standoff(
+            "path/to/brat/directory",
+            # Mapping from 'BRAT attribute name' to 'Doc attribute name'
+            span_attributes={"Negation": "negated"},
+            bool_attributes=["negated"],  # Missing values will be set to False
+        )
+        ```
+
     Parameters
     ----------
-    path: Union[str, Path]
+    path : Union[str, Path]
         Path to the directory containing the BRAT files (will recursively look for
         files in subdirectories).
-    nlp: Optional[PipelineProtocol]
-        The pipeline instance (defaults to `edsnlp.blank("eds")`) used to tokenize the
-        documents.
-    span_setter: SpanSetterArg
+    nlp : Optional[PipelineProtocol]
+        The pipeline object (optional and likely not needed, prefer to use the
+        `tokenizer` directly argument instead).
+    tokenizer : Optional[spacy.tokenizer.Tokenizer]
+        The tokenizer instance used to tokenize the documents. Likely not needed since
+        by default it uses the current context tokenizer :
+
+        - the tokenizer of the next pipeline run by `.map_pipeline` in a
+          [LazyCollection][edsnlp.core.lazy_collection.LazyCollection].
+        - or the `eds` tokenizer by default.
+    span_setter : SpanSetterArg
         The span setter to use when setting the spans in the documents. Defaults to
         setting the spans in the `ents` attribute, and creates a new span group for
-        each BRAT entity label.
-    span_attributes: Optional[Union[Sequence[str], Mapping[str, str]]]
-        Mapping from BRAT
+        each JSON entity label.
+    span_attributes : Optional[AttributesMappingArg]
+        Mapping from BRAT attributes to Span extensions (can be a list too).
+        By default, all attributes are imported as Span extensions with the same name.
+    keep_raw_attribute_values : bool
+        Whether to keep the raw attribute values (as strings) or to convert them to
+        Python objects (e.g. booleans).
+    bool_attributes : SequenceStr
+        List of attributes for which missing values should be set to False.
 
     Returns
     -------