fix: better handle empty documents

aphp · Jun 25, 2024 · b73336a · b73336a
1 parent 0728c7a
commit b73336a
Show file tree

Hide file tree

Showing 5 changed files with 32 additions and 21 deletions.
diff --git a/changelog.md b/changelog.md
@@ -10,6 +10,7 @@
 
 - `edsnlp.load("your/huggingface-model", install_dependencies=True)` now correctly resolves the python pip
   (especially on Colab) to auto-install the model dependencies
+- We now better handle empty documents in the `eds.transformer`, `eds.text_cnn` and `eds.ner_crf` components
 
 ## v0.12.3
 

diff --git a/edsnlp/pipes/trainable/layers/crf.py b/edsnlp/pipes/trainable/layers/crf.py
@@ -151,27 +151,28 @@ def decode(self, emissions, mask):
             )
         path = torch.zeros(*emissions.shape[:-1], dtype=torch.long)
 
-        emissions[..., 1:][~mask] = IMPOSSIBLE
-        emissions = emissions.unbind(1)  # 1 is axis for words
-
-        # emissions: n_tokens * n_samples * n_tags
-        out = [emissions[0] + start_transitions]
-        backtrack = []
-
-        for k in range(1, len(emissions)):
-            res, indices = max_reduce(out[-1], transitions)
-            backtrack.append(indices)
-            out.append(res + emissions[k])
-
-        res, indices = max_reduce(out[-1], end_transitions.unsqueeze(-1))
-        path[:, -1] = indices.squeeze(-1)
-
-        # If make has shape n_samples * n_tokens,
-        # we only need range(n_samples)
-        if len(backtrack) > 1:
-            # Backward max path following
-            for k, b in enumerate(backtrack[::-1]):
-                path[:, -k - 2] = index_dim(b, path[:, -k - 1], dim=-1)
+        if 0 not in emissions.shape:
+            emissions[..., 1:][~mask] = IMPOSSIBLE
+            emissions = emissions.unbind(1)  # 1 is axis for words
+
+            # emissions: n_tokens * n_samples * n_tags
+            out = [emissions[0] + start_transitions]
+            backtrack = []
+
+            for k in range(1, len(emissions)):
+                res, indices = max_reduce(out[-1], transitions)
+                backtrack.append(indices)
+                out.append(res + emissions[k])
+
+            res, indices = max_reduce(out[-1], end_transitions.unsqueeze(-1))
+            path[:, -1] = indices.squeeze(-1)
+
+            # If make has shape n_samples * n_tokens,
+            # we only need range(n_samples)
+            if len(backtrack) > 1:
+                # Backward max path following
+                for k, b in enumerate(backtrack[::-1]):
+                    path[:, -k - 2] = index_dim(b, path[:, -k - 1], dim=-1)
 
         return path.to(transitions.device)
 

diff --git a/edsnlp/pipes/trainable/layers/text_cnn.py b/edsnlp/pipes/trainable/layers/text_cnn.py
@@ -78,6 +78,8 @@ def forward(
         self, embeddings: torch.FloatTensor, mask: torch.BoolTensor
     ) -> torch.FloatTensor:
         # shape: samples words dim
+        if 0 in embeddings.shape:
+            return embeddings.view((*embeddings.shape[:-1], self.linear.out_features))  # type: ignore
         max_k = max(conv.kernel_size[0] for conv in self.convolutions)
         left_pad = (max_k) // 2
         right_pad = (max_k - 1) // 2

diff --git a/edsnlp/utils/torch.py b/edsnlp/utils/torch.py
@@ -71,6 +71,7 @@ def make_windows(lengths, size, stride):
             for idx in range(0, 1 + max(0, math.ceil((length - size) / stride)))
         ],
         pad=-1,
+        dtype=torch.long,
     )
     windows_mask = windows != -1
     windows[~windows_mask] = 0

diff --git a/tests/training/test_train.py b/tests/training/test_train.py
@@ -93,6 +93,9 @@ def test_ner_qualif_train(run_in_test_dir, tmp_path):
     scorer = GenericScorer(**kwargs["scorer"])
     last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))
 
+    # Check empty doc
+    nlp("")
+
     assert last_scores["ner"]["micro"]["f"] > 0.4
     assert last_scores["qual"]["micro"]["f"] > 0.4
 
@@ -106,6 +109,9 @@ def test_qualif_train(run_in_test_dir, tmp_path):
     scorer = GenericScorer(**kwargs["scorer"])
     last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))
 
+    # Check empty doc
+    nlp("")
+
     assert last_scores["qual"]["micro"]["f"] >= 0.4