Skip to content

Commit

Permalink
fix: better handle empty documents
Browse files Browse the repository at this point in the history
  • Loading branch information
percevalw committed Jun 25, 2024
1 parent 0728c7a commit b73336a
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 21 deletions.
1 change: 1 addition & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

- `edsnlp.load("your/huggingface-model", install_dependencies=True)` now correctly resolves the python pip
(especially on Colab) to auto-install the model dependencies
- We now better handle empty documents in the `eds.transformer`, `eds.text_cnn` and `eds.ner_crf` components

## v0.12.3

Expand Down
43 changes: 22 additions & 21 deletions edsnlp/pipes/trainable/layers/crf.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,27 +151,28 @@ def decode(self, emissions, mask):
)
path = torch.zeros(*emissions.shape[:-1], dtype=torch.long)

emissions[..., 1:][~mask] = IMPOSSIBLE
emissions = emissions.unbind(1) # 1 is axis for words

# emissions: n_tokens * n_samples * n_tags
out = [emissions[0] + start_transitions]
backtrack = []

for k in range(1, len(emissions)):
res, indices = max_reduce(out[-1], transitions)
backtrack.append(indices)
out.append(res + emissions[k])

res, indices = max_reduce(out[-1], end_transitions.unsqueeze(-1))
path[:, -1] = indices.squeeze(-1)

# If make has shape n_samples * n_tokens,
# we only need range(n_samples)
if len(backtrack) > 1:
# Backward max path following
for k, b in enumerate(backtrack[::-1]):
path[:, -k - 2] = index_dim(b, path[:, -k - 1], dim=-1)
if 0 not in emissions.shape:
emissions[..., 1:][~mask] = IMPOSSIBLE
emissions = emissions.unbind(1) # 1 is axis for words

# emissions: n_tokens * n_samples * n_tags
out = [emissions[0] + start_transitions]
backtrack = []

for k in range(1, len(emissions)):
res, indices = max_reduce(out[-1], transitions)
backtrack.append(indices)
out.append(res + emissions[k])

res, indices = max_reduce(out[-1], end_transitions.unsqueeze(-1))
path[:, -1] = indices.squeeze(-1)

# If make has shape n_samples * n_tokens,
# we only need range(n_samples)
if len(backtrack) > 1:
# Backward max path following
for k, b in enumerate(backtrack[::-1]):
path[:, -k - 2] = index_dim(b, path[:, -k - 1], dim=-1)

return path.to(transitions.device)

Expand Down
2 changes: 2 additions & 0 deletions edsnlp/pipes/trainable/layers/text_cnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ def forward(
self, embeddings: torch.FloatTensor, mask: torch.BoolTensor
) -> torch.FloatTensor:
# shape: samples words dim
if 0 in embeddings.shape:
return embeddings.view((*embeddings.shape[:-1], self.linear.out_features)) # type: ignore
max_k = max(conv.kernel_size[0] for conv in self.convolutions)
left_pad = (max_k) // 2
right_pad = (max_k - 1) // 2
Expand Down
1 change: 1 addition & 0 deletions edsnlp/utils/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def make_windows(lengths, size, stride):
for idx in range(0, 1 + max(0, math.ceil((length - size) / stride)))
],
pad=-1,
dtype=torch.long,
)
windows_mask = windows != -1
windows[~windows_mask] = 0
Expand Down
6 changes: 6 additions & 0 deletions tests/training/test_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ def test_ner_qualif_train(run_in_test_dir, tmp_path):
scorer = GenericScorer(**kwargs["scorer"])
last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))

# Check empty doc
nlp("")

assert last_scores["ner"]["micro"]["f"] > 0.4
assert last_scores["qual"]["micro"]["f"] > 0.4

Expand All @@ -106,6 +109,9 @@ def test_qualif_train(run_in_test_dir, tmp_path):
scorer = GenericScorer(**kwargs["scorer"])
last_scores = scorer(nlp, Reader(**kwargs["val_data"])(nlp))

# Check empty doc
nlp("")

assert last_scores["qual"]["micro"]["f"] >= 0.4


Expand Down

0 comments on commit b73336a

Please sign in to comment.