Skip to content

Commit

Permalink
CU-86938vf30 add trainer callbacks for Transformer NER
Browse files Browse the repository at this point in the history
  • Loading branch information
baixiac committed Dec 4, 2023
1 parent 76b75cc commit fc992a7
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 2 deletions.
16 changes: 14 additions & 2 deletions medcat/ner/transformers_ner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import json
import logging
import datasets
from spacy.tokens import Doc
from datetime import datetime
from typing import Iterable, Iterator, Optional, Dict, List, cast, Union
Expand All @@ -18,7 +19,7 @@

from transformers import Trainer, AutoModelForTokenClassification, AutoTokenizer
from transformers import pipeline, TrainingArguments
import datasets
from transformers.trainer_callback import TrainerCallback

# It should be safe to do this always, as all other multiprocessing
#will be finished before data comes to meta_cat
Expand Down Expand Up @@ -137,7 +138,12 @@ def merge_data_loaded(base, other):

return out_path

def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=False, dataset=None, meta_requirements=None):
def train(self,
json_path: Union[str, list, None]=None,
ignore_extra_labels=False,
dataset=None,
meta_requirements=None,
trainer_callbacks: Optional[List[TrainerCallback]]=None):
"""Train or continue training a model give a json_path containing a MedCATtrainer export. It will
continue training if an existing model is loaded or start new training if the model is blank/new.
Expand All @@ -149,6 +155,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals
ignore_extra_labels:
Makes only sense when an existing deid model was loaded and from the new data we want to ignore
labels that did not exist in the old model.
trainer_callbacks (List[TrainerCallback]):
A list of trainer callbacks for collecting metrics during the training at the client side. The
transformers Trainer object will be passed in when each callback is called.
"""

if dataset is None and json_path is not None:
Expand Down Expand Up @@ -193,6 +202,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals
compute_metrics=lambda p: metrics(p, tokenizer=self.tokenizer, dataset=encoded_dataset['test'], verbose=self.config.general['verbose_metrics']),
data_collator=data_collator, # type: ignore
tokenizer=None)
if trainer_callbacks:
for callback in trainer_callbacks:
trainer.add_callback(callback(trainer))

trainer.train() # type: ignore

Expand Down
46 changes: 46 additions & 0 deletions tests/ner/test_transformers_ner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import os
import unittest
from spacy.lang.en import English
from spacy.tokens import Doc, Span
from transformers import TrainerCallback
from medcat.ner.transformers_ner import TransformersNER
from medcat.config import Config
from medcat.cdb_maker import CDBMaker


class TransformerNERTest(unittest.TestCase):

@classmethod
def setUpClass(cls) -> None:
config = Config()
config.general["spacy_model"] = "en_core_web_md"
cdb_maker = CDBMaker(config)
cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "examples", "cdb.csv")
cdb = cdb_maker.prepare_csvs([cdb_csv], full_build=True)
Doc.set_extension("ents", default=[], force=True)
Span.set_extension("confidence", default=-1, force=True)
Span.set_extension("id", default=0, force=True)
Span.set_extension("detected_name", default=None, force=True)
Span.set_extension("link_candidates", default=None, force=True)
Span.set_extension("cui", default=-1, force=True)
Span.set_extension("context_similarity", default=-1, force=True)
cls.undertest = TransformersNER(cdb)
cls.undertest.create_eval_pipeline()

def test_pipe(self):
doc = English().make_doc("Intracerebral hemorrhage is not Movar Virus")
doc = next(self.undertest.pipe([doc]))
assert len(doc.ents) > 0, "No entities were recognised"

def test_train_with_callbacks(self):
tracker = unittest.mock.Mock()
class _DummyCallback(TrainerCallback):
def __init__(self, trainer) -> None:
self._trainer = trainer
def on_epoch_end(self, *args, **kwargs) -> None:
tracker.call()

train_data = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", "deid_train_data.json")
self.undertest.training_arguments.num_train_epochs = 1
self.undertest.train(train_data, trainer_callbacks=[_DummyCallback, _DummyCallback])
self.assertEqual(tracker.call.call_count, 2)

0 comments on commit fc992a7

Please sign in to comment.