From ed840d07a51b36734807317ae2947f28c8d2332b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Oct 2023 11:06:29 +0000 Subject: [PATCH 01/13] Bump urllib3 from 1.26.17 to 1.26.18 in /webapp/webapp Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.17 to 1.26.18. - [Release notes](https://github.com/urllib3/urllib3/releases) - [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst) - [Commits](https://github.com/urllib3/urllib3/compare/1.26.17...1.26.18) --- updated-dependencies: - dependency-name: urllib3 dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index d6a590572..cef7a036d 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -3,4 +3,4 @@ django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 medcat==1.2.7 -urllib3==1.26.17 +urllib3==1.26.18 From d377f0b6a38b0b2e46dd7224b0a87cf015b767d9 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 30 Oct 2023 14:06:34 +0200 Subject: [PATCH 02/13] CU-8692uznvd: Allow empty-dict config.linking.filters.cuis and convert to set in memory (#352) * CU-8692uznvd: Allow empty-dict config.linking.filters.cuis and convert to set in memory * CU-8692uznvd: Move the empty-set detection and conversion from validator to init * CU-8692uznvd: Remove unused import --- medcat/config.py | 13 +++++++++++++ tests/test_config.py | 21 ++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/medcat/config.py b/medcat/config.py index b2e324deb..87c6d34f5 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -433,6 +433,19 @@ class LinkingFilters(MixingConfig, BaseModel): cuis: Set[str] = set() cuis_exclude: Set[str] = set() + def __init__(self, **data): + if 'cuis' in data: + cuis = data['cuis'] + if isinstance(cuis, dict) and len(cuis) == 0: + logger.warning("Loading an old model where " + "config.linking.filters.cuis has been " + "dict to an empty dict instead of an empty " + "set. Converting the dict to a set in memory " + "as that is what is expected. Please consider " + "saving the model again.") + data['cuis'] = set(cuis.keys()) + super().__init__(**data) + def check_filters(self, cui: str) -> bool: """Checks is a CUI in the filters diff --git a/tests/test_config.py b/tests/test_config.py index 2f9cd5a84..aacd0a760 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,7 +1,7 @@ import unittest import pickle import tempfile -from medcat.config import Config, MixingConfig, VersionInfo, General +from medcat.config import Config, MixingConfig, VersionInfo, General, LinkingFilters from pydantic import ValidationError import os @@ -180,5 +180,24 @@ def test_from_dict(self): self.assertEqual("value", config.key) +class ConfigLinkingFiltersTests(unittest.TestCase): + + def test_allows_empty_dict_for_cuis(self): + lf = LinkingFilters(cuis={}) + self.assertIsNotNone(lf) + + def test_empty_dict_converted_to_empty_set(self): + lf = LinkingFilters(cuis={}) + self.assertEqual(lf.cuis, set()) + + def test_not_allow_nonempty_dict_for_cuis(self): + with self.assertRaises(ValidationError): + LinkingFilters(cuis={"KEY": "VALUE"}) + + def test_not_allow_empty_dict_for_cuis_exclude(self): + with self.assertRaises(ValidationError): + LinkingFilters(cuis_exclude={}) + + if __name__ == '__main__': unittest.main() From ad6704891db9fa91169b4ef6934fa18e5242184f Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 30 Oct 2023 16:27:37 +0200 Subject: [PATCH 03/13] CU-8692t3fdf separate config on save (#350) * CU-8692t3fdf Move saving config outside of the cdb.dat; Add test to make sure the config does not get saved with the CDB; patch a few existing tests * CU-8692t3fdf Use class methods on class instead of instance in a few tests * CU-8692t3fdf Fix typing issue * CU-8692t3fdf Add additional tests for 2 configs and zero configs when loading model pack * CU-8692t3fdf: Make sure CDB is linked to the correct config; Treat incorrect configs as dirty CDBs and force a recalc of the hash --- medcat/cat.py | 8 +++ medcat/cdb.py | 73 +++++++++++++++++++++++- medcat/config.py | 8 ++- medcat/utils/saving/serializer.py | 15 ++++- tests/test_cat.py | 52 ++++++++++++++++- tests/test_cdb.py | 8 +++ tests/test_config.py | 29 ++++++++++ tests/utils/saving/test_serialization.py | 2 +- tests/utils/test_hashing.py | 47 ++++++++++++++- 9 files changed, 229 insertions(+), 13 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 2323cd737..0fb6b1167 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -271,6 +271,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M cdb_path = os.path.join(save_dir_path, "cdb.dat") self.cdb.save(cdb_path, json_path) + # Save the config + config_path = os.path.join(save_dir_path, "config.json") + self.cdb.config.save(config_path) + # Save the Vocab vocab_path = os.path.join(save_dir_path, "vocab.dat") if self.vocab is not None: @@ -362,6 +366,10 @@ def load_model_pack(cls, logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format') cdb = CDB.load(cdb_path, json_path) + # load config + config_path = os.path.join(model_pack_path, "config.json") + cdb.load_config(config_path) + # TODO load addl_ner # Modify the config to contain full path to spacy model diff --git a/medcat/cdb.py b/medcat/cdb.py index 44d4fd9dd..5a648f4af 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -5,8 +5,9 @@ import logging import aiofiles import numpy as np -from typing import Dict, Set, Optional, List, Union +from typing import Dict, Set, Optional, List, Union, cast from functools import partial +import os from medcat import __version__ from medcat.utils.hasher import Hasher @@ -61,8 +62,10 @@ class CDB(object): def __init__(self, config: Union[Config, None] = None) -> None: if config is None: self.config = Config() + self._config_from_file = False else: self.config = config + self._config_from_file = True self.name2cuis: Dict = {} self.name2cuis2status: Dict = {} @@ -95,6 +98,12 @@ def __init__(self, config: Union[Config, None] = None) -> None: self._optim_params = None self.is_dirty = False self._hash: Optional[str] = None + # the config hash is kept track of here so that + # the CDB hash can be re-calculated when the config changes + # it can also be used to make sure the config loaded with + # a CDB matches the config it was saved with + # since the config is now saved separately + self._config_hash: Optional[str] = None self._memory_optimised_parts: Set[str] = set() def get_name(self, cui: str) -> str: @@ -458,6 +467,35 @@ async def save_async(self, path: str) -> None: } await f.write(dill.dumps(to_save)) + def load_config(self, config_path: str) -> None: + if not os.path.exists(config_path): + if not self._config_from_file: + # if there's no config defined anywhere + raise ValueError("Could not find a config in the CDB nor ", + "in the config.json for this model " + f"({os.path.dirname(config_path)})", + ) + # if there is a config, but it's defined in the cdb.dat file + logger.warning("Could not find config.json in model pack folder " + f"({os.path.dirname(config_path)}). " + "This is probably an older model. Please save the model " + "again in the new format to avoid potential issues.") + else: + if self._config_from_file: + # if there's a config.json and one defined in the cbd.dat file + raise ValueError("Found a config in the CDB and in the config.json " + f"for model ({os.path.dirname(config_path)}) - " + "this is ambiguous. Please either remove the " + "config.json or load the CDB without the config.json " + "in the folder and re-save in the newer format " + "(the default save in this version)") + # if the only config is in the separate config.json file + # this should be the behaviour for all newer models + self.config = cast(Config, Config.load(config_path)) + logger.debug("Loaded config from CDB from %s", config_path) + # mark config read from file + self._config_from_file = True + @classmethod def load(cls, path: str, json_path: Optional[str] = None, config_dict: Optional[Dict] = None) -> "CDB": """Load and return a CDB. This allows partial loads in probably not the right way at all. @@ -777,8 +815,34 @@ def _check_medcat_version(cls, config_data: Dict) -> None: or download the compatible model.""" ) + def _should_recalc_hash(self, force_recalc: bool) -> bool: + if force_recalc: + return True + if self.config.hash is None: + # TODO - perhaps this is not the best? + # as this is a side effect + # get and save result in config + self.config.get_hash() + if not self._hash or self.is_dirty: + # if no hash saved or is dirty + # need to calculate + logger.debug("Recalculating hash due to %s", + "no hash saved" if not self._hash else "CDB is dirty") + return True + # recalc config hash in case it changed + self.config.get_hash() + if self._config_hash is None or self._config_hash != self.config.hash: + # if no config hash saved + # or if the config hash is different from one saved in here + logger.debug("Recalculating hash due to %s", + "no config hash saved" if not self._config_hash + else "config hash has changed") + return True + return False + def get_hash(self, force_recalc: bool = False): - if not force_recalc and self._hash and not self.is_dirty: + should_recalc = self._should_recalc_hash(force_recalc) + if not should_recalc: logger.info("Reusing old hash of CDB since the CDB has not changed: %s", self._hash) return self._hash self.is_dirty = False @@ -791,7 +855,7 @@ def calculate_hash(self): for k,v in self.__dict__.items(): if k in ['cui2countext_vectors', 'name2cuis']: hasher.update(v, length=False) - elif k in ['_hash', 'is_dirty']: + elif k in ['_hash', 'is_dirty', '_config_hash']: # ignore _hash since if it previously didn't exist, the # new hash would be different when the value does exist # and ignore is_dirty so that we get the same hash as previously @@ -799,6 +863,9 @@ def calculate_hash(self): elif k != 'config': hasher.update(v, length=True) + # set cached config hash + self._config_hash = self.config.hash + self._hash = hasher.hexdigest() logger.info("Found new CDB hash: %s", self._hash) return self._hash diff --git a/medcat/config.py b/medcat/config.py index 87c6d34f5..e60c2eafc 100644 --- a/medcat/config.py +++ b/medcat/config.py @@ -548,6 +548,7 @@ class Config(MixingConfig, BaseModel): linking: Linking = Linking() word_skipper: re.Pattern = re.compile('') # empty pattern gets replaced upon init punct_checker: re.Pattern = re.compile('') # empty pattern gets replaced upon init + hash: Optional[str] = None class Config: # this if for word_skipper and punct_checker which would otherwise @@ -572,6 +573,9 @@ def rebuild_re(self) -> None: def get_hash(self): hasher = Hasher() for k, v in self.dict().items(): + if k in ['hash', ]: + # ignore hash + continue if k not in ['version', 'general', 'linking']: hasher.update(v, length=True) elif k == 'general': @@ -587,5 +591,5 @@ def get_hash(self): hasher.update(v2, length=False) else: hasher.update(v2, length=True) - - return hasher.hexdigest() + self.hash = hasher.hexdigest() + return self.hash diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py index d82df751c..25529c778 100644 --- a/medcat/utils/saving/serializer.py +++ b/medcat/utils/saving/serializer.py @@ -135,13 +135,12 @@ def serialize(self, cdb, overwrite: bool = False) -> None: raise ValueError(f'Unable to overwrite shelf path "{self.json_path}"' ' - specify overrwrite=True if you wish to overwrite') to_save = {} - to_save['config'] = cdb.config.asdict() # This uses different names so as to not be ambiguous # when looking at files whether the json parts should # exist separately or not to_save['cdb_main' if self.jsons is not None else 'cdb'] = dict( ((key, val) for key, val in cdb.__dict__.items() if - key != 'config' and + key not in ('config', '_config_from_file') and (self.jsons is None or key not in SPECIALITY_NAMES))) logger.info('Dumping CDB to %s', self.main_path) with open(self.main_path, 'wb') as f: @@ -165,7 +164,17 @@ def deserialize(self, cdb_cls): logger.info('Reading CDB data from %s', self.main_path) with open(self.main_path, 'rb') as f: data = dill.load(f) - config = cast(Config, Config.from_dict(data['config'])) + if 'config' in data: + logger.warning("Found config in CDB for model (%s). " + "This is an old format. Please re-save the " + "model in the new format to avoid potential issues", + os.path.dirname(self.main_path)) + config = cast(Config, Config.from_dict(data['config'])) + else: + # by passing None as config to constructor + # the CDB should identify that there has been + # no config loaded + config = None cdb = cdb_cls(config=config) if self.jsons is None: cdb_main = data['cdb'] diff --git a/tests/test_cat.py b/tests/test_cat.py index 0baa0d35d..62db4d44d 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -367,7 +367,7 @@ def test_load_model_pack(self): meta_cat = _get_meta_cat(self.meta_cat_dir) cat = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[meta_cat]) full_model_pack_name = cat.create_model_pack(save_dir_path.name, model_pack_name="mp_name") - cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) + cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertTrue(isinstance(cat, CAT)) self.assertIsNotNone(cat.config.version.medcat_version) self.assertEqual(repr(cat._meta_cats), repr([meta_cat])) @@ -377,7 +377,7 @@ def test_load_model_pack_without_meta_cat(self): meta_cat = _get_meta_cat(self.meta_cat_dir) cat = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[meta_cat]) full_model_pack_name = cat.create_model_pack(save_dir_path.name, model_pack_name="mp_name") - cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"), load_meta_models=False) + cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"), load_meta_models=False) self.assertTrue(isinstance(cat, CAT)) self.assertIsNotNone(cat.config.version.medcat_version) self.assertEqual(cat._meta_cats, []) @@ -385,10 +385,56 @@ def test_load_model_pack_without_meta_cat(self): def test_hashing(self): save_dir_path = tempfile.TemporaryDirectory() full_model_pack_name = self.undertest.create_model_pack(save_dir_path.name, model_pack_name="mp_name") - cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) + cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) +class ModelWithTwoConfigsLoadTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cls.model_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples") + cdb = CDB.load(os.path.join(cls.model_path, "cdb.dat")) + # save config next to the CDB + cls.config_path = os.path.join(cls.model_path, 'config.json') + cdb.config.save(cls.config_path) + + + @classmethod + def tearDownClass(cls) -> None: + # REMOVE config next to the CDB + os.remove(cls.config_path) + + def test_loading_model_pack_with_cdb_config_and_config_json_raises_exception(self): + with self.assertRaises(ValueError): + CAT.load_model_pack(self.model_path) + + +class ModelWithZeroConfigsLoadTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + cdb_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat") + cdb = CDB.load(cdb_path) + vocab_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat") + # copy the CDB and vocab to a temp dir + cls.temp_dir = tempfile.TemporaryDirectory() + cls.cdb_path = os.path.join(cls.temp_dir.name, 'cdb.dat') + cdb.save(cls.cdb_path) # save without internal config + cls.vocab_path = os.path.join(cls.temp_dir.name, 'vocab.dat') + shutil.copyfile(vocab_path, cls.vocab_path) + + + @classmethod + def tearDownClass(cls) -> None: + # REMOVE temp dir + cls.temp_dir.cleanup() + + def test_loading_model_pack_without_any_config_raises_exception(self): + with self.assertRaises(ValueError): + CAT.load_model_pack(self.temp_dir.name) + + def _get_meta_cat(meta_cat_dir): config = ConfigMetaCAT() config.general["category_name"] = "Status" diff --git a/tests/test_cdb.py b/tests/test_cdb.py index 96425bc8c..f7be24d64 100644 --- a/tests/test_cdb.py +++ b/tests/test_cdb.py @@ -6,6 +6,7 @@ import numpy as np from medcat.config import Config from medcat.cdb_maker import CDBMaker +from medcat.cdb import CDB class CDBTests(unittest.TestCase): @@ -53,6 +54,13 @@ def test_save_and_load(self): self.undertest.save(f.name) self.undertest.load(f.name) + def test_load_has_no_config(self): + with tempfile.NamedTemporaryFile() as f: + self.undertest.save(f.name) + cdb = CDB.load(f.name) + self.assertFalse(cdb._config_from_file) + + def test_save_async_and_load(self): with tempfile.NamedTemporaryFile() as f: asyncio.run(self.undertest.save_async(f.name)) diff --git a/tests/test_config.py b/tests/test_config.py index aacd0a760..ce6ed76eb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -179,6 +179,35 @@ def test_from_dict(self): config = Config.from_dict({"key": "value"}) self.assertEqual("value", config.key) + def test_config_no_hash_before_get(self): + config = Config() + self.assertIsNone(config.hash) + + def test_config_has_hash_after_get(self): + config = Config() + config.get_hash() + self.assertIsNotNone(config.hash) + + def test_config_hash_recalc_same_def(self): + config = Config() + h1 = config.get_hash() + h2 = config.get_hash() + self.assertEqual(h1, h2) + + def test_config_hash_changes_after_change(self): + config = Config() + h1 = config.get_hash() + config.linking.filters.cuis = {"a", "b"} + h2 = config.get_hash() + self.assertNotEqual(h1, h2) + + def test_config_hash_recalc_same_changed(self): + config = Config() + config.linking.filters.cuis = {"a", "b"} + h1 = config.get_hash() + h2 = config.get_hash() + self.assertEqual(h1, h2) + class ConfigLinkingFiltersTests(unittest.TestCase): diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py index f0cc75de1..c2c44da16 100644 --- a/tests/utils/saving/test_serialization.py +++ b/tests/utils/saving/test_serialization.py @@ -87,7 +87,7 @@ def test_dill_to_json(self): model_pack_folder = os.path.join( self.json_model_pack.name, model_pack_path) json_path = os.path.join(model_pack_folder, "*.json") - jsons = glob.glob(json_path) + jsons = [fn for fn in glob.glob(json_path) if not fn.endswith("config.json")] # there is also a model_card.json # but nothing for cui2many or name2many # so can remove the length of ONE2MANY diff --git a/tests/utils/test_hashing.py b/tests/utils/test_hashing.py index 99c10b153..b6681461f 100644 --- a/tests/utils/test_hashing.py +++ b/tests/utils/test_hashing.py @@ -1,4 +1,5 @@ import os +from typing import Optional import tempfile import unittest import unittest.mock @@ -6,6 +7,7 @@ from medcat.cat import CAT from medcat.cdb import CDB from medcat.vocab import Vocab +from medcat.config import Config class CDBHashingTests(unittest.TestCase): @@ -30,6 +32,43 @@ def test_CDB_hash_saves_on_disk(self): self.assertEqual(h, cdb._hash) +class CDBHashingWithConfigTests(unittest.TestCase): + temp_dir = tempfile.TemporaryDirectory() + + @classmethod + def setUpClass(cls) -> None: + cls.cdb = CDB.load(os.path.join(os.path.dirname( + os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat")) + # ensure config has hash + h = cls.cdb.get_hash() + cls.config = cls.config_copy(cls.cdb.config) + cls._config_hash = cls.cdb.config.hash + + @classmethod + def config_copy(cls, config: Optional[Config] = None) -> Config: + if config is None: + config = cls.config + return Config(**config.asdict()) + + def setUp(self) -> None: + # reset config + self.cdb.config = self.config_copy() + # reset config hash + self.cdb._config_hash = self._config_hash + self.cdb.config.hash = self._config_hash + + def test_CDB_same_hash_no_need_recalc(self): + self.assertFalse(self.cdb._should_recalc_hash(force_recalc=False)) + + def test_CDB_hash_recalc_if_no_config_hash(self): + self.cdb._config_hash = None + self.assertTrue(self.cdb._should_recalc_hash(force_recalc=False)) + + def test_CDB_hash_recalc_after_config_change(self): + self.cdb.config.linking.filters.cuis = {"a", "b", "c"} + self.assertTrue(self.cdb._should_recalc_hash(force_recalc=False)) + + class BaseCATHashingTests(unittest.TestCase): @classmethod @@ -75,8 +114,14 @@ def test_no_changes_recalc_same(self): class CATHashingTestsWithoutChange(CATHashingTestsWithFakeHash): - def test_no_changes_no_calc(self): + def setUp(self) -> None: + self._calculate_hash = self.undertest.cdb.calculate_hash + # make sure the hash exists + self.undertest.cdb._config_hash = self.undertest.cdb.config.get_hash() + self.undertest.cdb.get_hash() self.undertest.cdb.calculate_hash = unittest.mock.Mock() + + def test_no_changes_no_calc(self): hash = self.undertest.get_hash() self.assertIsInstance(hash, str) self.undertest.cdb.calculate_hash.assert_not_called() From e52bda3547dfa61c671727746058f67a21da3576 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Tue, 31 Oct 2023 11:44:00 +0200 Subject: [PATCH 04/13] CU-2cdpd4t: Unify default addl_info in different methdos. (#363) --- medcat/cat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 0fb6b1167..a86584c3a 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -1340,7 +1340,7 @@ def multiprocessing(self, nproc: int = 2, batch_size_chars: int = 5000 * 1000, only_cui: bool = False, - addl_info: List[str] = [], + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], separate_nn_components: bool = True, out_split_size_chars: Optional[int] = None, save_dir_path: str = os.path.abspath(os.getcwd()), @@ -1536,7 +1536,7 @@ def multiprocessing_pipe(self, nproc: Optional[int] = None, batch_size: Optional[int] = None, only_cui: bool = False, - addl_info: List[str] = [], + addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], return_dict: bool = True, batch_factor: int = 2) -> Union[List[Tuple], Dict]: """Run multiprocessing NOT FOR TRAINING From b6ab62ca2e5c0f654dcd34271ce0d72a70f603cb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 2 Nov 2023 22:13:07 +0000 Subject: [PATCH 05/13] Bump django from 3.2.20 to 3.2.23 in /webapp/webapp Bumps [django](https://github.com/django/django) from 3.2.20 to 3.2.23. - [Commits](https://github.com/django/django/compare/3.2.20...3.2.23) --- updated-dependencies: - dependency-name: django dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- webapp/webapp/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt index d6a590572..4ace8c6f3 100644 --- a/webapp/webapp/requirements.txt +++ b/webapp/webapp/requirements.txt @@ -1,4 +1,4 @@ -Django==3.2.20 +Django==3.2.23 django-dbbackup==4.0.0b0 django-storages[boto3]==1.12.3 django-cron==0.5.1 From 94827bb50cf11526daa8f787b8669395b621043b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Fri, 3 Nov 2023 15:20:50 +0000 Subject: [PATCH 06/13] Changing cdb.add_concept to a protected method --- medcat/cat.py | 2 +- medcat/cdb.py | 7 ++++--- medcat/cdb_maker.py | 2 +- tests/archive_tests/test_cdb_maker_archive.py | 2 +- tests/utils/test_hashing.py | 6 +++--- 5 files changed, 10 insertions(+), 9 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index 2323cd737..5bdf43211 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -834,7 +834,7 @@ def add_and_train_concept(self, names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) # Only if not negative, otherwise do not add the new name if in fact it should not be detected if do_add_concept and not negative: - self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, + self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) if spacy_entity is not None and spacy_doc is not None: diff --git a/medcat/cdb.py b/medcat/cdb.py index 44d4fd9dd..91bb7bd9d 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -213,9 +213,9 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b # Name status must be one of the three name_status = 'A' - self.add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) + self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) - def add_concept(self, + def _add_concept(self, cui: str, names: Dict, ontologies: set, @@ -232,7 +232,8 @@ def add_concept(self, the same CUI will be merged internally. names (Dict[str, Dict]): Names for this concept, or the value that if found in free text can be linked to this concept. - Names is an dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name' ontologies (Set[str]): ontologies in which the concept exists (e.g. SNOMEDCT, HPO) name_status (str): diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py index e9c72d12e..ca98f821e 100644 --- a/medcat/cdb_maker.py +++ b/medcat/cdb_maker.py @@ -173,7 +173,7 @@ def prepare_csvs(self, if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']: prepare_name(raw_name, self.pipe.spacy_nlp, names, self.config) - self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, + self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, full_build=full_build) # DEBUG logger.debug("\n\n**** Added\n CUI: %s\n Names: %s\n Ontologies: %s\n Name status: %s\n Type IDs: %s\n Description: %s\n Is full build: %s", diff --git a/tests/archive_tests/test_cdb_maker_archive.py b/tests/archive_tests/test_cdb_maker_archive.py index 329408999..9e2fc2d72 100644 --- a/tests/archive_tests/test_cdb_maker_archive.py +++ b/tests/archive_tests/test_cdb_maker_archive.py @@ -108,7 +108,7 @@ def test_concept_similarity(self): for i in range(500): cui = "C" + str(i) type_ids = {'T-' + str(i%10)} - cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(), + cdb._add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(), name_status='P', type_ids=type_ids, description='', full_build=True) vectors = {} diff --git a/tests/utils/test_hashing.py b/tests/utils/test_hashing.py index 99c10b153..60796eb99 100644 --- a/tests/utils/test_hashing.py +++ b/tests/utils/test_hashing.py @@ -90,7 +90,7 @@ class CATHashingTestsWithChange(CATHashingTestsWithFakeHash): def test_when_changes_do_calc(self): with unittest.mock.patch.object(CDB, 'calculate_hash', return_value='abcd1234') as patch_method: - self.undertest.cdb.add_concept(**self.concept_kwargs) + self.undertest.cdb._add_concept(**self.concept_kwargs) hash = self.undertest.get_hash() self.assertIsInstance(hash, str) patch_method.assert_called() @@ -106,10 +106,10 @@ def test_default_cdb_not_dirty(self): self.assertFalse(self.undertest.cdb.is_dirty) def test_after_add_concept_is_dirty(self): - self.undertest.cdb.add_concept(**self.concept_kwargs) + self.undertest.cdb._add_concept(**self.concept_kwargs) self.assertTrue(self.undertest.cdb.is_dirty) def test_after_recalc_not_dirty(self): - self.undertest.cdb.add_concept(**self.concept_kwargs) + self.undertest.cdb._add_concept(**self.concept_kwargs) self.undertest.get_hash() self.assertFalse(self.undertest.cdb.is_dirty) From 26b5120384905eaba66db6eb2e3fd626d7c66c7b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 Date: Mon, 6 Nov 2023 17:48:15 +0000 Subject: [PATCH 07/13] Re-added deprecated method with deprecated flag and addtional comments --- medcat/cdb.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/medcat/cdb.py b/medcat/cdb.py index 91bb7bd9d..5c1c27a96 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -12,6 +12,7 @@ from medcat.utils.hasher import Hasher from medcat.utils.matutils import unitvec from medcat.utils.ml_utils import get_lr_linking +from medcat.utils.decorators import deprecated from medcat.config import Config, weighted_average, workers from medcat.utils.saving.serializer import CDBSerializer @@ -215,6 +216,44 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build) + @deprecated("Use `cdb._add_concept` as this will be removed in a future release.") + def add_concept(self, + cui: str, + names: Dict, + ontologies: set, + name_status: str, + type_ids: Set[str], + description: str, + full_build: bool = False) -> None: + """ + Deprecated: Use `cdb._add_concept` as this will be removed in a future release. + + Add a concept to internal Concept Database (CDB). Depending on what you are providing + this will add a large number of properties for each concept. + + Args: + cui (str): + Concept ID or unique identifier in this database, all concepts that have + the same CUI will be merged internally. + names (Dict[str, Dict]): + Names for this concept, or the value that if found in free text can be linked to this concept. + Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}` + Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name' + ontologies (Set[str]): + ontologies in which the concept exists (e.g. SNOMEDCT, HPO) + name_status (str): + One of `P`, `N`, `A` + type_ids (Set[str]): + Semantic type identifier (have a look at TUIs in UMLS or SNOMED-CT) + description (str): + Description of this concept. + full_build (bool): + If True the dictionary self.addl_info will also be populated, contains a lot of extra information + about concepts, but can be very memory consuming. This is not necessary + for normal functioning of MedCAT (Default Value `False`). + """ + self._add_concept(cui, names, ontologies, name_status, type_ids, description, full_build) + def _add_concept(self, cui: str, names: Dict, From b6b023b9cb5c9c40a5d9824d8a1f368b32aa6b21 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 27 Nov 2023 03:21:10 -0600 Subject: [PATCH 08/13] CU-86931prq4: Update GHA versions (checkout and setup-python) to v4 (#368) --- .github/workflows/main.yml | 8 ++++---- .github/workflows/production.yml | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c769dfc2e..a5468fb9b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,9 +16,9 @@ jobs: max-parallel: 4 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -48,13 +48,13 @@ jobs: steps: - name: Checkout master - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: 'master' fetch-depth: 0 - name: Set up Python 3.9 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml index 5088c1000..9ad9a5d90 100644 --- a/.github/workflows/production.yml +++ b/.github/workflows/production.yml @@ -14,13 +14,13 @@ jobs: steps: - name: Checkout production - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: ref: ${{ github.event.release.target_commitish }} fetch-depth: 0 - name: Set up Python 3.9 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: 3.9 From 6a5103cbd9b9187a53040e82fa42f2f628faa7bc Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Mon, 27 Nov 2023 05:14:44 -0600 Subject: [PATCH 09/13] Cu 1yn0v9e duplicate multiprocessing methods (#364) * CU-1yn0v9e: Rename and deprecate one of the multiprocessing methods; Add docstring. Trying to be more explicit regarding usage and differences between different methods * CU-1yn0v9e: Rename and deprecate the multiprocessing_pipe method; Add docstring. Trying to be more explicit regarding usage and differences between different methods * CU-1yn0v9e: Fix typo in docstring; more consistent naming --- medcat/cat.py | 47 +++++++++++++++++++++++++++++++++++++++++++++-- tests/test_cat.py | 8 ++++---- 2 files changed, 49 insertions(+), 6 deletions(-) diff --git a/medcat/cat.py b/medcat/cat.py index a86584c3a..9cca44b0f 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -1335,6 +1335,7 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_ pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb')) return part_counter + @deprecated(message="Use `multiprocessing_batch_char_size` instead") def multiprocessing(self, data: Union[List[Tuple], Iterable[Tuple]], nproc: int = 2, @@ -1345,9 +1346,31 @@ def multiprocessing(self, out_split_size_chars: Optional[int] = None, save_dir_path: str = os.path.abspath(os.getcwd()), min_free_memory=0.1) -> Dict: + return self.multiprocessing_batch_char_size(data=data, nproc=nproc, + batch_size_chars=batch_size_chars, + only_cui=only_cui, addl_info=addl_info, + separate_nn_components=separate_nn_components, + out_split_size_chars=out_split_size_chars, + save_dir_path=save_dir_path, + min_free_memory=min_free_memory) + + def multiprocessing_batch_char_size(self, + data: Union[List[Tuple], Iterable[Tuple]], + nproc: int = 2, + batch_size_chars: int = 5000 * 1000, + only_cui: bool = False, + addl_info: List[str] = [], + separate_nn_components: bool = True, + out_split_size_chars: Optional[int] = None, + save_dir_path: str = os.path.abspath(os.getcwd()), + min_free_memory=0.1) -> Dict: r"""Run multiprocessing for inference, if out_save_path and out_split_size_chars is used this will also continue annotating documents if something is saved in that directory. + This method batches the data based on the number of characters as specified by user. + + PS: This method is unlikely to work on a Windows machine. + Args: data: Iterator or array with format: [(id, text), (id, text), ...] @@ -1531,7 +1554,22 @@ def _multiprocessing_batch(self, return docs - def multiprocessing_pipe(self, + @deprecated(message="Use `multiprocessing_batch_docs_size` instead") + def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]], + nproc: Optional[int] = None, + batch_size: Optional[int] = None, + only_cui: bool = False, + addl_info: List[str] = [], + return_dict: bool = True, + batch_factor: int = 2) -> Union[List[Tuple], Dict]: + return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc, + batch_size=batch_size, + only_cui=only_cui, + addl_info=addl_info, + return_dict=return_dict, + batch_factor=batch_factor) + + def multiprocessing_batch_docs_size(self, in_data: Union[List[Tuple], Iterable[Tuple]], nproc: Optional[int] = None, batch_size: Optional[int] = None, @@ -1539,7 +1577,12 @@ def multiprocessing_pipe(self, addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'], return_dict: bool = True, batch_factor: int = 2) -> Union[List[Tuple], Dict]: - """Run multiprocessing NOT FOR TRAINING + """Run multiprocessing NOT FOR TRAINING. + + This method batches the data based on the number of documents as specified by the user. + + PS: + This method supports Windows. Args: in_data (Union[List[Tuple], Iterable[Tuple]]): List with format: [(id, text), (id, text), ...] diff --git a/tests/test_cat.py b/tests/test_cat.py index 62db4d44d..cd33efbc7 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -60,7 +60,7 @@ def test_multiprocessing(self): (2, ""), (3, None) ] - out = self.undertest.multiprocessing(in_data, nproc=1) + out = self.undertest.multiprocessing_batch_char_size(in_data, nproc=1) self.assertEqual(3, len(out)) self.assertEqual(1, len(out[1]['entities'])) @@ -73,7 +73,7 @@ def test_multiprocessing_pipe(self): (2, "The dog is sitting outside the house."), (3, "The dog is sitting outside the house."), ] - out = self.undertest.multiprocessing_pipe(in_data, nproc=2, return_dict=False) + out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=2, return_dict=False) self.assertTrue(type(out) == list) self.assertEqual(3, len(out)) self.assertEqual(1, out[0][0]) @@ -89,7 +89,7 @@ def test_multiprocessing_pipe_with_malformed_texts(self): (2, ""), (3, None), ] - out = self.undertest.multiprocessing_pipe(in_data, nproc=1, batch_size=1, return_dict=False) + out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=1, batch_size=1, return_dict=False) self.assertTrue(type(out) == list) self.assertEqual(3, len(out)) self.assertEqual(1, out[0][0]) @@ -105,7 +105,7 @@ def test_multiprocessing_pipe_return_dict(self): (2, "The dog is sitting outside the house."), (3, "The dog is sitting outside the house.") ] - out = self.undertest.multiprocessing_pipe(in_data, nproc=2, return_dict=True) + out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=2, return_dict=True) self.assertTrue(type(out) == dict) self.assertEqual(3, len(out)) self.assertEqual({'entities': {}, 'tokens': []}, out[1]) From 72f7ddad20d6e8ae2aa7d8f761c0eb0fe5604290 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Wed, 29 Nov 2023 03:44:16 -0600 Subject: [PATCH 10/13] 869377m3u: Add comment regarding demo link load times to README (#376) --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 395aecf69..c47cf9a65 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,7 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g ## Demo A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT. +PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive. ## Tutorials A guide on how to use MedCAT is available at [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). Read more about MedCAT on [Towards Data Science](https://towardsdatascience.com/medcat-introduction-analyzing-electronic-health-records-e1c420afa13a). From d8473d9fabdc49eee3976f2d362433e28f36414b Mon Sep 17 00:00:00 2001 From: adam-sutton-1992 <60137864+adam-sutton-1992@users.noreply.github.com> Date: Thu, 30 Nov 2023 11:46:28 +0000 Subject: [PATCH 11/13] Added README.md documentation for CPU only installations (#365) * changed README.md to reflect installation options. * added setup script to demonstrate how wrapper could look for CPU installations * removed setup.sh as unnessescary for cpu only builds * Initial commit for merge_cdb method * Added indentation to make merge_cdb a class method * fixed syntax issues * more lint fixes * more lint fixes * bug fixes of merge_cdb * removed print statements * Added commentary on disk space usage of pytorch-gpu * removed merge_cdb from branch --------- Co-authored-by: adam-sutton-1992 --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index c47cf9a65..bf34f00c6 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,17 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g - **Paper**: [What’s in a Summary? Laying the Groundwork for Advances in Hospital-Course Summarization](https://www.aclweb.org/anthology/2021.naacl-main.382.pdf) - ([more...](https://github.com/CogStack/MedCAT/blob/master/media/news.md)) +## Installation +To install the latest version of MedCAT run the following command: +``` +pip install medcat +``` +Normal installations of MedCAT will install torch-gpu and all relevant dependancies (such as CUDA). This can require as much as 10 GB more disk space, which isn't required for CPU only usage. + +To install the latest version of MedCAT without torch GPU support run the following command: +``` +pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/ +``` ## Demo A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT. PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive. From 76b75cc4e3558e9a48d1fe8aa43ba23621652a75 Mon Sep 17 00:00:00 2001 From: Mart Ratas Date: Thu, 30 Nov 2023 05:47:37 -0600 Subject: [PATCH 12/13] Cu 8692zguyq no preferred name (#367) * CU-8692zguyq: Slight simplification of minimum-name-length logic * CU-8692zguyq: Add some tests for prepare_name preprocessor * CU-8692zguyq: Add warning if no preferred name was added along a new CUI * CU-8692zguyq: Add additional warning messages when adding/training a new CUI with no preferred name * CU-8692zguyq: Make no preferred name warnings only run if name status is preferred * CU-8692zguyq: Add tests for no-preferred name warnings * CU-8692zguyq: Add Vocab.make_unigram_table to CAT tests * CU-8692zguyq: Move to built in asserting for logging instead of patching the method * CU-8692zguyq: Add workaround for assertNoLogs on python 3.8 and 3.9 --- medcat/cat.py | 4 ++ medcat/cdb.py | 15 ++++ medcat/preprocessing/cleaners.py | 3 +- tests/preprocessing/__init__.py | 0 tests/preprocessing/test_cleaners.py | 104 +++++++++++++++++++++++++++ tests/test_cat.py | 55 +++++++++++++- 6 files changed, 178 insertions(+), 3 deletions(-) create mode 100644 tests/preprocessing/__init__.py create mode 100644 tests/preprocessing/test_cleaners.py diff --git a/medcat/cat.py b/medcat/cat.py index ef6099566..f49a25022 100644 --- a/medcat/cat.py +++ b/medcat/cat.py @@ -840,6 +840,10 @@ def add_and_train_concept(self, Refer to medcat.cat.cdb.CDB.add_concept """ names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config) + if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P': + logger.warning("No names were able to be prepared in CAT.add_and_train_concept " + "method. As such no preferred name will be able to be specifeid. " + "The CUI: '%s' and raw name: '%s'", cui, name) # Only if not negative, otherwise do not add the new name if in fact it should not be detected if do_add_concept and not negative: self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description, diff --git a/medcat/cdb.py b/medcat/cdb.py index 1110a3b84..2ca8382a7 100644 --- a/medcat/cdb.py +++ b/medcat/cdb.py @@ -358,6 +358,21 @@ def _add_concept(self, if name_status == 'P' and cui not in self.cui2preferred_name: # Do not overwrite old preferred names self.cui2preferred_name[cui] = name_info['raw_name'] + elif names: + # if no name_info and names is NOT an empty dict + # this shouldn't really happen in the current setup + raise ValueError("Unknown state where there is no name_info, " + "yet the `names` dict is not empty (%s)", names) + elif name_status == 'P' and cui not in self.cui2preferred_name: + # this means names is an empty `names` dict + logger.warning("Did not manage to add a preferred name in `add_cui`. " + "Was trying to do so for cui: '%s'" + "This means that the `names` dict passed was empty. " + "This is _usually_ caused by either no name or too short " + "a name passed to the `prepare_name` method. " + "The minimum length is defined in: " + "'config.cdb_maker.min_letters_required' and " + "is currently set at %s", cui, self.config.cdb_maker['min_letters_required']) # Add other fields if full_build if full_build: diff --git a/medcat/preprocessing/cleaners.py b/medcat/preprocessing/cleaners.py index 18314d562..43e8098e2 100644 --- a/medcat/preprocessing/cleaners.py +++ b/medcat/preprocessing/cleaners.py @@ -48,7 +48,8 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D snames = set() name = config.general['separator'].join(tokens) - if not config.cdb_maker.get('min_letters_required', 0) or len(re.sub("[^A-Za-z]*", '', name)) >= config.cdb_maker.get('min_letters_required', 0): + min_letters = config.cdb_maker.get('min_letters_required', 0) + if not min_letters or len(re.sub("[^A-Za-z]*", '', name)) >= min_letters: if name not in names: sname = "" for token in tokens: diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/preprocessing/test_cleaners.py b/tests/preprocessing/test_cleaners.py new file mode 100644 index 000000000..b879d9ee6 --- /dev/null +++ b/tests/preprocessing/test_cleaners.py @@ -0,0 +1,104 @@ +from medcat.preprocessing.cleaners import prepare_name +from medcat.config import Config +from medcat.cdb_maker import CDBMaker + +import logging, os + +import unittest + + +class BaseCDBMakerTests(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general['log_level'] = logging.DEBUG + config.general["spacy_model"] = "en_core_web_md" + cls.maker = CDBMaker(config) + csvs = [ + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb.csv'), + os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb_2.csv') + ] + cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True) + + +class BasePrepareNameTest(BaseCDBMakerTests): + raw_name = 'raw' + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.do_prepare_name() + + # method called after setup, when raw_name has been specified + @classmethod + def do_prepare_name(cls) -> None: + cls.name = cls.cdb.config.general.separator.join(cls.raw_name.split()) + cls.names = prepare_name(cls.raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config) + + def _dict_has_key_val_type(self, d: dict, key, val_type): + self.assertIn(key, d) + self.assertIsInstance(d[key], val_type) + + def _names_has_key_val_type(self, key, val_type): + self._dict_has_key_val_type(self.names, key, val_type) + + def test_result_has_name(self): + self._names_has_key_val_type(self.name, dict) + + def test_name_info_has_tokens(self): + self._dict_has_key_val_type(self.names[self.name], 'tokens', list) + + def test_name_info_has_words_as_tokens(self): + name_info = self.names[self.name] + tokens = name_info['tokens'] + for word in self.raw_name.split(): + with self.subTest(word): + self.assertIn(word, tokens) + + +class NamePreparationTests_OneLetter(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "a" + # the minimum name length is defined by the following config option + # if I don't set this to 1 here, I would see the tests fail + # that would be because the result from `prepare_names` would be empty + cls.cdb.config.cdb_maker.min_letters_required = 1 + super().do_prepare_name() + + +class NamePreparationTests_TwoLetters(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "an" + super().do_prepare_name() + + +class NamePreparationTests_MultiToken(BasePrepareNameTest): + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.raw_name = "this raw name" + super().do_prepare_name() + + +class NamePreparationTests_Empty(BaseCDBMakerTests): + """In case of an empty name, I would expect the names dict + returned by `prepare_name` to be empty. + """ + empty_raw_name = '' + + @classmethod + def setUpClass(cls) -> None: + super().setUpClass() + cls.names = prepare_name(cls.empty_raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config) + + def test_names_dict_is_empty(self): + self.assertEqual(len(self.names), 0) + self.assertEqual(self.names, {}) diff --git a/tests/test_cat.py b/tests/test_cat.py index cd33efbc7..acd337e71 100644 --- a/tests/test_cat.py +++ b/tests/test_cat.py @@ -4,10 +4,12 @@ import unittest import tempfile import shutil +import logging +import contextlib from transformers import AutoTokenizer from medcat.vocab import Vocab -from medcat.cdb import CDB -from medcat.cat import CAT +from medcat.cdb import CDB, logger as cdb_logger +from medcat.cat import CAT, logger as cat_logger from medcat.utils.checkpoint import Checkpoint from medcat.meta_cat import MetaCAT from medcat.config_meta_cat import ConfigMetaCAT @@ -20,6 +22,7 @@ class CATTests(unittest.TestCase): def setUpClass(cls) -> None: cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")) cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")) + cls.vocab.make_unigram_table() cls.cdb.config.general.spacy_model = "en_core_web_md" cls.cdb.config.ner.min_name_len = 2 cls.cdb.config.ner.upper_case_limit_len = 3 @@ -388,6 +391,54 @@ def test_hashing(self): cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip")) self.assertEqual(cat.get_hash(), cat.config.version.id) + def _assertNoLogs(self, logger: logging.Logger, level: int): + if hasattr(self, 'assertNoLogs'): + return self.assertNoLogs(logger=logger, level=level) + else: + return self.__assertNoLogs(logger=logger, level=level) + + @contextlib.contextmanager + def __assertNoLogs(self, logger: logging.Logger, level: int): + try: + with self.assertLogs(logger, level) as captured_logs: + yield + except AssertionError: + return + if captured_logs: + raise AssertionError("Logs were found: {}".format(captured_logs)) + + def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level, + name: str, name_status: str, nr_of_calls: int): + cui = 'CUI-%d'%(hash(name) + id(name)) + with (self.assertLogs(logger=logger, level=log_level) + if nr_of_calls == 1 + else self._assertNoLogs(logger=logger, level=log_level)): + self.undertest.add_and_train_concept(cui, name, name_status=name_status) + + def test_add_and_train_concept_cat_nowarn_long_name(self): + long_name = 'a very long name' + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cdb_nowarn_long_name(self): + long_name = 'a very long name' + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0) + + def test_add_and_train_concept_cat_warns_short_name(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) + + def test_add_and_train_concept_cdb_warns_short_name(self): + short_name = 'a' + self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1) + class ModelWithTwoConfigsLoadTests(unittest.TestCase): From 7fddac0626382720c1194b465b1e6f975a00152f Mon Sep 17 00:00:00 2001 From: Xi Bai <82581439+baixiac@users.noreply.github.com> Date: Tue, 5 Dec 2023 17:36:28 +0000 Subject: [PATCH 13/13] Add trainer callbacks for Transformer NER (#377) CU-86938vf30 add trainer callbacks for Transformer NER --- medcat/ner/transformers_ner.py | 16 ++++++++-- tests/ner/test_transformers_ner.py | 50 ++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 2 deletions(-) create mode 100644 tests/ner/test_transformers_ner.py diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py index 9623b1b93..227ccc083 100644 --- a/medcat/ner/transformers_ner.py +++ b/medcat/ner/transformers_ner.py @@ -1,6 +1,7 @@ import os import json import logging +import datasets from spacy.tokens import Doc from datetime import datetime from typing import Iterable, Iterator, Optional, Dict, List, cast, Union @@ -18,7 +19,7 @@ from transformers import Trainer, AutoModelForTokenClassification, AutoTokenizer from transformers import pipeline, TrainingArguments -import datasets +from transformers.trainer_callback import TrainerCallback # It should be safe to do this always, as all other multiprocessing #will be finished before data comes to meta_cat @@ -137,7 +138,12 @@ def merge_data_loaded(base, other): return out_path - def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=False, dataset=None, meta_requirements=None): + def train(self, + json_path: Union[str, list, None]=None, + ignore_extra_labels=False, + dataset=None, + meta_requirements=None, + trainer_callbacks: Optional[List[TrainerCallback]]=None): """Train or continue training a model give a json_path containing a MedCATtrainer export. It will continue training if an existing model is loaded or start new training if the model is blank/new. @@ -149,6 +155,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals ignore_extra_labels: Makes only sense when an existing deid model was loaded and from the new data we want to ignore labels that did not exist in the old model. + trainer_callbacks (List[TrainerCallback]): + A list of trainer callbacks for collecting metrics during the training at the client side. The + transformers Trainer object will be passed in when each callback is called. """ if dataset is None and json_path is not None: @@ -193,6 +202,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals compute_metrics=lambda p: metrics(p, tokenizer=self.tokenizer, dataset=encoded_dataset['test'], verbose=self.config.general['verbose_metrics']), data_collator=data_collator, # type: ignore tokenizer=None) + if trainer_callbacks: + for callback in trainer_callbacks: + trainer.add_callback(callback(trainer)) trainer.train() # type: ignore diff --git a/tests/ner/test_transformers_ner.py b/tests/ner/test_transformers_ner.py new file mode 100644 index 000000000..de9eae32c --- /dev/null +++ b/tests/ner/test_transformers_ner.py @@ -0,0 +1,50 @@ +import os +import unittest +from spacy.lang.en import English +from spacy.tokens import Doc, Span +from transformers import TrainerCallback +from medcat.ner.transformers_ner import TransformersNER +from medcat.config import Config +from medcat.cdb_maker import CDBMaker + + +class TransformerNERTest(unittest.TestCase): + + @classmethod + def setUpClass(cls) -> None: + config = Config() + config.general["spacy_model"] = "en_core_web_md" + cdb_maker = CDBMaker(config) + cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "examples", "cdb.csv") + cdb = cdb_maker.prepare_csvs([cdb_csv], full_build=True) + Doc.set_extension("ents", default=[], force=True) + Span.set_extension("confidence", default=-1, force=True) + Span.set_extension("id", default=0, force=True) + Span.set_extension("detected_name", default=None, force=True) + Span.set_extension("link_candidates", default=None, force=True) + Span.set_extension("cui", default=-1, force=True) + Span.set_extension("context_similarity", default=-1, force=True) + cls.undertest = TransformersNER(cdb) + cls.undertest.create_eval_pipeline() + + def test_pipe(self): + doc = English().make_doc("\nPatient Name: John Smith\nAddress: 15 Maple Avenue\nCity: New York\nCC: Chronic back pain\n\nHX: Mr. Smith") + doc = next(self.undertest.pipe([doc])) + assert len(doc.ents) > 0, "No entities were recognised" + + def test_train(self): + tracker = unittest.mock.Mock() + class _DummyCallback(TrainerCallback): + def __init__(self, trainer) -> None: + self._trainer = trainer + def on_epoch_end(self, *args, **kwargs) -> None: + tracker.call() + + train_data = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", "deid_train_data.json") + self.undertest.training_arguments.num_train_epochs = 1 + df, examples, dataset = self.undertest.train(train_data, trainer_callbacks=[_DummyCallback, _DummyCallback]) + assert "fp" in examples + assert "fn" in examples + assert dataset["train"].num_rows == 48 + assert dataset["test"].num_rows == 12 + self.assertEqual(tracker.call.call_count, 2)