From ed840d07a51b36734807317ae2947f28c8d2332b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 20 Oct 2023 11:06:29 +0000
Subject: [PATCH 01/13] Bump urllib3 from 1.26.17 to 1.26.18 in /webapp/webapp

Bumps [urllib3](https://github.com/urllib3/urllib3) from 1.26.17 to 1.26.18.
- [Release notes](https://github.com/urllib3/urllib3/releases)
- [Changelog](https://github.com/urllib3/urllib3/blob/main/CHANGES.rst)
- [Commits](https://github.com/urllib3/urllib3/compare/1.26.17...1.26.18)

---
updated-dependencies:
- dependency-name: urllib3
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 webapp/webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt
index d6a590572..cef7a036d 100644
--- a/webapp/webapp/requirements.txt
+++ b/webapp/webapp/requirements.txt
@@ -3,4 +3,4 @@ django-dbbackup==4.0.0b0
 django-storages[boto3]==1.12.3
 django-cron==0.5.1
 medcat==1.2.7
-urllib3==1.26.17
+urllib3==1.26.18

From d377f0b6a38b0b2e46dd7224b0a87cf015b767d9 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Mon, 30 Oct 2023 14:06:34 +0200
Subject: [PATCH 02/13] CU-8692uznvd: Allow empty-dict
 config.linking.filters.cuis and convert to set in memory (#352)

* CU-8692uznvd: Allow empty-dict config.linking.filters.cuis and convert to set in memory

* CU-8692uznvd: Move the empty-set detection and conversion from validator to init

* CU-8692uznvd: Remove unused import
---
 medcat/config.py     | 13 +++++++++++++
 tests/test_config.py | 21 ++++++++++++++++++++-
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/medcat/config.py b/medcat/config.py
index b2e324deb..87c6d34f5 100644
--- a/medcat/config.py
+++ b/medcat/config.py
@@ -433,6 +433,19 @@ class LinkingFilters(MixingConfig, BaseModel):
     cuis: Set[str] = set()
     cuis_exclude: Set[str] = set()
 
+    def __init__(self, **data):
+        if 'cuis' in data:
+            cuis = data['cuis']
+            if isinstance(cuis, dict) and len(cuis) == 0:
+                logger.warning("Loading an old model where "
+                               "config.linking.filters.cuis has been "
+                               "dict to an empty dict instead of an empty "
+                               "set. Converting the dict to a set in memory "
+                               "as that is what is expected. Please consider "
+                               "saving the model again.")
+                data['cuis'] = set(cuis.keys())
+        super().__init__(**data)
+
     def check_filters(self, cui: str) -> bool:
         """Checks is a CUI in the filters
 
diff --git a/tests/test_config.py b/tests/test_config.py
index 2f9cd5a84..aacd0a760 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -1,7 +1,7 @@
 import unittest
 import pickle
 import tempfile
-from medcat.config import Config, MixingConfig, VersionInfo, General
+from medcat.config import Config, MixingConfig, VersionInfo, General, LinkingFilters
 from pydantic import ValidationError
 import os
 
@@ -180,5 +180,24 @@ def test_from_dict(self):
         self.assertEqual("value", config.key)
 
 
+class ConfigLinkingFiltersTests(unittest.TestCase):
+
+    def test_allows_empty_dict_for_cuis(self):
+        lf = LinkingFilters(cuis={})
+        self.assertIsNotNone(lf)
+
+    def test_empty_dict_converted_to_empty_set(self):
+        lf = LinkingFilters(cuis={})
+        self.assertEqual(lf.cuis, set())
+
+    def test_not_allow_nonempty_dict_for_cuis(self):
+        with self.assertRaises(ValidationError):
+            LinkingFilters(cuis={"KEY": "VALUE"})
+
+    def test_not_allow_empty_dict_for_cuis_exclude(self):
+        with self.assertRaises(ValidationError):
+            LinkingFilters(cuis_exclude={})
+
+
 if __name__ == '__main__':
     unittest.main()

From ad6704891db9fa91169b4ef6934fa18e5242184f Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Mon, 30 Oct 2023 16:27:37 +0200
Subject: [PATCH 03/13] CU-8692t3fdf separate config on save (#350)

* CU-8692t3fdf Move saving config outside of the cdb.dat; Add test to make sure the config does not get saved with the CDB; patch a few existing tests

* CU-8692t3fdf Use class methods on class instead of instance in a few tests

* CU-8692t3fdf Fix typing issue

* CU-8692t3fdf Add additional tests for 2 configs and zero configs when loading model pack

* CU-8692t3fdf: Make sure CDB is linked to the correct config; Treat incorrect configs as dirty CDBs and force a recalc of the hash
---
 medcat/cat.py                            |  8 +++
 medcat/cdb.py                            | 73 +++++++++++++++++++++++-
 medcat/config.py                         |  8 ++-
 medcat/utils/saving/serializer.py        | 15 ++++-
 tests/test_cat.py                        | 52 ++++++++++++++++-
 tests/test_cdb.py                        |  8 +++
 tests/test_config.py                     | 29 ++++++++++
 tests/utils/saving/test_serialization.py |  2 +-
 tests/utils/test_hashing.py              | 47 ++++++++++++++-
 9 files changed, 229 insertions(+), 13 deletions(-)

diff --git a/medcat/cat.py b/medcat/cat.py
index 2323cd737..0fb6b1167 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -271,6 +271,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
         cdb_path = os.path.join(save_dir_path, "cdb.dat")
         self.cdb.save(cdb_path, json_path)
 
+        # Save the config
+        config_path = os.path.join(save_dir_path, "config.json")
+        self.cdb.config.save(config_path)
+
         # Save the Vocab
         vocab_path = os.path.join(save_dir_path, "vocab.dat")
         if self.vocab is not None:
@@ -362,6 +366,10 @@ def load_model_pack(cls,
         logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format')
         cdb = CDB.load(cdb_path, json_path)
 
+        # load config
+        config_path = os.path.join(model_pack_path, "config.json")
+        cdb.load_config(config_path)
+
         # TODO load addl_ner
 
         # Modify the config to contain full path to spacy model
diff --git a/medcat/cdb.py b/medcat/cdb.py
index 44d4fd9dd..5a648f4af 100644
--- a/medcat/cdb.py
+++ b/medcat/cdb.py
@@ -5,8 +5,9 @@
 import logging
 import aiofiles
 import numpy as np
-from typing import Dict, Set, Optional, List, Union
+from typing import Dict, Set, Optional, List, Union, cast
 from functools import partial
+import os
 
 from medcat import __version__
 from medcat.utils.hasher import Hasher
@@ -61,8 +62,10 @@ class CDB(object):
     def __init__(self, config: Union[Config, None] = None) -> None:
         if config is None:
             self.config = Config()
+            self._config_from_file = False
         else:
             self.config = config
+            self._config_from_file = True
         self.name2cuis: Dict = {}
         self.name2cuis2status: Dict = {}
 
@@ -95,6 +98,12 @@ def __init__(self, config: Union[Config, None] = None) -> None:
         self._optim_params = None
         self.is_dirty = False
         self._hash: Optional[str] = None
+        # the config hash is kept track of here so that
+        # the CDB hash can be re-calculated when the config changes
+        # it can also be used to make sure the config loaded with
+        # a CDB matches the config it was saved with
+        # since the config is now saved separately
+        self._config_hash: Optional[str] = None
         self._memory_optimised_parts: Set[str] = set()
 
     def get_name(self, cui: str) -> str:
@@ -458,6 +467,35 @@ async def save_async(self, path: str) -> None:
             }
             await f.write(dill.dumps(to_save))
 
+    def load_config(self, config_path: str) -> None:
+        if not os.path.exists(config_path):
+            if not self._config_from_file:
+                # if there's no config defined anywhere
+                raise ValueError("Could not find a config in the CDB nor ",
+                                 "in the config.json for this model "
+                                 f"({os.path.dirname(config_path)})",
+                                 )
+            # if there is a config, but it's defined in the cdb.dat file
+            logger.warning("Could not find config.json in model pack folder "
+                           f"({os.path.dirname(config_path)}). "
+                           "This is probably an older model. Please save the model "
+                           "again in the new format to avoid potential issues.")
+        else:
+            if self._config_from_file:
+                # if there's a config.json and one defined in the cbd.dat file
+                raise ValueError("Found a config in the CDB and in the config.json "
+                                 f"for model ({os.path.dirname(config_path)}) - "
+                                 "this is ambiguous. Please either remove the "
+                                 "config.json or load the CDB without the config.json "
+                                 "in the folder and re-save in the newer format "
+                                 "(the default save in this version)")
+            # if the only config is in the separate config.json file
+            # this should be the behaviour for all newer models
+            self.config = cast(Config, Config.load(config_path))
+            logger.debug("Loaded config from CDB from %s", config_path)
+        # mark config read from file
+        self._config_from_file = True
+
     @classmethod
     def load(cls, path: str, json_path: Optional[str] = None, config_dict: Optional[Dict] = None) -> "CDB":
         """Load and return a CDB. This allows partial loads in probably not the right way at all.
@@ -777,8 +815,34 @@ def _check_medcat_version(cls, config_data: Dict) -> None:
 or download the compatible model."""
             )
 
+    def _should_recalc_hash(self, force_recalc: bool) -> bool:
+        if force_recalc:
+            return True
+        if self.config.hash is None:
+            # TODO - perhaps this is not the best?
+            # as this is a side effect
+            # get and save result in config
+            self.config.get_hash()
+        if not self._hash or self.is_dirty:
+            # if no hash saved or is dirty
+            # need to calculate
+            logger.debug("Recalculating hash due to %s",
+                         "no hash saved" if not self._hash else "CDB is dirty")
+            return True
+        # recalc config hash in case it changed
+        self.config.get_hash()
+        if self._config_hash is None or self._config_hash != self.config.hash:
+            # if no config hash saved
+            # or if the config hash is different from one saved in here
+            logger.debug("Recalculating hash due to %s",
+                         "no config hash saved" if not self._config_hash
+                         else "config hash has changed")
+            return True
+        return False
+
     def get_hash(self, force_recalc: bool = False):
-        if not force_recalc and self._hash and not self.is_dirty:
+        should_recalc = self._should_recalc_hash(force_recalc)
+        if not should_recalc:
             logger.info("Reusing old hash of CDB since the CDB has not changed: %s", self._hash)
             return self._hash
         self.is_dirty = False
@@ -791,7 +855,7 @@ def calculate_hash(self):
         for k,v in self.__dict__.items():
             if k in ['cui2countext_vectors', 'name2cuis']:
                 hasher.update(v, length=False)
-            elif k in ['_hash', 'is_dirty']:
+            elif k in ['_hash', 'is_dirty', '_config_hash']:
                 # ignore _hash since if it previously didn't exist, the
                 # new hash would be different when the value does exist
                 # and ignore is_dirty so that we get the same hash as previously
@@ -799,6 +863,9 @@ def calculate_hash(self):
             elif k != 'config':
                 hasher.update(v, length=True)
 
+        # set cached config hash
+        self._config_hash = self.config.hash
+
         self._hash = hasher.hexdigest()
         logger.info("Found new CDB hash: %s", self._hash)
         return self._hash
diff --git a/medcat/config.py b/medcat/config.py
index 87c6d34f5..e60c2eafc 100644
--- a/medcat/config.py
+++ b/medcat/config.py
@@ -548,6 +548,7 @@ class Config(MixingConfig, BaseModel):
     linking: Linking = Linking()
     word_skipper: re.Pattern = re.compile('') # empty pattern gets replaced upon init
     punct_checker: re.Pattern = re.compile('') # empty pattern gets replaced upon init
+    hash: Optional[str] = None
 
     class Config:
         # this if for word_skipper and punct_checker which would otherwise
@@ -572,6 +573,9 @@ def rebuild_re(self) -> None:
     def get_hash(self):
         hasher = Hasher()
         for k, v in self.dict().items():
+            if k in ['hash', ]:
+                # ignore hash
+                continue
             if k not in ['version', 'general', 'linking']:
                 hasher.update(v, length=True)
             elif k == 'general':
@@ -587,5 +591,5 @@ def get_hash(self):
                         hasher.update(v2, length=False)
                     else:
                         hasher.update(v2, length=True)
-
-        return hasher.hexdigest()
+        self.hash = hasher.hexdigest()
+        return self.hash
diff --git a/medcat/utils/saving/serializer.py b/medcat/utils/saving/serializer.py
index d82df751c..25529c778 100644
--- a/medcat/utils/saving/serializer.py
+++ b/medcat/utils/saving/serializer.py
@@ -135,13 +135,12 @@ def serialize(self, cdb, overwrite: bool = False) -> None:
             raise ValueError(f'Unable to overwrite shelf path "{self.json_path}"'
                              ' - specify overrwrite=True if you wish to overwrite')
         to_save = {}
-        to_save['config'] = cdb.config.asdict()
         # This uses different names so as to not be ambiguous
         # when looking at files whether the json parts should
         # exist separately or not
         to_save['cdb_main' if self.jsons is not None else 'cdb'] = dict(
             ((key, val) for key, val in cdb.__dict__.items() if
-             key != 'config' and
+             key not in ('config', '_config_from_file') and
              (self.jsons is None or key not in SPECIALITY_NAMES)))
         logger.info('Dumping CDB to %s', self.main_path)
         with open(self.main_path, 'wb') as f:
@@ -165,7 +164,17 @@ def deserialize(self, cdb_cls):
         logger.info('Reading CDB data from %s', self.main_path)
         with open(self.main_path, 'rb') as f:
             data = dill.load(f)
-        config = cast(Config, Config.from_dict(data['config']))
+        if 'config' in data:
+            logger.warning("Found config in CDB for model (%s). "
+                           "This is an old format. Please re-save the "
+                           "model in the new format to avoid potential issues",
+                           os.path.dirname(self.main_path))
+            config = cast(Config, Config.from_dict(data['config']))
+        else:
+            # by passing None as config to constructor
+            # the CDB should identify that there has been
+            # no config loaded
+            config = None
         cdb = cdb_cls(config=config)
         if self.jsons is None:
             cdb_main = data['cdb']
diff --git a/tests/test_cat.py b/tests/test_cat.py
index 0baa0d35d..62db4d44d 100644
--- a/tests/test_cat.py
+++ b/tests/test_cat.py
@@ -367,7 +367,7 @@ def test_load_model_pack(self):
         meta_cat = _get_meta_cat(self.meta_cat_dir)
         cat = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[meta_cat])
         full_model_pack_name = cat.create_model_pack(save_dir_path.name, model_pack_name="mp_name")
-        cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
+        cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
         self.assertTrue(isinstance(cat, CAT))
         self.assertIsNotNone(cat.config.version.medcat_version)
         self.assertEqual(repr(cat._meta_cats), repr([meta_cat]))
@@ -377,7 +377,7 @@ def test_load_model_pack_without_meta_cat(self):
         meta_cat = _get_meta_cat(self.meta_cat_dir)
         cat = CAT(cdb=self.cdb, config=self.cdb.config, vocab=self.vocab, meta_cats=[meta_cat])
         full_model_pack_name = cat.create_model_pack(save_dir_path.name, model_pack_name="mp_name")
-        cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"), load_meta_models=False)
+        cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"), load_meta_models=False)
         self.assertTrue(isinstance(cat, CAT))
         self.assertIsNotNone(cat.config.version.medcat_version)
         self.assertEqual(cat._meta_cats, [])
@@ -385,10 +385,56 @@ def test_load_model_pack_without_meta_cat(self):
     def test_hashing(self):
         save_dir_path = tempfile.TemporaryDirectory()
         full_model_pack_name = self.undertest.create_model_pack(save_dir_path.name, model_pack_name="mp_name")
-        cat = self.undertest.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
+        cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
         self.assertEqual(cat.get_hash(), cat.config.version.id)
 
 
+class ModelWithTwoConfigsLoadTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.model_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples")
+        cdb = CDB.load(os.path.join(cls.model_path, "cdb.dat"))
+        # save config next to the CDB
+        cls.config_path = os.path.join(cls.model_path, 'config.json')
+        cdb.config.save(cls.config_path)
+
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        # REMOVE config next to the CDB
+        os.remove(cls.config_path)
+
+    def test_loading_model_pack_with_cdb_config_and_config_json_raises_exception(self):
+        with self.assertRaises(ValueError):
+            CAT.load_model_pack(self.model_path)
+
+
+class ModelWithZeroConfigsLoadTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cdb_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat")
+        cdb = CDB.load(cdb_path)
+        vocab_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat")
+        # copy the CDB and vocab to a temp dir
+        cls.temp_dir = tempfile.TemporaryDirectory()
+        cls.cdb_path = os.path.join(cls.temp_dir.name, 'cdb.dat')
+        cdb.save(cls.cdb_path) # save without internal config
+        cls.vocab_path = os.path.join(cls.temp_dir.name, 'vocab.dat')
+        shutil.copyfile(vocab_path, cls.vocab_path)
+
+
+    @classmethod
+    def tearDownClass(cls) -> None:
+        # REMOVE temp dir
+        cls.temp_dir.cleanup()
+
+    def test_loading_model_pack_without_any_config_raises_exception(self):
+        with self.assertRaises(ValueError):
+            CAT.load_model_pack(self.temp_dir.name)
+
+
 def _get_meta_cat(meta_cat_dir):
     config = ConfigMetaCAT()
     config.general["category_name"] = "Status"
diff --git a/tests/test_cdb.py b/tests/test_cdb.py
index 96425bc8c..f7be24d64 100644
--- a/tests/test_cdb.py
+++ b/tests/test_cdb.py
@@ -6,6 +6,7 @@
 import numpy as np
 from medcat.config import Config
 from medcat.cdb_maker import CDBMaker
+from medcat.cdb import CDB
 
 
 class CDBTests(unittest.TestCase):
@@ -53,6 +54,13 @@ def test_save_and_load(self):
             self.undertest.save(f.name)
             self.undertest.load(f.name)
 
+    def test_load_has_no_config(self):
+        with tempfile.NamedTemporaryFile() as f:
+            self.undertest.save(f.name)
+            cdb = CDB.load(f.name)
+            self.assertFalse(cdb._config_from_file)
+
+
     def test_save_async_and_load(self):
         with tempfile.NamedTemporaryFile() as f:
             asyncio.run(self.undertest.save_async(f.name))
diff --git a/tests/test_config.py b/tests/test_config.py
index aacd0a760..ce6ed76eb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -179,6 +179,35 @@ def test_from_dict(self):
         config = Config.from_dict({"key": "value"})
         self.assertEqual("value", config.key)
 
+    def test_config_no_hash_before_get(self):
+        config = Config()
+        self.assertIsNone(config.hash)
+
+    def test_config_has_hash_after_get(self):
+        config = Config()
+        config.get_hash()
+        self.assertIsNotNone(config.hash)
+
+    def test_config_hash_recalc_same_def(self):
+        config = Config()
+        h1 = config.get_hash()
+        h2 = config.get_hash()
+        self.assertEqual(h1, h2)
+
+    def test_config_hash_changes_after_change(self):
+        config = Config()
+        h1 = config.get_hash()
+        config.linking.filters.cuis = {"a", "b"}
+        h2 = config.get_hash()
+        self.assertNotEqual(h1, h2)
+
+    def test_config_hash_recalc_same_changed(self):
+        config = Config()
+        config.linking.filters.cuis = {"a", "b"}
+        h1 = config.get_hash()
+        h2 = config.get_hash()
+        self.assertEqual(h1, h2)
+
 
 class ConfigLinkingFiltersTests(unittest.TestCase):
 
diff --git a/tests/utils/saving/test_serialization.py b/tests/utils/saving/test_serialization.py
index f0cc75de1..c2c44da16 100644
--- a/tests/utils/saving/test_serialization.py
+++ b/tests/utils/saving/test_serialization.py
@@ -87,7 +87,7 @@ def test_dill_to_json(self):
         model_pack_folder = os.path.join(
             self.json_model_pack.name, model_pack_path)
         json_path = os.path.join(model_pack_folder, "*.json")
-        jsons = glob.glob(json_path)
+        jsons = [fn for fn in glob.glob(json_path) if not fn.endswith("config.json")]
         # there is also a model_card.json
         # but nothing for cui2many or name2many
         # so can remove the length of ONE2MANY
diff --git a/tests/utils/test_hashing.py b/tests/utils/test_hashing.py
index 99c10b153..b6681461f 100644
--- a/tests/utils/test_hashing.py
+++ b/tests/utils/test_hashing.py
@@ -1,4 +1,5 @@
 import os
+from typing import Optional
 import tempfile
 import unittest
 import unittest.mock
@@ -6,6 +7,7 @@
 from medcat.cat import CAT
 from medcat.cdb import CDB
 from medcat.vocab import Vocab
+from medcat.config import Config
 
 
 class CDBHashingTests(unittest.TestCase):
@@ -30,6 +32,43 @@ def test_CDB_hash_saves_on_disk(self):
         self.assertEqual(h, cdb._hash)
 
 
+class CDBHashingWithConfigTests(unittest.TestCase):
+    temp_dir = tempfile.TemporaryDirectory()
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.cdb = CDB.load(os.path.join(os.path.dirname(
+            os.path.realpath(__file__)), "..", "..", "examples", "cdb.dat"))
+        # ensure config has hash
+        h = cls.cdb.get_hash()
+        cls.config = cls.config_copy(cls.cdb.config)
+        cls._config_hash = cls.cdb.config.hash
+
+    @classmethod
+    def config_copy(cls, config: Optional[Config] = None) -> Config:
+        if config is None:
+            config = cls.config
+        return Config(**config.asdict())
+
+    def setUp(self) -> None:
+        # reset config
+        self.cdb.config = self.config_copy()
+        # reset config hash
+        self.cdb._config_hash = self._config_hash
+        self.cdb.config.hash = self._config_hash
+
+    def test_CDB_same_hash_no_need_recalc(self):
+        self.assertFalse(self.cdb._should_recalc_hash(force_recalc=False))
+
+    def test_CDB_hash_recalc_if_no_config_hash(self):
+        self.cdb._config_hash = None
+        self.assertTrue(self.cdb._should_recalc_hash(force_recalc=False))
+
+    def test_CDB_hash_recalc_after_config_change(self):
+        self.cdb.config.linking.filters.cuis = {"a", "b", "c"}
+        self.assertTrue(self.cdb._should_recalc_hash(force_recalc=False))
+
+
 class BaseCATHashingTests(unittest.TestCase):
 
     @classmethod
@@ -75,8 +114,14 @@ def test_no_changes_recalc_same(self):
 
 class CATHashingTestsWithoutChange(CATHashingTestsWithFakeHash):
 
-    def test_no_changes_no_calc(self):
+    def setUp(self) -> None:
+        self._calculate_hash = self.undertest.cdb.calculate_hash
+        # make sure the hash exists
+        self.undertest.cdb._config_hash = self.undertest.cdb.config.get_hash()
+        self.undertest.cdb.get_hash()
         self.undertest.cdb.calculate_hash = unittest.mock.Mock()
+
+    def test_no_changes_no_calc(self):
         hash = self.undertest.get_hash()
         self.assertIsInstance(hash, str)
         self.undertest.cdb.calculate_hash.assert_not_called()

From e52bda3547dfa61c671727746058f67a21da3576 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Tue, 31 Oct 2023 11:44:00 +0200
Subject: [PATCH 04/13] CU-2cdpd4t: Unify default addl_info in different
 methdos. (#363)

---
 medcat/cat.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/medcat/cat.py b/medcat/cat.py
index 0fb6b1167..a86584c3a 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -1340,7 +1340,7 @@ def multiprocessing(self,
                         nproc: int = 2,
                         batch_size_chars: int = 5000 * 1000,
                         only_cui: bool = False,
-                        addl_info: List[str] = [],
+                        addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
                         separate_nn_components: bool = True,
                         out_split_size_chars: Optional[int] = None,
                         save_dir_path: str = os.path.abspath(os.getcwd()),
@@ -1536,7 +1536,7 @@ def multiprocessing_pipe(self,
                              nproc: Optional[int] = None,
                              batch_size: Optional[int] = None,
                              only_cui: bool = False,
-                             addl_info: List[str] = [],
+                             addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
                              return_dict: bool = True,
                              batch_factor: int = 2) -> Union[List[Tuple], Dict]:
         """Run multiprocessing NOT FOR TRAINING

From b6ab62ca2e5c0f654dcd34271ce0d72a70f603cb Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Thu, 2 Nov 2023 22:13:07 +0000
Subject: [PATCH 05/13] Bump django from 3.2.20 to 3.2.23 in /webapp/webapp

Bumps [django](https://github.com/django/django) from 3.2.20 to 3.2.23.
- [Commits](https://github.com/django/django/compare/3.2.20...3.2.23)

---
updated-dependencies:
- dependency-name: django
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 webapp/webapp/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/webapp/webapp/requirements.txt b/webapp/webapp/requirements.txt
index d6a590572..4ace8c6f3 100644
--- a/webapp/webapp/requirements.txt
+++ b/webapp/webapp/requirements.txt
@@ -1,4 +1,4 @@
-Django==3.2.20
+Django==3.2.23
 django-dbbackup==4.0.0b0
 django-storages[boto3]==1.12.3
 django-cron==0.5.1

From 94827bb50cf11526daa8f787b8669395b621043b Mon Sep 17 00:00:00 2001
From: adam-sutton-1992 <adam.sutton@kcl.ac.uk>
Date: Fri, 3 Nov 2023 15:20:50 +0000
Subject: [PATCH 06/13] Changing cdb.add_concept to a protected method

---
 medcat/cat.py                                 | 2 +-
 medcat/cdb.py                                 | 7 ++++---
 medcat/cdb_maker.py                           | 2 +-
 tests/archive_tests/test_cdb_maker_archive.py | 2 +-
 tests/utils/test_hashing.py                   | 6 +++---
 5 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/medcat/cat.py b/medcat/cat.py
index 2323cd737..5bdf43211 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -834,7 +834,7 @@ def add_and_train_concept(self,
         names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
         # Only if not negative, otherwise do not add the new name if in fact it should not be detected
         if do_add_concept and not negative:
-            self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
+            self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
                                  full_build=full_build)
 
         if spacy_entity is not None and spacy_doc is not None:
diff --git a/medcat/cdb.py b/medcat/cdb.py
index 44d4fd9dd..91bb7bd9d 100644
--- a/medcat/cdb.py
+++ b/medcat/cdb.py
@@ -213,9 +213,9 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
             # Name status must be one of the three
             name_status = 'A'
 
-        self.add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)
+        self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)
 
-    def add_concept(self,
+    def _add_concept(self,
                     cui: str,
                     names: Dict,
                     ontologies: set,
@@ -232,7 +232,8 @@ def add_concept(self,
                 the same CUI will be merged internally.
             names (Dict[str, Dict]):
                 Names for this concept, or the value that if found in free text can be linked to this concept.
-                Names is an dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
+                Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
+                Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name'
             ontologies (Set[str]):
                 ontologies in which the concept exists (e.g. SNOMEDCT, HPO)
             name_status (str):
diff --git a/medcat/cdb_maker.py b/medcat/cdb_maker.py
index e9c72d12e..ca98f821e 100644
--- a/medcat/cdb_maker.py
+++ b/medcat/cdb_maker.py
@@ -173,7 +173,7 @@ def prepare_csvs(self,
                             if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']:
                                 prepare_name(raw_name, self.pipe.spacy_nlp, names, self.config)
 
-                    self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids,
+                    self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids,
                                          description=description, full_build=full_build)
                     # DEBUG
                     logger.debug("\n\n**** Added\n CUI: %s\n Names: %s\n Ontologies: %s\n Name status: %s\n Type IDs: %s\n Description: %s\n Is full build: %s",
diff --git a/tests/archive_tests/test_cdb_maker_archive.py b/tests/archive_tests/test_cdb_maker_archive.py
index 329408999..9e2fc2d72 100644
--- a/tests/archive_tests/test_cdb_maker_archive.py
+++ b/tests/archive_tests/test_cdb_maker_archive.py
@@ -108,7 +108,7 @@ def test_concept_similarity(self):
         for i in range(500):
             cui = "C" + str(i)
             type_ids = {'T-' + str(i%10)}
-            cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(),
+            cdb._add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(),
                             name_status='P', type_ids=type_ids, description='', full_build=True)
 
             vectors = {}
diff --git a/tests/utils/test_hashing.py b/tests/utils/test_hashing.py
index 99c10b153..60796eb99 100644
--- a/tests/utils/test_hashing.py
+++ b/tests/utils/test_hashing.py
@@ -90,7 +90,7 @@ class CATHashingTestsWithChange(CATHashingTestsWithFakeHash):
 
     def test_when_changes_do_calc(self):
         with unittest.mock.patch.object(CDB, 'calculate_hash', return_value='abcd1234') as patch_method:
-            self.undertest.cdb.add_concept(**self.concept_kwargs)
+            self.undertest.cdb._add_concept(**self.concept_kwargs)
             hash = self.undertest.get_hash()
         self.assertIsInstance(hash, str)
         patch_method.assert_called()
@@ -106,10 +106,10 @@ def test_default_cdb_not_dirty(self):
         self.assertFalse(self.undertest.cdb.is_dirty)
 
     def test_after_add_concept_is_dirty(self):
-        self.undertest.cdb.add_concept(**self.concept_kwargs)
+        self.undertest.cdb._add_concept(**self.concept_kwargs)
         self.assertTrue(self.undertest.cdb.is_dirty)
 
     def test_after_recalc_not_dirty(self):
-        self.undertest.cdb.add_concept(**self.concept_kwargs)
+        self.undertest.cdb._add_concept(**self.concept_kwargs)
         self.undertest.get_hash()
         self.assertFalse(self.undertest.cdb.is_dirty)

From 26b5120384905eaba66db6eb2e3fd626d7c66c7b Mon Sep 17 00:00:00 2001
From: adam-sutton-1992 <adam.sutton@kcl.ac.uk>
Date: Mon, 6 Nov 2023 17:48:15 +0000
Subject: [PATCH 07/13] Re-added deprecated method  with deprecated flag and
 addtional comments

---
 medcat/cdb.py | 39 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/medcat/cdb.py b/medcat/cdb.py
index 91bb7bd9d..5c1c27a96 100644
--- a/medcat/cdb.py
+++ b/medcat/cdb.py
@@ -12,6 +12,7 @@
 from medcat.utils.hasher import Hasher
 from medcat.utils.matutils import unitvec
 from medcat.utils.ml_utils import get_lr_linking
+from medcat.utils.decorators import deprecated
 from medcat.config import Config, weighted_average, workers
 from medcat.utils.saving.serializer import CDBSerializer
 
@@ -215,6 +216,44 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
 
         self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)
 
+    @deprecated("Use `cdb._add_concept` as this will be removed in a future release.")
+    def add_concept(self,
+                    cui: str,
+                    names: Dict,
+                    ontologies: set,
+                    name_status: str,
+                    type_ids: Set[str],
+                    description: str,
+                    full_build: bool = False) -> None:
+        """
+        Deprecated: Use `cdb._add_concept` as this will be removed in a future release.
+
+        Add a concept to internal Concept Database (CDB). Depending on what you are providing
+        this will add a large number of properties for each concept.
+
+        Args:
+            cui (str):
+                Concept ID or unique identifier in this database, all concepts that have
+                the same CUI will be merged internally.
+            names (Dict[str, Dict]):
+                Names for this concept, or the value that if found in free text can be linked to this concept.
+                Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
+                Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name'
+            ontologies (Set[str]):
+                ontologies in which the concept exists (e.g. SNOMEDCT, HPO)
+            name_status (str):
+                One of `P`, `N`, `A`
+            type_ids (Set[str]):
+                Semantic type identifier (have a look at TUIs in UMLS or SNOMED-CT)
+            description (str):
+                Description of this concept.
+            full_build (bool):
+                If True the dictionary self.addl_info will also be populated, contains a lot of extra information
+                about concepts, but can be very memory consuming. This is not necessary
+                for normal functioning of MedCAT (Default Value `False`).
+        """
+        self._add_concept(cui, names, ontologies, name_status, type_ids, description, full_build)
+
     def _add_concept(self,
                     cui: str,
                     names: Dict,

From b6b023b9cb5c9c40a5d9824d8a1f368b32aa6b21 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Mon, 27 Nov 2023 03:21:10 -0600
Subject: [PATCH 08/13] CU-86931prq4: Update GHA versions (checkout and
 setup-python) to v4 (#368)

---
 .github/workflows/main.yml       | 8 ++++----
 .github/workflows/production.yml | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index c769dfc2e..a5468fb9b 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -16,9 +16,9 @@ jobs:
       max-parallel: 4
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.python-version }}
       - name: Install dependencies
@@ -48,13 +48,13 @@ jobs:
 
     steps:
       - name: Checkout master
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: 'master'
           fetch-depth: 0
 
       - name: Set up Python 3.9
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.9
 
diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml
index 5088c1000..9ad9a5d90 100644
--- a/.github/workflows/production.yml
+++ b/.github/workflows/production.yml
@@ -14,13 +14,13 @@ jobs:
 
     steps:
       - name: Checkout production
-        uses: actions/checkout@v2
+        uses: actions/checkout@v4
         with:
           ref: ${{ github.event.release.target_commitish }}
           fetch-depth: 0
 
       - name: Set up Python 3.9
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.9
 

From 6a5103cbd9b9187a53040e82fa42f2f628faa7bc Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Mon, 27 Nov 2023 05:14:44 -0600
Subject: [PATCH 09/13] Cu 1yn0v9e duplicate multiprocessing methods (#364)

* CU-1yn0v9e: Rename and deprecate one of the multiprocessing methods;

Add docstring. Trying to be more explicit regarding usage and differences between different methods

* CU-1yn0v9e: Rename and deprecate the multiprocessing_pipe method;

Add docstring. Trying to be more explicit regarding usage and differences between different methods

* CU-1yn0v9e: Fix typo in docstring; more consistent naming
---
 medcat/cat.py     | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 tests/test_cat.py |  8 ++++----
 2 files changed, 49 insertions(+), 6 deletions(-)

diff --git a/medcat/cat.py b/medcat/cat.py
index a86584c3a..9cca44b0f 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -1335,6 +1335,7 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_
             pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb'))
         return part_counter
 
+    @deprecated(message="Use `multiprocessing_batch_char_size` instead")
     def multiprocessing(self,
                         data: Union[List[Tuple], Iterable[Tuple]],
                         nproc: int = 2,
@@ -1345,9 +1346,31 @@ def multiprocessing(self,
                         out_split_size_chars: Optional[int] = None,
                         save_dir_path: str = os.path.abspath(os.getcwd()),
                         min_free_memory=0.1) -> Dict:
+        return self.multiprocessing_batch_char_size(data=data, nproc=nproc,
+                                                    batch_size_chars=batch_size_chars,
+                                                    only_cui=only_cui, addl_info=addl_info,
+                                                    separate_nn_components=separate_nn_components,
+                                                    out_split_size_chars=out_split_size_chars,
+                                                    save_dir_path=save_dir_path,
+                                                    min_free_memory=min_free_memory)
+
+    def multiprocessing_batch_char_size(self,
+                                        data: Union[List[Tuple], Iterable[Tuple]],
+                                        nproc: int = 2,
+                                        batch_size_chars: int = 5000 * 1000,
+                                        only_cui: bool = False,
+                                        addl_info: List[str] = [],
+                                        separate_nn_components: bool = True,
+                                        out_split_size_chars: Optional[int] = None,
+                                        save_dir_path: str = os.path.abspath(os.getcwd()),
+                                        min_free_memory=0.1) -> Dict:
         r"""Run multiprocessing for inference, if out_save_path and out_split_size_chars is used this will also continue annotating
         documents if something is saved in that directory.
 
+        This method batches the data based on the number of characters as specified by user.
+
+        PS: This method is unlikely to work on a Windows machine.
+
         Args:
             data:
                 Iterator or array with format: [(id, text), (id, text), ...]
@@ -1531,7 +1554,22 @@ def _multiprocessing_batch(self,
 
         return docs
 
-    def multiprocessing_pipe(self,
+    @deprecated(message="Use `multiprocessing_batch_docs_size` instead")
+    def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]],
+                             nproc: Optional[int] = None,
+                             batch_size: Optional[int] = None,
+                             only_cui: bool = False,
+                             addl_info: List[str] = [],
+                             return_dict: bool = True,
+                             batch_factor: int = 2) -> Union[List[Tuple], Dict]:
+        return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc,
+                                                     batch_size=batch_size,
+                                                     only_cui=only_cui,
+                                                     addl_info=addl_info,
+                                                     return_dict=return_dict,
+                                                     batch_factor=batch_factor)
+
+    def multiprocessing_batch_docs_size(self,
                              in_data: Union[List[Tuple], Iterable[Tuple]],
                              nproc: Optional[int] = None,
                              batch_size: Optional[int] = None,
@@ -1539,7 +1577,12 @@ def multiprocessing_pipe(self,
                              addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
                              return_dict: bool = True,
                              batch_factor: int = 2) -> Union[List[Tuple], Dict]:
-        """Run multiprocessing NOT FOR TRAINING
+        """Run multiprocessing NOT FOR TRAINING.
+
+        This method batches the data based on the number of documents as specified by the user.
+
+        PS:
+        This method supports Windows.
 
         Args:
             in_data (Union[List[Tuple], Iterable[Tuple]]): List with format: [(id, text), (id, text), ...]
diff --git a/tests/test_cat.py b/tests/test_cat.py
index 62db4d44d..cd33efbc7 100644
--- a/tests/test_cat.py
+++ b/tests/test_cat.py
@@ -60,7 +60,7 @@ def test_multiprocessing(self):
             (2, ""),
             (3, None)
         ]
-        out = self.undertest.multiprocessing(in_data, nproc=1)
+        out = self.undertest.multiprocessing_batch_char_size(in_data, nproc=1)
 
         self.assertEqual(3, len(out))
         self.assertEqual(1, len(out[1]['entities']))
@@ -73,7 +73,7 @@ def test_multiprocessing_pipe(self):
             (2, "The dog is sitting outside the house."),
             (3, "The dog is sitting outside the house."),
         ]
-        out = self.undertest.multiprocessing_pipe(in_data, nproc=2, return_dict=False)
+        out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=2, return_dict=False)
         self.assertTrue(type(out) == list)
         self.assertEqual(3, len(out))
         self.assertEqual(1, out[0][0])
@@ -89,7 +89,7 @@ def test_multiprocessing_pipe_with_malformed_texts(self):
             (2, ""),
             (3, None),
         ]
-        out = self.undertest.multiprocessing_pipe(in_data, nproc=1, batch_size=1, return_dict=False)
+        out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=1, batch_size=1, return_dict=False)
         self.assertTrue(type(out) == list)
         self.assertEqual(3, len(out))
         self.assertEqual(1, out[0][0])
@@ -105,7 +105,7 @@ def test_multiprocessing_pipe_return_dict(self):
             (2, "The dog is sitting outside the house."),
             (3, "The dog is sitting outside the house.")
         ]
-        out = self.undertest.multiprocessing_pipe(in_data, nproc=2, return_dict=True)
+        out = self.undertest.multiprocessing_batch_docs_size(in_data, nproc=2, return_dict=True)
         self.assertTrue(type(out) == dict)
         self.assertEqual(3, len(out))
         self.assertEqual({'entities': {}, 'tokens': []}, out[1])

From 72f7ddad20d6e8ae2aa7d8f761c0eb0fe5604290 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Wed, 29 Nov 2023 03:44:16 -0600
Subject: [PATCH 10/13] 869377m3u: Add comment regarding demo link load times
 to README (#376)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 395aecf69..c47cf9a65 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g
 
 ## Demo
 A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT.
+PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive.
 
 ## Tutorials
 A guide on how to use MedCAT is available at [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). Read more about MedCAT on [Towards Data Science](https://towardsdatascience.com/medcat-introduction-analyzing-electronic-health-records-e1c420afa13a).

From d8473d9fabdc49eee3976f2d362433e28f36414b Mon Sep 17 00:00:00 2001
From: adam-sutton-1992 <60137864+adam-sutton-1992@users.noreply.github.com>
Date: Thu, 30 Nov 2023 11:46:28 +0000
Subject: [PATCH 11/13] Added README.md documentation for CPU only
 installations (#365)

* changed README.md to reflect installation options.

* added setup script to demonstrate how wrapper could look for CPU installations

* removed setup.sh as unnessescary for cpu only builds

* Initial commit for merge_cdb method

* Added indentation to make merge_cdb a class method

* fixed syntax issues

* more lint fixes

* more lint fixes

* bug fixes of merge_cdb

* removed print statements

* Added commentary on disk space usage of pytorch-gpu

* removed merge_cdb from branch

---------

Co-authored-by: adam-sutton-1992 <adam.sutton@kcl.ac.uk>
---
 README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/README.md b/README.md
index c47cf9a65..bf34f00c6 100644
--- a/README.md
+++ b/README.md
@@ -38,6 +38,17 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g
 - **Paper**: [What’s in a Summary? Laying the Groundwork for Advances in Hospital-Course Summarization](https://www.aclweb.org/anthology/2021.naacl-main.382.pdf)
 - ([more...](https://github.com/CogStack/MedCAT/blob/master/media/news.md))
 
+## Installation
+To install the latest version of MedCAT run the following command:
+```
+pip install medcat
+```
+Normal installations of MedCAT will install torch-gpu and all relevant dependancies (such as CUDA). This can require as much as 10 GB more disk space, which isn't required for CPU only usage.
+
+To install the latest version of MedCAT without torch GPU support run the following command:
+```
+pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/
+```
 ## Demo
 A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT.
 PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive.

From 76b75cc4e3558e9a48d1fe8aa43ba23621652a75 Mon Sep 17 00:00:00 2001
From: Mart Ratas <mart.ratas@gmail.com>
Date: Thu, 30 Nov 2023 05:47:37 -0600
Subject: [PATCH 12/13] Cu 8692zguyq no preferred name (#367)

* CU-8692zguyq: Slight simplification of minimum-name-length logic

* CU-8692zguyq: Add some tests for prepare_name preprocessor

* CU-8692zguyq: Add warning if no preferred name was added along a new CUI

* CU-8692zguyq: Add additional warning messages when adding/training a new CUI with no preferred name

* CU-8692zguyq: Make no preferred name warnings only run if name status is preferred

* CU-8692zguyq: Add tests for no-preferred name warnings

* CU-8692zguyq: Add Vocab.make_unigram_table to CAT tests

* CU-8692zguyq: Move to built in asserting for logging instead of patching the method

* CU-8692zguyq: Add workaround for assertNoLogs on python 3.8 and 3.9
---
 medcat/cat.py                        |   4 ++
 medcat/cdb.py                        |  15 ++++
 medcat/preprocessing/cleaners.py     |   3 +-
 tests/preprocessing/__init__.py      |   0
 tests/preprocessing/test_cleaners.py | 104 +++++++++++++++++++++++++++
 tests/test_cat.py                    |  55 +++++++++++++-
 6 files changed, 178 insertions(+), 3 deletions(-)
 create mode 100644 tests/preprocessing/__init__.py
 create mode 100644 tests/preprocessing/test_cleaners.py

diff --git a/medcat/cat.py b/medcat/cat.py
index ef6099566..f49a25022 100644
--- a/medcat/cat.py
+++ b/medcat/cat.py
@@ -840,6 +840,10 @@ def add_and_train_concept(self,
                 Refer to medcat.cat.cdb.CDB.add_concept
         """
         names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
+        if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P':
+            logger.warning("No names were able to be prepared in CAT.add_and_train_concept "
+                           "method. As such no preferred name will be able to be specifeid. "
+                           "The CUI: '%s' and raw name: '%s'", cui, name)
         # Only if not negative, otherwise do not add the new name if in fact it should not be detected
         if do_add_concept and not negative:
             self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
diff --git a/medcat/cdb.py b/medcat/cdb.py
index 1110a3b84..2ca8382a7 100644
--- a/medcat/cdb.py
+++ b/medcat/cdb.py
@@ -358,6 +358,21 @@ def _add_concept(self,
             if name_status == 'P' and cui not in self.cui2preferred_name:
                 # Do not overwrite old preferred names
                 self.cui2preferred_name[cui] = name_info['raw_name']
+        elif names:
+            # if no name_info and names is NOT an empty dict
+            # this shouldn't really happen in the current setup
+            raise ValueError("Unknown state where there is no name_info, "
+                             "yet the `names` dict is not empty (%s)", names)
+        elif name_status == 'P' and cui not in self.cui2preferred_name:
+            # this means names is an empty `names` dict
+            logger.warning("Did not manage to add a preferred name in `add_cui`. "
+                           "Was trying to do so for cui: '%s'"
+                           "This means that the `names` dict passed was empty. "
+                           "This is _usually_ caused by either no name or too short "
+                           "a name passed to the `prepare_name` method. "
+                           "The minimum length is defined in: "
+                           "'config.cdb_maker.min_letters_required' and "
+                           "is currently set at %s", cui, self.config.cdb_maker['min_letters_required'])
 
         # Add other fields if full_build
         if full_build:
diff --git a/medcat/preprocessing/cleaners.py b/medcat/preprocessing/cleaners.py
index 18314d562..43e8098e2 100644
--- a/medcat/preprocessing/cleaners.py
+++ b/medcat/preprocessing/cleaners.py
@@ -48,7 +48,8 @@ def prepare_name(raw_name: str, nlp: Language, names: Dict, config: Config) -> D
             snames = set()
             name = config.general['separator'].join(tokens)
 
-            if not config.cdb_maker.get('min_letters_required', 0) or len(re.sub("[^A-Za-z]*", '', name)) >= config.cdb_maker.get('min_letters_required', 0):
+            min_letters = config.cdb_maker.get('min_letters_required', 0)
+            if not min_letters or len(re.sub("[^A-Za-z]*", '', name)) >= min_letters:
                 if name not in names:
                     sname = ""
                     for token in tokens:
diff --git a/tests/preprocessing/__init__.py b/tests/preprocessing/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/preprocessing/test_cleaners.py b/tests/preprocessing/test_cleaners.py
new file mode 100644
index 000000000..b879d9ee6
--- /dev/null
+++ b/tests/preprocessing/test_cleaners.py
@@ -0,0 +1,104 @@
+from medcat.preprocessing.cleaners import prepare_name
+from medcat.config import Config
+from medcat.cdb_maker import CDBMaker
+
+import logging, os
+
+import unittest
+
+
+class BaseCDBMakerTests(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        config = Config()
+        config.general['log_level'] = logging.DEBUG
+        config.general["spacy_model"] = "en_core_web_md"
+        cls.maker = CDBMaker(config)
+        csvs = [
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb.csv'),
+            os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', '..', 'examples', 'cdb_2.csv')
+        ]
+        cls.cdb = cls.maker.prepare_csvs(csvs, full_build=True)
+
+
+class BasePrepareNameTest(BaseCDBMakerTests):
+    raw_name = 'raw'
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        super().setUpClass()
+        cls.do_prepare_name()
+
+    # method called after setup, when raw_name has been specified
+    @classmethod
+    def do_prepare_name(cls) -> None:
+        cls.name = cls.cdb.config.general.separator.join(cls.raw_name.split())
+        cls.names = prepare_name(cls.raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config)
+
+    def _dict_has_key_val_type(self, d: dict, key, val_type):
+        self.assertIn(key, d)
+        self.assertIsInstance(d[key], val_type)
+
+    def _names_has_key_val_type(self, key, val_type):
+        self._dict_has_key_val_type(self.names, key, val_type)
+
+    def test_result_has_name(self):
+        self._names_has_key_val_type(self.name, dict)
+
+    def test_name_info_has_tokens(self):
+        self._dict_has_key_val_type(self.names[self.name], 'tokens', list)
+
+    def test_name_info_has_words_as_tokens(self):
+        name_info = self.names[self.name]
+        tokens = name_info['tokens']
+        for word in self.raw_name.split():
+            with self.subTest(word):
+                self.assertIn(word, tokens)
+    
+
+class NamePreparationTests_OneLetter(BasePrepareNameTest):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        super().setUpClass()
+        cls.raw_name = "a"
+        # the minimum name length is defined by the following config option
+        # if I don't set this to 1 here, I would see the tests fail
+        # that would be because the result from `prepare_names` would be empty
+        cls.cdb.config.cdb_maker.min_letters_required = 1
+        super().do_prepare_name()
+    
+
+class NamePreparationTests_TwoLetters(BasePrepareNameTest):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        super().setUpClass()
+        cls.raw_name = "an"
+        super().do_prepare_name()
+    
+
+class NamePreparationTests_MultiToken(BasePrepareNameTest):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        super().setUpClass()
+        cls.raw_name = "this raw name"
+        super().do_prepare_name()
+    
+
+class NamePreparationTests_Empty(BaseCDBMakerTests):
+    """In case of an empty name, I would expect the names dict
+    returned by `prepare_name` to be empty.
+    """
+    empty_raw_name = ''
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        super().setUpClass()
+        cls.names = prepare_name(cls.empty_raw_name, cls.maker.pipe.spacy_nlp, {}, cls.cdb.config)
+
+    def test_names_dict_is_empty(self):
+        self.assertEqual(len(self.names), 0)
+        self.assertEqual(self.names, {})
diff --git a/tests/test_cat.py b/tests/test_cat.py
index cd33efbc7..acd337e71 100644
--- a/tests/test_cat.py
+++ b/tests/test_cat.py
@@ -4,10 +4,12 @@
 import unittest
 import tempfile
 import shutil
+import logging
+import contextlib
 from transformers import AutoTokenizer
 from medcat.vocab import Vocab
-from medcat.cdb import CDB
-from medcat.cat import CAT
+from medcat.cdb import CDB, logger as cdb_logger
+from medcat.cat import CAT, logger as cat_logger
 from medcat.utils.checkpoint import Checkpoint
 from medcat.meta_cat import MetaCAT
 from medcat.config_meta_cat import ConfigMetaCAT
@@ -20,6 +22,7 @@ class CATTests(unittest.TestCase):
     def setUpClass(cls) -> None:
         cls.cdb = CDB.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "cdb.dat"))
         cls.vocab = Vocab.load(os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "examples", "vocab.dat"))
+        cls.vocab.make_unigram_table()
         cls.cdb.config.general.spacy_model = "en_core_web_md"
         cls.cdb.config.ner.min_name_len = 2
         cls.cdb.config.ner.upper_case_limit_len = 3
@@ -388,6 +391,54 @@ def test_hashing(self):
         cat = CAT.load_model_pack(os.path.join(save_dir_path.name, f"{full_model_pack_name}.zip"))
         self.assertEqual(cat.get_hash(), cat.config.version.id)
 
+    def _assertNoLogs(self, logger: logging.Logger, level: int):
+        if hasattr(self, 'assertNoLogs'):
+            return self.assertNoLogs(logger=logger, level=level)
+        else:
+            return self.__assertNoLogs(logger=logger, level=level)
+    
+    @contextlib.contextmanager
+    def __assertNoLogs(self, logger: logging.Logger, level: int):
+        try:
+            with self.assertLogs(logger, level) as captured_logs:
+                yield
+        except AssertionError:
+            return
+        if captured_logs:
+            raise AssertionError("Logs were found: {}".format(captured_logs))
+
+    def assertLogsDuringAddAndTrainConcept(self, logger: logging.Logger, log_level,
+                                           name: str, name_status: str, nr_of_calls: int):
+        cui = 'CUI-%d'%(hash(name) + id(name))
+        with (self.assertLogs(logger=logger, level=log_level)
+              if nr_of_calls == 1
+              else self._assertNoLogs(logger=logger, level=log_level)):
+            self.undertest.add_and_train_concept(cui, name, name_status=name_status)
+
+    def test_add_and_train_concept_cat_nowarn_long_name(self):
+        long_name = 'a very long name'
+        self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0)
+
+    def test_add_and_train_concept_cdb_nowarn_long_name(self):
+        long_name = 'a very long name'
+        self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=long_name, name_status='', nr_of_calls=0)
+
+    def test_add_and_train_concept_cat_nowarn_short_name_not_pref(self):
+        short_name = 'a'
+        self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0)
+
+    def test_add_and_train_concept_cdb_nowarn_short_name_not_pref(self):
+        short_name = 'a'
+        self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='', nr_of_calls=0)
+
+    def test_add_and_train_concept_cat_warns_short_name(self):
+        short_name = 'a'
+        self.assertLogsDuringAddAndTrainConcept(cat_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1)
+
+    def test_add_and_train_concept_cdb_warns_short_name(self):
+        short_name = 'a'
+        self.assertLogsDuringAddAndTrainConcept(cdb_logger, logging.WARNING, name=short_name, name_status='P', nr_of_calls=1)
+
 
 class ModelWithTwoConfigsLoadTests(unittest.TestCase):
 

From 7fddac0626382720c1194b465b1e6f975a00152f Mon Sep 17 00:00:00 2001
From: Xi Bai <82581439+baixiac@users.noreply.github.com>
Date: Tue, 5 Dec 2023 17:36:28 +0000
Subject: [PATCH 13/13] Add trainer callbacks for Transformer NER (#377)

CU-86938vf30 add trainer callbacks for Transformer NER
---
 medcat/ner/transformers_ner.py     | 16 ++++++++--
 tests/ner/test_transformers_ner.py | 50 ++++++++++++++++++++++++++++++
 2 files changed, 64 insertions(+), 2 deletions(-)
 create mode 100644 tests/ner/test_transformers_ner.py

diff --git a/medcat/ner/transformers_ner.py b/medcat/ner/transformers_ner.py
index 9623b1b93..227ccc083 100644
--- a/medcat/ner/transformers_ner.py
+++ b/medcat/ner/transformers_ner.py
@@ -1,6 +1,7 @@
 import os
 import json
 import logging
+import datasets
 from spacy.tokens import Doc
 from datetime import datetime
 from typing import Iterable, Iterator, Optional, Dict, List, cast, Union
@@ -18,7 +19,7 @@
 
 from transformers import Trainer, AutoModelForTokenClassification, AutoTokenizer
 from transformers import pipeline, TrainingArguments
-import datasets
+from transformers.trainer_callback import TrainerCallback
 
 # It should be safe to do this always, as all other multiprocessing
 #will be finished before data comes to meta_cat
@@ -137,7 +138,12 @@ def merge_data_loaded(base, other):
 
         return out_path
 
-    def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=False, dataset=None, meta_requirements=None):
+    def train(self,
+              json_path: Union[str, list, None]=None,
+              ignore_extra_labels=False,
+              dataset=None,
+              meta_requirements=None,
+              trainer_callbacks: Optional[List[TrainerCallback]]=None):
         """Train or continue training a model give a json_path containing a MedCATtrainer export. It will
         continue training if an existing model is loaded or start new training if the model is blank/new.
 
@@ -149,6 +155,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals
             ignore_extra_labels:
                 Makes only sense when an existing deid model was loaded and from the new data we want to ignore
                 labels that did not exist in the old model.
+            trainer_callbacks (List[TrainerCallback]):
+                A list of trainer callbacks for collecting metrics during the training at the client side. The
+                transformers Trainer object will be passed in when each callback is called.
         """
 
         if dataset is None and json_path is not None:
@@ -193,6 +202,9 @@ def train(self, json_path: Union[str, list, None]=None, ignore_extra_labels=Fals
                 compute_metrics=lambda p: metrics(p, tokenizer=self.tokenizer, dataset=encoded_dataset['test'], verbose=self.config.general['verbose_metrics']),
                 data_collator=data_collator, # type: ignore
                 tokenizer=None)
+        if trainer_callbacks:
+            for callback in trainer_callbacks:
+                trainer.add_callback(callback(trainer))
 
         trainer.train() # type: ignore
 
diff --git a/tests/ner/test_transformers_ner.py b/tests/ner/test_transformers_ner.py
new file mode 100644
index 000000000..de9eae32c
--- /dev/null
+++ b/tests/ner/test_transformers_ner.py
@@ -0,0 +1,50 @@
+import os
+import unittest
+from spacy.lang.en import English
+from spacy.tokens import Doc, Span
+from transformers import TrainerCallback
+from medcat.ner.transformers_ner import TransformersNER
+from medcat.config import Config
+from medcat.cdb_maker import CDBMaker
+
+
+class TransformerNERTest(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        config = Config()
+        config.general["spacy_model"] = "en_core_web_md"
+        cdb_maker = CDBMaker(config)
+        cdb_csv = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "..", "examples", "cdb.csv")
+        cdb = cdb_maker.prepare_csvs([cdb_csv], full_build=True)
+        Doc.set_extension("ents", default=[], force=True)
+        Span.set_extension("confidence", default=-1, force=True)
+        Span.set_extension("id", default=0, force=True)
+        Span.set_extension("detected_name", default=None, force=True)
+        Span.set_extension("link_candidates", default=None, force=True)
+        Span.set_extension("cui", default=-1, force=True)
+        Span.set_extension("context_similarity", default=-1, force=True)
+        cls.undertest = TransformersNER(cdb)
+        cls.undertest.create_eval_pipeline()
+
+    def test_pipe(self):
+        doc = English().make_doc("\nPatient Name: John Smith\nAddress: 15 Maple Avenue\nCity: New York\nCC: Chronic back pain\n\nHX: Mr. Smith")
+        doc = next(self.undertest.pipe([doc]))
+        assert len(doc.ents) > 0, "No entities were recognised"
+
+    def test_train(self):
+        tracker = unittest.mock.Mock()
+        class _DummyCallback(TrainerCallback):
+            def __init__(self, trainer) -> None:
+                self._trainer = trainer
+            def on_epoch_end(self, *args, **kwargs) -> None:
+                tracker.call()
+
+        train_data = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", "deid_train_data.json")
+        self.undertest.training_arguments.num_train_epochs = 1
+        df, examples, dataset = self.undertest.train(train_data, trainer_callbacks=[_DummyCallback, _DummyCallback])
+        assert "fp" in examples
+        assert "fn" in examples
+        assert dataset["train"].num_rows == 48
+        assert dataset["test"].num_rows == 12
+        self.assertEqual(tracker.call.call_count, 2)