Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CU-2exy49p: Make sure the cdb.add_concept really adds a concept or somehow make it clear #370

Merged
merged 2 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -834,7 +834,7 @@ def add_and_train_concept(self,
names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
# Only if not negative, otherwise do not add the new name if in fact it should not be detected
if do_add_concept and not negative:
self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
full_build=full_build)

if spacy_entity is not None and spacy_doc is not None:
Expand Down
44 changes: 42 additions & 2 deletions medcat/cdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from medcat.utils.hasher import Hasher
from medcat.utils.matutils import unitvec
from medcat.utils.ml_utils import get_lr_linking
from medcat.utils.decorators import deprecated
from medcat.config import Config, weighted_average, workers
from medcat.utils.saving.serializer import CDBSerializer

Expand Down Expand Up @@ -213,8 +214,9 @@ def add_names(self, cui: str, names: Dict, name_status: str = 'A', full_build: b
# Name status must be one of the three
name_status = 'A'

self.add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)
self._add_concept(cui=cui, names=names, ontologies=set(), name_status=name_status, type_ids=set(), description='', full_build=full_build)

@deprecated("Use `cdb._add_concept` as this will be removed in a future release.")
def add_concept(self,
adam-sutton-1992 marked this conversation as resolved.
Show resolved Hide resolved
cui: str,
names: Dict,
Expand All @@ -223,6 +225,43 @@ def add_concept(self,
type_ids: Set[str],
description: str,
full_build: bool = False) -> None:
"""
Deprecated: Use `cdb._add_concept` as this will be removed in a future release.

Add a concept to internal Concept Database (CDB). Depending on what you are providing
this will add a large number of properties for each concept.

Args:
cui (str):
Concept ID or unique identifier in this database, all concepts that have
the same CUI will be merged internally.
names (Dict[str, Dict]):
Names for this concept, or the value that if found in free text can be linked to this concept.
Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name'
ontologies (Set[str]):
ontologies in which the concept exists (e.g. SNOMEDCT, HPO)
name_status (str):
One of `P`, `N`, `A`
type_ids (Set[str]):
Semantic type identifier (have a look at TUIs in UMLS or SNOMED-CT)
description (str):
Description of this concept.
full_build (bool):
If True the dictionary self.addl_info will also be populated, contains a lot of extra information
about concepts, but can be very memory consuming. This is not necessary
for normal functioning of MedCAT (Default Value `False`).
"""
self._add_concept(cui, names, ontologies, name_status, type_ids, description, full_build)

def _add_concept(self,
adam-sutton-1992 marked this conversation as resolved.
Show resolved Hide resolved
cui: str,
names: Dict,
ontologies: set,
name_status: str,
type_ids: Set[str],
description: str,
full_build: bool = False) -> None:
"""Add a concept to internal Concept Database (CDB). Depending on what you are providing
this will add a large number of properties for each concept.

Expand All @@ -232,7 +271,8 @@ def add_concept(self,
the same CUI will be merged internally.
names (Dict[str, Dict]):
Names for this concept, or the value that if found in free text can be linked to this concept.
Names is an dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
Names is a dict like: `{name: {'tokens': tokens, 'snames': snames, 'raw_name': raw_name}, ...}`
Names should be generated by helper function 'medcat.preprocessing.cleaners.prepare_name'
ontologies (Set[str]):
ontologies in which the concept exists (e.g. SNOMEDCT, HPO)
name_status (str):
Expand Down
2 changes: 1 addition & 1 deletion medcat/cdb_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def prepare_csvs(self,
if len(raw_name) >= self.config.cdb_maker['remove_parenthesis']:
prepare_name(raw_name, self.pipe.spacy_nlp, names, self.config)

self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids,
self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids,
description=description, full_build=full_build)
# DEBUG
logger.debug("\n\n**** Added\n CUI: %s\n Names: %s\n Ontologies: %s\n Name status: %s\n Type IDs: %s\n Description: %s\n Is full build: %s",
Expand Down
2 changes: 1 addition & 1 deletion tests/archive_tests/test_cdb_maker_archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def test_concept_similarity(self):
for i in range(500):
cui = "C" + str(i)
type_ids = {'T-' + str(i%10)}
cdb.add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(),
cdb._add_concept(cui=cui, names=prepare_name('Name: ' + str(i), self.maker.pipe.get_spacy_nlp(), {}, self.config), ontologies=set(),
name_status='P', type_ids=type_ids, description='', full_build=True)

vectors = {}
Expand Down
6 changes: 3 additions & 3 deletions tests/utils/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ class CATHashingTestsWithChange(CATHashingTestsWithFakeHash):

def test_when_changes_do_calc(self):
with unittest.mock.patch.object(CDB, 'calculate_hash', return_value='abcd1234') as patch_method:
self.undertest.cdb.add_concept(**self.concept_kwargs)
self.undertest.cdb._add_concept(**self.concept_kwargs)
hash = self.undertest.get_hash()
self.assertIsInstance(hash, str)
patch_method.assert_called()
Expand All @@ -106,10 +106,10 @@ def test_default_cdb_not_dirty(self):
self.assertFalse(self.undertest.cdb.is_dirty)

def test_after_add_concept_is_dirty(self):
self.undertest.cdb.add_concept(**self.concept_kwargs)
self.undertest.cdb._add_concept(**self.concept_kwargs)
self.assertTrue(self.undertest.cdb.is_dirty)

def test_after_recalc_not_dirty(self):
self.undertest.cdb.add_concept(**self.concept_kwargs)
self.undertest.cdb._add_concept(**self.concept_kwargs)
self.undertest.get_hash()
self.assertFalse(self.undertest.cdb.is_dirty)
Loading