Skip to content

Commit

Permalink
Merge branch 'master' into CU-2e77a31-improve-print_stats
Browse files Browse the repository at this point in the history
  • Loading branch information
mart-r authored Dec 18, 2023
2 parents eb7655e + 22e4aec commit 4a490ce
Show file tree
Hide file tree
Showing 20 changed files with 623 additions and 43 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ jobs:
max-parallel: 4

steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand Down Expand Up @@ -48,13 +48,13 @@ jobs:

steps:
- name: Checkout master
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
ref: 'master'
fetch-depth: 0

- name: Set up Python 3.9
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: 3.9

Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,13 @@ jobs:

steps:
- name: Checkout production
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
ref: ${{ github.event.release.target_commitish }}
fetch-depth: 0

- name: Set up Python 3.9
uses: actions/setup-python@v2
uses: actions/setup-python@v4
with:
python-version: 3.9

Expand Down
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,20 @@ To download any of these models, please [follow this link](https://uts.nlm.nih.g
- **Paper**: [What’s in a Summary? Laying the Groundwork for Advances in Hospital-Course Summarization](https://www.aclweb.org/anthology/2021.naacl-main.382.pdf)
- ([more...](https://github.com/CogStack/MedCAT/blob/master/media/news.md))

## Installation
To install the latest version of MedCAT run the following command:
```
pip install medcat
```
Normal installations of MedCAT will install torch-gpu and all relevant dependancies (such as CUDA). This can require as much as 10 GB more disk space, which isn't required for CPU only usage.

To install the latest version of MedCAT without torch GPU support run the following command:
```
pip install medcat --extra_index_url https://download.pytorch.org/whl/cpu/
```
## Demo
A demo application is available at [MedCAT](https://medcat.rosalind.kcl.ac.uk). This was trained on MIMIC-III and all of SNOMED-CT.
PS: This link can take a long time to load the first time around. The machine spins up as needed and spins down when inactive.

## Tutorials
A guide on how to use MedCAT is available at [MedCAT Tutorials](https://github.com/CogStack/MedCATtutorials). Read more about MedCAT on [Towards Data Science](https://towardsdatascience.com/medcat-introduction-analyzing-electronic-health-records-e1c420afa13a).
Expand Down
65 changes: 60 additions & 5 deletions medcat/cat.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,10 @@ def create_model_pack(self, save_dir_path: str, model_pack_name: str = DEFAULT_M
cdb_path = os.path.join(save_dir_path, "cdb.dat")
self.cdb.save(cdb_path, json_path)

# Save the config
config_path = os.path.join(save_dir_path, "config.json")
self.cdb.config.save(config_path)

# Save the Vocab
vocab_path = os.path.join(save_dir_path, "vocab.dat")
if self.vocab is not None:
Expand Down Expand Up @@ -361,6 +365,10 @@ def load_model_pack(cls,
logger.info('Loading model pack with %s', 'JSON format' if json_path else 'dill format')
cdb = CDB.load(cdb_path, json_path)

# load config
config_path = os.path.join(model_pack_path, "config.json")
cdb.load_config(config_path)

# TODO load addl_ner

# Modify the config to contain full path to spacy model
Expand Down Expand Up @@ -640,9 +648,13 @@ def add_and_train_concept(self,
Refer to medcat.cat.cdb.CDB.add_concept
"""
names = prepare_name(name, self.pipe.spacy_nlp, {}, self.config)
if not names and cui not in self.cdb.cui2preferred_name and name_status == 'P':
logger.warning("No names were able to be prepared in CAT.add_and_train_concept "
"method. As such no preferred name will be able to be specifeid. "
"The CUI: '%s' and raw name: '%s'", cui, name)
# Only if not negative, otherwise do not add the new name if in fact it should not be detected
if do_add_concept and not negative:
self.cdb.add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
self.cdb._add_concept(cui=cui, names=names, ontologies=ontologies, name_status=name_status, type_ids=type_ids, description=description,
full_build=full_build)

if spacy_entity is not None and spacy_doc is not None:
Expand Down Expand Up @@ -1135,19 +1147,42 @@ def _save_docs_to_file(self, docs: Iterable, annotated_ids: List[str], save_dir_
pickle.dump((annotated_ids, part_counter), open(annotated_ids_path, 'wb'))
return part_counter

@deprecated(message="Use `multiprocessing_batch_char_size` instead")
def multiprocessing(self,
data: Union[List[Tuple], Iterable[Tuple]],
nproc: int = 2,
batch_size_chars: int = 5000 * 1000,
only_cui: bool = False,
addl_info: List[str] = [],
addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
separate_nn_components: bool = True,
out_split_size_chars: Optional[int] = None,
save_dir_path: str = os.path.abspath(os.getcwd()),
min_free_memory=0.1) -> Dict:
return self.multiprocessing_batch_char_size(data=data, nproc=nproc,
batch_size_chars=batch_size_chars,
only_cui=only_cui, addl_info=addl_info,
separate_nn_components=separate_nn_components,
out_split_size_chars=out_split_size_chars,
save_dir_path=save_dir_path,
min_free_memory=min_free_memory)

def multiprocessing_batch_char_size(self,
data: Union[List[Tuple], Iterable[Tuple]],
nproc: int = 2,
batch_size_chars: int = 5000 * 1000,
only_cui: bool = False,
addl_info: List[str] = [],
separate_nn_components: bool = True,
out_split_size_chars: Optional[int] = None,
save_dir_path: str = os.path.abspath(os.getcwd()),
min_free_memory=0.1) -> Dict:
r"""Run multiprocessing for inference, if out_save_path and out_split_size_chars is used this will also continue annotating
documents if something is saved in that directory.
This method batches the data based on the number of characters as specified by user.
PS: This method is unlikely to work on a Windows machine.
Args:
data:
Iterator or array with format: [(id, text), (id, text), ...]
Expand Down Expand Up @@ -1331,15 +1366,35 @@ def _multiprocessing_batch(self,

return docs

def multiprocessing_pipe(self,
in_data: Union[List[Tuple], Iterable[Tuple]],
@deprecated(message="Use `multiprocessing_batch_docs_size` instead")
def multiprocessing_pipe(self, in_data: Union[List[Tuple], Iterable[Tuple]],
nproc: Optional[int] = None,
batch_size: Optional[int] = None,
only_cui: bool = False,
addl_info: List[str] = [],
return_dict: bool = True,
batch_factor: int = 2) -> Union[List[Tuple], Dict]:
"""Run multiprocessing NOT FOR TRAINING
return self.multiprocessing_batch_docs_size(in_data=in_data, nproc=nproc,
batch_size=batch_size,
only_cui=only_cui,
addl_info=addl_info,
return_dict=return_dict,
batch_factor=batch_factor)

def multiprocessing_batch_docs_size(self,
in_data: Union[List[Tuple], Iterable[Tuple]],
nproc: Optional[int] = None,
batch_size: Optional[int] = None,
only_cui: bool = False,
addl_info: List[str] = ['cui2icd10', 'cui2ontologies', 'cui2snomed'],
return_dict: bool = True,
batch_factor: int = 2) -> Union[List[Tuple], Dict]:
"""Run multiprocessing NOT FOR TRAINING.
This method batches the data based on the number of documents as specified by the user.
PS:
This method supports Windows.
Args:
in_data (Union[List[Tuple], Iterable[Tuple]]): List with format: [(id, text), (id, text), ...]
Expand Down
Loading

0 comments on commit 4a490ce

Please sign in to comment.