Merge branch 'main' into feat/ehrshot

som-shahlab · Apr 11, 2024 · ad2da15 · ad2da15
2 parents cd775d9 + 3c5562c
commit ad2da15
Show file tree

Hide file tree

Showing 10 changed files with 90 additions and 25 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -2,26 +2,26 @@ exclude: '^tutorials/input'
 
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.6.0
     hooks:
     -   id: check-yaml
     -   id: end-of-file-fixer
     -   id: trailing-whitespace
 -   repo: https://github.com/psf/black
-    rev: 23.9.1
+    rev: 24.3.0
     hooks:
     -   id: black
 -   repo: https://github.com/PyCQA/flake8
-    rev: 6.1.0
+    rev: 7.0.0
     hooks:
     -   id: flake8
         exclude: ^tutorials
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.5.1
+    rev: v1.9.0
     hooks:
     -   id: mypy
         exclude: ^tutorials
 -   repo: https://github.com/PyCQA/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
     -   id: isort
diff --git a/README.md b/README.md
@@ -18,16 +18,76 @@ We recommend users start with our [tutorial folder](https://github.com/som-shahl
 
 # Installation
 
-FEMR can be installed with the simple command ```pip install femr```.
-
-If you are using deep learning, you need to also install xformers ```pip install xformers```.
+```bash
+pip install femr
 
+# If you are using deep learning, also run...
+pip install xformers
+```
 # Getting Started
 
 The first step of using **FEMR** is to convert your patient data into [MEDS](https://github.com/Medical-Event-Data-Standard), the standard input format expected by **FEMR** codebase.
 
 The best way to do this is with the [ETLs provided by MEDS](https://github.com/Medical-Event-Data-Standard/meds_etl).
 
+## OMOP Data
+
+If you have OMOP CDM formated data, follow these instructions:
+
+1. Download your OMOP dataset to `[PATH_TO_SOURCE_OMOP]`.
+2. Convert OMOP => MEDS using the following:
+```bash
+# Convert OMOP => MEDS data format
+meds_etl_omop [PATH_TO_SOURCE_OMOP] [PATH_TO_OUTPUT_MEDS]
+```
+
+3. Use HuggingFace's Datasets library to load our dataset in Python
+```bash
+import datasets
+dataset = datasets.Dataset.from_parquet(PATH_TO_OUTPUT_MEDS + 'data/*')
+
+# Print dataset stats
+print(dataset)
+>>> Dataset({
+>>>   features: ['patient_id', 'events'],
+>>>   num_rows: 6732
+>>> })
+
+# Print number of events in first patient in dataset
+print(len(dataset[0]['events']))
+>>> 2287
+```
+
+## Stanford STARR-OMOP Data
+
+If you are using the STARR-OMOP dataset from Stanford (which uses the OMOP CDM), we add an initial Stanford-specific preprocessing step. Otherwise this should be identical to the **OMOP Data** section. Follow these instructions:
+
+1. Download your STARR-OMOP dataset to `[PATH_TO_SOURCE_OMOP]`.
+2. Convert STARR-OMOP => MEDS using the following:
+```bash
+femr_stanford_omop_fixer [PATH_TO_SOURCE_OMOP] [PATH_TO_SOURCE_OMOP]
+
+# Convert OMOP => MEDS data format
+meds_etl_omop [PATH_TO_SOURCE_OMOP] [PATH_TO_OUTPUT_MEDS]
+```
+
+3. Use HuggingFace's Datasets library to load our dataset in Python
+```bash
+import datasets
+dataset = datasets.Dataset.from_parquet(PATH_TO_OUTPUT_MEDS + 'data/*')
+
+# Print dataset stats
+print(dataset)
+>>> Dataset({
+>>>   features: ['patient_id', 'events'],
+>>>   num_rows: 6732
+>>> })
+
+# Print number of events in first patient in dataset
+print(len(dataset[0]['events']))
+>>> 2287
+```
+
 # Development
 
 The following guides are for developers who want to contribute to **FEMR**.

diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     "nptyping == 2.4.1",
     "msgpack >= 1.0.5",
     "meds == 0.1.3",
+    "meds_etl == 0.1.0",
     "torch >= 2.1.2",
     "transformers >= 4.25",
     "datasets >= 2.15",

diff --git a/src/femr/featurizers/core.py b/src/femr/featurizers/core.py
@@ -1,4 +1,5 @@
 """Core featurizer functionality, shared across Featurizers."""
+
 from __future__ import annotations
 
 import collections
@@ -242,6 +243,7 @@ def preprocess_featurizers(
         index: femr.index.PatientIndex,
         labels: List[meds.Label],
         num_proc: int = 1,
+        batch_size: int = 1000,
     ) -> None:
         """Preprocess `self.featurizers` on the provided set of labels."""
 
@@ -264,7 +266,7 @@ def preprocess_featurizers(
             dataset,
             functools.partial(_preprocess_map_func, label_map=label_map, featurizers=self.featurizers),
             _preprocess_agg_func,
-            batch_size=1,
+            batch_size=batch_size,
             num_proc=num_proc,
         )
 
@@ -279,6 +281,7 @@ def featurize(
         index: femr.index.PatientIndex,
         labels: List[meds.Label],
         num_proc: int = 1,
+        batch_size: int = 1000,
     ) -> Mapping[str, np.ndarray]:
         """
         Apply a list of Featurizers (in sequence) to obtain a feature matrix for each Label for each patient.
@@ -306,7 +309,11 @@ def featurize(
             dataset,
             functools.partial(_features_map_func, label_map=label_map, featurizers=self.featurizers),
             _features_agg_func,
+<<<<<<< HEAD
             batch_size=1,
+=======
+            batch_size=batch_size,
+>>>>>>> main
             num_proc=num_proc,
         )
 

diff --git a/src/femr/labelers/core.py b/src/femr/labelers/core.py
@@ -1,4 +1,5 @@
 """Core labeling functionality/schemas, shared across all labeling functions."""
+
 from __future__ import annotations
 
 import datetime

diff --git a/src/femr/labelers/omop.py b/src/femr/labelers/omop.py
@@ -1,4 +1,5 @@
 """meds.Labeling functions for OMOP data."""
+
 from __future__ import annotations
 
 from abc import abstractmethod

diff --git a/src/femr/models/tasks.py b/src/femr/models/tasks.py
@@ -23,37 +23,30 @@ def __init__(self):
         super().__init__()
 
     @abc.abstractmethod
-    def get_task_config(self) -> femr.models.config.FEMRTaskConfig:
-        ...
+    def get_task_config(self) -> femr.models.config.FEMRTaskConfig: ...
 
     @abc.abstractmethod
-    def start_batch(self) -> None:
-        ...
+    def start_batch(self) -> None: ...
 
     @abc.abstractmethod
-    def start_patient(self, patient: meds.Patient, ontology: Optional[femr.ontology.Ontology]) -> None:
-        ...
+    def start_patient(self, patient: meds.Patient, ontology: Optional[femr.ontology.Ontology]) -> None: ...
 
     @abc.abstractmethod
-    def add_patient_labels(self, patient_label_offsets: List[int]) -> None:
-        ...
+    def add_patient_labels(self, patient_label_offsets: List[int]) -> None: ...
 
     @abc.abstractmethod
-    def needs_exact(self) -> bool:
-        ...
+    def needs_exact(self) -> bool: ...
 
     @abc.abstractmethod
     def add_event(
         self,
         current_date: datetime.datetime,
         next_date: Optional[datetime.datetime],
         next_features: Optional[Sequence[int]],
-    ) -> int:
-        ...
+    ) -> int: ...
 
     @abc.abstractmethod
-    def get_batch_data(self) -> Mapping[str, np.ndarray]:
-        ...
+    def get_batch_data(self) -> Mapping[str, np.ndarray]: ...
 
     def cleanup(self, batch: Mapping[str, torch.Tensor]) -> Mapping[str, torch.Tensor]:
         return batch

diff --git a/src/femr/models/transformer.py b/src/femr/models/transformer.py
@@ -12,6 +12,7 @@
 import transformers
 import xformers.ops
 from torch import nn
+from tqdm import tqdm
 
 import femr.models.config
 import femr.models.processor
@@ -368,7 +369,7 @@ def compute_features(
     all_feature_times = []
     all_representations = []
 
-    for batch in batches:
+    for batch in tqdm(batches, total=len(batches)):
         batch = processor.collate([batch])["batch"]
         with torch.no_grad():
             _, result = model(batch, return_reprs=True)

diff --git a/src/femr/transforms/__init__.py b/src/femr/transforms/__init__.py
@@ -1,4 +1,5 @@
 """A collection of general use transforms."""
+
 import datetime
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple
 

diff --git a/tutorials/5_CLMBR Featurization And Modeling.ipynb b/tutorials/5_CLMBR Featurization And Modeling.ipynb
@@ -136,7 +136,7 @@
    "source": [
     "# Data Splitting\n",
     "\n",
-    "When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model"
+    "Your data split should be contained in a CSV with two columns: `patient_id` and `split_name`, where `patient_id` is the ID of the `meds.Patient` and `split_name` is `train` or `test`. When using a pretrained CLMBR model, we have to be very careful to use the splits used for the original model"
    ]
   },
   {