From 66535e706e0e00446b77e0905ccdb5bd21d4de31 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:44:26 +0000 Subject: [PATCH 01/13] [pre-commit.ci] pre-commit autoupdate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/psf/black: 23.11.0 → 24.2.0](https://github.com/psf/black/compare/23.11.0...24.2.0) - [github.com/pre-commit/mirrors-prettier: v3.1.0 → v4.0.0-alpha.8](https://github.com/pre-commit/mirrors-prettier/compare/v3.1.0...v4.0.0-alpha.8) - [github.com/astral-sh/ruff-pre-commit: v0.1.6 → v0.2.1](https://github.com/astral-sh/ruff-pre-commit/compare/v0.1.6...v0.2.1) --- .pre-commit-config.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 94c745a..7082825 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,7 +7,7 @@ default_stages: minimum_pre_commit_version: 2.16.0 repos: - repo: https://github.com/psf/black - rev: "23.11.0" + rev: "24.2.0" hooks: - id: black - repo: https://github.com/asottile/blacken-docs @@ -15,7 +15,7 @@ repos: hooks: - id: blacken-docs - repo: https://github.com/pre-commit/mirrors-prettier - rev: v3.1.0 + rev: v4.0.0-alpha.8 hooks: - id: prettier # Newer versions of node don't work on systems that have an older version of GLIBC @@ -25,7 +25,7 @@ repos: # https://github.com/jupyterlab/jupyterlab/issues/12675 language_version: "17.9.1" - repo: https://github.com/astral-sh/ruff-pre-commit - rev: v0.1.6 + rev: v0.2.1 hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] From 45d912ab1962d66255697b9a18408c492c5bf0c7 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 22:44:37 +0000 Subject: [PATCH 02/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ehrdata.py | 644 ++++++++++++++++++++++++++++------------------------- 1 file changed, 342 insertions(+), 302 deletions(-) diff --git a/ehrdata.py b/ehrdata.py index 80ae05a..f7a574b 100644 --- a/ehrdata.py +++ b/ehrdata.py @@ -1,64 +1,51 @@ -import awkward as ak -import numpy as np -import pandas as pd import csv -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -import ehrapy as ep -import scanpy as sc -from anndata import AnnData -import mudata as md -from mudata import MuData -from typing import List, Union, Literal, Optional -import os import glob -import dask.dataframe as dd -from thefuzz import process -import sys -from rich import print as rprint -import missingno as msno -import warnings import numbers import os -from pandas.tseries.offsets import DateOffset as Offset - -import anndata as ad -from collections.abc import Collection, Iterable, Mapping, Sequence -from enum import Enum +import warnings +from collections.abc import Sequence from functools import partial -from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Callable, Literal, Union +from typing import List, Literal, Optional, Union +import awkward as ak +import dask.dataframe as dd +import ehrapy as ep +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd import scanpy as sc -from scanpy.plotting import DotPlot, MatrixPlot, StackedViolin +import seaborn as sns +from anndata import AnnData from matplotlib.axes import Axes +from pandas.tseries.offsets import DateOffset as Offset +from rich import print as rprint +from thefuzz import process - - -pth = 'auxillary_files/OMOP_CDMv5.4_Field_Level.csv' +pth = "auxillary_files/OMOP_CDMv5.4_Field_Level.csv" field_level = pd.read_csv(pth) -dtype_mapping = {'integer': "Int64", - 'Integer': "Int64", - 'float': float, - 'bigint': "Int64", - 'varchar(MAX)': str, - 'varchar(2000)': str, - 'varchar(1000)': str, - 'varchar(255)': str, - 'varchar(250)': str, - 'varchar(80)': str, - 'varchar(60)': str, - 'varchar(50)': str, - 'varchar(25)': str, - 'varchar(20)': str, - 'varchar(10)': str, - 'varchar(9)': str, - 'varchar(3)': str, - 'varchar(2)': str, - 'varchar(1)': str, - 'datetime': object, - 'date': object} +dtype_mapping = { + "integer": "Int64", + "Integer": "Int64", + "float": float, + "bigint": "Int64", + "varchar(MAX)": str, + "varchar(2000)": str, + "varchar(1000)": str, + "varchar(255)": str, + "varchar(250)": str, + "varchar(80)": str, + "varchar(60)": str, + "varchar(50)": str, + "varchar(25)": str, + "varchar(20)": str, + "varchar(10)": str, + "varchar(9)": str, + "varchar(3)": str, + "varchar(2)": str, + "varchar(1)": str, + "datetime": object, + "date": object, +} clinical_tables_columns = { "person": ["person_id", "year_of_birth", "gender_source_value"], "observation_period": [], @@ -138,7 +125,6 @@ def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities that don't score at least that similar to word are ignored. """ - if not n > 0: raise ValueError("n must be > 0: %r" % (n,)) if not 0.0 <= cutoff <= 1.0: @@ -166,8 +152,8 @@ def df_to_dict(df, key, value): def check_csv_has_only_header(file_path): - if file_path.endswith('csv'): - with open(file_path, 'r') as file: + if file_path.endswith("csv"): + with open(file_path) as file: reader = csv.reader(file) header = next(reader, None) # Read the header if header is not None: @@ -197,14 +183,14 @@ def __init__(self, folder_path, delimiter=None, make_filename_lowercase=True, us if make_filename_lowercase: new_filepath = os.path.join(self.base, file_path.split("/")[-1].lower()) if file_path != new_filepath: - warnings(f"Rename file [file_path] to [new_filepath]") + warnings("Rename file [file_path] to [new_filepath]") os.rename(file_path, new_filepath) self.filepath[file_name] = new_filepath else: self.filepath[file_name] = file_path self.check_with_omop_cdm() self.tables = list(self.filepath.keys()) - + """ if "concept" in self.tables: df_concept = dd.read_csv(self.filepath["concept"], usecols=vocabularies_tables_columns["concept"]) @@ -229,80 +215,85 @@ def format_tables(tables, max_line_length=80): yield line tables_str = "\n".join(format_tables(self.tables)) - return f'OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables: {tables_str}' + return f"OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables: {tables_str}" def set_path(self, table_name, file_path): # TODO move to init self.tables.append(table_name) self.filepath[table_name] = file_path - + def check_with_omop_cdm(self): for file_name, path in self.filepath.items(): if file_name not in set(field_level.cdmTableName): - raise KeyError(f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!") + raise KeyError( + f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!" + ) # If not a single file, read the first one if not os.path.isfile(path): folder_walk = os.walk(path) first_file_in_folder = next(folder_walk)[2][0] path = os.path.join(path, first_file_in_folder) - - if path.endswith('csv'): - with open(path, "r") as f: + + if path.endswith("csv"): + with open(path) as f: dict_reader = csv.DictReader(f, delimiter=self.delimiter) columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith('parquet'): + columns = list(filter(None, columns)) + elif path.endswith("parquet"): df = dd.read_parquet(path) columns = list(df.columns) else: raise TypeError("Only support CSV and Parquet file!") columns_lowercase = [column.lower() for column in columns] - + invalid_column_name = [] for _, column in enumerate(columns_lowercase): - cdm_columns = set(field_level[field_level.cdmTableName == file_name]['cdmFieldName']) + cdm_columns = set(field_level[field_level.cdmTableName == file_name]["cdmFieldName"]) if column not in cdm_columns: invalid_column_name.append(column) if len(invalid_column_name) > 0: - print(f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}") + print( + f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}" + ) raise KeyError - - # TODO redo this using omop cdm csv file - def _get_column_types(self, - path: str = None, - filename: str = None): + # TODO redo this using omop cdm csv file + def _get_column_types(self, path: str = None, filename: str = None): column_types = {} # If not a single file, read the first one if not os.path.isfile(path): folder_walk = os.walk(path) first_file_in_folder = next(folder_walk)[2][0] path = os.path.join(path, first_file_in_folder) - - if path.endswith('csv'): - with open(path, "r") as f: + + if path.endswith("csv"): + with open(path) as f: dict_reader = csv.DictReader(f, delimiter=self.delimiter) columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith('parquet'): + columns = list(filter(None, columns)) + elif path.endswith("parquet"): df = dd.read_parquet(path) columns = list(df.columns) else: raise TypeError("Only support CSV and Parquet file!") columns_lowercase = [column.lower() for column in columns] for _, column in enumerate(columns_lowercase): - column_types[column] = dtype_mapping[field_level[(field_level.cdmTableName == filename) & (field_level.cdmFieldName == column)]['cdmDatatype'].values[0]] + column_types[column] = dtype_mapping[ + field_level[(field_level.cdmTableName == filename) & (field_level.cdmFieldName == column)][ + "cdmDatatype" + ].values[0] + ] return column_types - + def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=None, use_dask=False, **kwargs): - + if use_dask: if not os.path.isfile(path): folder_walk = os.walk(path) filetype = next(folder_walk)[2][0].split(".")[-1] else: filetype = path.split(".")[-1] - if filetype == 'csv': + if filetype == "csv": if not os.path.isfile(path): path = f"{path}/*.csv" if usecols: @@ -310,7 +301,7 @@ def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=No if parse_dates: parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} df = dd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == 'parquet': + elif filetype == "parquet": if not os.path.isfile(path): path = f"{path}/*.parquet" if usecols: @@ -324,13 +315,13 @@ def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=No if not os.path.isfile(path): raise TypeError("Only support reading a single file!") filetype = path.split(".")[-1] - if filetype == 'csv': + if filetype == "csv": if usecols: dtype = {key: dtype[key] for key in usecols if key in dtype} if parse_dates: parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} df = pd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == 'parquet': + elif filetype == "parquet": df = pd.read_parquet(path, columns=usecols) else: raise TypeError("Only support CSV and Parquet file!") @@ -338,7 +329,7 @@ def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=No if index: df = df.set_index(index) return df - + @property def clinical_tables(self): """ @@ -422,11 +413,13 @@ def load(self, level="stay_level", tables=["visit_occurrence", "person", "death" for table in tables: print(f"reading table [{table}]") - column_types = self._get_column_types(path = self.filepath[table], filename=table) - df = self._read_table(self.filepath[table], dtype=column_types, index='person_id') # TODO parse_dates = parse_dates + column_types = self._get_column_types(path=self.filepath[table], filename=table) + df = self._read_table( + self.filepath[table], dtype=column_types, index="person_id" + ) # TODO parse_dates = parse_dates if remove_empty_column: # TODO dask Support - #columns = [column for column in df.columns if not df[column].compute().isna().all()] + # columns = [column for column in df.columns if not df[column].compute().isna().all()] columns = [column for column in df.columns if not df[column].isna().all()] df = df.loc[:, columns] setattr(self, table, df) @@ -438,14 +431,14 @@ def load(self, level="stay_level", tables=["visit_occurrence", "person", "death" # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] # TODO dask Support joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") - + joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") - + # TODO dask Support - #joined_table = joined_table.compute() - - # TODO check this earlier - joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') + # joined_table = joined_table.compute() + + # TODO check this earlier + joined_table = joined_table.drop_duplicates(subset="visit_occurrence_id") joined_table = joined_table.set_index("visit_occurrence_id") # obs_only_list = list(self.joined_table.columns) # obs_only_list.remove('visit_occurrence_id') @@ -454,7 +447,7 @@ def load(self, level="stay_level", tables=["visit_occurrence", "person", "death" joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only ) # TODO this needs to be fixed because anndata set obs index as string by default - #adata.obs.index = adata.obs.index.astype(int) + # adata.obs.index = adata.obs.index.astype(int) """ for column in self.measurement.columns: @@ -463,14 +456,14 @@ def load(self, level="stay_level", tables=["visit_occurrence", "person", "death" for visit_occurrence_id in adata.obs.index: obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) adata.obsm[column]= ak.Array(obs_list) - + for column in self.drug_exposure.columns: if column != 'visit_occurrence_id': obs_list = [] for visit_occurrence_id in adata.obs.index: obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) adata.obsm[column]= ak.Array(obs_list) - + for column in self.observation.columns: if column != 'visit_occurrence_id': obs_list = [] @@ -480,7 +473,7 @@ def load(self, level="stay_level", tables=["visit_occurrence", "person", "death" """ return adata - + def feature_counts( self, source: Literal[ @@ -493,26 +486,27 @@ def feature_counts( "condition_occurrence", ], number=20, - key = None - ): - - if source == 'measurement': + key=None, + ): + + if source == "measurement": columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] - elif source == 'observation': + elif source == "observation": columns = ["value_as_number", "value_as_string", "measurement_datetime"] - elif source == 'condition_occurrence': + elif source == "condition_occurrence": columns = None else: raise KeyError(f"Extracting data from {source} is not supported yet") - - column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = self._read_table(self.filepath[source], dtype=column_types, usecols=[f"{source}_concept_id"], use_dask=True) + + column_types = self._get_column_types(path=self.filepath[source], filename=source) + df_source = self._read_table( + self.filepath[source], dtype=column_types, usecols=[f"{source}_concept_id"], use_dask=True + ) # TODO dask Support - #feature_counts = df_source[f"{source}_concept_id"].value_counts().compute()[0:number] + # feature_counts = df_source[f"{source}_concept_id"].value_counts().compute()[0:number] feature_counts = df_source[f"{source}_concept_id"].value_counts().compute() feature_counts = feature_counts.to_frame().reset_index(drop=False)[0:number] - feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = self.map_concept_id( feature_counts[f"{source}_concept_id"], verbose=False ) @@ -537,15 +531,17 @@ def map_concept_id(self, concept_id: Union[str, List], verbose=True): concept_id_1 = [] concept_id_2 = [] concept_id_mapped_not_found = [] - + if "concept_relationship" in self.tables: - column_types = self._get_column_types(path = self.filepath["concept_relationship"], filename="concept_relationship") - df_concept_relationship = self._read_csv( - self.filepath["concept_relationship"], dtype=column_types + column_types = self._get_column_types( + path=self.filepath["concept_relationship"], filename="concept_relationship" ) + df_concept_relationship = self._read_csv(self.filepath["concept_relationship"], dtype=column_types) # TODO dask Support - #df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], - df_concept_relationship.dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + # df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + df_concept_relationship.dropna( + subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True + ) # , usecols=vocabularies_tables_columns["concept_relationship"], concept_relationship_dict = df_to_dict( df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], key="concept_id_1", @@ -571,11 +567,13 @@ def map_concept_id(self, concept_id: Union[str, List], verbose=True): if len(concept_id_mapped_not_found) > 0: # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") if verbose: - rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!") + rprint( + f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!" + ) else: concept_id_1 = concept_id concept_id_2 = concept_id - + if len(concept_id_1) == 1: return concept_id_1[0], concept_id_2[0] else: @@ -585,11 +583,13 @@ def get_concept_name(self, concept_id: Union[str, List], raise_error=False, verb if isinstance(concept_id, numbers.Integral): concept_id = [concept_id] - column_types = self._get_column_types(path = self.filepath["concept"], filename="concept") + column_types = self._get_column_types(path=self.filepath["concept"], filename="concept") df_concept = self._read_table(self.filepath["concept"], dtype=column_types) # TODO dask Support - #df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] - df_concept.dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + # df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + df_concept.dropna( + subset=["concept_id", "concept_name"], inplace=True, ignore_index=True + ) # usecols=vocabularies_tables_columns["concept"] concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") concept_name = [] concept_name_not_found = [] @@ -611,7 +611,7 @@ def get_concept_name(self, concept_id: Union[str, List], raise_error=False, verb return concept_name def extract_note(self, adata, source="note"): - column_types = self._get_column_types(path = self.filepath[source], filename=source) + column_types = self._get_column_types(path=self.filepath[source], filename=source) df_source = dd.read_csv(self.filepath[source], dtype=column_types) if columns is None: columns = df_source.columns @@ -631,7 +631,6 @@ def note_nlp_map( # Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping pass - def get_feature_info( self, adata, @@ -648,7 +647,6 @@ def get_feature_info( key: str = None, ignore_not_shown_in_concept_table: bool = True, exact_match: bool = True, - verbose: bool = False, ): if key is None: @@ -670,7 +668,7 @@ def get_feature_info( # TODO support features name if "concept" in self.tables: - column_types = self._get_column_types(path = self.filepath["concept"], filename="concept") + column_types = self._get_column_types(path=self.filepath["concept"], filename="concept") df_concept = self._read_table(self.filepath["concept"], dtype=column_types).dropna( subset=["concept_id", "concept_name"] ) # usecols=vocabularies_tables_columns["concept"], @@ -744,10 +742,17 @@ def get_feature_info( "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] you want to extarct" ) raise TypeError - - info_df = pd.concat([info_df, pd.DataFrame(data=[[feature_name, feature_id_1, feature_id_2]], columns=['feature_name', 'feature_id_1', 'feature_id_2'])]) - - + + info_df = pd.concat( + [ + info_df, + pd.DataFrame( + data=[[feature_name, feature_id_1, feature_id_2]], + columns=["feature_name", "feature_id_1", "feature_id_2"], + ), + ] + ) + # feature_name_list.append(feature_name) # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) @@ -764,8 +769,8 @@ def get_feature_info( rprint( f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match socre = [green]{match_score}." ) - if info_df[f"feature_id_1"].equals(info_df[f"feature_id_2"]): - info_df.drop(f"feature_id_2", axis=1, inplace=True) + if info_df["feature_id_1"].equals(info_df["feature_id_2"]): + info_df.drop("feature_id_2", axis=1, inplace=True) info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) info_df = info_df.reset_index(drop=True) else: @@ -786,8 +791,10 @@ def get_feature_statistics( ], features: str or int or List[Union[str, int]] = None, level="stay_level", - value_col: str = 'value_source_value', - aggregation_methods: Union[Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]]]=None, + value_col: str = "value_source_value", + aggregation_methods: Union[ + Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]] + ] = None, add_aggregation_to_X: bool = True, verbose: bool = False, use_dask: bool = None, @@ -798,70 +805,69 @@ def get_feature_statistics( key = f"{source.split('_')[0]}_concept_id" else: raise KeyError(f"Extracting data from {source} is not supported yet") - - if source == 'measurement': - source_table_columns = ['visit_occurrence_id', 'measurement_datetime', key, value_col] - elif source == 'observation': - source_table_columns = ['visit_occurrence_id', "observation_datetime", key, value_col] - elif source == 'condition_occurrence': + + if source == "measurement": + source_table_columns = ["visit_occurrence_id", "measurement_datetime", key, value_col] + elif source == "observation": + source_table_columns = ["visit_occurrence_id", "observation_datetime", key, value_col] + elif source == "condition_occurrence": source_table_columns = None else: raise KeyError(f"Extracting data from {source} is not supported yet") if use_dask is None: use_dask = self.use_dask - source_column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = self._read_table(self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask) + source_column_types = self._get_column_types(path=self.filepath[source], filename=source) + df_source = self._read_table( + self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask + ) info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) - info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] - + info_dict = info_df[["feature_id", "feature_name"]].set_index("feature_id").to_dict()["feature_name"] + # Select featrues df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - #TODO Select time - #da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] - #df_source[f'{source}_name'] = df_source[key].map(info_dict) + # TODO Select time + # da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] + # df_source[f'{source}_name'] = df_source[key].map(info_dict) if aggregation_methods is None: aggregation_methods = ["min", "max", "mean", "std", "count"] - if level == 'stay_level': - result = df_source.groupby(['visit_occurrence_id', key]).agg({ - value_col: aggregation_methods}) - + if level == "stay_level": + result = df_source.groupby(["visit_occurrence_id", key]).agg({value_col: aggregation_methods}) + if use_dask: result = result.compute() result = result.reset_index(drop=False) result.columns = ["_".join(a) for a in result.columns.to_flat_index()] - result.columns = result.columns.str.removesuffix('_') - result.columns = result.columns.str.removeprefix(f'{value_col}_') - result[f'{source}_name'] = result[key].map(info_dict) + result.columns = result.columns.str.removesuffix("_") + result.columns = result.columns.str.removeprefix(f"{value_col}_") + result[f"{source}_name"] = result[key].map(info_dict) - df_statistics = result.pivot(index='visit_occurrence_id', - columns=f'{source}_name', - values=aggregation_methods) + df_statistics = result.pivot( + index="visit_occurrence_id", columns=f"{source}_name", values=aggregation_methods + ) df_statistics.columns = df_statistics.columns.swaplevel() df_statistics.columns = ["_".join(a) for a in df_statistics.columns.to_flat_index()] - # TODO sort_columns = True if sort_columns: new_column_order = [] for feature in features: - for suffix in (f'_{aggregation_method}' for aggregation_method in aggregation_methods): - col_name = f'{feature}{suffix}' + for suffix in (f"_{aggregation_method}" for aggregation_method in aggregation_methods): + col_name = f"{feature}{suffix}" if col_name in df_statistics.columns: new_column_order.append(col_name) df_statistics.columns = new_column_order - + df_statistics.index = df_statistics.index.astype(str) - - adata.obs = adata.obs.join(df_statistics, how='left') - + + adata.obs = adata.obs.join(df_statistics, how="left") + if add_aggregation_to_X: adata = ep.ad.move_to_x(adata, list(df_statistics.columns)) return adata - def extract_features( self, adata, @@ -880,44 +886,50 @@ def extract_features( verbose: Optional[bool] = True, use_dask: bool = None, ): - + if source in ["measurement", "observation", "specimen"]: key = f"{source}_concept_id" elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: key = f"{source.split('_')[0]}_concept_id" else: raise KeyError(f"Extracting data from {source} is not supported yet") - + if source_table_columns is None: - if source == 'measurement': - source_table_columns = ['visit_occurrence_id', 'measurement_datetime', 'value_as_number', key] - elif source == 'observation': - source_table_columns = ['visit_occurrence_id', "value_as_number", "value_as_string", "observation_datetime", key] - elif source == 'condition_occurrence': + if source == "measurement": + source_table_columns = ["visit_occurrence_id", "measurement_datetime", "value_as_number", key] + elif source == "observation": + source_table_columns = [ + "visit_occurrence_id", + "value_as_number", + "value_as_string", + "observation_datetime", + key, + ] + elif source == "condition_occurrence": source_table_columns = None else: raise KeyError(f"Extracting data from {source} is not supported yet") if use_dask is None: - use_dask = self.use_dask - + use_dask = self.use_dask # TODO load using Dask or Dask-Awkward # Load source table using dask - source_column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = self._read_table(self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask) + source_column_types = self._get_column_types(path=self.filepath[source], filename=source) + df_source = self._read_table( + self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask + ) info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) - info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] - - + info_dict = info_df[["feature_id", "feature_name"]].set_index("feature_id").to_dict()["feature_name"] + # Select featrues df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - - # TODO select time period - #df_source = df_source[(df_source.time >= 0) & (df_source.time <= 48*60*60)] - #da_measurement['measurement_name'] = da_measurement.measurement_concept_id.replace(info_dict) - + + # TODO select time period + # df_source = df_source[(df_source.time >= 0) & (df_source.time <= 48*60*60)] + # da_measurement['measurement_name'] = da_measurement.measurement_concept_id.replace(info_dict) + # TODO dask caching - """ + """ from dask.cache import Cache cache = Cache(2e9) cache.register() @@ -930,52 +942,61 @@ def extract_features( else: if dropna == True: df_source = df_source.dropna() - + # Preprocess steps outside the loop - unique_visit_occurrence_ids = set(adata.obs.index)#.astype(int)) - empty_entry = {source_table_column: [] for source_table_column in source_table_columns if source_table_column not in [key, 'visit_occurrence_id'] } - - # Filter data once, if possible - filtered_data = { - feature_id: df_source[df_source[key] == feature_id] - for feature_id in set(info_dict.keys()) + unique_visit_occurrence_ids = set(adata.obs.index) # .astype(int)) + empty_entry = { + source_table_column: [] + for source_table_column in source_table_columns + if source_table_column not in [key, "visit_occurrence_id"] } + # Filter data once, if possible + filtered_data = {feature_id: df_source[df_source[key] == feature_id] for feature_id in set(info_dict.keys())} + for feature_id in set(info_dict.keys()): df_feature = filtered_data[feature_id][list(set(source_table_columns) - set([key]))] grouped = df_feature.groupby("visit_occurrence_id") if verbose: print(f"Adding feature [{info_dict[feature_id]}] into adata.obsm") - + # Use set difference and intersection more efficiently feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) # Creating the array more efficiently - adata.obsm[info_dict[feature_id]] = ak.Array([ - grouped.get_group(visit_occurrence_id)[list(set(source_table_columns) - set([key, 'visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids - ]) + adata.obsm[info_dict[feature_id]] = ak.Array( + [ + ( + grouped.get_group(visit_occurrence_id)[ + list(set(source_table_columns) - set([key, "visit_occurrence_id"])) + ].to_dict(orient="list") + if visit_occurrence_id in feature_ids + else empty_entry + ) + for visit_occurrence_id in unique_visit_occurrence_ids + ] + ) return adata - - def drop_nan(self, - adata, - key: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - ): + def drop_nan( + self, + adata, + key: Union[str, List[str]], + slot: Union[str, None] = "obsm", + ): if isinstance(key, str): key_list = [key] else: key_list = key - if slot == 'obsm': + if slot == "obsm": for key in key_list: ak_array = adata.obsm[key] - + # Update the combined mask based on the presence of None in each field for i, field in enumerate(ak_array.fields): field_mask = ak.is_none(ak.nan_to_none(ak_array[field]), axis=1) - if i==0: + if i == 0: combined_mask = ak.full_like(field_mask, fill_value=False, dtype=bool) combined_mask = combined_mask | field_mask ak_array = ak_array[~combined_mask] @@ -984,17 +1005,18 @@ def drop_nan(self, return adata # downsampling - def aggregate_timeseries_in_bins(self, - adata, - features: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - value_key: str = 'value_as_number', - time_key: str = 'measurement_datetime', - time_binning_method: Literal["floor", "ceil", "round"] = "floor", - bin_size: Union[str, Offset] = 'h', - aggregation_method: Literal['median', 'mean', 'min', 'max'] = 'median', - time_upper_bound: int = 48# TODO - ): + def aggregate_timeseries_in_bins( + self, + adata, + features: Union[str, List[str]], + slot: Union[str, None] = "obsm", + value_key: str = "value_as_number", + time_key: str = "measurement_datetime", + time_binning_method: Literal["floor", "ceil", "round"] = "floor", + bin_size: Union[str, Offset] = "h", + aggregation_method: Literal["median", "mean", "min", "max"] = "median", + time_upper_bound: int = 48, # TODO + ): if isinstance(features, str): features_list = [features] @@ -1003,12 +1025,16 @@ def aggregate_timeseries_in_bins(self, # Ensure the time_binning_method provided is one of the expected methods if time_binning_method not in ["floor", "ceil", "round"]: - raise ValueError(f"time_binning_method {time_binning_method} is not supported. Choose from 'floor', 'ceil', or 'round'.") + raise ValueError( + f"time_binning_method {time_binning_method} is not supported. Choose from 'floor', 'ceil', or 'round'." + ) - if aggregation_method not in {'median', 'mean', 'min', 'max'}: - raise ValueError(f"aggregation_method {aggregation_method} is not supported. Choose from 'median', 'mean', 'min', or 'max'.") + if aggregation_method not in {"median", "mean", "min", "max"}: + raise ValueError( + f"aggregation_method {aggregation_method} is not supported. Choose from 'median', 'mean', 'min', or 'max'." + ) - if slot == 'obsm': + if slot == "obsm": for feature in features_list: print(f"processing feature [{feature}]") df = self.to_dataframe(adata, features) @@ -1018,17 +1044,21 @@ def aggregate_timeseries_in_bins(self, df[time_key] = func(bin_size) else: # TODO need to take care of this if it doesn't follow omop standard - if bin_size == 'h': + if bin_size == "h": df[time_key] = df[time_key] / 3600 func = getattr(np, time_binning_method) df[time_key] = func(df[time_key]) - + df[time_key] = df[time_key].astype(str) # Adjust time values that are equal to the time_upper_bound - #df.loc[df[time_key] == time_upper_bound, time_key] = time_upper_bound - 1 - + # df.loc[df[time_key] == time_upper_bound, time_key] = time_upper_bound - 1 + # Group and aggregate data - df = df.groupby(["visit_occurrence_id", time_key])[value_key].agg(aggregation_method).reset_index(drop=False) + df = ( + df.groupby(["visit_occurrence_id", time_key])[value_key] + .agg(aggregation_method) + .reset_index(drop=False) + ) grouped = df.groupby("visit_occurrence_id") unique_visit_occurrence_ids = adata.obs.index @@ -1037,50 +1067,63 @@ def aggregate_timeseries_in_bins(self, # Efficiently use set difference and intersection feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) # Efficiently create the array - ak_array = ak.Array([ - grouped.get_group(visit_occurrence_id)[[value_key, time_key]].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids - ]) + ak_array = ak.Array( + [ + ( + grouped.get_group(visit_occurrence_id)[[value_key, time_key]].to_dict(orient="list") + if visit_occurrence_id in feature_ids + else empty_entry + ) + for visit_occurrence_id in unique_visit_occurrence_ids + ] + ) adata.obsm[feature] = ak_array return adata - - def timeseries_discretizer(self, - adata, - key: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - value_key: str = 'value_as_number', - time_key: str = 'measurement_datetime', - freq: str = 'hour', #TODO - time_limit: int = 48, #TODO - method: str = 'median' #TODO - ): - - pass - - - - def from_dataframe( + + def timeseries_discretizer( self, adata, - feature: str, - df + key: Union[str, List[str]], + slot: Union[str, None] = "obsm", + value_key: str = "value_as_number", + time_key: str = "measurement_datetime", + freq: str = "hour", # TODO + time_limit: int = 48, # TODO + method: str = "median", # TODO ): + + pass + + def from_dataframe(self, adata, feature: str, df): grouped = df.groupby("visit_occurrence_id") unique_visit_occurrence_ids = set(adata.obs.index) # Use set difference and intersection more efficiently feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - empty_entry = {source_table_column: [] for source_table_column in set(df.columns) if source_table_column not in ['visit_occurrence_id'] } + empty_entry = { + source_table_column: [] + for source_table_column in set(df.columns) + if source_table_column not in ["visit_occurrence_id"] + } # Creating the array more efficiently - ak_array = ak.Array([ - grouped.get_group(visit_occurrence_id)[list(set(df.columns) - set(['visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids]) + ak_array = ak.Array( + [ + ( + grouped.get_group(visit_occurrence_id)[ + list(set(df.columns) - set(["visit_occurrence_id"])) + ].to_dict(orient="list") + if visit_occurrence_id in feature_ids + else empty_entry + ) + for visit_occurrence_id in unique_visit_occurrence_ids + ] + ) adata.obsm[feature] = ak_array - + return adata - + # TODO add function to check feature and add concept # More IO functions def to_dataframe( @@ -1102,28 +1145,26 @@ def to_dataframe( df["entry"] = adata.obs.index[df["entry"]] df = df.rename(columns={"entry": "visit_occurrence_id"}) del df["subentry"] - for col in df.columns: - if col.endswith('time'): + for col in df.columns: + if col.endswith("time"): df[col] = pd.to_datetime(df[col]) - - df['feature_name'] = feature - df_concat = pd.concat([df_concat, df], axis= 0) - - - return df_concat + df["feature_name"] = feature + df_concat = pd.concat([df_concat, df], axis=0) + + return df_concat - def plot_timeseries(self, - adata, - visit_occurrence_id: int, - key: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - value_key: str = 'value_as_number', - time_key: str = 'measurement_datetime', - x_label: str = None + def plot_timeseries( + self, + adata, + visit_occurrence_id: int, + key: Union[str, List[str]], + slot: Union[str, None] = "obsm", + value_key: str = "value_as_number", + time_key: str = "measurement_datetime", + x_label: str = None, ): - - + if isinstance(key, str): key_list = [key] else: @@ -1133,7 +1174,7 @@ def plot_timeseries(self, min_x = None max_x = None - if slot == 'obsm': + if slot == "obsm": fig, ax = plt.subplots(figsize=(20, 6)) # Scatter plot for i, key in enumerate(key_list): @@ -1145,30 +1186,27 @@ def plot_timeseries(self, if not x.empty: ax.scatter(x=x, y=y, label=key) ax.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=len(key_list), prop={"size": 14}) - + ax.plot(x, y) - if min_x is None or min_x > x.min(): min_x = x.min() if max_x is None or max_x < x.max(): max_x = x.max() - - + else: # Skip this iteration if x is empty continue - + if min_x is not None and max_x is not None: - + # Adapt this to input data # TODO step - #plt.xticks(np.arange(min_x, max_x, step=1)) + # plt.xticks(np.arange(min_x, max_x, step=1)) # Adapt this to input data plt.xlabel(x_label if x_label else "Hours since ICU admission") - - plt.show() + plt.show() def violin( self, @@ -1224,7 +1262,8 @@ def violin( **kwds: Are passed to :func:`~seaborn.violinplot`. - Returns: + Returns + ------- A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`. Example: @@ -1242,19 +1281,18 @@ def violin( Preview: .. image:: /_static/docstring_previews/violin.png """ - if obsm_key: df = self.to_dataframe(adata, features=obsm_key) df = df[["visit_occurrence_id", "value_as_number"]] - df = df.rename(columns = {"value_as_number": obsm_key}) - + df = df.rename(columns={"value_as_number": obsm_key}) + if groupby: - df = df.set_index('visit_occurrence_id').join(adata.obs[groupby].to_frame()).reset_index(drop=False) - adata = ep.ad.df_to_anndata(df, columns_obs_only=['visit_occurrence_id', groupby]) + df = df.set_index("visit_occurrence_id").join(adata.obs[groupby].to_frame()).reset_index(drop=False) + adata = ep.ad.df_to_anndata(df, columns_obs_only=["visit_occurrence_id", groupby]) else: - adata = ep.ad.df_to_anndata(df, columns_obs_only=['visit_occurrence_id']) - keys=obsm_key - + adata = ep.ad.df_to_anndata(df, columns_obs_only=["visit_occurrence_id"]) + keys = obsm_key + violin_partial = partial( sc.pl.violin, keys=keys, @@ -1273,10 +1311,10 @@ def violin( show=show, save=save, ax=ax, - **kwds,) - - return violin_partial(adata=adata, groupby=groupby) + **kwds, + ) + return violin_partial(adata=adata, groupby=groupby) def qc_lab_measurements( self, @@ -1301,7 +1339,7 @@ def qc_lab_measurements( if copy: adata = adata.copy() - preprocessing_dir = '/Users/xinyuezhang/ehrapy/ehrapy/preprocessing' + preprocessing_dir = "/Users/xinyuezhang/ehrapy/ehrapy/preprocessing" if reference_table is None: reference_table = pd.read_csv( f"{preprocessing_dir}/laboratory_reference_tables/laposata.tsv", sep="\t", index_col="Measurement" @@ -1355,7 +1393,7 @@ def qc_lab_measurements( actual_measurements = adata[:, measurement].layers[layer] else: if obsm_measurements: - actual_measurements = adata.obsm[measurement]['value_as_number'] + actual_measurements = adata.obsm[measurement]["value_as_number"] ak_measurements = adata.obsm[measurement] else: actual_measurements = adata[:, measurement].X @@ -1371,14 +1409,16 @@ def qc_lab_measurements( rprint(f"[bold blue]Using upperbound [green]{upperbound}") upperbound_check_results = actual_measurements < upperbound if isinstance(actual_measurements, ak.Array): - if action == 'remove': + if action == "remove": if verbose: - rprint(f"Removing {ak.count(actual_measurements) - ak.count(actual_measurements[upperbound_check_results])} outliers") + rprint( + f"Removing {ak.count(actual_measurements) - ak.count(actual_measurements[upperbound_check_results])} outliers" + ) adata.obsm[measurement] = ak_measurements[upperbound_check_results] else: - upperbound_check_results_array: np.ndarray = upperbound_check_results.copy() + upperbound_check_results_array: np.ndarray = upperbound_check_results.copy() adata.obs[f"{measurement} normal"] = upperbound_check_results_array - + elif ">" in check_str: lower_bound = float(check_str.replace(">", "")) if verbose: @@ -1386,7 +1426,7 @@ def qc_lab_measurements( lower_bound_check_results = actual_measurements > lower_bound if isinstance(actual_measurements, ak.Array): - if action == 'remove': + if action == "remove": adata.obsm[measurement] = ak_measurements[lower_bound_check_results] else: adata.obs[f"{measurement} normal"] = lower_bound_check_results_array @@ -1399,7 +1439,7 @@ def qc_lab_measurements( range_check_results = (actual_measurements >= min_value) & (actual_measurements <= max_value) if isinstance(actual_measurements, ak.Array): - if action == 'remove': + if action == "remove": adata.obsm[measurement] = ak_measurements[range_check_results] else: adata.obs[f"{measurement} normal"] = range_check_results_array From 1163cd0b0e49ccde3ce0009e8a58679764251774 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Wed, 14 Feb 2024 12:10:54 +0100 Subject: [PATCH 03/13] ehrdata refactoring --- ehrdata.py | 1449 -------------------- ehrdata/__init__.py | 7 + ehrdata/dt/__init__.py | 1 + ehrdata/dt/_omop.py | 130 ++ ehrdata/io/__init__.py | 1 + ehrdata/io/_omop.py | 55 + ehrdata/pl/__init__.py | 1 + ehrdata/pl/_omop.py | 59 + ehrdata/pp/__init__.py | 1 + ehrdata/pp/_omop.py | 110 ++ ehrdata/tl/__init__.py | 1 + ehrdata/tl/_omop.py | 48 + ehrdata/utils/OMOP_CDMv5.4_Field_Level.csv | 551 ++++++++ ehrdata/utils/omop_utils.py | 455 ++++++ src/ehrdata/__init__.py | 7 - src/ehrdata/pl/__init__.py | 1 - src/ehrdata/pl/basic.py | 63 - src/ehrdata/pp/__init__.py | 1 - src/ehrdata/pp/basic.py | 17 - src/ehrdata/tl/__init__.py | 1 - src/ehrdata/tl/basic.py | 17 - tests/test_basic.py | 4 +- 22 files changed, 1422 insertions(+), 1558 deletions(-) delete mode 100644 ehrdata.py create mode 100644 ehrdata/__init__.py create mode 100644 ehrdata/dt/__init__.py create mode 100644 ehrdata/dt/_omop.py create mode 100644 ehrdata/io/__init__.py create mode 100644 ehrdata/io/_omop.py create mode 100644 ehrdata/pl/__init__.py create mode 100644 ehrdata/pl/_omop.py create mode 100644 ehrdata/pp/__init__.py create mode 100644 ehrdata/pp/_omop.py create mode 100644 ehrdata/tl/__init__.py create mode 100644 ehrdata/tl/_omop.py create mode 100644 ehrdata/utils/OMOP_CDMv5.4_Field_Level.csv create mode 100644 ehrdata/utils/omop_utils.py delete mode 100644 src/ehrdata/__init__.py delete mode 100644 src/ehrdata/pl/__init__.py delete mode 100644 src/ehrdata/pl/basic.py delete mode 100644 src/ehrdata/pp/__init__.py delete mode 100644 src/ehrdata/pp/basic.py delete mode 100644 src/ehrdata/tl/__init__.py delete mode 100644 src/ehrdata/tl/basic.py diff --git a/ehrdata.py b/ehrdata.py deleted file mode 100644 index f7a574b..0000000 --- a/ehrdata.py +++ /dev/null @@ -1,1449 +0,0 @@ -import csv -import glob -import numbers -import os -import warnings -from collections.abc import Sequence -from functools import partial -from typing import List, Literal, Optional, Union - -import awkward as ak -import dask.dataframe as dd -import ehrapy as ep -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -import scanpy as sc -import seaborn as sns -from anndata import AnnData -from matplotlib.axes import Axes -from pandas.tseries.offsets import DateOffset as Offset -from rich import print as rprint -from thefuzz import process - -pth = "auxillary_files/OMOP_CDMv5.4_Field_Level.csv" -field_level = pd.read_csv(pth) -dtype_mapping = { - "integer": "Int64", - "Integer": "Int64", - "float": float, - "bigint": "Int64", - "varchar(MAX)": str, - "varchar(2000)": str, - "varchar(1000)": str, - "varchar(255)": str, - "varchar(250)": str, - "varchar(80)": str, - "varchar(60)": str, - "varchar(50)": str, - "varchar(25)": str, - "varchar(20)": str, - "varchar(10)": str, - "varchar(9)": str, - "varchar(3)": str, - "varchar(2)": str, - "varchar(1)": str, - "datetime": object, - "date": object, -} -clinical_tables_columns = { - "person": ["person_id", "year_of_birth", "gender_source_value"], - "observation_period": [], - "death": ["person_id", "death_datetime"], - "visit_occurrence": ["visit_occurrence_id", "person_id", "visit_start_datetime", "visit_end_datetime"], - "visit_detail": [], - "condition_occurrence": [], - "drug_exposure": [ - "drug_exposure_id", - "person_id", - "visit_occurrence_id", - "drug_concept_id", - ], - "procedure_occurrence": ["visit_occurrence_id", "person_id", "visit_start_datetime", "visit_end_datetime"], - "device_exposure": [], - "specimen": [], - "measurement": [ - "measurement_id", - "person_id", - "visit_occurrence_id", - "measurement_concept_id", - "measurement_datetime", - "value_as_number", - "unit_source_value", - ], - "observation": [ - "observation_id", - "person_id", - "observation_concept_id", - "observation_datetime", - "value_as_number", - "value_as_string", - ], - "note": [], - "note_nlp": [], - "fact_relationship": [], - "procedure_occurrence": [], -} -health_system_tables_columns = { - "location": [], - "care_site": ["care_site_id", "care_site_name"], - "provider": [], -} -vocabularies_tables_columns = { - "concept": [ - "concept_id", - "concept_name", - "domain_id", - "vocabulary_id", - "concept_class_id", - "standard_concept", - "concept_code", - ], - "vocabulary": [], - "domain": [], - "concept_class": [], - "concept_synonym": [], - "concept_relationship": ["concept_id_1", "concept_id_2", "relationship_id"], - "relationship": [], - "concept_ancestor": [], - "source_to_concept_map": [], - "drug_strength": [], -} - - -from difflib import SequenceMatcher -from heapq import nlargest as _nlargest - - -def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): - """Use SequenceMatcher to return a list of the indexes of the best - "good enough" matches. word is a sequence for which close matches - are desired (typically a string). - possibilities is a dictionary of sequences. - Optional arg n (default 2) is the maximum number of close matches to - return. n must be > 0. - Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities - that don't score at least that similar to word are ignored. - """ - if not n > 0: - raise ValueError("n must be > 0: %r" % (n,)) - if not 0.0 <= cutoff <= 1.0: - raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) - result = [] - s = SequenceMatcher() - s.set_seq2(word) - for _, (key, value) in enumerate(possibilities.items()): - s.set_seq1(value) - if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: - result.append((s.ratio(), value, key)) - - # Move the best scorers to head of list - result = _nlargest(n, result) - - # Strip scores for the best n matches - return [(value, key, score) for score, value, key in result] - - -def df_to_dict(df, key, value): - if isinstance(df, dd.DataFrame): - return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() - else: - return pd.Series(df[value].values, index=df[key]).to_dict() - - -def check_csv_has_only_header(file_path): - if file_path.endswith("csv"): - with open(file_path) as file: - reader = csv.reader(file) - header = next(reader, None) # Read the header - if header is not None: - second_row = next(reader, None) # Try to read the next row - return second_row is None # If there's no second row, return True - else: - return False # File is empty or not a valid CSV - else: - return False - - -class OMOP: - def __init__(self, folder_path, delimiter=None, make_filename_lowercase=True, use_dask=False): - self.base = folder_path - self.delimiter = delimiter - self.use_dask = use_dask - # TODO support also parquet and other formats - file_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) - self.loaded_tabel = None - self.filepath = {} - for file_path in file_list: - file_name = file_path.split("/")[-1].split(".")[0] - if check_csv_has_only_header(file_path): - pass - else: - # Rename the file - if make_filename_lowercase: - new_filepath = os.path.join(self.base, file_path.split("/")[-1].lower()) - if file_path != new_filepath: - warnings("Rename file [file_path] to [new_filepath]") - os.rename(file_path, new_filepath) - self.filepath[file_name] = new_filepath - else: - self.filepath[file_name] = file_path - self.check_with_omop_cdm() - self.tables = list(self.filepath.keys()) - - """ - if "concept" in self.tables: - df_concept = dd.read_csv(self.filepath["concept"], usecols=vocabularies_tables_columns["concept"]) - self.concept_id_to_name = dict(zip(df_concept['id'], df_concept['name'])) - self.concept_name_to_id = dict(zip(df_concept['name'], df_concept['id'])) - """ - - def __repr__(self) -> str: - # TODO this should be seperated by diff table categories - def format_tables(tables, max_line_length=80): - line = "" - for table in tables: - # Check if adding the next table would exceed the max line length - if len(line) + len(table) > max_line_length: - # Yield the current line and start a new one - yield line - line = table - else: - # Add the table to the current line - line += table if line == "" else ", " + table - # Yield the last line - yield line - - tables_str = "\n".join(format_tables(self.tables)) - return f"OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables: {tables_str}" - - def set_path(self, table_name, file_path): - # TODO move to init - self.tables.append(table_name) - self.filepath[table_name] = file_path - - def check_with_omop_cdm(self): - for file_name, path in self.filepath.items(): - if file_name not in set(field_level.cdmTableName): - raise KeyError( - f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!" - ) - # If not a single file, read the first one - if not os.path.isfile(path): - folder_walk = os.walk(path) - first_file_in_folder = next(folder_walk)[2][0] - path = os.path.join(path, first_file_in_folder) - - if path.endswith("csv"): - with open(path) as f: - dict_reader = csv.DictReader(f, delimiter=self.delimiter) - columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith("parquet"): - df = dd.read_parquet(path) - columns = list(df.columns) - else: - raise TypeError("Only support CSV and Parquet file!") - columns_lowercase = [column.lower() for column in columns] - - invalid_column_name = [] - for _, column in enumerate(columns_lowercase): - cdm_columns = set(field_level[field_level.cdmTableName == file_name]["cdmFieldName"]) - if column not in cdm_columns: - invalid_column_name.append(column) - if len(invalid_column_name) > 0: - print( - f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}" - ) - raise KeyError - - # TODO redo this using omop cdm csv file - def _get_column_types(self, path: str = None, filename: str = None): - column_types = {} - # If not a single file, read the first one - if not os.path.isfile(path): - folder_walk = os.walk(path) - first_file_in_folder = next(folder_walk)[2][0] - path = os.path.join(path, first_file_in_folder) - - if path.endswith("csv"): - with open(path) as f: - dict_reader = csv.DictReader(f, delimiter=self.delimiter) - columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith("parquet"): - df = dd.read_parquet(path) - columns = list(df.columns) - else: - raise TypeError("Only support CSV and Parquet file!") - columns_lowercase = [column.lower() for column in columns] - for _, column in enumerate(columns_lowercase): - column_types[column] = dtype_mapping[ - field_level[(field_level.cdmTableName == filename) & (field_level.cdmFieldName == column)][ - "cdmDatatype" - ].values[0] - ] - return column_types - - def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=None, use_dask=False, **kwargs): - - if use_dask: - if not os.path.isfile(path): - folder_walk = os.walk(path) - filetype = next(folder_walk)[2][0].split(".")[-1] - else: - filetype = path.split(".")[-1] - if filetype == "csv": - if not os.path.isfile(path): - path = f"{path}/*.csv" - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == "parquet": - if not os.path.isfile(path): - path = f"{path}/*.parquet" - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_parquet(path, dtype=dtype, parse_dates=parse_dates, columns=usecols) - else: - raise TypeError("Only support CSV and Parquet file!") - else: - if not os.path.isfile(path): - raise TypeError("Only support reading a single file!") - filetype = path.split(".")[-1] - if filetype == "csv": - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = pd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == "parquet": - df = pd.read_parquet(path, columns=usecols) - else: - raise TypeError("Only support CSV and Parquet file!") - - if index: - df = df.set_index(index) - return df - - @property - def clinical_tables(self): - """ - A dictionary containing all of the ``Clinical`` OMOP CDM tables in the connected database. - """ - table_names = [ - "person", - "observation_period", - "specimen", - "death", - "visit_occurrence", - "visit_detail", - "procedure_occurrence", - "drug_exposure", - "device_exposure", - "condition_occurrence", - "measurement", - "note", - "note_nlp", - "observation", - "fact_relationship", - ] - return [table_name for table_name in self.tables if table_name in table_names] - - @property - def vocabularies_tables(self): - """ - A dictionary containing all of the ``Vocabularies`` OMOP CDM tables in the connected database. - """ - table_names = [ - "concept", - "vocabulary", - "domain", - "concept_class", - "concept_relationship", - "relationship", - "concept_synonym", - "concept_ancestor", - "source_to_concept_map", - "drug_strength", - ] - return [table_name for table_name in self.tables if table_name in table_names] - - @property - def metadata_tables(self): - """ - A dictionary containing all of the ``MetaData`` OMOP CDM tables in the connected database. - """ - table_names = ["cdm_source", "metadata"] - return [table_name for table_name in self.tables if table_name in table_names] - - @property - def health_system_tables(self): - """ - A dictionary containing all of the ``Health System`` OMOP CDM tables in the connected database. - """ - table_names = ["location", "care_site", "provider"] - return [table_name for table_name in self.tables if table_name in table_names] - - @property - def derived_elements_tables(self): - """ - A dictionary containing all of the ``Derived Elements`` OMOP CDM tables in the connected database. - """ - table_names = ["cohort", "cohort_definition", "drug_era", "dose_era", "condition_era"] - return [table_name for table_name in self.tables if table_name in table_names] - - @property - def health_economics_tables(self): - """ - A dictionary containing all of the ``Health Economics`` OMOP CDM tables in the connected database. - """ - table_names = ["payer_plan_period", "cost"] - return [table_name for table_name in self.tables if table_name in table_names] - - def load(self, level="stay_level", tables=["visit_occurrence", "person", "death"], remove_empty_column=True): - # TODO patient level and hospital level - if level == "stay_level": - index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} - # TODO Only support clinical_tables_columns - - for table in tables: - print(f"reading table [{table}]") - column_types = self._get_column_types(path=self.filepath[table], filename=table) - df = self._read_table( - self.filepath[table], dtype=column_types, index="person_id" - ) # TODO parse_dates = parse_dates - if remove_empty_column: - # TODO dask Support - # columns = [column for column in df.columns if not df[column].compute().isna().all()] - columns = [column for column in df.columns if not df[column].isna().all()] - df = df.loc[:, columns] - setattr(self, table, df) - - # concept_id_list = list(self.concept.concept_id) - # concept_name_list = list(self.concept.concept_id) - # concept_domain_id_list = list(set(self.concept.domain_id)) - - # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] - # TODO dask Support - joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") - - joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") - - # TODO dask Support - # joined_table = joined_table.compute() - - # TODO check this earlier - joined_table = joined_table.drop_duplicates(subset="visit_occurrence_id") - joined_table = joined_table.set_index("visit_occurrence_id") - # obs_only_list = list(self.joined_table.columns) - # obs_only_list.remove('visit_occurrence_id') - columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) - adata = ep.ad.df_to_anndata( - joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only - ) - # TODO this needs to be fixed because anndata set obs index as string by default - # adata.obs.index = adata.obs.index.astype(int) - - """ - for column in self.measurement.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.drug_exposure.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.observation.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - """ - - return adata - - def feature_counts( - self, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - number=20, - key=None, - ): - - if source == "measurement": - columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] - elif source == "observation": - columns = ["value_as_number", "value_as_string", "measurement_datetime"] - elif source == "condition_occurrence": - columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - column_types = self._get_column_types(path=self.filepath[source], filename=source) - df_source = self._read_table( - self.filepath[source], dtype=column_types, usecols=[f"{source}_concept_id"], use_dask=True - ) - # TODO dask Support - # feature_counts = df_source[f"{source}_concept_id"].value_counts().compute()[0:number] - feature_counts = df_source[f"{source}_concept_id"].value_counts().compute() - feature_counts = feature_counts.to_frame().reset_index(drop=False)[0:number] - - feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = self.map_concept_id( - feature_counts[f"{source}_concept_id"], verbose=False - ) - feature_counts["feature_name"] = self.get_concept_name(feature_counts[f"{source}_concept_id_1"]) - if feature_counts[f"{source}_concept_id_1"].equals(feature_counts[f"{source}_concept_id_2"]): - feature_counts.drop(f"{source}_concept_id_2", axis=1, inplace=True) - feature_counts.rename(columns={f"{source}_concept_id_1": f"{source}_concept_id"}) - feature_counts = feature_counts.reindex(columns=["feature_name", f"{source}_concept_id", "count"]) - else: - feature_counts = feature_counts.reindex( - columns=["feature_name", f"{source}_concept_id_1", f"{source}_concept_id_2", "count"] - ) - - ax = sns.barplot(feature_counts, x="feature_name", y="count") - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - plt.tight_layout() - return feature_counts - - def map_concept_id(self, concept_id: Union[str, List], verbose=True): - if isinstance(concept_id, numbers.Integral): - concept_id = [concept_id] - concept_id_1 = [] - concept_id_2 = [] - concept_id_mapped_not_found = [] - - if "concept_relationship" in self.tables: - column_types = self._get_column_types( - path=self.filepath["concept_relationship"], filename="concept_relationship" - ) - df_concept_relationship = self._read_csv(self.filepath["concept_relationship"], dtype=column_types) - # TODO dask Support - # df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], - df_concept_relationship.dropna( - subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True - ) # , usecols=vocabularies_tables_columns["concept_relationship"], - concept_relationship_dict = df_to_dict( - df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], - key="concept_id_1", - value="concept_id_2", - ) - concept_relationship_dict_reverse = df_to_dict( - df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"], - key="concept_id_1", - value="concept_id_2", - ) - for id in concept_id: - try: - concept_id_2.append(concept_relationship_dict[id]) - concept_id_1.append(id) - except KeyError: - try: - concept_id_1.append(concept_relationship_dict_reverse[id]) - concept_id_2.append(id) - except KeyError: - concept_id_1.append(id) - concept_id_2.append(id) - concept_id_mapped_not_found.append(id) - if len(concept_id_mapped_not_found) > 0: - # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") - if verbose: - rprint( - f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!" - ) - else: - concept_id_1 = concept_id - concept_id_2 = concept_id - - if len(concept_id_1) == 1: - return concept_id_1[0], concept_id_2[0] - else: - return concept_id_1, concept_id_2 - - def get_concept_name(self, concept_id: Union[str, List], raise_error=False, verbose=True): - if isinstance(concept_id, numbers.Integral): - concept_id = [concept_id] - - column_types = self._get_column_types(path=self.filepath["concept"], filename="concept") - df_concept = self._read_table(self.filepath["concept"], dtype=column_types) - # TODO dask Support - # df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] - df_concept.dropna( - subset=["concept_id", "concept_name"], inplace=True, ignore_index=True - ) # usecols=vocabularies_tables_columns["concept"] - concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") - concept_name = [] - concept_name_not_found = [] - for id in concept_id: - try: - concept_name.append(concept_dict[id]) - except KeyError: - concept_name.append(id) - concept_name_not_found.append(id) - if len(concept_name_not_found) > 0: - # warnings.warn(f"Couldn't find concept {id} in concept table!") - if verbose: - rprint(f"Couldn't find concept {concept_name_not_found} in concept table!") - if raise_error: - raise KeyError - if len(concept_name) == 1: - return concept_name[0] - else: - return concept_name - - def extract_note(self, adata, source="note"): - column_types = self._get_column_types(path=self.filepath[source], filename=source) - df_source = dd.read_csv(self.filepath[source], dtype=column_types) - if columns is None: - columns = df_source.columns - obs_dict = [ - { - column: list(df_source[df_source["visit_occurrence_id"] == int(visit_occurrence_id)][column]) - for column in columns - } - for visit_occurrence_id in adata.obs.index - ] - adata.obsm["note"] = ak.Array(obs_dict) - return adata - - def note_nlp_map( - self, - ): - # Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping - pass - - def get_feature_info( - self, - adata, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - features: str or int or List[Union[str, int]] = None, - key: str = None, - ignore_not_shown_in_concept_table: bool = True, - exact_match: bool = True, - verbose: bool = False, - ): - if key is None: - if source in ["measurement", "observation", "specimen"]: - key = f"{source}_concept_id" - elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: - key = f"{source.split('_')[0]}_concept_id" - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if isinstance(features, str): - features = [features] - rprint(f"Trying to extarct the following features: {features}") - - # Input could be feature names/feature id (concept id) - # First convert all input feaure names into feature id. Map concept using CONCEPT_RELATIONSHIP table if required. - # Then try to extract feature data from source table using feature id. - - # TODO support features name - - if "concept" in self.tables: - column_types = self._get_column_types(path=self.filepath["concept"], filename="concept") - df_concept = self._read_table(self.filepath["concept"], dtype=column_types).dropna( - subset=["concept_id", "concept_name"] - ) # usecols=vocabularies_tables_columns["concept"], - concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") - - # TODO query this in the table - - feature_id_list = [] - feature_name_list = [] - domain_id_list = [] - concept_class_id_list = [] - concept_code_list = [] - - fetures_not_shown_in_concept_table = [] - - info_df = pd.DataFrame([]) - # Get feature id for each input, and check if each feature occurs in the concept table - for feature in features: - # if the input is feature ID - if isinstance(feature, numbers.Integral): - feature_id = feature - feature_id_1, feature_id_2 = self.map_concept_id(feature_id, verbose=False) - try: - feature_name = self.get_concept_name(feature_id_1, raise_error=True, verbose=False) - except KeyError: - if ignore_not_shown_in_concept_table: - fetures_not_shown_in_concept_table.append(feature) - continue - else: - rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table") - raise - match_score = 1 - - # if the input is feature name - elif isinstance(feature, str): - # return a list of (value, key, score) - result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) - - # if find 2 best matches - if len(result) == 2: - match_score = result[0][2] - - if match_score != 1: - if exact_match: - rprint( - f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similar ones: 1) [red]{result[0][0]}[/] 2) [red]{result[1][0]}" - ) - raise ValueError - else: - if result[1][1] == 1: - rprint( - f"Found multiple exact matches for [red]{feature}[/] in the concept table: 1) concept id: [red]{result[0][1]}[/] 2) concept id: [red]{result[1][1]}[/]. It is better to specify concept id directly." - ) - raise ValueError - feature_name = feature - feature_id = result[0][1] - # if only find 1 match - else: - feature_name = result[0][0] - match_score = result[0][1] - feature_id = result[0][2] - if exact_match and match_score != 1: - rprint( - f"Unable to find an exact match for [red]{feature}[/] in the concept table Similar one is [red]{result[0][0]}" - ) - raise ValueError - feature_id_1, feature_id_2 = self.map_concept_id(feature_id) - - else: - rprint( - "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] you want to extarct" - ) - raise TypeError - - info_df = pd.concat( - [ - info_df, - pd.DataFrame( - data=[[feature_name, feature_id_1, feature_id_2]], - columns=["feature_name", "feature_id_1", "feature_id_2"], - ), - ] - ) - - # feature_name_list.append(feature_name) - # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) - # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) - # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0]) - - if verbose: - """ - if map_concept: - rprint( - f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, feature ID [green]{concept_id}[/] in concept relationship table, match socre = [green]{match_score}." - ) - else: - """ - rprint( - f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match socre = [green]{match_score}." - ) - if info_df["feature_id_1"].equals(info_df["feature_id_2"]): - info_df.drop("feature_id_2", axis=1, inplace=True) - info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) - info_df = info_df.reset_index(drop=True) - else: - info_df = info_df.reset_index(drop=True) - return info_df - - def get_feature_statistics( - self, - adata, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - features: str or int or List[Union[str, int]] = None, - level="stay_level", - value_col: str = "value_source_value", - aggregation_methods: Union[ - Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]] - ] = None, - add_aggregation_to_X: bool = True, - verbose: bool = False, - use_dask: bool = None, - ): - if source in ["measurement", "observation", "specimen"]: - key = f"{source}_concept_id" - elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: - key = f"{source.split('_')[0]}_concept_id" - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if source == "measurement": - source_table_columns = ["visit_occurrence_id", "measurement_datetime", key, value_col] - elif source == "observation": - source_table_columns = ["visit_occurrence_id", "observation_datetime", key, value_col] - elif source == "condition_occurrence": - source_table_columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if use_dask is None: - use_dask = self.use_dask - source_column_types = self._get_column_types(path=self.filepath[source], filename=source) - df_source = self._read_table( - self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask - ) - info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) - info_dict = info_df[["feature_id", "feature_name"]].set_index("feature_id").to_dict()["feature_name"] - - # Select featrues - df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - # TODO Select time - # da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] - # df_source[f'{source}_name'] = df_source[key].map(info_dict) - if aggregation_methods is None: - aggregation_methods = ["min", "max", "mean", "std", "count"] - if level == "stay_level": - result = df_source.groupby(["visit_occurrence_id", key]).agg({value_col: aggregation_methods}) - - if use_dask: - result = result.compute() - result = result.reset_index(drop=False) - result.columns = ["_".join(a) for a in result.columns.to_flat_index()] - result.columns = result.columns.str.removesuffix("_") - result.columns = result.columns.str.removeprefix(f"{value_col}_") - result[f"{source}_name"] = result[key].map(info_dict) - - df_statistics = result.pivot( - index="visit_occurrence_id", columns=f"{source}_name", values=aggregation_methods - ) - df_statistics.columns = df_statistics.columns.swaplevel() - df_statistics.columns = ["_".join(a) for a in df_statistics.columns.to_flat_index()] - - # TODO - sort_columns = True - if sort_columns: - new_column_order = [] - for feature in features: - for suffix in (f"_{aggregation_method}" for aggregation_method in aggregation_methods): - col_name = f"{feature}{suffix}" - if col_name in df_statistics.columns: - new_column_order.append(col_name) - - df_statistics.columns = new_column_order - - df_statistics.index = df_statistics.index.astype(str) - - adata.obs = adata.obs.join(df_statistics, how="left") - - if add_aggregation_to_X: - adata = ep.ad.move_to_x(adata, list(df_statistics.columns)) - return adata - - def extract_features( - self, - adata, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - features: str or int or List[Union[str, int]] = None, - source_table_columns: Union[str, List[str]] = None, - dropna: Optional[bool] = True, - verbose: Optional[bool] = True, - use_dask: bool = None, - ): - - if source in ["measurement", "observation", "specimen"]: - key = f"{source}_concept_id" - elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: - key = f"{source.split('_')[0]}_concept_id" - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if source_table_columns is None: - if source == "measurement": - source_table_columns = ["visit_occurrence_id", "measurement_datetime", "value_as_number", key] - elif source == "observation": - source_table_columns = [ - "visit_occurrence_id", - "value_as_number", - "value_as_string", - "observation_datetime", - key, - ] - elif source == "condition_occurrence": - source_table_columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - if use_dask is None: - use_dask = self.use_dask - - # TODO load using Dask or Dask-Awkward - # Load source table using dask - source_column_types = self._get_column_types(path=self.filepath[source], filename=source) - df_source = self._read_table( - self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask - ) - info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) - info_dict = info_df[["feature_id", "feature_name"]].set_index("feature_id").to_dict()["feature_name"] - - # Select featrues - df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - - # TODO select time period - # df_source = df_source[(df_source.time >= 0) & (df_source.time <= 48*60*60)] - # da_measurement['measurement_name'] = da_measurement.measurement_concept_id.replace(info_dict) - - # TODO dask caching - """ - from dask.cache import Cache - cache = Cache(2e9) - cache.register() - """ - if use_dask: - if dropna == True: - df_source = df_source.compute().dropna() - else: - df_source = df_source.compute() - else: - if dropna == True: - df_source = df_source.dropna() - - # Preprocess steps outside the loop - unique_visit_occurrence_ids = set(adata.obs.index) # .astype(int)) - empty_entry = { - source_table_column: [] - for source_table_column in source_table_columns - if source_table_column not in [key, "visit_occurrence_id"] - } - - # Filter data once, if possible - filtered_data = {feature_id: df_source[df_source[key] == feature_id] for feature_id in set(info_dict.keys())} - - for feature_id in set(info_dict.keys()): - df_feature = filtered_data[feature_id][list(set(source_table_columns) - set([key]))] - grouped = df_feature.groupby("visit_occurrence_id") - if verbose: - print(f"Adding feature [{info_dict[feature_id]}] into adata.obsm") - - # Use set difference and intersection more efficiently - feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - - # Creating the array more efficiently - adata.obsm[info_dict[feature_id]] = ak.Array( - [ - ( - grouped.get_group(visit_occurrence_id)[ - list(set(source_table_columns) - set([key, "visit_occurrence_id"])) - ].to_dict(orient="list") - if visit_occurrence_id in feature_ids - else empty_entry - ) - for visit_occurrence_id in unique_visit_occurrence_ids - ] - ) - - return adata - - def drop_nan( - self, - adata, - key: Union[str, List[str]], - slot: Union[str, None] = "obsm", - ): - if isinstance(key, str): - key_list = [key] - else: - key_list = key - if slot == "obsm": - for key in key_list: - ak_array = adata.obsm[key] - - # Update the combined mask based on the presence of None in each field - for i, field in enumerate(ak_array.fields): - field_mask = ak.is_none(ak.nan_to_none(ak_array[field]), axis=1) - if i == 0: - combined_mask = ak.full_like(field_mask, fill_value=False, dtype=bool) - combined_mask = combined_mask | field_mask - ak_array = ak_array[~combined_mask] - adata.obsm[key] = ak_array - - return adata - - # downsampling - def aggregate_timeseries_in_bins( - self, - adata, - features: Union[str, List[str]], - slot: Union[str, None] = "obsm", - value_key: str = "value_as_number", - time_key: str = "measurement_datetime", - time_binning_method: Literal["floor", "ceil", "round"] = "floor", - bin_size: Union[str, Offset] = "h", - aggregation_method: Literal["median", "mean", "min", "max"] = "median", - time_upper_bound: int = 48, # TODO - ): - - if isinstance(features, str): - features_list = [features] - else: - features_list = features - - # Ensure the time_binning_method provided is one of the expected methods - if time_binning_method not in ["floor", "ceil", "round"]: - raise ValueError( - f"time_binning_method {time_binning_method} is not supported. Choose from 'floor', 'ceil', or 'round'." - ) - - if aggregation_method not in {"median", "mean", "min", "max"}: - raise ValueError( - f"aggregation_method {aggregation_method} is not supported. Choose from 'median', 'mean', 'min', or 'max'." - ) - - if slot == "obsm": - for feature in features_list: - print(f"processing feature [{feature}]") - df = self.to_dataframe(adata, features) - if pd.api.types.is_datetime64_any_dtype(df[time_key]): - func = getattr(df[time_key].dt, time_binning_method, None) - if func is not None: - df[time_key] = func(bin_size) - else: - # TODO need to take care of this if it doesn't follow omop standard - if bin_size == "h": - df[time_key] = df[time_key] / 3600 - func = getattr(np, time_binning_method) - df[time_key] = func(df[time_key]) - - df[time_key] = df[time_key].astype(str) - # Adjust time values that are equal to the time_upper_bound - # df.loc[df[time_key] == time_upper_bound, time_key] = time_upper_bound - 1 - - # Group and aggregate data - df = ( - df.groupby(["visit_occurrence_id", time_key])[value_key] - .agg(aggregation_method) - .reset_index(drop=False) - ) - grouped = df.groupby("visit_occurrence_id") - - unique_visit_occurrence_ids = adata.obs.index - empty_entry = {value_key: [], time_key: []} - - # Efficiently use set difference and intersection - feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - # Efficiently create the array - ak_array = ak.Array( - [ - ( - grouped.get_group(visit_occurrence_id)[[value_key, time_key]].to_dict(orient="list") - if visit_occurrence_id in feature_ids - else empty_entry - ) - for visit_occurrence_id in unique_visit_occurrence_ids - ] - ) - adata.obsm[feature] = ak_array - - return adata - - def timeseries_discretizer( - self, - adata, - key: Union[str, List[str]], - slot: Union[str, None] = "obsm", - value_key: str = "value_as_number", - time_key: str = "measurement_datetime", - freq: str = "hour", # TODO - time_limit: int = 48, # TODO - method: str = "median", # TODO - ): - - pass - - def from_dataframe(self, adata, feature: str, df): - grouped = df.groupby("visit_occurrence_id") - unique_visit_occurrence_ids = set(adata.obs.index) - - # Use set difference and intersection more efficiently - feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - empty_entry = { - source_table_column: [] - for source_table_column in set(df.columns) - if source_table_column not in ["visit_occurrence_id"] - } - - # Creating the array more efficiently - ak_array = ak.Array( - [ - ( - grouped.get_group(visit_occurrence_id)[ - list(set(df.columns) - set(["visit_occurrence_id"])) - ].to_dict(orient="list") - if visit_occurrence_id in feature_ids - else empty_entry - ) - for visit_occurrence_id in unique_visit_occurrence_ids - ] - ) - adata.obsm[feature] = ak_array - - return adata - - # TODO add function to check feature and add concept - # More IO functions - def to_dataframe( - self, - adata, - features: Union[str, List[str]], # TODO also support list of features - # patient str or List, # TODO also support subset of patients/visit - ): - # TODO - # can be viewed as patient level - only select some patient - # TODO change variable name here - if isinstance(features, str): - features = [features] - df_concat = pd.DataFrame([]) - for feature in features: - df = ak.to_dataframe(adata.obsm[feature]) - - df.reset_index(drop=False, inplace=True) - df["entry"] = adata.obs.index[df["entry"]] - df = df.rename(columns={"entry": "visit_occurrence_id"}) - del df["subentry"] - for col in df.columns: - if col.endswith("time"): - df[col] = pd.to_datetime(df[col]) - - df["feature_name"] = feature - df_concat = pd.concat([df_concat, df], axis=0) - - return df_concat - - def plot_timeseries( - self, - adata, - visit_occurrence_id: int, - key: Union[str, List[str]], - slot: Union[str, None] = "obsm", - value_key: str = "value_as_number", - time_key: str = "measurement_datetime", - x_label: str = None, - ): - - if isinstance(key, str): - key_list = [key] - else: - key_list = key - - # Initialize min_x and max_x - min_x = None - max_x = None - - if slot == "obsm": - fig, ax = plt.subplots(figsize=(20, 6)) - # Scatter plot - for i, key in enumerate(key_list): - df = self.to_dataframe(adata, key) - x = df[df.visit_occurrence_id == visit_occurrence_id][time_key] - y = df[df.visit_occurrence_id == visit_occurrence_id][value_key] - - # Check if x is empty - if not x.empty: - ax.scatter(x=x, y=y, label=key) - ax.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=len(key_list), prop={"size": 14}) - - ax.plot(x, y) - - if min_x is None or min_x > x.min(): - min_x = x.min() - if max_x is None or max_x < x.max(): - max_x = x.max() - - else: - # Skip this iteration if x is empty - continue - - if min_x is not None and max_x is not None: - - # Adapt this to input data - # TODO step - # plt.xticks(np.arange(min_x, max_x, step=1)) - # Adapt this to input data - plt.xlabel(x_label if x_label else "Hours since ICU admission") - - plt.show() - - def violin( - self, - adata: AnnData, - obsm_key: str = None, - keys: Union[str, Sequence[str]] = None, - groupby: Optional[str] = None, - log: Optional[bool] = False, - use_raw: Optional[bool] = None, - stripplot: bool = True, - jitter: Union[float, bool] = True, - size: int = 1, - layer: Optional[str] = None, - scale: Literal["area", "count", "width"] = "width", - order: Optional[Sequence[str]] = None, - multi_panel: Optional[bool] = None, - xlabel: str = "", - ylabel: Union[str, Sequence[str]] = None, - rotation: Optional[float] = None, - show: Optional[bool] = None, - save: Union[bool, str] = None, - ax: Optional[Axes] = None, - **kwds, - ): # pragma: no cover - """Violin plot. - - Wraps :func:`seaborn.violinplot` for :class:`~anndata.AnnData`. - - Args: - adata: :class:`~anndata.AnnData` object object containing all observations. - keys: Keys for accessing variables of `.var_names` or fields of `.obs`. - groupby: The key of the observation grouping to consider. - log: Plot on logarithmic axis. - use_raw: Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present. - stripplot: Add a stripplot on top of the violin plot. See :func:`~seaborn.stripplot`. - jitter: Add jitter to the stripplot (only when stripplot is True) See :func:`~seaborn.stripplot`. - size: Size of the jitter points. - layer: Name of the AnnData object layer that wants to be plotted. By - default adata.raw.X is plotted. If `use_raw=False` is set, - then `adata.X` is plotted. If `layer` is set to a valid layer name, - then the layer is plotted. `layer` takes precedence over `use_raw`. - scale: The method used to scale the width of each violin. - If 'width' (the default), each violin will have the same width. - If 'area', each violin will have the same area. - If 'count', a violin’s width corresponds to the number of observations. - order: Order in which to show the categories. - multi_panel: Display keys in multiple panels also when `groupby is not None`. - xlabel: Label of the x axis. Defaults to `groupby` if `rotation` is `None`, otherwise, no label is shown. - ylabel: Label of the y axis. If `None` and `groupby` is `None`, defaults to `'value'`. - If `None` and `groubpy` is not `None`, defaults to `keys`. - rotation: Rotation of xtick labels. - {show_save_ax} - **kwds: - Are passed to :func:`~seaborn.violinplot`. - - Returns - ------- - A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`. - - Example: - .. code-block:: python - - import ehrapy as ep - - adata = ep.dt.mimic_2(encoded=True) - ep.pp.knn_impute(adata) - ep.pp.log_norm(adata, offset=1) - ep.pp.neighbors(adata) - ep.tl.leiden(adata, resolution=0.5, key_added="leiden_0_5") - ep.pl.violin(adata, keys=["age"], groupby="leiden_0_5") - - Preview: - .. image:: /_static/docstring_previews/violin.png - """ - if obsm_key: - df = self.to_dataframe(adata, features=obsm_key) - df = df[["visit_occurrence_id", "value_as_number"]] - df = df.rename(columns={"value_as_number": obsm_key}) - - if groupby: - df = df.set_index("visit_occurrence_id").join(adata.obs[groupby].to_frame()).reset_index(drop=False) - adata = ep.ad.df_to_anndata(df, columns_obs_only=["visit_occurrence_id", groupby]) - else: - adata = ep.ad.df_to_anndata(df, columns_obs_only=["visit_occurrence_id"]) - keys = obsm_key - - violin_partial = partial( - sc.pl.violin, - keys=keys, - log=log, - use_raw=use_raw, - stripplot=stripplot, - jitter=jitter, - size=size, - layer=layer, - scale=scale, - order=order, - multi_panel=multi_panel, - xlabel=xlabel, - ylabel=ylabel, - rotation=rotation, - show=show, - save=save, - ax=ax, - **kwds, - ) - - return violin_partial(adata=adata, groupby=groupby) - - def qc_lab_measurements( - self, - adata: AnnData, - reference_table: pd.DataFrame = None, - measurements: list[str] = None, - obsm_measurements: list[str] = None, - action: Literal["remove"] = "remove", - unit: Literal["traditional", "SI"] = None, - layer: str = None, - threshold: int = 20, - age_col: str = None, - age_range: str = None, - sex_col: str = None, - sex: str = None, - ethnicity_col: str = None, - ethnicity: str = None, - copy: bool = False, - verbose: bool = False, - ) -> AnnData: - - if copy: - adata = adata.copy() - - preprocessing_dir = "/Users/xinyuezhang/ehrapy/ehrapy/preprocessing" - if reference_table is None: - reference_table = pd.read_csv( - f"{preprocessing_dir}/laboratory_reference_tables/laposata.tsv", sep="\t", index_col="Measurement" - ) - if obsm_measurements: - measurements = obsm_measurements - for measurement in measurements: - best_column_match, score = process.extractOne( - query=measurement, choices=reference_table.index, score_cutoff=threshold - ) - if best_column_match is None: - rprint(f"[bold yellow]Unable to find a match for {measurement}") - continue - if verbose: - rprint( - f"[bold blue]Detected [green]{best_column_match}[blue] for [green]{measurement}[blue] with score [green]{score}." - ) - - reference_column = "SI Reference Interval" if unit == "SI" else "Traditional Reference Interval" - - # Fetch all non None columns from the reference statistics - not_none_columns = [col for col in [sex_col, age_col, ethnicity_col] if col is not None] - not_none_columns.append(reference_column) - reference_values = reference_table.loc[[best_column_match], not_none_columns] - - additional_columns = False - if sex_col or age_col or ethnicity_col: # check if additional columns were provided - additional_columns = True - - # Check if multiple reference values occur and no additional information is available: - if reference_values.shape[0] > 1 and additional_columns is False: - raise ValueError( - f"Several options for {best_column_match} reference value are available. Please specify sex, age or " - f"ethnicity columns and their values." - ) - - try: - if age_col: - min_age, max_age = age_range.split("-") - reference_values = reference_values[ - (reference_values[age_col].str.split("-").str[0].astype(int) >= int(min_age)) - and (reference_values[age_col].str.split("-").str[1].astype(int) <= int(max_age)) - ] - if sex_col: - sexes = "U|M" if sex is None else sex - reference_values = reference_values[reference_values[sex_col].str.contains(sexes)] - if ethnicity_col: - reference_values = reference_values[reference_values[ethnicity_col].isin([ethnicity])] - - if layer is not None: - actual_measurements = adata[:, measurement].layers[layer] - else: - if obsm_measurements: - actual_measurements = adata.obsm[measurement]["value_as_number"] - ak_measurements = adata.obsm[measurement] - else: - actual_measurements = adata[:, measurement].X - except TypeError: - rprint(f"[bold yellow]Unable to find specified reference values for {measurement}.") - - check = reference_values[reference_column].values - check_str: str = np.array2string(check) - check_str = check_str.replace("[", "").replace("]", "").replace("'", "") - if "<" in check_str: - upperbound = float(check_str.replace("<", "")) - if verbose: - rprint(f"[bold blue]Using upperbound [green]{upperbound}") - upperbound_check_results = actual_measurements < upperbound - if isinstance(actual_measurements, ak.Array): - if action == "remove": - if verbose: - rprint( - f"Removing {ak.count(actual_measurements) - ak.count(actual_measurements[upperbound_check_results])} outliers" - ) - adata.obsm[measurement] = ak_measurements[upperbound_check_results] - else: - upperbound_check_results_array: np.ndarray = upperbound_check_results.copy() - adata.obs[f"{measurement} normal"] = upperbound_check_results_array - - elif ">" in check_str: - lower_bound = float(check_str.replace(">", "")) - if verbose: - rprint(f"[bold blue]Using lowerbound [green]{lower_bound}") - - lower_bound_check_results = actual_measurements > lower_bound - if isinstance(actual_measurements, ak.Array): - if action == "remove": - adata.obsm[measurement] = ak_measurements[lower_bound_check_results] - else: - adata.obs[f"{measurement} normal"] = lower_bound_check_results_array - lower_bound_check_results_array = lower_bound_check_results.copy() - else: # "-" range case - min_value = float(check_str.split("-")[0]) - max_value = float(check_str.split("-")[1]) - if verbose: - rprint(f"[bold blue]Using minimum of [green]{min_value}[blue] and maximum of [green]{max_value}") - - range_check_results = (actual_measurements >= min_value) & (actual_measurements <= max_value) - if isinstance(actual_measurements, ak.Array): - if action == "remove": - adata.obsm[measurement] = ak_measurements[range_check_results] - else: - adata.obs[f"{measurement} normal"] = range_check_results_array - range_check_results_array: np.ndarray = range_check_results.copy() - - if copy: - return adata diff --git a/ehrdata/__init__.py b/ehrdata/__init__.py new file mode 100644 index 0000000..73939b7 --- /dev/null +++ b/ehrdata/__init__.py @@ -0,0 +1,7 @@ +from importlib.metadata import version + +from . import dt, pl, pp, tl, io + +__all__ = ["dt", "pl", "pp", "tl", "io"] + +__version__ = "0.0.0" diff --git a/ehrdata/dt/__init__.py b/ehrdata/dt/__init__.py new file mode 100644 index 0000000..8a2b780 --- /dev/null +++ b/ehrdata/dt/__init__.py @@ -0,0 +1 @@ +from ehrdata.dt._omop import init_omop \ No newline at end of file diff --git a/ehrdata/dt/_omop.py b/ehrdata/dt/_omop.py new file mode 100644 index 0000000..2a9a605 --- /dev/null +++ b/ehrdata/dt/_omop.py @@ -0,0 +1,130 @@ +import os + + +import pandas as pd + +import ehrapy as ep +from pathlib import Path +from ehrdata.utils.omop_utils import * +from rich.console import Console +from rich.text import Text +import rich.repr +from rich import print as rprint +from typing import TYPE_CHECKING, Any, Callable, Literal, Union, List + + + + +def init_omop(folder_path, + delimiter=None, + make_filename_lowercase=True, + use_dask=False, + level: Literal["stay_level", "patient_level"] = "stay_level", + tables: Union[str, List[str]] = None, + remove_empty_column=True): + + + + filepath_dict = check_with_omop_cdm(folder_path=folder_path, delimiter=delimiter, make_filename_lowercase=make_filename_lowercase) + tables = list(filepath_dict.keys()) + adata_dict = {} + adata_dict['filepath_dict'] = filepath_dict + adata_dict['tables'] = tables + adata_dict['delimiter'] = delimiter + adata_dict['use_dask'] = use_dask + + + table_catalog_dict = get_table_catalog_dict() + + color_map = { + 'Clinical data': 'blue', + 'Health system data': 'green', + 'Health economics data': 'red', + 'Standardized derived elements': 'magenta', + 'Metadata': 'white', + 'Vocabulary': 'dark_orange' + } + # Object description + print_str = f'OMOP Database ([red]{os.path.basename(folder_path)}[/]) with {len(tables)} tables.\n' + + # Tables information + for key, value in table_catalog_dict.items(): + table_list = [table_name for table_name in tables if table_name in value] + if len(table_list) != 0: + print_str = print_str + f"[{color_map[key]}]{key} tables[/]: [black]{', '.join(table_list)}[/]\n" + #table_list_str = ', '.join(table_list) + + #text = Text(f"{key} tables: ", style=color_map[key]) + #text.append(table_list_str) + #yield None, f"{key} tables", "red" + rprint(print_str) + + tables = ['person', 'death', 'visit_occurrence'] + # TODO patient level and hospital level + if level == "stay_level": + index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} + # TODO Only support clinical_tables_columns + table_dict = {} + for table in tables: + print(f"reading table [{table}]") + column_types = get_column_types(adata_dict, table_name=table) + df = read_table(adata_dict, table_name=table, dtype=column_types, index='person_id') + if remove_empty_column: + # TODO dask Support + #columns = [column for column in df.columns if not df[column].compute().isna().all()] + columns = [column for column in df.columns if not df[column].isna().all()] + df = df.loc[:, columns] + table_dict[table] = df + + # concept_id_list = list(self.concept.concept_id) + # concept_name_list = list(self.concept.concept_id) + # concept_domain_id_list = list(set(self.concept.domain_id)) + + # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] + # TODO dask Support + joined_table = pd.merge(table_dict["visit_occurrence"], table_dict["person"], left_index=True, right_index=True, how="left") + + joined_table = pd.merge(joined_table, table_dict["death"], left_index=True, right_index=True, how="left") + + # TODO dask Support + #joined_table = joined_table.compute() + + # TODO check this earlier + joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') + joined_table = joined_table.set_index("visit_occurrence_id") + # obs_only_list = list(self.joined_table.columns) + # obs_only_list.remove('visit_occurrence_id') + columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) + adata = ep.ad.df_to_anndata( + joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only + ) + # TODO this needs to be fixed because anndata set obs index as string by default + #adata.obs.index = adata.obs.index.astype(int) + + """ + for column in self.measurement.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.drug_exposure.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.observation.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + """ + + adata.uns.update(adata_dict) + + return adata + diff --git a/ehrdata/io/__init__.py b/ehrdata/io/__init__.py new file mode 100644 index 0000000..aaf540f --- /dev/null +++ b/ehrdata/io/__init__.py @@ -0,0 +1 @@ +from ehrdata.io._omop import from_dataframe, to_dataframe \ No newline at end of file diff --git a/ehrdata/io/_omop.py b/ehrdata/io/_omop.py new file mode 100644 index 0000000..cb6331f --- /dev/null +++ b/ehrdata/io/_omop.py @@ -0,0 +1,55 @@ +from typing import List, Union, Literal, Optional +import awkward as ak +import pandas as pd + +def from_dataframe( + adata, + feature: str, + df +): + grouped = df.groupby("visit_occurrence_id") + unique_visit_occurrence_ids = set(adata.obs.index) + + # Use set difference and intersection more efficiently + feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) + empty_entry = {source_table_column: [] for source_table_column in set(df.columns) if source_table_column not in ['visit_occurrence_id'] } + + # Creating the array more efficiently + ak_array = ak.Array([ + grouped.get_group(visit_occurrence_id)[list(set(df.columns) - set(['visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry + for visit_occurrence_id in unique_visit_occurrence_ids]) + adata.obsm[feature] = ak_array + + return adata + +# TODO add function to check feature and add concept +# More IO functions + +def to_dataframe( + adata, + features: Union[str, List[str]], # TODO also support list of features + # patient str or List, # TODO also support subset of patients/visit +): + # TODO + # can be viewed as patient level - only select some patient + # TODO change variable name here + if isinstance(features, str): + features = [features] + df_concat = pd.DataFrame([]) + for feature in features: + df = ak.to_dataframe(adata.obsm[feature]) + + df.reset_index(drop=False, inplace=True) + df["entry"] = adata.obs.index[df["entry"]] + df = df.rename(columns={"entry": "visit_occurrence_id"}) + del df["subentry"] + for col in df.columns: + if col.endswith('time'): + df[col] = pd.to_datetime(df[col]) + + df['feature_name'] = feature + df_concat = pd.concat([df_concat, df], axis= 0) + + + return df_concat + diff --git a/ehrdata/pl/__init__.py b/ehrdata/pl/__init__.py new file mode 100644 index 0000000..a1a7091 --- /dev/null +++ b/ehrdata/pl/__init__.py @@ -0,0 +1 @@ +from ehrdata.pl._omop import feature_counts \ No newline at end of file diff --git a/ehrdata/pl/_omop.py b/ehrdata/pl/_omop.py new file mode 100644 index 0000000..7565a68 --- /dev/null +++ b/ehrdata/pl/_omop.py @@ -0,0 +1,59 @@ +from typing import List, Union, Literal, Optional +from ehrdata.utils.omop_utils import * +from ehrdata.tl import get_concept_name +import seaborn as sns +import matplotlib.pyplot as plt + +# TODO allow users to pass features +def feature_counts( + adata, + source: Literal[ + "observation", + "measurement", + "procedure_occurrence", + "specimen", + "device_exposure", + "drug_exposure", + "condition_occurrence", + ], + number=20, + key = None +): + + if source == 'measurement': + columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] + elif source == 'observation': + columns = ["value_as_number", "value_as_string", "measurement_datetime"] + elif source == 'condition_occurrence': + columns = None + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + filepath_dict = adata.uns['filepath_dict'] + tables = adata.uns['tables'] + + column_types = get_column_types(adata.uns, table_name=source) + df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=[f"{source}_concept_id"]) + feature_counts = df_source[f"{source}_concept_id"].value_counts() + if adata.uns['use_dask']: + feature_counts = feature_counts.compute() + feature_counts = feature_counts.to_frame().reset_index(drop=False)[0:number] + + + feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = map_concept_id( + adata.uns, concept_id=feature_counts[f"{source}_concept_id"], verbose=False + ) + feature_counts["feature_name"] = get_concept_name(adata, concept_id=feature_counts[f"{source}_concept_id_1"]) + if feature_counts[f"{source}_concept_id_1"].equals(feature_counts[f"{source}_concept_id_2"]): + feature_counts.drop(f"{source}_concept_id_2", axis=1, inplace=True) + feature_counts.rename(columns={f"{source}_concept_id_1": f"{source}_concept_id"}) + feature_counts = feature_counts.reindex(columns=["feature_name", f"{source}_concept_id", "count"]) + else: + feature_counts = feature_counts.reindex( + columns=["feature_name", f"{source}_concept_id_1", f"{source}_concept_id_2", "count"] + ) + + ax = sns.barplot(feature_counts, x="feature_name", y="count") + ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") + plt.tight_layout() + return feature_counts \ No newline at end of file diff --git a/ehrdata/pp/__init__.py b/ehrdata/pp/__init__.py new file mode 100644 index 0000000..ac5fed4 --- /dev/null +++ b/ehrdata/pp/__init__.py @@ -0,0 +1 @@ +from ehrdata.pp._omop import get_feature_statistics diff --git a/ehrdata/pp/_omop.py b/ehrdata/pp/_omop.py new file mode 100644 index 0000000..7e86845 --- /dev/null +++ b/ehrdata/pp/_omop.py @@ -0,0 +1,110 @@ +from typing import List, Union, Literal, Optional +from ehrdata.utils.omop_utils import * +import ehrapy as ep +import warnings + +def get_feature_statistics( + adata, + source: Literal[ + "observation", + "measurement", + "procedure_occurrence", + "specimen", + "device_exposure", + "drug_exposure", + "condition_occurrence", + ], + features: Union[str, int , List[Union[str, int]]] = None, + level="stay_level", + value_col: str = None, + aggregation_methods: Union[Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]]]=None, + add_aggregation_to_X: bool = True, + verbose: bool = False, + use_dask: bool = None, +): + if source in ["measurement", "observation", "specimen"]: + key = f"{source}_concept_id" + elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: + key = f"{source.split('_')[0]}_concept_id" + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + if source == 'measurement': + value_col = 'value_as_number' + warnings.warn(f"Extracting values from {value_col}. Value in measurement table could be saved in these columns: value_as_number, value_source_value.\nSpecify value_col to extract value from desired column.") + source_table_columns = ['visit_occurrence_id', 'measurement_datetime', key, value_col] + elif source == 'observation': + value_col = 'value_as_number' + warnings.warn(f"Extracting values from {value_col}. Value in observation table could be saved in these columns: value_as_number, value_as_string, value_source_value.\nSpecify value_col to extract value from desired column.") + source_table_columns = ['visit_occurrence_id', "observation_datetime", key, value_col] + elif source == 'condition_occurrence': + source_table_columns = None + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + if isinstance(features, str): + features = [features] + rprint(f"Trying to extarct the following features: {features}") + + if use_dask is None: + use_dask = True + + column_types = get_column_types(adata.uns, table_name=source) + df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=source_table_columns, use_dask=use_dask) + + info_df = get_feature_info(adata.uns, features=features, verbose=verbose) + info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] + + # Select featrues + df_source = df_source[df_source[key].isin(list(info_df.feature_id))] + #TODO Select time + #da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] + #df_source[f'{source}_name'] = df_source[key].map(info_dict) + if aggregation_methods is None: + aggregation_methods = ["min", "max", "mean", "std", "count"] + if level == 'stay_level': + result = df_source.groupby(['visit_occurrence_id', key]).agg({ + value_col: aggregation_methods}) + + if use_dask: + result = result.compute() + result = result.reset_index(drop=False) + result.columns = ["_".join(a) for a in result.columns.to_flat_index()] + result.columns = result.columns.str.removesuffix('_') + result.columns = result.columns.str.removeprefix(f'{value_col}_') + result[f'{source}_name'] = result[key].map(info_dict) + + df_statistics = result.pivot(index='visit_occurrence_id', + columns=f'{source}_name', + values=aggregation_methods) + df_statistics.columns = df_statistics.columns.swaplevel() + df_statistics.columns = ["_".join(a) for a in df_statistics.columns.to_flat_index()] + + + # TODO + sort_columns = True + if sort_columns: + new_column_order = [] + for feature in features: + for suffix in (f'_{aggregation_method}' for aggregation_method in aggregation_methods): + col_name = f'{feature}{suffix}' + if col_name in df_statistics.columns: + new_column_order.append(col_name) + + df_statistics.columns = new_column_order + + df_statistics.index = df_statistics.index.astype(str) + + adata.obs = pd.merge(adata.obs, df_statistics, how='left', left_index=True, right_index=True) + + if add_aggregation_to_X: + uns = adata.uns + obsm = adata.obsm + varm = adata.varm + layers = adata.layers + adata = ep.ad.move_to_x(adata, list(df_statistics.columns)) + adata.uns = uns + adata.obsm = obsm + adata.varm = varm + # It will change + # adata.layers = layers + return adata \ No newline at end of file diff --git a/ehrdata/tl/__init__.py b/ehrdata/tl/__init__.py new file mode 100644 index 0000000..83756fa --- /dev/null +++ b/ehrdata/tl/__init__.py @@ -0,0 +1 @@ +from ehrdata.tl._omop import get_concept_name \ No newline at end of file diff --git a/ehrdata/tl/_omop.py b/ehrdata/tl/_omop.py new file mode 100644 index 0000000..490bb04 --- /dev/null +++ b/ehrdata/tl/_omop.py @@ -0,0 +1,48 @@ +from ehrdata.utils.omop_utils import * #get_column_types, read_table, df_to_dict +from typing import List, Union, Literal, Optional, Dict +import numbers +from rich import print as rprint +from anndata import AnnData + +def get_concept_name( + adata: Union[AnnData, Dict], + concept_id: Union[str, List], + raise_error=False, + verbose=True): + + if isinstance(concept_id, numbers.Integral): + concept_id = [concept_id] + + if isinstance(adata, AnnData): + adata_dict = adata.uns + else: + adata_dict = adata + + column_types = get_column_types(adata_dict, table_name="concept") + df_concept = read_table(adata_dict, table_name="concept", dtype=column_types) + # TODO dask Support + #df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + df_concept.dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") + concept_name = [] + concept_name_not_found = [] + for id in concept_id: + try: + concept_name.append(concept_dict[id]) + except KeyError: + concept_name.append(id) + concept_name_not_found.append(id) + if len(concept_name_not_found) > 0: + # warnings.warn(f"Couldn't find concept {id} in concept table!") + if verbose: + rprint(f"Couldn't find concept {concept_name_not_found} in concept table!") + if raise_error: + raise KeyError + if len(concept_name) == 1: + return concept_name[0] + else: + return concept_name + +# TODO +def get_concept_id(): + pass \ No newline at end of file diff --git a/ehrdata/utils/OMOP_CDMv5.4_Field_Level.csv b/ehrdata/utils/OMOP_CDMv5.4_Field_Level.csv new file mode 100644 index 0000000..86ea6a1 --- /dev/null +++ b/ehrdata/utils/OMOP_CDMv5.4_Field_Level.csv @@ -0,0 +1,551 @@ +cdmTableName,cdmFieldName,isRequired,cdmDatatype,userGuidance,etlConventions,isPrimaryKey,isForeignKey,fkTableName,fkFieldName,fkDomain,fkClass,unique DQ identifiers +person,person_id,Yes,integer,It is assumed that every person with a different unique identifier is in fact a different person and should be treated independently.,"Any person linkage that needs to occur to uniquely identify Persons ought to be done prior to writing this table. This identifier can be the original id from the source data provided if it is an integer, otherwise it can be an autogenerated number.",Yes,No,NA,NA,NA,NA,NA +person,gender_concept_id,Yes,integer,This field is meant to capture the biological sex at birth of the Person. This field should not be used to study gender identity issues.,Use the gender or sex value present in the data under the assumption that it is the biological sex at birth. If the source data captures gender identity it should be stored in the [OBSERVATION](https://ohdsi.github.io/CommonDataModel/cdm531.html#observation) table. [Accepted gender concepts](http://athena.ohdsi.org/search-terms/terms?domain=Gender&standardConcept=Standard&page=1&pageSize=15&query=),No,Yes,CONCEPT,CONCEPT_ID,Gender,NA,NA +person,year_of_birth,Yes,integer,Compute age using year_of_birth.,"For data sources with date of birth, the year should be extracted. For data sources where the year of birth is not available, the approximate year of birth could be derived based on age group categorization, if available.",No,No,NA,NA,NA,NA,NA +person,month_of_birth,No,integer,NA,"For data sources that provide the precise date of birth, the month should be extracted and stored in this field.",No,No,NA,NA,NA,NA,NA +person,day_of_birth,No,integer,NA,"For data sources that provide the precise date of birth, the day should be extracted and stored in this field.",No,No,NA,NA,NA,NA,NA +person,birth_datetime,No,datetime,NA,"This field is not required but highly encouraged. For data sources that provide the precise datetime of birth, that value should be stored in this field. If birth_datetime is not provided in the source, use the following logic to infer the date: If day_of_birth is null and month_of_birth is not null then use the first of the month in that year. If month_of_birth is null or if day_of_birth AND month_of_birth are both null and the person has records during their year of birth then use the date of the earliest record, otherwise use the 15th of June of that year. If time of birth is not given use midnight (00:00:0000).",No,No,NA,NA,NA,NA,NA +person,race_concept_id,Yes,integer,This field captures race or ethnic background of the person.,"Only use this field if you have information about race or ethnic background. The Vocabulary contains Concepts about the main races and ethnic backgrounds in a hierarchical system. Due to the imprecise nature of human races and ethnic backgrounds, this is not a perfect system. Mixed races are not supported. If a clear race or ethnic background cannot be established, use Concept_Id 0. [Accepted Race Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Race&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Race,NA,NA +person,ethnicity_concept_id,Yes,integer,"This field captures Ethnicity as defined by the Office of Management and Budget (OMB) of the US Government: it distinguishes only between ""Hispanic"" and ""Not Hispanic"". Races and ethnic backgrounds are not stored here.",Only use this field if you have US-based data and a source of this information. Do not attempt to infer Ethnicity from the race or ethnic background of the Person. [Accepted ethnicity concepts](http://athena.ohdsi.org/search-terms/terms?domain=Ethnicity&standardConcept=Standard&page=1&pageSize=15&query=),No,Yes,CONCEPT,CONCEPT_ID,Ethnicity,NA,NA +person,location_id,No,integer,The location refers to the physical address of the person. This field should capture the last known location of the person.,"Put the location_id from the [LOCATION](https://ohdsi.github.io/CommonDataModel/cdm531.html#location) table here that represents the most granular location information for the person. This could represent anything from postal code or parts thereof, state, or county for example. Since many databases contain deidentified data, it is common that the precision of the location is reduced to prevent re-identification. This field should capture the last known location.",No,Yes,LOCATION,LOCATION_ID,NA,NA,NA +person,provider_id,No,integer,The Provider refers to the last known primary care provider (General Practitioner).,"Put the provider_id from the [PROVIDER](https://ohdsi.github.io/CommonDataModel/cdm531.html#provider) table of the last known general practitioner of the person. If there are multiple providers, it is up to the ETL to decide which to put here.",No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +person,care_site_id,No,integer,The Care Site refers to where the Provider typically provides the primary care.,NA,No,Yes,CARE_SITE,CARE_SITE_ID,NA,NA,NA +person,person_source_value,No,varchar(50),Use this field to link back to persons in the source data. This is typically used for error checking of ETL logic.,Some use cases require the ability to link back to persons in the source data. This field allows for the storing of the person value as it appears in the source. This field is not required but strongly recommended.,No,No,NA,NA,NA,NA,NA +person,gender_source_value,No,varchar(50),This field is used to store the biological sex of the person from the source data. It is not intended for use in standard analytics but for reference only.,Put the biological sex of the person as it appears in the source data.,No,No,NA,NA,NA,NA,NA +person,gender_source_concept_id,No,integer,"Due to the small number of options, this tends to be zero.","If the source data codes biological sex in a non-standard vocabulary, store the concept_id here.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +person,race_source_value,No,varchar(50),This field is used to store the race of the person from the source data. It is not intended for use in standard analytics but for reference only.,Put the race of the person as it appears in the source data.,No,No,NA,NA,NA,NA,NA +person,race_source_concept_id,No,integer,"Due to the small number of options, this tends to be zero.",If the source data codes race in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +person,ethnicity_source_value,No,varchar(50),This field is used to store the ethnicity of the person from the source data. It is not intended for use in standard analytics but for reference only.,"If the person has an ethnicity other than the OMB standard of ""Hispanic"" or ""Not Hispanic"" store that value from the source data here.",No,No,NA,NA,NA,NA,NA +person,ethnicity_source_concept_id,No,integer,"Due to the small number of options, this tends to be zero.","If the source data codes ethnicity in an OMOP supported vocabulary, store the concept_id here.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +observation_period,observation_period_id,Yes,integer,A Person can have multiple discrete Observation Periods which are identified by the Observation_Period_Id.,Assign a unique observation_period_id to each discrete Observation Period for a Person.,Yes,No,NA,NA,NA,NA,NA +observation_period,person_id,Yes,integer,The Person ID of the PERSON record for which the Observation Period is recorded.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +observation_period,observation_period_start_date,Yes,date,Use this date to determine the start date of the Observation Period.,"It is often the case that the idea of Observation Periods does not exist in source data. In those cases, the observation_period_start_date can be inferred as the earliest Event date available for the Person. In insurance claim data, the Observation Period can be considered as the time period the Person is enrolled with a payer. If a Person switches plans but stays with the same payer, and therefore capturing of data continues, that change would be captured in [PAYER_PLAN_PERIOD](https://ohdsi.github.io/CommonDataModel/cdm531.html#payer_plan_period).",No,No,NA,NA,NA,NA,NA +observation_period,observation_period_end_date,Yes,date,Use this date to determine the end date of the period for which we can assume that all events for a Person are recorded.,"It is often the case that the idea of Observation Periods does not exist in source data. In those cases, the observation_period_end_date can be inferred as the last Event date available for the Person. In insurance claim data, the Observation Period can be considered as the time period the Person is enrolled with a payer.",No,No,NA,NA,NA,NA,NA +observation_period,period_type_concept_id,Yes,integer,"This field can be used to determine the provenance of the Observation Period as in whether the period was determined from an insurance enrollment file, EHR healthcare encounters, or other sources.",Choose the observation_period_type_concept_id that best represents how the period was determined. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).,No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +visit_occurrence,visit_occurrence_id,Yes,integer,Use this to identify unique interactions between a person and the health care system. This identifier links across the other CDM event tables to associate events with a visit.,This should be populated by creating a unique identifier for each unique interaction between a person and the healthcare system where the person receives a medical good or service over a span of time.,Yes,No,NA,NA,NA,NA,NA +visit_occurrence,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +visit_occurrence,visit_concept_id,Yes,integer,"This field contains a concept id representing the kind of visit, like inpatient or outpatient. All concepts in this field should be standard and belong to the Visit domain.","Populate this field based on the kind of visit that took place for the person. For example this could be ""Inpatient Visit"", ""Outpatient Visit"", ""Ambulatory Visit"", etc. This table will contain standard concepts in the Visit domain. These concepts are arranged in a hierarchical structure to facilitate cohort definitions by rolling up to generally familiar Visits adopted in most healthcare systems worldwide. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Visit,NA,NA +visit_occurrence,visit_start_date,Yes,date,"For inpatient visits, the start date is typically the admission date. For outpatient visits the start date and end date will be the same.","When populating VISIT_START_DATE, you should think about the patient experience to make decisions on how to define visits. In the case of an inpatient visit this should be the date the patient was admitted to the hospital or institution. In all other cases this should be the date of the patient-provider interaction.",No,No,NA,NA,NA,NA,NA +visit_occurrence,visit_start_datetime,No,datetime,NA,"If no time is given for the start date of a visit, set it to midnight (00:00:0000).",No,No,NA,NA,NA,NA,NA +visit_occurrence,visit_end_date,Yes,date,"For inpatient visits the end date is typically the discharge date. If a Person is still an inpatient in the hospital at the time of the data extract and does not have a visit_end_date, then set the visit_end_date to the date of the data pull.","Visit end dates are mandatory. If end dates are not provided in the source there are three ways in which to derive them: +- Outpatient Visit: visit_end_datetime = visit_start_datetime +- Emergency Room Visit: visit_end_datetime = visit_start_datetime +- Inpatient Visit: Usually there is information about discharge. If not, you should be able to derive the end date from the sudden decline of activity or from the absence of inpatient procedures/drugs. +- Non-hospital institution Visits: Particularly for claims data, if end dates are not provided assume the visit is for the duration of month that it occurs. +For Inpatient Visits ongoing at the date of ETL, put date of processing the data into visit_end_datetime and visit_type_concept_id with 32220 ""Still patient"" to identify the visit as incomplete. +- All other Visits: visit_end_datetime = visit_start_datetime. If this is a one-day visit the end date should match the start date.",No,No,NA,NA,NA,NA,NA +visit_occurrence,visit_end_datetime,No,datetime,"If a Person is still an inpatient in the hospital at the time of the data extract and does not have a visit_end_datetime, then set the visit_end_datetime to the datetime of the data pull.","If no time is given for the end date of a visit, set it to midnight (00:00:0000).",No,No,NA,NA,NA,NA,NA +visit_occurrence,visit_type_concept_id,Yes,Integer,"Use this field to understand the provenance of the visit record, or where the record comes from.","Populate this field based on the provenance of the visit record, as in whether it came from an EHR record or billing claim. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +visit_occurrence,provider_id,No,integer,"There will only be one provider per visit record and the ETL document should clearly state how they were chosen (attending, admitting, etc.). If there are multiple providers associated with a visit in the source, this can be reflected in the event tables (CONDITION_OCCURRENCE, PROCEDURE_OCCURRENCE, etc.) or in the VISIT_DETAIL table.","If there are multiple providers associated with a visit, you will need to choose which one to put here. The additional providers can be stored in the [VISIT_DETAIL](https://ohdsi.github.io/CommonDataModel/cdm531.html#visit_detail) table.",No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +visit_occurrence,care_site_id,No,integer,This field provides information about the Care Site where the Visit took place.,There should only be one Care Site associated with a Visit.,No,Yes,CARE_SITE,CARE_SITE_ID,NA,NA,NA +visit_occurrence,visit_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the kind of visit that took place (inpatient, outpatient, emergency, etc.)","If there is information about the kind of visit in the source data that value should be stored here. If a visit is an amalgamation of visits from the source then use a hierarchy to choose the visit source value, such as IP -> ER-> OP. This should line up with the logic chosen to determine how visits are created.",No,No,NA,NA,NA,NA,NA +visit_occurrence,visit_source_concept_id,No,integer,NA,If the visit source value is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +visit_occurrence,admitted_from_concept_id,No,integer,"Use this field to determine where the patient was admitted from. This concept is part of the visit domain and can indicate if a patient was admitted to the hospital from a long-term care facility, for example.","If available, map the admitted_from_source_value to a standard concept in the visit domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=1&pageSize=15&query=). If a person was admitted from home, set this to 0.",No,Yes,CONCEPT,CONCEPT_ID,Visit,NA,NA +visit_occurrence,admitted_from_source_value,No,varchar(50),NA,"This information may be called something different in the source data but the field is meant to contain a value indicating where a person was admitted from. Typically this applies only to visits that have a length of stay, like inpatient visits or long-term care visits.",No,No,NA,NA,NA,NA,NA +visit_occurrence,discharged_to_concept_id,No,integer,"Use this field to determine where the patient was discharged to after a visit. This concept is part of the visit domain and can indicate if a patient was transferred to another hospital or sent to a long-term care facility, for example. It is assumed that a person is discharged to home therefore there is not a standard concept id for ""home"". Use concept id = 0 when a person is discharged to home.","If available, map the discharged_to_source_value to a standard concept in the visit domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Visit,NA,NA +visit_occurrence,discharged_to_source_value,No,varchar(50),NA,"This information may be called something different in the source data but the field is meant to contain a value indicating where a person was discharged to after a visit, as in they went home or were moved to long-term care. Typically this applies only to visits that have a length of stay of a day or more.",No,No,NA,NA,NA,NA,NA +visit_occurrence,preceding_visit_occurrence_id,No,integer,Use this field to find the visit that occurred for the person prior to the given visit. There could be a few days or a few years in between.,"This field can be used to link a visit immediately preceding the current visit. Note this is not symmetrical, and there is no such thing as a ""following_visit_id"".",No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +visit_detail,visit_detail_id,Yes,integer,Use this to identify unique interactions between a person and the health care system. This identifier links across the other CDM event tables to associate events with a visit detail.,This should be populated by creating a unique identifier for each unique interaction between a person and the healthcare system where the person receives a medical good or service over a span of time.,Yes,No,NA,NA,NA,NA,NA +visit_detail,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +visit_detail,visit_detail_concept_id,Yes,integer,"This field contains a concept id representing the kind of visit detail, like inpatient or outpatient. All concepts in this field should be standard and belong to the Visit domain.","Populate this field based on the kind of visit that took place for the person. For example this could be ""Inpatient Visit"", ""Outpatient Visit"", ""Ambulatory Visit"", etc. This table will contain standard concepts in the Visit domain. These concepts are arranged in a hierarchical structure to facilitate cohort definitions by rolling up to generally familiar Visits adopted in most healthcare systems worldwide. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Visit,NA,NA +visit_detail,visit_detail_start_date,Yes,date,This is the date of the start of the encounter. This may or may not be equal to the date of the Visit the Visit Detail is associated with.,"When populating VISIT_DETAIL_START_DATE, you should think about the patient experience to make decisions on how to define visits. Most likely this should be the date of the patient-provider interaction.",No,No,NA,NA,NA,NA,NA +visit_detail,visit_detail_start_datetime,No,datetime,NA,"If no time is given for the start date of a visit, set it to midnight (00:00:0000).",No,No,NA,NA,NA,NA,NA +visit_detail,visit_detail_end_date,Yes,date,"This the end date of the patient-provider interaction. If a Person is still an inpatient in the hospital at the time of the data extract and does not have a visit_end_date, then set the visit_end_date to the date of the data pull.","Visit Detail end dates are mandatory. If end dates are not provided in the source there are three ways in which to derive them:
+- Outpatient Visit Detail: visit_detail_end_datetime = visit_detail_start_datetime +- Emergency Room Visit Detail: visit_detail_end_datetime = visit_detail_start_datetime +- Inpatient Visit Detail: Usually there is information about discharge. If not, you should be able to derive the end date from the sudden decline of activity or from the absence of inpatient procedures/drugs. +- Non-hospital institution Visit Details: Particularly for claims data, if end dates are not provided assume the visit is for the duration of month that it occurs.
+For Inpatient Visit Details ongoing at the date of ETL, put date of processing the data into visit_detai_end_datetime and visit_detail_type_concept_id with 32220 ""Still patient"" to identify the visit as incomplete. +All other Visits Details: visit_detail_end_datetime = visit_detail_start_datetime.",No,No,NA,NA,NA,NA,NA +visit_detail,visit_detail_end_datetime,No,datetime,"If a Person is still an inpatient in the hospital at the time of the data extract and does not have a visit_end_datetime, then set the visit_end_datetime to the datetime of the data pull.","If no time is given for the end date of a visit, set it to midnight (00:00:0000).",No,No,NA,NA,NA,NA,NA +visit_detail,visit_detail_type_concept_id,Yes,integer,"Use this field to understand the provenance of the visit detail record, or where the record comes from.","Populate this field based on the provenance of the visit detail record, as in whether it came from an EHR record or billing claim. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +visit_detail,provider_id,No,integer,"There will only be one provider per **visit** record and the ETL document should clearly state how they were chosen (attending, admitting, etc.). This is a typical reason for leveraging the VISIT_DETAIL table as even though each VISIT_DETAIL record can only have one provider, there is no limit to the number of VISIT_DETAIL records that can be associated to a VISIT_OCCURRENCE record.",The additional providers associated to a Visit can be stored in this table where each VISIT_DETAIL record represents a different provider.,No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +visit_detail,care_site_id,No,integer,This field provides information about the Care Site where the Visit Detail took place.,There should only be one Care Site associated with a Visit Detail.,No,Yes,CARE_SITE,CARE_SITE_ID,NA,NA,NA +visit_detail,visit_detail_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the kind of visit detail that took place (inpatient, outpatient, emergency, etc.)","If there is information about the kind of visit detail in the source data that value should be stored here. If a visit is an amalgamation of visits from the source then use a hierarchy to choose the VISIT_DETAIL_SOURCE_VALUE, such as IP -> ER-> OP. This should line up with the logic chosen to determine how visits are created.",No,No,NA,NA,NA,NA,NA +visit_detail,visit_detail_source_concept_id,No,Integer,NA,If the VISIT_DETAIL_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +visit_detail,admitted_from_concept_id,No,Integer,"Use this field to determine where the patient was admitted from. This concept is part of the visit domain and can indicate if a patient was admitted to the hospital from a long-term care facility, for example.","If available, map the admitted_from_source_value to a standard concept in the visit domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=1&pageSize=15&query=). If the person was admitted from home, set this to 0.",No,Yes,CONCEPT,CONCEPT_ID,Visit,NA,NA +visit_detail,admitted_from_source_value,No,varchar(50),NA,"This information may be called something different in the source data but the field is meant to contain a value indicating where a person was admitted from. Typically this applies only to visits that have a length of stay, like inpatient visits or long-term care visits.",No,No,NA,NA,NA,NA,NA +visit_detail,discharged_to_source_value,No,varchar(50),NA,"This information may be called something different in the source data but the field is meant to contain a value indicating where a person was discharged to after a visit, as in they went home or were moved to long-term care. Typically this applies only to visits that have a length of stay of a day or more.",No,No,NA,NA,NA,NA,NA +visit_detail,discharged_to_concept_id,No,integer,"Use this field to determine where the patient was discharged to after a visit. This concept is part of the visit domain and can indicate if a patient was transferred to another hospital or sent to a long-term care facility, for example. It is assumed that a person is discharged to home therefore there is not a standard concept id for ""home"". Use concept id = 0 when a person is discharged to home.","If available, map the DISCHARGE_TO_SOURCE_VALUE to a Standard Concept in the Visit domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Visit,NA,NA +visit_detail,preceding_visit_detail_id,No,integer,Use this field to find the visit detail that occurred for the person prior to the given visit detail record. There could be a few days or a few years in between.,"The PRECEDING_VISIT_DETAIL_ID can be used to link a visit immediately preceding the current Visit Detail. Note this is not symmetrical, and there is no such thing as a ""following_visit_id"".",No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +visit_detail,parent_visit_detail_id,No,integer,Use this field to find the visit detail that subsumes the given visit detail record. This is used in the case that a visit detail record needs to be nested beyond the VISIT_OCCURRENCE/VISIT_DETAIL relationship.,"If there are multiple nested levels to how Visits are represented in the source, the VISIT_DETAIL_PARENT_ID can be used to record this relationship.",No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +visit_detail,visit_occurrence_id,Yes,integer,Use this field to link the VISIT_DETAIL record to its VISIT_OCCURRENCE.,Put the VISIT_OCCURRENCE_ID that subsumes the VISIT_DETAIL record here.,No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +condition_occurrence,condition_occurrence_id,Yes,integer,The unique key given to a condition record for a person. Refer to the ETL for how duplicate conditions during the same visit were handled.,"Each instance of a condition present in the source data should be assigned this unique key. In some cases, a person can have multiple records of the same condition within the same visit. It is valid to keep these duplicates and assign them individual, unique, CONDITION_OCCURRENCE_IDs, though it is up to the ETL how they should be handled.",Yes,No,NA,NA,NA,NA,NA +condition_occurrence,person_id,Yes,integer,The PERSON_ID of the PERSON for whom the condition is recorded.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +condition_occurrence,condition_concept_id,Yes,integer,"The CONDITION_CONCEPT_ID field is recommended for primary use in analyses, and must be used for network studies. This is the standard concept mapped from the source value which represents a condition","The CONCEPT_ID that the CONDITION_SOURCE_VALUE maps to. Only records whose source values map to concepts with a domain of ""Condition"" should go in this table. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Condition&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Condition,NA,NA +condition_occurrence,condition_start_date,Yes,date,Use this date to determine the start date of the condition,"Most often data sources do not have the idea of a start date for a condition. Rather, if a source only has one date associated with a condition record it is acceptable to use that date for both the CONDITION_START_DATE and the CONDITION_END_DATE.",No,No,NA,NA,NA,NA,NA +condition_occurrence,condition_start_datetime,No,datetime,NA,If a source does not specify datetime the convention is to set the time to midnight (00:00:0000),No,No,NA,NA,NA,NA,NA +condition_occurrence,condition_end_date,No,date,Use this date to determine the end date of the condition,"Most often data sources do not have the idea of a start date for a condition. Rather, if a source only has one date associated with a condition record it is acceptable to use that date for both the CONDITION_START_DATE and the CONDITION_END_DATE.",No,No,NA,NA,NA,NA,NA +condition_occurrence,condition_end_datetime,No,datetime,NA,If a source does not specify datetime the convention is to set the time to midnight (00:00:0000),No,No,NA,NA,NA,NA,NA +condition_occurrence,condition_type_concept_id,Yes,integer,"This field can be used to determine the provenance of the Condition record, as in whether the condition was from an EHR system, insurance claim, registry, or other sources.",Choose the CONDITION_TYPE_CONCEPT_ID that best represents the provenance of the record. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).,No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +condition_occurrence,condition_status_concept_id,No,integer,"This concept represents the point during the visit the diagnosis was given (admitting diagnosis, final diagnosis), whether the diagnosis was determined due to laboratory findings, if the diagnosis was exclusionary, or if it was a preliminary diagnosis, among others.","Choose the Concept in the Condition Status domain that best represents the point during the visit when the diagnosis was given. These can include admitting diagnosis, principal diagnosis, and secondary diagnosis. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Condition+Status&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Condition Status,NA,NA +condition_occurrence,stop_reason,No,varchar(20),The Stop Reason indicates why a Condition is no longer valid with respect to the purpose within the source data. Note that a Stop Reason does not necessarily imply that the condition is no longer occurring.,This information is often not populated in source data and it is a valid etl choice to leave it blank if the information does not exist.,No,No,NA,NA,NA,NA,NA +condition_occurrence,provider_id,No,integer,"The provider associated with condition record, e.g. the provider who made the diagnosis or the provider who recorded the symptom.","The ETL may need to make a choice as to which PROVIDER_ID to put here. Based on what is available this may or may not be different than the provider associated with the overall VISIT_OCCURRENCE record, for example the admitting vs attending physician on an EHR record.",No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +condition_occurrence,visit_occurrence_id,No,integer,The visit during which the condition occurred.,"Depending on the structure of the source data, this may have to be determined based on dates. If a CONDITION_START_DATE occurs within the start and end date of a Visit it is a valid ETL choice to choose the VISIT_OCCURRENCE_ID from the Visit that subsumes it, even if not explicitly stated in the data. While not required, an attempt should be made to locate the VISIT_OCCURRENCE_ID of the CONDITION_OCCURRENCE record.",No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +condition_occurrence,visit_detail_id,No,integer,"The VISIT_DETAIL record during which the condition occurred. For example, if the person was in the ICU at the time of the diagnosis the VISIT_OCCURRENCE record would reflect the overall hospital stay and the VISIT_DETAIL record would reflect the ICU stay during the hospital visit.",Same rules apply as for the VISIT_OCCURRENCE_ID.,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +condition_occurrence,condition_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the condition that occurred. For example, this could be an ICD10 or Read code.",This code is mapped to a Standard Condition Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +condition_occurrence,condition_source_concept_id,No,integer,"This is the concept representing the condition source value and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Condition necessary for a given analytic use case. Consider using CONDITION_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the CONDITION_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +condition_occurrence,condition_status_source_value,No,varchar(50),This field houses the verbatim value from the source data representing the condition status.,This information may be called something different in the source data but the field is meant to contain a value indicating when and how a diagnosis was given to a patient. This source value is mapped to a standard concept which is stored in the CONDITION_STATUS_CONCEPT_ID field.,No,No,NA,NA,NA,NA,NA +drug_exposure,drug_exposure_id,Yes,integer,The unique key given to records of drug dispensings or administrations for a person. Refer to the ETL for how duplicate drugs during the same visit were handled.,"Each instance of a drug dispensing or administration present in the source data should be assigned this unique key. In some cases, a person can have multiple records of the same drug within the same visit. It is valid to keep these duplicates and assign them individual, unique, DRUG_EXPOSURE_IDs, though it is up to the ETL how they should be handled.",Yes,No,NA,NA,NA,NA,NA +drug_exposure,person_id,Yes,integer,The PERSON_ID of the PERSON for whom the drug dispensing or administration is recorded. This may be a system generated code.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +drug_exposure,drug_concept_id,Yes,integer,"The DRUG_CONCEPT_ID field is recommended for primary use in analyses, and must be used for network studies. This is the standard concept mapped from the source concept id which represents a drug product or molecule otherwise introduced to the body. The drug concepts can have a varying degree of information about drug strength and dose. This information is relevant in the context of quantity and administration information in the subsequent fields plus strength information from the DRUG_STRENGTH table, provided as part of the standard vocabulary download.","The CONCEPT_ID that the DRUG_SOURCE_VALUE maps to. The concept id should be derived either from mapping from the source concept id or by picking the drug concept representing the most amount of detail you have. Records whose source values map to standard concepts with a domain of Drug should go in this table. When the Drug Source Value of the code cannot be translated into Standard Drug Concept IDs, a Drug exposure entry is stored with only the corresponding SOURCE_CONCEPT_ID and DRUG_SOURCE_VALUE and a DRUG_CONCEPT_ID of 0. The Drug Concept with the most detailed content of information is preferred during the mapping process. These are indicated in the CONCEPT_CLASS_ID field of the Concept and are recorded in the following order of precedence: 'Branded Pack', 'Clinical Pack', 'Branded Drug', 'Clinical Drug', 'Branded Drug Component', 'Clinical Drug Component', 'Branded Drug Form', 'Clinical Drug Form', and only if no other information is available 'Ingredient'. Note: If only the drug class is known, the DRUG_CONCEPT_ID field should contain 0. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Drug&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Drug,NA,NA +drug_exposure,drug_exposure_start_date,Yes,date,Use this date to determine the start date of the drug record.,"Valid entries include a start date of a prescription, the date a prescription was filled, or the date on which a Drug administration was recorded. It is a valid ETL choice to use the date the drug was ordered as the DRUG_EXPOSURE_START_DATE.",No,No,NA,NA,NA,NA,NA +drug_exposure,drug_exposure_start_datetime,No,datetime,NA,"This is not required, though it is in v6. If a source does not specify datetime the convention is to set the time to midnight (00:00:0000)",No,No,NA,NA,NA,NA,NA +drug_exposure,drug_exposure_end_date,Yes,date,The DRUG_EXPOSURE_END_DATE denotes the day the drug exposure ended for the patient.,"If this information is not explicitly available in the data, infer the end date using the following methods:

1. Start first with duration or days supply using the calculation drug start date + days supply -1 day. 2. Use quantity divided by daily dose that you may obtain from the sig or a source field (or assumed daily dose of 1) for solid, indivisibile, drug products. If quantity represents ingredient amount, quantity divided by daily dose * concentration (from drug_strength) drug concept id tells you the dose form. 3. If it is an administration record, set drug end date equal to drug start date. If the record is a written prescription then set end date to start date + 29. If the record is a mail-order prescription set end date to start date + 89. The end date must be equal to or greater than the start date. Ibuprofen 20mg/mL oral solution concept tells us this is oral solution. Calculate duration as quantity (200 example) * daily dose (5mL) /concentration (20mg/mL) 200*5/20 = 50 days. [Examples by dose form](https://ohdsi.github.io/CommonDataModel/drug_dose.html)",No,No,NA,NA,NA,NA,NA +drug_exposure,drug_exposure_end_datetime,No,datetime,NA,"This is not required, though it is in v6. If a source does not specify datetime the convention is to set the time to midnight (00:00:0000)",No,No,NA,NA,NA,NA,NA +drug_exposure,verbatim_end_date,No,date,"This is the end date of the drug exposure as it appears in the source data, if it is given",Put the end date or discontinuation date as it appears from the source data or leave blank if unavailable.,No,No,NA,NA,NA,NA,NA +drug_exposure,drug_type_concept_id,Yes,integer,"You can use the TYPE_CONCEPT_ID to delineate between prescriptions written vs. prescriptions dispensed vs. medication history vs. patient-reported exposure, etc.","Choose the drug_type_concept_id that best represents the provenance of the record, for example whether it came from a record of a prescription written or physician administered drug. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +drug_exposure,stop_reason,No,varchar(20),"The reason a person stopped a medication as it is represented in the source. Reasons include regimen completed, changed, removed, etc. This field will be retired in v6.0.",This information is often not populated in source data and it is a valid etl choice to leave it blank if the information does not exist.,No,No,NA,NA,NA,NA,NA +drug_exposure,refills,No,integer,This is only filled in when the record is coming from a prescription written this field is meant to represent intended refills at time of the prescription.,NA,No,No,NA,NA,NA,NA,NA +drug_exposure,quantity,No,float,NA,"To find the dose form of a drug the RELATIONSHIP table can be used where the relationship_id is 'Has dose form'. If liquid, quantity stands for the total amount dispensed or ordered of ingredient in the units given by the drug_strength table. If the unit from the source data does not align with the unit in the DRUG_STRENGTH table the quantity should be converted to the correct unit given in DRUG_STRENGTH. For clinical drugs with fixed dose forms (tablets etc.) the quantity is the number of units/tablets/capsules prescribed or dispensed (can be partial, but then only 1/2 or 1/3, not 0.01). Clinical drugs with divisible dose forms (injections) the quantity is the amount of ingredient the patient got. For example, if the injection is 2mg/mL but the patient got 80mL then quantity is reported as 160. +Quantified clinical drugs with divisible dose forms (prefilled syringes), the quantity is the amount of ingredient similar to clinical drugs. Please see [how to calculate drug dose](https://ohdsi.github.io/CommonDataModel/drug_dose.html) for more information. +",No,No,NA,NA,NA,NA,NA +drug_exposure,days_supply,No,integer,NA,Days supply of the drug. This should be the verbatim days_supply as given on the prescription. If the drug is physician administered use duration end date if given or set to 1 as default if duration is not available.,No,No,NA,NA,NA,NA,NA +drug_exposure,sig,No,varchar(MAX),This is the verbatim instruction for the drug as written by the provider.,"Put the written out instructions for the drug as it is verbatim in the source, if available.",No,No,NA,NA,NA,NA,NA +drug_exposure,route_concept_id,No,integer,NA,The standard CONCEPT_ID that the ROUTE_SOURCE_VALUE maps to in the route domain.,No,Yes,CONCEPT,CONCEPT_ID,Route,NA,NA +drug_exposure,lot_number,No,varchar(50),NA,NA,No,No,NA,NA,NA,NA,NA +drug_exposure,provider_id,No,integer,"The Provider associated with drug record, e.g. the provider who wrote the prescription or the provider who administered the drug.","The ETL may need to make a choice as to which PROVIDER_ID to put here. Based on what is available this may or may not be different than the provider associated with the overall VISIT_OCCURRENCE record, for example the ordering vs administering physician on an EHR record.",No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +drug_exposure,visit_occurrence_id,No,integer,"The Visit during which the drug was prescribed, administered or dispensed.",To populate this field drug exposures must be explicitly initiated in the visit.,No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +drug_exposure,visit_detail_id,No,integer,"The VISIT_DETAIL record during which the drug exposure occurred. For example, if the person was in the ICU at the time of the drug administration the VISIT_OCCURRENCE record would reflect the overall hospital stay and the VISIT_DETAIL record would reflect the ICU stay during the hospital visit.",Same rules apply as for the VISIT_OCCURRENCE_ID.,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +drug_exposure,drug_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the drug exposure that occurred. For example, this could be an NDC or Gemscript code.",This code is mapped to a Standard Drug Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +drug_exposure,drug_source_concept_id,No,integer,"This is the concept representing the drug source value and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Drug necessary for a given analytic use case. Consider using DRUG_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the DRUG_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +drug_exposure,route_source_value,No,varchar(50),This field houses the verbatim value from the source data representing the drug route.,This information may be called something different in the source data but the field is meant to contain a value indicating when and how a drug was given to a patient. This source value is mapped to a standard concept which is stored in the ROUTE_CONCEPT_ID field.,No,No,NA,NA,NA,NA,NA +drug_exposure,dose_unit_source_value,No,varchar(50),This field houses the verbatim value from the source data representing the dose unit of the drug given.,This information may be called something different in the source data but the field is meant to contain a value indicating the unit of dosage of drug given to the patient. **This is an older column and will be deprecated in an upcoming version.**,No,No,NA,NA,NA,NA,NA +procedure_occurrence,procedure_occurrence_id,Yes,integer,The unique key given to a procedure record for a person. Refer to the ETL for how duplicate procedures during the same visit were handled.,"Each instance of a procedure occurrence in the source data should be assigned this unique key. In some cases, a person can have multiple records of the same procedure within the same visit. It is valid to keep these duplicates and assign them individual, unique, PROCEDURE_OCCURRENCE_IDs, though it is up to the ETL how they should be handled.",Yes,No,NA,NA,NA,NA,NA +procedure_occurrence,person_id,Yes,integer,The PERSON_ID of the PERSON for whom the procedure is recorded. This may be a system generated code.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +procedure_occurrence,procedure_concept_id,Yes,integer,"The PROCEDURE_CONCEPT_ID field is recommended for primary use in analyses, and must be used for network studies. This is the standard concept mapped from the source value which represents a procedure","The CONCEPT_ID that the PROCEDURE_SOURCE_VALUE maps to. Only records whose source values map to standard concepts with a domain of ""Procedure"" should go in this table. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Procedure&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Procedure,NA,NA +procedure_occurrence,procedure_date,Yes,date,Use this date to determine the date the procedure started.,This is meant to be the **start date** of the procedure. It will be renamed in a future version to **PROCEDURE_START_DATE**.,No,No,NA,NA,NA,NA,NA +procedure_occurrence,procedure_datetime,No,datetime,NA,"If the procedure has a start time in the native date, use this field to house that information. This will be renamed in a future version to **PROCEDURE_START_DATETIME**.",No,No,NA,NA,NA,NA,NA +procedure_occurrence,procedure_end_date,No,date,Use this field to house the date that the procedure ended.,This is meant to be the end date of the procedure. It is not required and for most cases will be the same as the PROCEDURE_START_DATE.,No,No,NA,NA,NA,NA,NA +procedure_occurrence,procedure_end_datetime,No,datetime,Use this field to house the datetime that the procedure ended.,This is meant to house the end datetime of the procedure and will most often be used in conjunction with the procedure_start_datetime to determine the length of the procedure.,No,No,NA,NA,NA,NA,NA +procedure_occurrence,procedure_type_concept_id,Yes,integer,"This field can be used to determine the provenance of the Procedure record, as in whether the procedure was from an EHR system, insurance claim, registry, or other sources.","Choose the PROCEDURE_TYPE_CONCEPT_ID that best represents the provenance of the record, for example whether it came from an EHR record or billing claim. If a procedure is recorded as an EHR encounter, the PROCEDURE_TYPE_CONCEPT would be 'EHR encounter record'. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +procedure_occurrence,modifier_concept_id,No,integer,The modifiers are intended to give additional information about the procedure but as of now the vocabulary is under review.,"It is up to the ETL to choose how to map modifiers if they exist in source data. These concepts are typically distinguished by 'Modifier' concept classes (e.g., 'CPT4 Modifier' as part of the 'CPT4' vocabulary). If there is more than one modifier on a record, one should be chosen that pertains to the procedure rather than provider. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?conceptClass=CPT4+Modifier&conceptClass=HCPCS+Modifier&vocabulary=CPT4&vocabulary=HCPCS&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +procedure_occurrence,quantity,No,integer,"If the quantity value is omitted, a single procedure is assumed.","If a Procedure has a quantity of '0' in the source, this should default to '1' in the ETL. If there is a record in the source it can be assumed the exposure occurred at least once",No,No,NA,NA,NA,NA,NA +procedure_occurrence,provider_id,No,integer,"The provider associated with the procedure record, e.g. the provider who performed the Procedure.","The ETL may need to make a choice as to which PROVIDER_ID to put here. Based on what is available this may or may not be different than the provider associated with the overall VISIT_OCCURRENCE record, for example the admitting vs attending physician on an EHR record.",No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +procedure_occurrence,visit_occurrence_id,No,integer,The visit during which the procedure occurred.,"Depending on the structure of the source data, this may have to be determined based on dates. If a PROCEDURE_DATE occurs within the start and end date of a Visit it is a valid ETL choice to choose the VISIT_OCCURRENCE_ID from the Visit that subsumes it, even if not explicitly stated in the data. While not required, an attempt should be made to locate the VISIT_OCCURRENCE_ID of the PROCEDURE_OCCURRENCE record.",No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +procedure_occurrence,visit_detail_id,No,integer,"The VISIT_DETAIL record during which the Procedure occurred. For example, if the Person was in the ICU at the time of the Procedure the VISIT_OCCURRENCE record would reflect the overall hospital stay and the VISIT_DETAIL record would reflect the ICU stay during the hospital visit.",Same rules apply as for the VISIT_OCCURRENCE_ID.,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +procedure_occurrence,procedure_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the procedure that occurred. For example, this could be an CPT4 or OPCS4 code.",Use this value to look up the source concept id and then map the source concept id to a standard concept id.,No,No,NA,NA,NA,NA,NA +procedure_occurrence,procedure_source_concept_id,No,integer,"This is the concept representing the procedure source value and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Procedure necessary for a given analytic use case. Consider using PROCEDURE_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the PROCEDURE_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +procedure_occurrence,modifier_source_value,No,varchar(50),NA,The original modifier code from the source is stored here for reference.,No,No,NA,NA,NA,NA,NA +device_exposure,device_exposure_id,Yes,integer,The unique key given to records a person's exposure to a foreign physical object or instrument.,Each instance of an exposure to a foreign object or device present in the source data should be assigned this unique key.,Yes,No,NA,NA,NA,NA,NA +device_exposure,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +device_exposure,device_concept_id,Yes,integer,"The DEVICE_CONCEPT_ID field is recommended for primary use in analyses, and must be used for network studies. This is the standard concept mapped from the source concept id which represents a foreign object or instrument the person was exposed to.",The CONCEPT_ID that the DEVICE_SOURCE_VALUE maps to.,No,Yes,CONCEPT,CONCEPT_ID,Device,NA,NA +device_exposure,device_exposure_start_date,Yes,date,Use this date to determine the start date of the device record.,"Valid entries include a start date of a procedure to implant a device, the date of a prescription for a device, or the date of device administration.",No,No,NA,NA,NA,NA,NA +device_exposure,device_exposure_start_datetime,No,datetime,NA,"This is not required, though it is in v6. If a source does not specify datetime the convention is to set the time to midnight (00:00:0000)",No,No,NA,NA,NA,NA,NA +device_exposure,device_exposure_end_date,No,date,"The DEVICE_EXPOSURE_END_DATE denotes the day the device exposure ended for the patient, if given.",Put the end date or discontinuation date as it appears from the source data or leave blank if unavailable.,No,No,NA,NA,NA,NA,NA +device_exposure,device_exposure_end_datetime,No,datetime,NA,If a source does not specify datetime the convention is to set the time to midnight (00:00:0000),No,No,NA,NA,NA,NA,NA +device_exposure,device_type_concept_id,Yes,integer,"You can use the TYPE_CONCEPT_ID to denote the provenance of the record, as in whether the record is from administrative claims or EHR.","Choose the drug_type_concept_id that best represents the provenance of the record, for example whether it came from a record of a prescription written or physician administered drug. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +device_exposure,unique_device_id,No,varchar(255),"This is the Unique Device Identification (UDI-DI) number for devices regulated by the FDA, if given.","For medical devices that are regulated by the FDA, a Unique Device Identification (UDI) is provided if available in the data source and is recorded in the UNIQUE_DEVICE_ID field.",No,No,NA,NA,NA,NA,NA +device_exposure,production_id,No,varchar(255),This is the Production Identifier (UDI-PI) portion of the Unique Device Identification.,NA,No,No,NA,NA,NA,NA,NA +device_exposure,quantity,No,integer,NA,NA,No,No,NA,NA,NA,NA,NA +device_exposure,provider_id,No,integer,"The Provider associated with device record, e.g. the provider who wrote the prescription or the provider who implanted the device.",The ETL may need to make a choice as to which PROVIDER_ID to put here. Based on what is available this may or may not be different than the provider associated with the overall VISIT_OCCURRENCE record.,No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +device_exposure,visit_occurrence_id,No,integer,The Visit during which the device was prescribed or given.,To populate this field device exposures must be explicitly initiated in the visit.,No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +device_exposure,visit_detail_id,No,integer,The Visit Detail during which the device was prescribed or given.,To populate this field device exposures must be explicitly initiated in the visit detail record.,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +device_exposure,device_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the device exposure that occurred. For example, this could be an NDC or Gemscript code.",This code is mapped to a Standard Device Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +device_exposure,device_source_concept_id,No,integer,"This is the concept representing the device source value and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Device necessary for a given analytic use case. Consider using DEVICE_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the DEVICE_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +device_exposure,unit_concept_id,No,integer,UNIT_SOURCE_VALUES should be mapped to a Standard Concept in the Unit domain that best represents the unit as given in the source data.,"There is no standardization requirement for units associated with DEVICE_CONCEPT_IDs, however, it is the responsibility of the ETL to choose the most plausible unit. If there is no unit associated with a Device record, set to NULL.",No,Yes,CONCEPT,CONCEPT_ID,Unit,NA,NA +device_exposure,unit_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the unit of the Device. For example, blood transfusions are considered devices and can be given in mL quantities.","This code is mapped to a Standard Condition Concept in the Standardized Vocabularies and the original code is stored here for reference. Using the blood transfusion example, blood transfusion is represented by the DEVICE_CONCEPT_ID and the unit (mL) would be housed in the UNIT_SOURCE_VALUE and mapped to a standard concept in the unit domain.",No,No,NA,NA,NA,NA,NA +device_exposure,unit_source_concept_id,No,integer,"This is the concept representing the UNIT_SOURCE_VALUE and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Unit necessary for a given analytic use case. Consider using UNIT_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the UNIT_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +measurement,measurement_id,Yes,integer,The unique key given to a Measurement record for a Person. Refer to the ETL for how duplicate Measurements during the same Visit were handled.,"Each instance of a measurement present in the source data should be assigned this unique key. In some cases, a person can have multiple records of the same measurement within the same visit. It is valid to keep these duplicates and assign them individual, unique, MEASUREMENT_IDs, though it is up to the ETL how they should be handled.",Yes,No,NA,NA,NA,NA,NA +measurement,person_id,Yes,integer,The PERSON_ID of the Person for whom the Measurement is recorded. This may be a system generated code.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +measurement,measurement_concept_id,Yes,integer,"The MEASUREMENT_CONCEPT_ID field is recommended for primary use in analyses, and must be used for network studies.","The CONCEPT_ID that the MEASUREMENT_SOURCE_CONCEPT_ID maps to. Only records whose SOURCE_CONCEPT_IDs map to Standard Concepts with a domain of ""Measurement"" should go in this table.",No,Yes,CONCEPT,CONCEPT_ID,Measurement,NA,NA +measurement,measurement_date,Yes,date,Use this date to determine the date of the measurement.,"If there are multiple dates in the source data associated with a record such as order_date, draw_date, and result_date, choose the one that is closest to the date the sample was drawn from the patient.",No,No,NA,NA,NA,NA,NA +measurement,measurement_datetime,No,datetime,NA,"This is not required, though it is in v6. If a source does not specify datetime the convention is to set the time to midnight (00:00:0000)",No,No,NA,NA,NA,NA,NA +measurement,measurement_time,No,varchar(10),NA,This is present for backwards compatibility and will be deprecated in an upcoming version.,No,No,NA,NA,NA,NA,NA +measurement,measurement_type_concept_id,Yes,integer,"This field can be used to determine the provenance of the Measurement record, as in whether the measurement was from an EHR system, insurance claim, registry, or other sources.","Choose the MEASUREMENT_TYPE_CONCEPT_ID that best represents the provenance of the record, for example whether it came from an EHR record or billing claim. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +measurement,operator_concept_id,No,integer,"The meaning of Concept [4172703](https://athena.ohdsi.org/search-terms/terms/4172703) for '=' is identical to omission of a OPERATOR_CONCEPT_ID value. Since the use of this field is rare, it's important when devising analyses to not to forget testing for the content of this field for values different from =.","Operators are <, <=, =, >=, > and these concepts belong to the 'Meas Value Operator' domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Meas+Value+Operator&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +measurement,value_as_number,No,float,"This is the numerical value of the Result of the Measurement, if available. Note that measurements such as blood pressures will be split into their component parts i.e. one record for systolic, one record for diastolic.","If there is a negative value coming from the source, set the VALUE_AS_NUMBER to NULL, with the exception of the following Measurements (listed as LOINC codes):
- [1925-7](https://athena.ohdsi.org/search-terms/terms/3003396) Base excess in Arterial blood by calculation - [1927-3](https://athena.ohdsi.org/search-terms/terms/3002032) Base excess in Venous blood by calculation - [8632-2](https://athena.ohdsi.org/search-terms/terms/3006277) QRS-Axis - [11555-0](https://athena.ohdsi.org/search-terms/terms/3012501) Base excess in Blood by calculation - [1926-5](https://athena.ohdsi.org/search-terms/terms/3003129) Base excess in Capillary blood by calculation - [28638-5](https://athena.ohdsi.org/search-terms/terms/3004959) Base excess in Arterial cord blood by calculation [28639-3](https://athena.ohdsi.org/search-terms/terms/3007435) Base excess in Venous cord blood by calculation",No,No,NA,NA,NA,NA,NA +measurement,value_as_concept_id,No,integer,If the raw data gives a categorial result for measurements those values are captured and mapped to standard concepts in the 'Meas Value' domain.,"If the raw data provides categorial results as well as continuous results for measurements, it is a valid ETL choice to preserve both values. The continuous value should go in the VALUE_AS_NUMBER field and the categorical value should be mapped to a standard concept in the 'Meas Value' domain and put in the VALUE_AS_CONCEPT_ID field. This is also the destination for the 'Maps to value' relationship.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +measurement,unit_concept_id,No,integer,"There is currently no recommended unit for individual measurements, i.e. it is not mandatory to represent Hemoglobin a1C measurements as a percentage. UNIT_SOURCE_VALUES should be mapped to a Standard Concept in the Unit domain that best represents the unit as given in the source data.","There is no standardization requirement for units associated with MEASUREMENT_CONCEPT_IDs, however, it is the responsibility of the ETL to choose the most plausible unit.",No,Yes,CONCEPT,CONCEPT_ID,Unit,NA,NA +measurement,range_low,No,float,Ranges have the same unit as the VALUE_AS_NUMBER. These ranges are provided by the source and should remain NULL if not given.,If reference ranges for upper and lower limit of normal as provided (typically by a laboratory) these are stored in the RANGE_HIGH and RANGE_LOW fields. This should be set to NULL if not provided.,No,No,NA,NA,NA,NA,NA +measurement,range_high,No,float,Ranges have the same unit as the VALUE_AS_NUMBER. These ranges are provided by the source and should remain NULL if not given.,If reference ranges for upper and lower limit of normal as provided (typically by a laboratory) these are stored in the RANGE_HIGH and RANGE_LOW fields. This should be set to NULL if not provided.,No,No,NA,NA,NA,NA,NA +measurement,provider_id,No,integer,"The provider associated with measurement record, e.g. the provider who ordered the test or the provider who recorded the result.",The ETL may need to make a choice as to which PROVIDER_ID to put here. Based on what is available this may or may not be different than the provider associated with the overall VISIT_OCCURRENCE record. For example the admitting vs attending physician on an EHR record.,No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +measurement,visit_occurrence_id,No,integer,The visit during which the Measurement occurred.,"Depending on the structure of the source data, this may have to be determined based on dates. If a MEASUREMENT_DATE occurs within the start and end date of a Visit it is a valid ETL choice to choose the VISIT_OCCURRENCE_ID from the visit that subsumes it, even if not explicitly stated in the data. While not required, an attempt should be made to locate the VISIT_OCCURRENCE_ID of the measurement record. If a measurement is related to a visit explicitly in the source data, it is possible that the result date of the Measurement falls outside of the bounds of the Visit dates.",No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +measurement,visit_detail_id,No,integer,"The VISIT_DETAIL record during which the Measurement occurred. For example, if the Person was in the ICU at the time the VISIT_OCCURRENCE record would reflect the overall hospital stay and the VISIT_DETAIL record would reflect the ICU stay during the hospital visit.",Same rules apply as for the VISIT_OCCURRENCE_ID.,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +measurement,measurement_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the Measurement that occurred. For example, this could be an ICD10 or Read code.",This code is mapped to a Standard Measurement Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +measurement,measurement_source_concept_id,No,integer,"This is the concept representing the MEASUREMENT_SOURCE_VALUE and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Measurement necessary for a given analytic use case. Consider using MEASUREMENT_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the MEASUREMENT_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +measurement,unit_source_value,No,varchar(50),This field houses the verbatim value from the source data representing the unit of the Measurement that occurred.,This code is mapped to a Standard Condition Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +measurement,unit_source_concept_id,No,integer,"""This is the concept representing the UNIT_SOURCE_VALUE and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Measurement necessary for a given analytic use case. Consider using UNIT_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.""",If the UNIT_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +measurement,value_source_value,No,varchar(50),This field houses the verbatim result value of the Measurement from the source data .,"If both a continuous and categorical result are given in the source data such that both VALUE_AS_NUMBER and VALUE_AS_CONCEPT_ID are both included, store the verbatim value that was mapped to VALUE_AS_CONCEPT_ID here.",No,No,NA,NA,NA,NA,NA +measurement,measurement_event_id,No,integer,"If the Measurement record is related to another record in the database, this field is the primary key of the linked record.","Put the primary key of the linked record, if applicable, here.",No,No,NA,NA,NA,NA,NA +measurement,meas_event_field_concept_id,No,integer,"If the Measurement record is related to another record in the database, this field is the CONCEPT_ID that identifies which table the primary key of the linked record came from.",Put the CONCEPT_ID that identifies which table and field the MEASUREMENT_EVENT_ID came from.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +observation,observation_id,Yes,integer,The unique key given to an Observation record for a Person. Refer to the ETL for how duplicate Observations during the same Visit were handled.,Each instance of an observation present in the source data should be assigned this unique key.,Yes,No,NA,NA,NA,NA,NA +observation,person_id,Yes,integer,The PERSON_ID of the Person for whom the Observation is recorded. This may be a system generated code.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +observation,observation_concept_id,Yes,integer,"The OBSERVATION_CONCEPT_ID field is recommended for primary use in analyses, and must be used for network studies.","The CONCEPT_ID that the OBSERVATION_SOURCE_CONCEPT_ID maps to. There is no specified domain that the Concepts in this table must adhere to. The only rule is that records with Concepts in the Condition, Procedure, Drug, Measurement, or Device domains MUST go to the corresponding table.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +observation,observation_date,Yes,date,"The date of the Observation. Depending on what the Observation represents this could be the date of a lab test, the date of a survey, or the date a patient's family history was taken.",For some observations the ETL may need to make a choice as to which date to choose.,No,No,NA,NA,NA,NA,NA +observation,observation_datetime,No,datetime,NA,If no time is given set to midnight (00:00:00).,No,No,NA,NA,NA,NA,NA +observation,observation_type_concept_id,Yes,integer,"This field can be used to determine the provenance of the Observation record, as in whether the measurement was from an EHR system, insurance claim, registry, or other sources.","Choose the OBSERVATION_TYPE_CONCEPT_ID that best represents the provenance of the record, for example whether it came from an EHR record or billing claim. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +observation,value_as_number,No,float,"This is the numerical value of the Result of the Observation, if applicable and available. It is not expected that all Observations will have numeric results, rather, this field is here to house values should they exist.",NA,No,No,NA,NA,NA,NA,NA +observation,value_as_string,No,varchar(60),"This is the categorical value of the Result of the Observation, if applicable and available.",NA,No,No,NA,NA,NA,NA,NA +observation,value_as_concept_id,No,Integer,"It is possible that some records destined for the Observation table have two clinical ideas represented in one source code. This is common with ICD10 codes that describe a family history of some Condition, for example. In OMOP the Vocabulary breaks these two clinical ideas into two codes; one becomes the OBSERVATION_CONCEPT_ID and the other becomes the VALUE_AS_CONCEPT_ID. It is important when using the Observation table to keep this possibility in mind and to examine the VALUE_AS_CONCEPT_ID field for relevant information.","Note that the value of VALUE_AS_CONCEPT_ID may be provided through mapping from a source Concept which contains the content of the Observation. In those situations, the CONCEPT_RELATIONSHIP table in addition to the 'Maps to' record contains a second record with the relationship_id set to 'Maps to value'. For example, ICD10 [Z82.4](https://athena.ohdsi.org/search-terms/terms/45581076) 'Family history of ischaemic heart disease and other diseases of the circulatory system' has a 'Maps to' relationship to [4167217](https://athena.ohdsi.org/search-terms/terms/4167217) 'Family history of clinical finding' as well as a 'Maps to value' record to [134057](https://athena.ohdsi.org/search-terms/terms/134057) 'Disorder of cardiovascular system'.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +observation,qualifier_concept_id,No,integer,"This field contains all attributes specifying the clinical fact further, such as as degrees, severities, drug-drug interaction alerts etc.","Use your best judgement as to what Concepts to use here and if they are necessary to accurately represent the clinical record. There is no restriction on the domain of these Concepts, they just need to be Standard.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +observation,unit_concept_id,No,integer,There is currently no recommended unit for individual observation concepts. UNIT_SOURCE_VALUES should be mapped to a Standard Concept in the Unit domain that best represents the unit as given in the source data.,"There is no standardization requirement for units associated with OBSERVATION_CONCEPT_IDs, however, it is the responsibility of the ETL to choose the most plausible unit.",No,Yes,CONCEPT,CONCEPT_ID,Unit,NA,NA +observation,provider_id,No,integer,"The provider associated with the observation record, e.g. the provider who ordered the test or the provider who recorded the result.",The ETL may need to make a choice as to which PROVIDER_ID to put here. Based on what is available this may or may not be different than the provider associated with the overall VISIT_OCCURRENCE record. For example the admitting vs attending physician on an EHR record.,No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +observation,visit_occurrence_id,No,integer,The visit during which the Observation occurred.,"Depending on the structure of the source data, this may have to be determined based on dates. If an OBSERVATION_DATE occurs within the start and end date of a Visit it is a valid ETL choice to choose the VISIT_OCCURRENCE_ID from the visit that subsumes it, even if not explicitly stated in the data. While not required, an attempt should be made to locate the VISIT_OCCURRENCE_ID of the observation record. If an observation is related to a visit explicitly in the source data, it is possible that the result date of the Observation falls outside of the bounds of the Visit dates.",No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +observation,visit_detail_id,No,integer,"The VISIT_DETAIL record during which the Observation occurred. For example, if the Person was in the ICU at the time the VISIT_OCCURRENCE record would reflect the overall hospital stay and the VISIT_DETAIL record would reflect the ICU stay during the hospital visit.",Same rules apply as for the VISIT_OCCURRENCE_ID.,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +observation,observation_source_value,No,varchar(50),"This field houses the verbatim value from the source data representing the Observation that occurred. For example, this could be an ICD10 or Read code.",This code is mapped to a Standard Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +observation,observation_source_concept_id,No,integer,"This is the concept representing the OBSERVATION_SOURCE_VALUE and may not necessarily be standard. This field is discouraged from use in analysis because it is not required to contain Standard Concepts that are used across the OHDSI community, and should only be used when Standard Concepts do not adequately represent the source detail for the Observation necessary for a given analytic use case. Consider using OBSERVATION_CONCEPT_ID instead to enable standardized analytics that can be consistent across the network.",If the OBSERVATION_SOURCE_VALUE is coded in the source data using an OMOP supported vocabulary put the concept id representing the source value here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +observation,unit_source_value,No,varchar(50),This field houses the verbatim value from the source data representing the unit of the Observation that occurred.,This code is mapped to a Standard Condition Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +observation,qualifier_source_value,No,varchar(50),This field houses the verbatim value from the source data representing the qualifier of the Observation that occurred.,This code is mapped to a Standard Condition Concept in the Standardized Vocabularies and the original code is stored here for reference.,No,No,NA,NA,NA,NA,NA +observation,value_source_value,No,varchar(50),This field houses the verbatim result value of the Observation from the source data. Do not get confused with the Observation_source_value which captures source value of the observation mapped to observation_concept_id. This field is the observation result value from the source.,"If the observation_source_value was a question, for example, or an observation that requires a result then this field is the answer/ result from the source data. Store the verbatim value that represents the result of the observation_source_value.",No,No,NA,NA,NA,NA,NA +observation,observation_event_id,No,integer,"If the Observation record is related to another record in the database, this field is the primary key of the linked record.","Put the primary key of the linked record, if applicable, here. See the [ETL Conventions for the OBSERVATION](https://ohdsi.github.io/CommonDataModel/cdm60.html#observation) table for more details.",No,No,NA,NA,NA,NA,NA +observation,obs_event_field_concept_id,No,integer,"If the Observation record is related to another record in the database, this field is the CONCEPT_ID that identifies which table the primary key of the linked record came from.",Put the CONCEPT_ID that identifies which table and field the OBSERVATION_EVENT_ID came from.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +death,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +death,death_date,Yes,date,The date the person was deceased.,"If the precise date include day or month is not known or not allowed, December is used as the default month, and the last day of the month the default day.",No,No,NA,NA,NA,NA,NA +death,death_datetime,No,datetime,NA,If not available set time to midnight (00:00:00),No,No,NA,NA,NA,NA,NA +death,death_type_concept_id,No,integer,"This is the provenance of the death record, i.e., where it came from. It is possible that an administrative claims database would source death information from a government file so do not assume the Death Type is the same as the Visit Type, etc.",Use the type concept that be reflects the source of the death record. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).,No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +death,cause_concept_id,No,integer,"This is the Standard Concept representing the Person's cause of death, if available.","There is no specified domain for this concept, just choose the Standard Concept Id that best represents the person's cause of death.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +death,cause_source_value,No,varchar(50),NA,"If available, put the source code representing the cause of death here.",No,No,NA,NA,NA,NA,NA +death,cause_source_concept_id,No,integer,NA,If the cause of death was coded using a Vocabulary present in the OMOP Vocabularies put the CONCEPT_ID representing the cause of death here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note,note_id,Yes,integer,A unique identifier for each note.,NA,Yes,No,NA,NA,NA,NA,NA +note,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +note,note_date,Yes,date,The date the note was recorded.,NA,No,No,NA,NA,NA,NA,NA +note,note_datetime,No,datetime,NA,If time is not given set the time to midnight.,No,No,NA,NA,NA,NA,NA +note,note_type_concept_id,Yes,integer,The provenance of the note. Most likely this will be EHR.,"Put the source system of the note, as in EHR record. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?standardConcept=Standard&domain=Type+Concept&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +note,note_class_concept_id,Yes,integer,"A Standard Concept Id representing the HL7 LOINC +Document Type Vocabulary classification of the note.",Map the note classification to a Standard Concept. For more information see the ETL Conventions in the description of the NOTE table. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?standardConcept=Standard&conceptClass=Doc+Kind&conceptClass=Doc+Role&conceptClass=Doc+Setting&conceptClass=Doc+Subject+Matter&conceptClass=Doc+Type+of+Service&domain=Meas+Value&page=1&pageSize=15&query=). This Concept can alternatively be represented by concepts with the relationship 'Kind of (LOINC)' to [706391](https://athena.ohdsi.org/search-terms/terms/706391) (Note).,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note,note_title,No,varchar(250),The title of the note.,NA,No,No,NA,NA,NA,NA,NA +note,note_text,Yes,varchar(MAX),The content of the note.,NA,No,No,NA,NA,NA,NA,NA +note,encoding_concept_id,Yes,integer,This is the Concept representing the character encoding type.,"Put the Concept Id that represents the encoding character type here. Currently the only option is UTF-8 ([32678](https://athena.ohdsi.org/search-terms/terms/32678)). It the note is encoded in any other type, like ASCII then put 0.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note,language_concept_id,Yes,integer,The language of the note.,Use Concepts that are descendants of the concept [4182347](https://athena.ohdsi.org/search-terms/terms/4182347) (World Languages).,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note,provider_id,No,integer,The Provider who wrote the note.,The ETL may need to make a determination on which provider to put here.,No,Yes,PROVIDER,PROVIDER_ID,NA,NA,NA +note,visit_occurrence_id,No,integer,The Visit during which the note was written.,NA,No,Yes,VISIT_OCCURRENCE,VISIT_OCCURRENCE_ID,NA,NA,NA +note,visit_detail_id,No,integer,The Visit Detail during which the note was written.,NA,No,Yes,VISIT_DETAIL,VISIT_DETAIL_ID,NA,NA,NA +note,note_source_value,No,varchar(50),NA,The source value mapped to the NOTE_CLASS_CONCEPT_ID.,No,No,NA,NA,NA,NA,NA +note,note_event_id,No,integer,"If the Note record is related to another record in the database, this field is the primary key of the linked record.","Put the primary key of the linked record, if applicable, here.",No,No,NA,NA,NA,NA,NA +note,note_event_field_concept_id,No,integer,"If the Note record is related to another record in the database, this field is the CONCEPT_ID that identifies which table the primary key of the linked record came from.",Put the CONCEPT_ID that identifies which table and field the NOTE_EVENT_ID came from.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note_nlp,note_nlp_id,Yes,integer,A unique identifier for the NLP record.,NA,Yes,No,NA,NA,NA,NA,NA +note_nlp,note_id,Yes,integer,This is the NOTE_ID for the NOTE record the NLP record is associated to.,NA,No,No,NA,NA,NA,NA,NA +note_nlp,section_concept_id,No,integer,NA,"The SECTION_CONCEPT_ID should be used to represent the note section contained in the NOTE_NLP record. These concepts can be found as parts of document panels and are based on the type of note written, i.e. a discharge summary. These panels can be found as concepts with the relationship 'Subsumes' to CONCEPT_ID [45875957](https://athena.ohdsi.org/search-terms/terms/45875957).",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note_nlp,snippet,No,varchar(250),A small window of text surrounding the term,NA,No,No,NA,NA,NA,NA,NA +note_nlp,"""offset""",No,varchar(50),Character offset of the extracted term in the input note,NA,No,No,NA,NA,NA,NA,NA +note_nlp,lexical_variant,Yes,varchar(250),Raw text extracted from the NLP tool.,NA,No,No,NA,NA,NA,NA,NA +note_nlp,note_nlp_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note_nlp,note_nlp_source_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +note_nlp,nlp_system,No,varchar(250),NA,Name and version of the NLP system that extracted the term. Useful for data provenance.,No,No,NA,NA,NA,NA,NA +note_nlp,nlp_date,Yes,date,The date of the note processing.,NA,No,No,NA,NA,NA,NA,NA +note_nlp,nlp_datetime,No,datetime,The date and time of the note processing.,NA,No,No,NA,NA,NA,NA,NA +note_nlp,term_exists,No,varchar(1),NA,"Term_exists is defined as a flag that indicates if the patient actually has or had the condition. Any of the following modifiers would make Term_exists false: +Negation = true +Subject = [anything other than the patient] +Conditional = true/li> +Rule_out = true +Uncertain = very low certainty or any lower certainties +A complete lack of modifiers would make Term_exists true. +",No,No,NA,NA,NA,NA,NA +note_nlp,term_temporal,No,varchar(50),NA,"Term_temporal is to indicate if a condition is present or just in the past. The following would be past:

+- History = true +- Concept_date = anything before the time of the report",No,No,NA,NA,NA,NA,NA +note_nlp,term_modifiers,No,varchar(2000),NA,"For the modifiers that are there, they would have to have these values:

+- Negation = false +- Subject = patient +- Conditional = false +- Rule_out = false +- Uncertain = true or high or moderate or even low (could argue about low). Term_modifiers will concatenate all modifiers for different types of entities (conditions, drugs, labs etc) into one string. Lab values will be saved as one of the modifiers.",No,No,NA,NA,NA,NA,NA +specimen,specimen_id,Yes,integer,Unique identifier for each specimen.,NA,Yes,No,NA,NA,NA,NA,NA +specimen,person_id,Yes,integer,The person from whom the specimen is collected.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +specimen,specimen_concept_id,Yes,integer,NA,The standard CONCEPT_ID that the SPECIMEN_SOURCE_VALUE maps to in the specimen domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Specimen&standardConcept=Standard&page=1&pageSize=15&query=),No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +specimen,specimen_type_concept_id,Yes,integer,NA,"Put the source of the specimen record, as in an EHR system. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?standardConcept=Standard&domain=Type+Concept&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).",No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +specimen,specimen_date,Yes,date,The date the specimen was collected.,NA,No,No,NA,NA,NA,NA,NA +specimen,specimen_datetime,No,datetime,NA,NA,No,No,NA,NA,NA,NA,NA +specimen,quantity,No,float,The amount of specimen collected from the person.,NA,No,No,NA,NA,NA,NA,NA +specimen,unit_concept_id,No,integer,The unit for the quantity of the specimen.,Map the UNIT_SOURCE_VALUE to a Standard Concept in the Unit domain. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Unit&standardConcept=Standard&page=1&pageSize=15&query=),No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +specimen,anatomic_site_concept_id,No,integer,This is the site on the body where the specimen is from.,Map the ANATOMIC_SITE_SOURCE_VALUE to a Standard Concept in the Spec Anatomic Site domain. This should be coded at the lowest level of granularity [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?standardConcept=Standard&domain=Spec+Anatomic+Site&conceptClass=Body+Structure&page=4&pageSize=15&query=),No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +specimen,disease_status_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +specimen,specimen_source_id,No,varchar(50),This is the identifier for the specimen from the source system.,NA,No,No,NA,NA,NA,NA,NA +specimen,specimen_source_value,No,varchar(50),NA,NA,No,No,NA,NA,NA,NA,NA +specimen,unit_source_value,No,varchar(50),NA,"This unit for the quantity of the specimen, as represented in the source.",No,No,NA,NA,NA,NA,NA +specimen,anatomic_site_source_value,No,varchar(50),NA,"This is the site on the body where the specimen was taken from, as represented in the source.",No,No,NA,NA,NA,NA,NA +specimen,disease_status_source_value,No,varchar(50),NA,NA,No,No,NA,NA,NA,NA,NA +fact_relationship,domain_concept_id_1,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +fact_relationship,fact_id_1,Yes,integer,NA,NA,No,No,NA,NA,NA,NA,NA +fact_relationship,domain_concept_id_2,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +fact_relationship,fact_id_2,Yes,integer,NA,NA,No,No,NA,NA,NA,NA,NA +fact_relationship,relationship_concept_id,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +location,location_id,Yes,integer,The unique key given to a unique Location.,Each instance of a Location in the source data should be assigned this unique key.,Yes,No,NA,NA,NA,NA,NA +location,address_1,No,varchar(50),This is the first line of the address.,NA,No,No,NA,NA,NA,NA,NA +location,address_2,No,varchar(50),This is the second line of the address,NA,No,No,NA,NA,NA,NA,NA +location,city,No,varchar(50),NA,NA,No,No,NA,NA,NA,NA,NA +location,state,No,varchar(2),NA,NA,No,No,NA,NA,NA,NA,NA +location,zip,No,varchar(9),NA,"Zip codes are handled as strings of up to 9 characters length. For US addresses, these represent either a 3-digit abbreviated Zip code as provided by many sources for patient protection reasons, the full 5-digit Zip or the 9-digit (ZIP + 4) codes. Unless for specific reasons analytical methods should expect and utilize only the first 3 digits. For international addresses, different rules apply.",No,No,NA,NA,NA,NA,NA +location,county,No,varchar(20),NA,NA,No,No,NA,NA,NA,NA,NA +location,location_source_value,No,varchar(50),NA,"Put the verbatim value for the location here, as it shows up in the source.",No,No,NA,NA,NA,NA,NA +location,country_concept_id,No,integer,The Concept Id representing the country. Values should conform to the [Geography](https://athena.ohdsi.org/search-terms/terms?domain=Geography&standardConcept=Standard&page=1&pageSize=15&query=&boosts) domain.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +location,country_source_value,No,varchar(80),The name of the country.,NA,No,No,NA,NA,NA,NA,NA +location,latitude,No,float,NA,Must be between -90 and 90.,No,No,NA,NA,NA,NA,NA +location,longitude,No,float,NA,Must be between -180 and 180.,No,No,NA,NA,NA,NA,NA +care_site,care_site_id,Yes,integer,NA,"Assign an ID to each combination of a location and nature of the site - the latter could be the Place of Service, name or another characteristic in your source data.",Yes,No,NA,NA,NA,NA,NA +care_site,care_site_name,No,varchar(255),The name of the care_site as it appears in the source data,NA,No,No,NA,NA,NA,NA,NA +care_site,place_of_service_concept_id,No,integer,"This is a high-level way of characterizing a Care Site. Typically, however, Care Sites can provide care in multiple settings (inpatient, outpatient, etc.) and this granularity should be reflected in the visit.","Choose the concept in the visit domain that best represents the setting in which healthcare is provided in the Care Site. If most visits in a Care Site are Inpatient, then the place_of_service_concept_id should represent Inpatient. If information is present about a unique Care Site (e.g. Pharmacy) then a Care Site record should be created. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Visit&standardConcept=Standard&page=2&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +care_site,location_id,No,integer,The location_id from the LOCATION table representing the physical location of the care_site.,NA,No,Yes,LOCATION,LOCATION_ID,NA,NA,NA +care_site,care_site_source_value,No,varchar(50),The identifier of the care_site as it appears in the source data. This could be an identifier separate from the name of the care_site.,NA,No,No,NA,NA,NA,NA,NA +care_site,place_of_service_source_value,No,varchar(50),NA,Put the place of service of the care_site as it appears in the source data.,No,No,NA,NA,NA,NA,NA +provider,provider_id,Yes,integer,It is assumed that every provider with a different unique identifier is in fact a different person and should be treated independently.,"This identifier can be the original id from the source data provided it is an integer, otherwise it can be an autogenerated number.",Yes,No,NA,NA,NA,NA,NA +provider,provider_name,No,varchar(255),NA,"This field is not necessary as it is not necessary to have the actual identity of the Provider. Rather, the idea is to uniquely and anonymously identify providers of care across the database.",No,No,NA,NA,NA,NA,NA +provider,npi,No,varchar(20),This is the National Provider Number issued to health care providers in the US by the Centers for Medicare and Medicaid Services (CMS).,NA,No,No,NA,NA,NA,NA,NA +provider,dea,No,varchar(20),"This is the identifier issued by the DEA, a US federal agency, that allows a provider to write prescriptions for controlled substances.",NA,No,No,NA,NA,NA,NA,NA +provider,specialty_concept_id,No,integer,"This field either represents the most common specialty that occurs in the data or the most specific concept that represents all specialties listed, should the provider have more than one. This includes physician specialties such as internal medicine, emergency medicine, etc. and allied health professionals such as nurses, midwives, and pharmacists.","If a Provider has more than one Specialty, there are two options: 1. Choose a concept_id which is a common ancestor to the multiple specialties, or, 2. Choose the specialty that occurs most often for the provider. Concepts in this field should be Standard with a domain of Provider. [Accepted Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Provider&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +provider,care_site_id,No,integer,This is the CARE_SITE_ID for the location that the provider primarily practices in.,"If a Provider has more than one Care Site, the main or most often exerted CARE_SITE_ID should be recorded.",No,Yes,CARE_SITE,CARE_SITE_ID,NA,NA,NA +provider,year_of_birth,No,integer,NA,NA,No,No,NA,NA,NA,NA,NA +provider,gender_concept_id,No,integer,This field represents the recorded gender of the provider in the source data.,"If given, put a concept from the gender domain representing the recorded gender of the provider. [Accepted Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Gender&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,Gender,NA,NA +provider,provider_source_value,No,varchar(50),Use this field to link back to providers in the source data. This is typically used for error checking of ETL logic.,Some use cases require the ability to link back to providers in the source data. This field allows for the storing of the provider identifier as it appears in the source.,No,No,NA,NA,NA,NA,NA +provider,specialty_source_value,No,varchar(50),"This is the kind of provider or specialty as it appears in the source data. This includes physician specialties such as internal medicine, emergency medicine, etc. and allied health professionals such as nurses, midwives, and pharmacists.",Put the kind of provider as it appears in the source data. This field is up to the discretion of the ETL-er as to whether this should be the coded value from the source or the text description of the lookup value.,No,No,NA,NA,NA,NA,NA +provider,specialty_source_concept_id,No,integer,This is often zero as many sites use proprietary codes to store physician speciality.,If the source data codes provider specialty in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +provider,gender_source_value,No,varchar(50),This is provider's gender as it appears in the source data.,Put the provider's gender as it appears in the source data. This field is up to the discretion of the ETL-er as to whether this should be the coded value from the source or the text description of the lookup value.,No,No,NA,NA,NA,NA,NA +provider,gender_source_concept_id,No,integer,This is often zero as many sites use proprietary codes to store provider gender.,If the source data codes provider gender in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,payer_plan_period_id,Yes,integer,"A unique identifier for each unique combination of a Person, Payer, Plan, and Period of time.",NA,Yes,No,NA,NA,NA,NA,NA +payer_plan_period,person_id,Yes,integer,The Person covered by the Plan.,"A single Person can have multiple, overlapping, PAYER_PLAN_PERIOD records",No,Yes,PERSON,PERSON_ID,NA,NA,NA +payer_plan_period,payer_plan_period_start_date,Yes,date,Start date of Plan coverage.,NA,No,No,NA,NA,NA,NA,NA +payer_plan_period,payer_plan_period_end_date,Yes,date,End date of Plan coverage.,NA,No,No,NA,NA,NA,NA,NA +payer_plan_period,payer_concept_id,No,integer,This field represents the organization who reimburses the provider which administers care to the Person.,"Map the Payer directly to a standard CONCEPT_ID. If one does not exists please contact the vocabulary team. There is no global controlled vocabulary available for this information. The point is to stratify on this information and identify if Persons have the same payer, though the name of the Payer is not necessary. [Accepted Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Payer&standardConcept=Standard&page=1&pageSize=15&query=).",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,payer_source_value,No,varchar(50),This is the Payer as it appears in the source data.,NA,No,No,NA,NA,NA,NA,NA +payer_plan_period,payer_source_concept_id,No,integer,NA,If the source data codes the Payer in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,plan_concept_id,No,integer,This field represents the specific health benefit Plan the Person is enrolled in.,Map the Plan directly to a standard CONCEPT_ID. If one does not exists please contact the vocabulary team. There is no global controlled vocabulary available for this information. The point is to stratify on this information and identify if Persons have the same health benefit Plan though the name of the Plan is not necessary. [Accepted Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Plan&standardConcept=Standard&page=1&pageSize=15&query=).,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,plan_source_value,No,varchar(50),This is the health benefit Plan of the Person as it appears in the source data.,NA,No,No,NA,NA,NA,NA,NA +payer_plan_period,plan_source_concept_id,No,integer,NA,If the source data codes the Plan in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,sponsor_concept_id,No,integer,"This field represents the sponsor of the Plan who finances the Plan. This includes self-insured, small group health plan and large group health plan.",Map the sponsor directly to a standard CONCEPT_ID. If one does not exists please contact the vocabulary team. There is no global controlled vocabulary available for this information. The point is to stratify on this information and identify if Persons have the same sponsor though the name of the sponsor is not necessary. [Accepted Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Sponsor&standardConcept=Standard&page=1&pageSize=15&query=).,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,sponsor_source_value,No,varchar(50),The Plan sponsor as it appears in the source data.,NA,No,No,NA,NA,NA,NA,NA +payer_plan_period,sponsor_source_concept_id,No,integer,NA,If the source data codes the sponsor in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,family_source_value,No,varchar(50),The common identifier for all people (often a family) that covered by the same policy.,Often these are the common digits of the enrollment id of the policy members.,No,No,NA,NA,NA,NA,NA +payer_plan_period,stop_reason_concept_id,No,integer,"This field represents the reason the Person left the Plan, if known.",Map the stop reason directly to a standard CONCEPT_ID. If one does not exists please contact the vocabulary team. There is no global controlled vocabulary available for this information. [Accepted Concepts](http://athena.ohdsi.org/search-terms/terms?domain=Plan+Stop+Reason&standardConcept=Standard&page=1&pageSize=15&query=).,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +payer_plan_period,stop_reason_source_value,No,varchar(50),The Plan stop reason as it appears in the source data.,NA,No,No,NA,NA,NA,NA,NA +payer_plan_period,stop_reason_source_concept_id,No,integer,NA,If the source data codes the stop reason in an OMOP supported vocabulary store the concept_id here.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cost,cost_id,Yes,integer,NA,NA,Yes,No,NA,NA,NA,NA,NA +cost,cost_event_id,Yes,integer,NA,NA,No,No,NA,NA,NA,NA,NA +cost,cost_domain_id,Yes,varchar(20),NA,NA,No,Yes,DOMAIN,DOMAIN_ID,NA,NA,NA +cost,cost_type_concept_id,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cost,currency_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cost,total_charge,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,total_cost,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,total_paid,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_by_payer,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_by_patient,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_patient_copay,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_patient_coinsurance,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_patient_deductible,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_by_primary,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_ingredient_cost,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,paid_dispensing_fee,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,payer_plan_period_id,No,integer,NA,NA,No,No,NA,NA,NA,NA,NA +cost,amount_allowed,No,float,NA,NA,No,No,NA,NA,NA,NA,NA +cost,revenue_code_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cost,revenue_code_source_value,No,varchar(50),Revenue codes are a method to charge for a class of procedures and conditions in the U.S. hospital system.,NA,No,No,NA,NA,NA,NA,NA +cost,drg_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cost,drg_source_value,No,varchar(3),Diagnosis Related Groups are US codes used to classify hospital cases into one of approximately 500 groups.,NA,No,No,NA,NA,NA,NA,NA +drug_era,drug_era_id,Yes,integer,NA,NA,Yes,No,NA,NA,NA,NA,NA +drug_era,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +drug_era,drug_concept_id,Yes,integer,The Concept Id representing the specific drug ingredient.,NA,No,Yes,CONCEPT,CONCEPT_ID,Drug,Ingredient,NA +drug_era,drug_era_start_date,Yes,date,NA,"The Drug Era Start Date is the start date of the first Drug Exposure for a given ingredient, with at least 31 days since the previous exposure.",No,No,NA,NA,NA,NA,NA +drug_era,drug_era_end_date,Yes,date,NA,"The Drug Era End Date is the end date of the last Drug Exposure. The End Date of each Drug Exposure is either taken from the field drug_exposure_end_date or, as it is typically not available, inferred using the following rules: +For pharmacy prescription data, the date when the drug was dispensed plus the number of days of supply are used to extrapolate the End Date for the Drug Exposure. Depending on the country-specific healthcare system, this supply information is either explicitly provided in the day_supply field or inferred from package size or similar information. +For Procedure Drugs, usually the drug is administered on a single date (i.e., the administration date). +A standard Persistence Window of 30 days (gap, slack) is permitted between two subsequent such extrapolated DRUG_EXPOSURE records to be considered to be merged into a single Drug Era.",No,No,NA,NA,NA,NA,NA +drug_era,drug_exposure_count,No,integer,NA,NA,No,No,NA,NA,NA,NA,NA +drug_era,gap_days,No,integer,NA,"The Gap Days determine how many total drug-free days are observed between all Drug Exposure events that contribute to a DRUG_ERA record. It is assumed that the drugs are ""not stockpiled"" by the patient, i.e. that if a new drug prescription or refill is observed (a new DRUG_EXPOSURE record is written), the remaining supply from the previous events is abandoned. The difference between Persistence Window and Gap Days is that the former is the maximum drug-free time allowed between two subsequent DRUG_EXPOSURE records, while the latter is the sum of actual drug-free days for the given Drug Era under the above assumption of non-stockpiling.",No,No,NA,NA,NA,NA,NA +dose_era,dose_era_id,Yes,integer,NA,NA,Yes,No,NA,NA,NA,NA,NA +dose_era,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +dose_era,drug_concept_id,Yes,integer,The Concept Id representing the specific drug ingredient.,NA,No,Yes,CONCEPT,CONCEPT_ID,Drug,Ingredient,NA +dose_era,unit_concept_id,Yes,integer,The Concept Id representing the unit of the specific drug ingredient.,NA,No,Yes,CONCEPT,CONCEPT_ID,Unit,NA,NA +dose_era,dose_value,Yes,float,The numeric value of the dosage of the drug_ingredient.,NA,No,No,NA,NA,NA,NA,NA +dose_era,dose_era_start_date,Yes,date,"The date the Person started on the specific dosage, with at least 31 days since any prior exposure.",NA,No,No,NA,NA,NA,NA,NA +dose_era,dose_era_end_date,Yes,date,NA,The date the Person was no longer exposed to the dosage of the specific drug ingredient. An era is ended if there are 31 days or more between dosage records.,No,No,NA,NA,NA,NA,NA +condition_era,condition_era_id,Yes,integer,NA,NA,Yes,No,NA,NA,NA,NA,NA +condition_era,person_id,Yes,integer,NA,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +condition_era,condition_concept_id,Yes,integer,The Concept Id representing the Condition.,NA,No,Yes,CONCEPT,CONCEPT_ID,Condition,NA,NA +condition_era,condition_era_start_date,Yes,date,"The start date for the Condition Era +constructed from the individual +instances of Condition Occurrences. +It is the start date of the very first +chronologically recorded instance of +the condition with at least 31 days since any prior record of the same Condition.",NA,No,No,NA,NA,NA,NA,NA +condition_era,condition_era_end_date,Yes,date,"The end date for the Condition Era +constructed from the individual +instances of Condition Occurrences. +It is the end date of the final +continuously recorded instance of the +Condition.",NA,No,No,NA,NA,NA,NA,NA +condition_era,condition_occurrence_count,No,integer,"The number of individual Condition +Occurrences used to construct the +condition era.",NA,No,No,NA,NA,NA,NA,NA +episode,episode_id,Yes,integer,A unique identifier for each Episode.,NA,Yes,No,NA,NA,NA,NA,NA +episode,person_id,Yes,integer,The PERSON_ID of the PERSON for whom the episode is recorded.,NA,No,Yes,PERSON,PERSON_ID,NA,NA,NA +episode,episode_concept_id,Yes,integer,"The EPISODE_CONCEPT_ID represents the kind abstraction related to the disease phase, outcome or treatment.","Choose a concept in the Episode domain that best represents the ongoing disease phase, outcome, or treatment. Please see [article] for cancers and [article] for non-cancers describing how these are defined. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Episode&page=1&pageSize=15&query=)",No,Yes,CONCEPT,CONCEPT_ID,Episode,NA,NA +episode,episode_start_date,Yes,date,The date when the Episode beings.,Please see [article] for how to define an Episode start date.,No,No,NA,NA,NA,NA,NA +episode,episode_start_datetime,No,datetime,The date and time when the Episode begins.,NA,No,No,NA,NA,NA,NA,NA +episode,episode_end_date,No,date,The date when the instance of the Episode is considered to have ended.,Please see [article] for how to define an Episode end date.,No,No,NA,NA,NA,NA,NA +episode,episode_end_datetime,No,datetime,The date when the instance of the Episode is considered to have ended.,NA,No,No,NA,NA,NA,NA,NA +episode,episode_parent_id,No,integer,Use this field to find the Episode that subsumes the given Episode record. This is used in the case that an Episode are nested into each other.,"If there are multiple nested levels to how Episodes are represented, the EPISODE_PARENT_ID can be used to record this relationship.",No,No,NA,NA,NA,NA,NA +episode,episode_number,No,integer,"For sequences of episodes, this is used to indicate the order the episodes occurred. For example, lines of treatment could be indicated here.",Please see [article] for the details of how to count episodes.,No,No,NA,NA,NA,NA,NA +episode,episode_object_concept_id,Yes,integer,"A Standard Concept representing the disease phase, outcome, or other abstraction of which the episode consists. For example, if the EPISODE_CONCEPT_ID is [treatment regimen](https://athena.ohdsi.org/search-terms/terms/32531) then the EPISODE_OBJECT_CONCEPT_ID should contain the chemotherapy regimen concept, like [Afatinib monotherapy](https://athena.ohdsi.org/search-terms/terms/35804392).",Episode entries from the 'Disease Episode' concept class should have an episode_object_concept_id that comes from the Condition domain. Episode entries from the 'Treatment Episode' concept class should have an episode_object_concept_id that scome from the 'Procedure' domain or 'Regimen' concept class.,No,Yes,CONCEPT,CONCEPT_ID,"Procedure, Regimen",NA,NA +episode,episode_type_concept_id,Yes,integer,"This field can be used to determine the provenance of the Episode record, as in whether the episode was from an EHR system, insurance claim, registry, or other sources.",Choose the EPISODE_TYPE_CONCEPT_ID that best represents the provenance of the record. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?domain=Type+Concept&standardConcept=Standard&page=1&pageSize=15&query=). A more detailed explanation of each Type Concept can be found on the [vocabulary wiki](https://github.com/OHDSI/Vocabulary-v5.0/wiki/Vocab.-TYPE_CONCEPT).,No,Yes,CONCEPT,CONCEPT_ID,Type Concept,NA,NA +episode,episode_source_value,No,varchar(50),The source code for the Episdoe as it appears in the source data. This code is mapped to a Standard Condition Concept in the Standardized Vocabularies and the original code is stored here for reference.,NA,No,No,NA,NA,NA,NA,NA +episode,episode_source_concept_id,No,integer,A foreign key to a Episode Concept that refers to the code used in the source.,Given that the Episodes are user-defined it is unlikely that there will be a Source Concept available. If that is the case then set this field to zero.,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +episode_event,episode_id,Yes,integer,Use this field to link the EPISODE_EVENT record to its EPISODE.,Put the EPISODE_ID that subsumes the EPISODE_EVENT record here.,No,Yes,EPISODE,EPISODE_ID,NA,NA,NA +episode_event,event_id,Yes,integer,"This field is the primary key of the linked record in the database. For example, if the Episode Event is a Condition Occurrence, then the CONDITION_OCCURRENCE_ID of the linked record goes in this field.",Put the primary key of the linked record here.,No,No,NA,NA,NA,NA,NA +episode_event,episode_event_field_concept_id,Yes,integer,This field is the CONCEPT_ID that identifies which table the primary key of the linked record came from.,Put the CONCEPT_ID that identifies which table and field the EVENT_ID came from. [Accepted Concepts](https://athena.ohdsi.org/search-terms/terms?vocabulary=CDM&conceptClass=Field&page=1&pageSize=15&query=),No,Yes,CONCEPT,CONCEPT_ID,Metadata,NA,NA +metadata,metadata_id,Yes,integer,The unique key given to a Metadata record.,Attribute value is auto-generated,Yes,No,NA,NA,NA,NA,NA +metadata,metadata_concept_id,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +metadata,metadata_type_concept_id,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +metadata,name,Yes,varchar(250),NA,NA,No,No,NA,NA,NA,NA,NA +metadata,value_as_string,No,varchar(250),NA,NA,No,No,NA,NA,NA,NA,NA +metadata,value_as_concept_id,No,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +metadata,value_as_number,No,float,"This is the numerical value of the result of the Metadata, if applicable and available. It is not expected that all Metadata will have numeric results, rather, this field is here to house values should they exist.",NA,No,No,NA,NA,NA,NA,NA +metadata,metadata_date,No,date,NA,NA,No,No,NA,NA,NA,NA,NA +metadata,metadata_datetime,No,datetime,NA,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_source_name,Yes,varchar(255),The name of the CDM instance.,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_source_abbreviation,Yes,varchar(25),The abbreviation of the CDM instance.,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_holder,Yes,varchar(255),The holder of the CDM instance.,NA,No,No,NA,NA,NA,NA,NA +cdm_source,source_description,No,varchar(MAX),The description of the CDM instance.,NA,No,No,NA,NA,NA,NA,NA +cdm_source,source_documentation_reference,No,varchar(255),NA,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_etl_reference,No,varchar(255),NA,Put the link to the CDM version used.,No,No,NA,NA,NA,NA,NA +cdm_source,source_release_date,Yes,date,The release date of the source data.,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_release_date,Yes,date,The release data of the CDM instance.,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_version,No,varchar(10),NA,NA,No,No,NA,NA,NA,NA,NA +cdm_source,cdm_version_concept_id,Yes,integer,The Concept Id representing the version of the CDM.,You can find all concepts that represent the CDM versions using the query: SELECT * FROM CONCEPT WHERE VOCABULARY_ID = 'CDM' AND CONCEPT_CLASS = 'CDM',No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cdm_source,vocabulary_version,Yes,varchar(20),NA,You can find the version of your Vocabulary using the query: SELECT vocabulary_version from vocabulary where vocabulary_id = 'None',No,No,NA,NA,NA,NA,NA +concept,concept_id,Yes,integer,A unique identifier for each Concept across all domains.,NA,Yes,No,NA,NA,NA,NA,NA +concept,concept_name,Yes,varchar(255),"An unambiguous, meaningful and descriptive name for the Concept.",NA,No,No,NA,NA,NA,NA,NA +concept,domain_id,Yes,varchar(20),A foreign key to the [DOMAIN](https://ohdsi.github.io/CommonDataModel/cdm531.html#domain) table the Concept belongs to.,NA,No,Yes,DOMAIN,DOMAIN_ID,NA,NA,NA +concept,vocabulary_id,Yes,varchar(20),"A foreign key to the [VOCABULARY](https://ohdsi.github.io/CommonDataModel/cdm531.html#vocabulary) +table indicating from which source the +Concept has been adapted.",NA,No,Yes,VOCABULARY,VOCABULARY_ID,NA,NA,NA +concept,concept_class_id,Yes,varchar(20),"The attribute or concept class of the +Concept. Examples are 'Clinical Drug', +'Ingredient', 'Clinical Finding' etc.",NA,No,Yes,CONCEPT_CLASS,CONCEPT_CLASS_ID,NA,NA,NA +concept,standard_concept,No,varchar(1),"This flag determines where a Concept is +a Standard Concept, i.e. is used in the +data, a Classification Concept, or a +non-standard Source Concept. The +allowable values are 'S' (Standard +Concept) and 'C' (Classification +Concept), otherwise the content is NULL.",NA,No,No,NA,NA,NA,NA,NA +concept,concept_code,Yes,varchar(50),"The concept code represents the identifier +of the Concept in the source vocabulary, +such as SNOMED-CT concept IDs, +RxNorm RXCUIs etc. Note that concept +codes are not unique across vocabularies.",NA,No,No,NA,NA,NA,NA,NA +concept,valid_start_date,Yes,date,"The date when the Concept was first +recorded. The default value is +1-Jan-1970, meaning, the Concept has no +(known) date of inception.",NA,No,No,NA,NA,NA,NA,NA +concept,valid_end_date,Yes,date,"The date when the Concept became +invalid because it was deleted or +superseded (updated) by a new concept. +The default value is 31-Dec-2099, +meaning, the Concept is valid until it +becomes deprecated.",NA,No,No,NA,NA,NA,NA,NA +concept,invalid_reason,No,varchar(1),"Reason the Concept was invalidated. +Possible values are D (deleted), U +(replaced with an update) or NULL when +valid_end_date has the default value.",NA,No,No,NA,NA,NA,NA,NA +vocabulary,vocabulary_id,Yes,varchar(20),"A unique identifier for each Vocabulary, such +as ICD9CM, SNOMED, Visit.",NA,Yes,No,NA,NA,NA,NA,NA +vocabulary,vocabulary_name,Yes,varchar(255),"The name describing the vocabulary, for +example, International Classification of +Diseases, Ninth Revision, Clinical +Modification, Volume 1 and 2 (NCHS) etc.",NA,No,No,NA,NA,NA,NA,NA +vocabulary,vocabulary_reference,No,varchar(255),"External reference to documentation or +available download of the about the +vocabulary.",NA,No,No,NA,NA,NA,NA,NA +vocabulary,vocabulary_version,No,varchar(255),"Version of the Vocabulary as indicated in +the source.",NA,No,No,NA,NA,NA,NA,NA +vocabulary,vocabulary_concept_id,Yes,integer,A Concept that represents the Vocabulary the VOCABULARY record belongs to.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +domain,domain_id,Yes,varchar(20),A unique key for each domain.,NA,Yes,No,NA,NA,NA,NA,NA +domain,domain_name,Yes,varchar(255),"The name describing the Domain, e.g. +Condition, Procedure, Measurement +etc.",NA,No,No,NA,NA,NA,NA,NA +domain,domain_concept_id,Yes,integer,A Concept representing the Domain Concept the DOMAIN record belongs to.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_class,concept_class_id,Yes,varchar(20),A unique key for each class.,NA,Yes,No,NA,NA,NA,NA,NA +concept_class,concept_class_name,Yes,varchar(255),"The name describing the Concept Class, e.g. +Clinical Finding, Ingredient, etc.",NA,No,No,NA,NA,NA,NA,NA +concept_class,concept_class_concept_id,Yes,integer,A Concept that represents the Concept Class.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_relationship,concept_id_1,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_relationship,concept_id_2,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_relationship,relationship_id,Yes,varchar(20),The relationship between CONCEPT_ID_1 and CONCEPT_ID_2. Please see the [Vocabulary Conventions](https://ohdsi.github.io/CommonDataModel/dataModelConventions.html#concept_relationships). for more information.,NA,No,Yes,RELATIONSHIP,RELATIONSHIP_ID,NA,NA,NA +concept_relationship,valid_start_date,Yes,date,The date when the relationship is first recorded.,NA,No,No,NA,NA,NA,NA,NA +concept_relationship,valid_end_date,Yes,date,The date when the relationship is invalidated.,NA,No,No,NA,NA,NA,NA,NA +concept_relationship,invalid_reason,No,varchar(1),"Reason the relationship was invalidated. Possible values are 'D' (deleted), 'U' (updated) or NULL.",NA,No,No,NA,NA,NA,NA,NA +relationship,relationship_id,Yes,varchar(20),"The type of relationship captured by the +relationship record.",NA,Yes,No,NA,NA,NA,NA,NA +relationship,relationship_name,Yes,varchar(255),NA,NA,No,No,NA,NA,NA,NA,NA +relationship,is_hierarchical,Yes,varchar(1),"Defines whether a relationship defines +concepts into classes or hierarchies. Values +are 1 for hierarchical relationship or 0 if not.",NA,No,No,NA,NA,NA,NA,NA +relationship,defines_ancestry,Yes,varchar(1),"Defines whether a hierarchical relationship +contributes to the concept_ancestor table. +These are subsets of the hierarchical +relationships. Valid values are 1 or 0.",NA,No,No,NA,NA,NA,NA,NA +relationship,reverse_relationship_id,Yes,varchar(20),"The identifier for the relationship used to +define the reverse relationship between two +concepts.",NA,No,No,NA,NA,NA,NA,NA +relationship,relationship_concept_id,Yes,integer,"A foreign key that refers to an identifier in +the [CONCEPT](https://ohdsi.github.io/CommonDataModel/cdm531.html#concept) table for the unique +relationship concept.",NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_synonym,concept_id,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_synonym,concept_synonym_name,Yes,varchar(1000),NA,NA,No,No,NA,NA,NA,NA,NA +concept_synonym,language_concept_id,Yes,integer,NA,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_ancestor,ancestor_concept_id,Yes,integer,"The Concept Id for the higher-level concept +that forms the ancestor in the relationship.",NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_ancestor,descendant_concept_id,Yes,integer,"The Concept Id for the lower-level concept +that forms the descendant in the +relationship.",NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +concept_ancestor,min_levels_of_separation,Yes,integer,"The minimum separation in number of +levels of hierarchy between ancestor and +descendant concepts. This is an attribute +that is used to simplify hierarchic analysis.",NA,No,No,NA,NA,NA,NA,NA +concept_ancestor,max_levels_of_separation,Yes,integer,"The maximum separation in number of +levels of hierarchy between ancestor and +descendant concepts. This is an attribute +that is used to simplify hierarchic analysis.",NA,No,No,NA,NA,NA,NA,NA +source_to_concept_map,source_code,Yes,varchar(50),"The source code being translated +into a Standard Concept.",NA,No,No,NA,NA,NA,NA,NA +source_to_concept_map,source_concept_id,Yes,integer,"A foreign key to the Source +Concept that is being translated +into a Standard Concept.","This is either 0 or should be a number above 2 billion, which are the Concepts reserved for site-specific codes and mappings.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +source_to_concept_map,source_vocabulary_id,Yes,varchar(20),"A foreign key to the +VOCABULARY table defining the +vocabulary of the source code that +is being translated to a Standard +Concept.",NA,No,No,NA,NA,NA,NA,NA +source_to_concept_map,source_code_description,No,varchar(255),"An optional description for the +source code. This is included as a +convenience to compare the +description of the source code to +the name of the concept.",NA,No,No,NA,NA,NA,NA,NA +source_to_concept_map,target_concept_id,Yes,integer,"The target Concept +to which the source code is being +mapped.",NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +source_to_concept_map,target_vocabulary_id,Yes,varchar(20),The Vocabulary of the target Concept.,NA,No,Yes,VOCABULARY,VOCABULARY_ID,NA,NA,NA +source_to_concept_map,valid_start_date,Yes,date,"The date when the mapping +instance was first recorded.",NA,No,No,NA,NA,NA,NA,NA +source_to_concept_map,valid_end_date,Yes,date,"The date when the mapping +instance became invalid because it +was deleted or superseded +(updated) by a new relationship. +Default value is 31-Dec-2099.",NA,No,No,NA,NA,NA,NA,NA +source_to_concept_map,invalid_reason,No,varchar(1),"Reason the mapping instance was invalidated. Possible values are D (deleted), U (replaced with an update) or NULL when valid_end_date has the default value.",NA,No,No,NA,NA,NA,NA,NA +drug_strength,drug_concept_id,Yes,integer,The Concept representing the Branded Drug or Clinical Drug Product.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +drug_strength,ingredient_concept_id,Yes,integer,The Concept representing the active ingredient contained within the drug product.,"Combination Drugs will have more than one record in this table, one for each active Ingredient.",No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +drug_strength,amount_value,No,float,The numeric value or the amount of active ingredient contained within the drug product.,NA,No,No,NA,NA,NA,NA,NA +drug_strength,amount_unit_concept_id,No,integer,The Concept representing the Unit of measure for the amount of active ingredient contained within the drug product.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +drug_strength,numerator_value,No,float,The concentration of the active ingredient contained within the drug product.,NA,No,No,NA,NA,NA,NA,NA +drug_strength,numerator_unit_concept_id,No,integer,The Concept representing the Unit of measure for the concentration of active ingredient.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +drug_strength,denominator_value,No,float,"The amount of total liquid (or other divisible product, such as ointment, gel, spray, etc.).",NA,No,No,NA,NA,NA,NA,NA +drug_strength,denominator_unit_concept_id,No,integer,The Concept representing the denominator unit for the concentration of active ingredient.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +drug_strength,box_size,No,integer,The number of units of Clinical Branded Drug or Quantified Clinical or Branded Drug contained in a box as dispensed to the patient.,NA,No,No,NA,NA,NA,NA,NA +drug_strength,valid_start_date,Yes,date,"The date when the Concept was first +recorded. The default value is +1-Jan-1970.",NA,No,No,NA,NA,NA,NA,NA +drug_strength,valid_end_date,Yes,date,The date when then Concept became invalid.,NA,No,No,NA,NA,NA,NA,NA +drug_strength,invalid_reason,No,varchar(1),"Reason the concept was invalidated. Possible values are D (deleted), U (replaced with an update) or NULL when valid_end_date has the default value.",NA,No,No,NA,NA,NA,NA,NA +cohort,cohort_definition_id,Yes,integer,NA,NA,No,No,NA,NA,NA,NA,NA +cohort,subject_id,Yes,integer,NA,NA,No,No,NA,NA,NA,NA,NA +cohort,cohort_start_date,Yes,date,NA,NA,No,No,NA,NA,NA,NA,NA +cohort,cohort_end_date,Yes,date,NA,NA,No,No,NA,NA,NA,NA,NA +cohort_definition,cohort_definition_id,Yes,integer,"This is the identifier given to the cohort, usually by the ATLAS application",NA,No,No,NA,NA,NA,NA,NA +cohort_definition,cohort_definition_name,Yes,varchar(255),A short description of the cohort,NA,No,No,NA,NA,NA,NA,NA +cohort_definition,cohort_definition_description,No,varchar(MAX),A complete description of the cohort.,NA,No,No,NA,NA,NA,NA,NA +cohort_definition,definition_type_concept_id,Yes,integer,Type defining what kind of Cohort Definition the record represents and how the syntax may be executed.,NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cohort_definition,cohort_definition_syntax,No,varchar(MAX),Syntax or code to operationalize the Cohort Definition.,NA,No,No,NA,NA,NA,NA,NA +cohort_definition,subject_concept_id,Yes,integer,"This field contains a Concept that represents the domain of the subjects that are members of the cohort (e.g., Person, Provider, Visit).",NA,No,Yes,CONCEPT,CONCEPT_ID,NA,NA,NA +cohort_definition,cohort_initiation_date,No,date,A date to indicate when the Cohort was initiated in the COHORT table.,NA,No,No,NA,NA,NA,NA,NA diff --git a/ehrdata/utils/omop_utils.py b/ehrdata/utils/omop_utils.py new file mode 100644 index 0000000..b6a796b --- /dev/null +++ b/ehrdata/utils/omop_utils.py @@ -0,0 +1,455 @@ +import pandas as pd +import os +import csv +import warnings +import dask.dataframe as dd +from pathlib import Path +from typing import List, Union, Literal, Optional, Dict +import numbers +from rich import print as rprint +import glob +from difflib import SequenceMatcher +from heapq import nlargest as _nlargest + + +def get_table_catalog_dict(): + table_catalog_dict = {} + table_catalog_dict['Clinical data'] = [ + "person", + "observation_period", + "specimen", + "death", + "visit_occurrence", + "visit_detail", + "procedure_occurrence", + "drug_exposure", + "device_exposure", + "condition_occurrence", + "measurement", + "note", + "note_nlp", + "observation", + "fact_relationship", + ] + + table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] + table_catalog_dict["Health economics data"] = ["payer_plan_period", "cost"] + table_catalog_dict["Standardized derived elements"] = ["cohort", "cohort_definition", "drug_era", "dose_era", "condition_era"] + table_catalog_dict["Metadata"] = ["cdm_source", "metadata"] + table_catalog_dict["Vocabulary"] = [ + "concept", + "vocabulary", + "domain", + "concept_class", + "concept_relationship", + "relationship", + "concept_synonym", + "concept_ancestor", + "source_to_concept_map", + "drug_strength", + ] + return table_catalog_dict + +def get_dtype_mapping(): + dtype_mapping = {'integer': "Int64", + 'Integer': "Int64", + 'float': float, + 'bigint': "Int64", + 'varchar(MAX)': str, + 'varchar(2000)': str, + 'varchar(1000)': str, + 'varchar(255)': str, + 'varchar(250)': str, + 'varchar(80)': str, + 'varchar(60)': str, + 'varchar(50)': str, + 'varchar(25)': str, + 'varchar(20)': str, + 'varchar(10)': str, + 'varchar(9)': str, + 'varchar(3)': str, + 'varchar(2)': str, + 'varchar(1)': str, + 'datetime': object, + 'date': object} + + return dtype_mapping + +def get_omop_cdm_field_level(): + pth = f"{Path(__file__).resolve().parent}/OMOP_CDMv5.4_Field_Level.csv" + df = pd.read_csv(pth) + return df + +def check_with_omop_cdm( + delimiter, + folder_path=str, + make_filename_lowercase=True): + + + print("Checking if your data adheres to the OMOP Common Data Model (CDM) version 5.4 standards.") + filepath_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) + filepath_dict = {} + for path in filepath_list: + if os.path.isfile(path): + is_single_file = True + else: + is_single_file = False + + # TODO support table stored in a folder + """ + # If not a single file, only check the first one's column names + if not os.path.isfile(path): + folder_walk = os.walk(path) + first_file_in_folder = next(folder_walk)[2][0] + file = os.path.join(path, first_file_in_folder) + is_single_file = False + """ + if is_single_file and not check_csv_has_only_header(path): + + # Make filename into lowercase + if make_filename_lowercase: + new_path = os.path.join(folder_path, path.split("/")[-1].lower()) + if path != new_path: + warnings(f"Rename file [{path}] to [{new_path}]") + os.rename(path, new_path) + path = new_path + + # check if table name adheres to the OMOP CDM + file_name = os.path.basename(path).split(".")[0] + field_level = get_omop_cdm_field_level() + if file_name not in set(field_level.cdmTableName): + raise KeyError(f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!") + + + + # check if column names adhere to the OMOP CDM + if path.endswith('csv'): + with open(path, "r") as f: + dict_reader = csv.DictReader(f, delimiter=delimiter) + columns = dict_reader.fieldnames + columns = list(filter(None, columns)) + elif path.endswith('parquet'): + df = dd.read_parquet(path) + columns = list(df.columns) + else: + raise TypeError("Only support CSV and Parquet file!") + + invalid_column_name = [] + for _, column in enumerate(columns): + cdm_columns = set(field_level[field_level.cdmTableName == file_name]['cdmFieldName']) + if column not in cdm_columns: + invalid_column_name.append(column) + if len(invalid_column_name) > 0: + print(f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}") + raise KeyError + + filepath_dict[file_name] = path + return filepath_dict + +def check_csv_has_only_header(file_path): + if file_path.endswith('csv'): + with open(file_path, 'r') as file: + reader = csv.reader(file) + header = next(reader, None) + if header is not None: + second_row = next(reader, None) + return second_row is None + else: + return False + else: + return False + +def get_column_types(adata_dict, + table_name: str = None): + + path = adata_dict['filepath_dict'][table_name] + column_types = {} + # If not a single file, read the first one + if not os.path.isfile(path): + folder_walk = os.walk(path) + first_file_in_folder = next(folder_walk)[2][0] + path = os.path.join(path, first_file_in_folder) + + if path.endswith('csv'): + with open(path, "r") as f: + dict_reader = csv.DictReader(f, delimiter=adata_dict['delimiter']) + columns = dict_reader.fieldnames + columns = list(filter(None, columns)) + elif path.endswith('parquet'): + df = dd.read_parquet(path) + columns = list(df.columns) + else: + raise TypeError("Only support CSV and Parquet file!") + columns_lowercase = [column.lower() for column in columns] + for _, column in enumerate(columns_lowercase): + dtype_mapping = get_dtype_mapping() + field_level = get_omop_cdm_field_level() + column_types[column] = dtype_mapping[field_level[(field_level.cdmTableName == table_name) & (field_level.cdmFieldName == column)]['cdmDatatype'].values[0]] + return column_types + + +def get_primary_key(table_name): + field_level = get_omop_cdm_field_level() + primary_key = field_level[(field_level.cdmTableName == table_name) & (field_level.isPrimaryKey == 'Yes')]['cdmFieldName'].values[0] + return primary_key + +def read_table(adata_dict, table_name: str = None, dtype=None, parse_dates=None, index=None, usecols=None, use_dask=None): + + if not use_dask: + use_dask = adata_dict['use_dask'] + path = adata_dict['filepath_dict'][table_name] + if use_dask: + if not os.path.isfile(path): + folder_walk = os.walk(path) + filetype = next(folder_walk)[2][0].split(".")[-1] + else: + filetype = path.split(".")[-1] + if filetype == 'csv': + if not os.path.isfile(path): + path = f"{path}/*.csv" + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = dd.read_csv(path, delimiter=adata_dict['delimiter'], dtype=dtype, parse_dates=parse_dates, usecols=usecols) + elif filetype == 'parquet': + if not os.path.isfile(path): + path = f"{path}/*.parquet" + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = dd.read_parquet(path, dtype=dtype, parse_dates=parse_dates, columns=usecols) + else: + raise TypeError("Only support CSV and Parquet file!") + else: + if not os.path.isfile(path): + raise TypeError("Only support reading a single file!") + filetype = path.split(".")[-1] + if filetype == 'csv': + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = pd.read_csv(path, delimiter=adata_dict['delimiter'], dtype=dtype, parse_dates=parse_dates, usecols=usecols) + elif filetype == 'parquet': + df = pd.read_parquet(path, columns=usecols) + + else: + raise TypeError("Only support CSV and Parquet file!") + + + if index: + df = df.set_index(index) + return df + + +def map_concept_id( + adata_dict, + concept_id: Union[str, List], + verbose=True): + + filepath_dict = adata_dict['filepath_dict'] + tables = adata_dict['tables'] + delimiter = adata_dict['delimiter'] + + if isinstance(concept_id, numbers.Integral): + concept_id = [concept_id] + concept_id_1 = [] + concept_id_2 = [] + concept_id_mapped_not_found = [] + + if "concept_relationship" in tables: + column_types = get_column_types(adata_dict, table_name="concept_relationship") + df_concept_relationship = pd.read_csv( + filepath_dict["concept_relationship"], dtype=column_types + ) + # TODO dask Support + #df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + df_concept_relationship.dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + concept_relationship_dict = df_to_dict( + df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], + key="concept_id_1", + value="concept_id_2", + ) + concept_relationship_dict_reverse = df_to_dict( + df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"], + key="concept_id_1", + value="concept_id_2", + ) + for id in concept_id: + try: + concept_id_2.append(concept_relationship_dict[id]) + concept_id_1.append(id) + except KeyError: + try: + concept_id_1.append(concept_relationship_dict_reverse[id]) + concept_id_2.append(id) + except KeyError: + concept_id_1.append(id) + concept_id_2.append(id) + concept_id_mapped_not_found.append(id) + if len(concept_id_mapped_not_found) > 0: + # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") + if verbose: + rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!") + else: + concept_id_1 = concept_id + concept_id_2 = concept_id + + if len(concept_id_1) == 1: + return concept_id_1[0], concept_id_2[0] + else: + return concept_id_1, concept_id_2 + + +def df_to_dict(df, key, value): + if isinstance(df, dd.DataFrame): + return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() + else: + return pd.Series(df[value].values, index=df[key]).to_dict() + + +def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): + """Use SequenceMatcher to return a list of the indexes of the best + "good enough" matches. word is a sequence for which close matches + are desired (typically a string). + possibilities is a dictionary of sequences. + Optional arg n (default 2) is the maximum number of close matches to + return. n must be > 0. + Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities + that don't score at least that similar to word are ignored. + """ + + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher() + s.set_seq2(word) + for _, (key, value) in enumerate(possibilities.items()): + s.set_seq1(value) + if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: + result.append((s.ratio(), value, key)) + + # Move the best scorers to head of list + result = _nlargest(n, result) + + # Strip scores for the best n matches + return [(value, key, score) for score, value, key in result] + + + +def get_feature_info( + adata_dict: Dict, + features: Union[str, int, List[Union[str, int]]] = None, + ignore_not_shown_in_concept_table: bool = True, + exact_match: bool = True, + verbose: bool = True, +): + + if "concept" in adata_dict['tables']: + column_types = get_column_types(adata_dict, table_name="concept") + + df_concept = read_table(adata_dict, table_name="concept", dtype=column_types).dropna( + subset=["concept_id", "concept_name"] + ) # usecols=vocabularies_tables_columns["concept"], + #concept_dict = df_to_dict(df=df_concept, key="concept_name", value="concept_id") + + fetures_not_shown_in_concept_table = [] + + info_df = pd.DataFrame([]) + if isinstance(features, str): + features = [features] + # Get feature id for each input, and check if each feature occurs in the concept table + for feature in features: + # if the input is feature ID + if isinstance(feature, numbers.Integral): + feature_id = feature + feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) + try: + feature_name = df_concept[df_concept['concept_id'] == feature_id_1]['concept_name'].values[0] + except KeyError: + if ignore_not_shown_in_concept_table: + fetures_not_shown_in_concept_table.append(feature) + continue + else: + rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table") + raise + match_1_ratio = 100 + + # if the input is feature name + elif isinstance(feature, str): + # return a list of (value, key, score) + #result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) + from thefuzz import process + + # the thefuzz match returns a list of tuples of (matched string, match ratio) + result = process.extract(feature, list(df_concept['concept_name'].values), limit=2) + + + match_1 = result[0] + match_1_name = match_1[0] + match_1_ratio = match_1[1] + # Most of the case: if find 2 best matches + if len(result) == 2: + + match_2 = result[1] + match_2_name = match_2[0] + match_2_ratio = match_2[1] + + if match_1_ratio != 100: + if exact_match: + rprint( + f"Unable to find an exact match for [blue]{feature}[/] in the concept table.\nSimilar ones: 1) [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/] 2) [blue]{match_2_name}[/] with match ratio [red]{match_2_ratio}[/]" + ) + raise ValueError + else: + if match_2_ratio == 100: + match_1_id = df_concept[df_concept['concept_name'] == match_1_name]['concept_id'].values[0] + match_2_id = df_concept[df_concept['concept_name'] == match_2_name]['concept_id'].values[0] + rprint( + f"Found multiple exact matches for [blue]{feature}[/] in the concept table.\n1) concept id: [blue]{match_1_id}[/] 2) concept id: [blue]{match_2_id}[/]. Please specify concept_id directly." + ) + raise ValueError + + + # Very rare: if only find 1 match + else: + if exact_match and match_1_ratio != 1: + rprint( + f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similiar one: [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/]" + ) + raise ValueError + + feature_name = match_1_name + feature_id = df_concept[df_concept['concept_name'] == feature_name]['concept_id'].values[0] + feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) + + else: + rprint( + f"Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] that you want to extarct" + ) + raise TypeError + + info_df = pd.concat([info_df, pd.DataFrame(data=[[feature_name, feature_id_1, feature_id_2]], columns=['feature_name', 'feature_id_1', 'feature_id_2'])]) + + + # feature_name_list.append(feature_name) + # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) + # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) + # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0]) + + if verbose: + rprint( + f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match ratio = [green]{match_1_ratio}." + ) + + if info_df[f"feature_id_1"].equals(info_df[f"feature_id_2"]): + info_df.drop(f"feature_id_2", axis=1, inplace=True) + info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) + info_df = info_df.reset_index(drop=True) + else: + info_df = info_df.reset_index(drop=True) + return info_df \ No newline at end of file diff --git a/src/ehrdata/__init__.py b/src/ehrdata/__init__.py deleted file mode 100644 index 709eac7..0000000 --- a/src/ehrdata/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from importlib.metadata import version - -from . import pl, pp, tl - -__all__ = ["pl", "pp", "tl"] - -__version__ = version("ehrdata") diff --git a/src/ehrdata/pl/__init__.py b/src/ehrdata/pl/__init__.py deleted file mode 100644 index c2315dd..0000000 --- a/src/ehrdata/pl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import BasicClass, basic_plot diff --git a/src/ehrdata/pl/basic.py b/src/ehrdata/pl/basic.py deleted file mode 100644 index ed390ef..0000000 --- a/src/ehrdata/pl/basic.py +++ /dev/null @@ -1,63 +0,0 @@ -from anndata import AnnData - - -def basic_plot(adata: AnnData) -> int: - """Generate a basic plot for an AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Import matplotlib and implement a plotting function here.") - return 0 - - -class BasicClass: - """A basic class. - - Parameters - ---------- - adata - The AnnData object to preprocess. - """ - - my_attribute: str = "Some attribute." - my_other_attribute: int = 0 - - def __init__(self, adata: AnnData): - print("Implement a class here.") - - def my_method(self, param: int) -> int: - """A basic method. - - Parameters - ---------- - param - A parameter. - - Returns - ------- - Some integer value. - """ - print("Implement a method here.") - return 0 - - def my_other_method(self, param: str) -> str: - """Another basic method. - - Parameters - ---------- - param - A parameter. - - Returns - ------- - Some integer value. - """ - print("Implement a method here.") - return "" diff --git a/src/ehrdata/pp/__init__.py b/src/ehrdata/pp/__init__.py deleted file mode 100644 index 5e7e293..0000000 --- a/src/ehrdata/pp/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import basic_preproc diff --git a/src/ehrdata/pp/basic.py b/src/ehrdata/pp/basic.py deleted file mode 100644 index 5db1ec0..0000000 --- a/src/ehrdata/pp/basic.py +++ /dev/null @@ -1,17 +0,0 @@ -from anndata import AnnData - - -def basic_preproc(adata: AnnData) -> int: - """Run a basic preprocessing on the AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Implement a preprocessing function here.") - return 0 diff --git a/src/ehrdata/tl/__init__.py b/src/ehrdata/tl/__init__.py deleted file mode 100644 index 95a32cd..0000000 --- a/src/ehrdata/tl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .basic import basic_tool diff --git a/src/ehrdata/tl/basic.py b/src/ehrdata/tl/basic.py deleted file mode 100644 index d215ade..0000000 --- a/src/ehrdata/tl/basic.py +++ /dev/null @@ -1,17 +0,0 @@ -from anndata import AnnData - - -def basic_tool(adata: AnnData) -> int: - """Run a tool on the AnnData object. - - Parameters - ---------- - adata - The AnnData object to preprocess. - - Returns - ------- - Some integer value. - """ - print("Implement a tool to run on the AnnData object.") - return 0 diff --git a/tests/test_basic.py b/tests/test_basic.py index 4ac1cbe..e61e5c0 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,10 +1,10 @@ import pytest -import ehrdata +import ehrdata_source def test_package_has_version(): - assert ehrdata.__version__ is not None + assert ehrdata_source.__version__ is not None @pytest.mark.skip(reason="This decorator should be removed when test passes.") From d20a3b0a2c6bb1eb799ea44d62df58dd06e63732 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Wed, 14 Feb 2024 12:13:26 +0100 Subject: [PATCH 04/13] update --- ehrdata_source.py | 1313 ++++++++++++++++++++++++++++++++++++++++++++ omop_conversion.py | 246 +++++++++ 2 files changed, 1559 insertions(+) create mode 100644 ehrdata_source.py create mode 100644 omop_conversion.py diff --git a/ehrdata_source.py b/ehrdata_source.py new file mode 100644 index 0000000..11eceba --- /dev/null +++ b/ehrdata_source.py @@ -0,0 +1,1313 @@ +import awkward as ak +import numpy as np +import pandas as pd +import csv +import pandas as pd +import matplotlib.pyplot as plt +import seaborn as sns +import ehrapy as ep +import scanpy as sc +from anndata import AnnData +import mudata as md +from mudata import MuData +from typing import List, Union, Literal, Optional +import os +import glob +import dask.dataframe as dd +from thefuzz import process +import sys +from rich import print as rprint +import missingno as msno +import warnings +import numbers +import os +from pandas.tseries.offsets import DateOffset as Offset + +import anndata as ad +from collections.abc import Collection, Iterable, Mapping, Sequence +from enum import Enum +from functools import partial +from types import MappingProxyType +from typing import TYPE_CHECKING, Any, Callable, Literal, Union + +import scanpy as sc +from scanpy.plotting import DotPlot, MatrixPlot, StackedViolin +from matplotlib.axes import Axes + +from difflib import SequenceMatcher +from heapq import nlargest as _nlargest + + +pth = 'auxillary_files/OMOP_CDMv5.4_Field_Level.csv' +field_level = pd.read_csv(pth) +dtype_mapping = {'integer': "Int64", + 'Integer': "Int64", + 'float': float, + 'bigint': "Int64", + 'varchar(MAX)': str, + 'varchar(2000)': str, + 'varchar(1000)': str, + 'varchar(255)': str, + 'varchar(250)': str, + 'varchar(80)': str, + 'varchar(60)': str, + 'varchar(50)': str, + 'varchar(25)': str, + 'varchar(20)': str, + 'varchar(10)': str, + 'varchar(9)': str, + 'varchar(3)': str, + 'varchar(2)': str, + 'varchar(1)': str, + 'datetime': object, + 'date': object} + + + +def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): + """Use SequenceMatcher to return a list of the indexes of the best + "good enough" matches. word is a sequence for which close matches + are desired (typically a string). + possibilities is a dictionary of sequences. + Optional arg n (default 2) is the maximum number of close matches to + return. n must be > 0. + Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities + that don't score at least that similar to word are ignored. + """ + + if not n > 0: + raise ValueError("n must be > 0: %r" % (n,)) + if not 0.0 <= cutoff <= 1.0: + raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) + result = [] + s = SequenceMatcher() + s.set_seq2(word) + for _, (key, value) in enumerate(possibilities.items()): + s.set_seq1(value) + if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: + result.append((s.ratio(), value, key)) + + # Move the best scorers to head of list + result = _nlargest(n, result) + + # Strip scores for the best n matches + return [(value, key, score) for score, value, key in result] + + +def df_to_dict(df, key, value): + if isinstance(df, dd.DataFrame): + return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() + else: + return pd.Series(df[value].values, index=df[key]).to_dict() + + +def check_csv_has_only_header(file_path): + if file_path.endswith('csv'): + with open(file_path, 'r') as file: + reader = csv.reader(file) + header = next(reader, None) # Read the header + if header is not None: + second_row = next(reader, None) # Try to read the next row + return second_row is None # If there's no second row, return True + else: + return False # File is empty or not a valid CSV + else: + return False + + +class OMOP: + def __init__(self, folder_path, delimiter=None, make_filename_lowercase=True, use_dask=False): + self.base = folder_path + self.delimiter = delimiter + self.use_dask = use_dask + # TODO support also parquet and other formats + file_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) + self.loaded_tabel = None + self.filepath = {} + for file_path in file_list: + file_name = file_path.split("/")[-1].split(".")[0] + if check_csv_has_only_header(file_path): + pass + else: + # Rename the file + if make_filename_lowercase: + new_filepath = os.path.join(self.base, file_path.split("/")[-1].lower()) + if file_path != new_filepath: + warnings(f"Rename file [file_path] to [new_filepath]") + os.rename(file_path, new_filepath) + self.filepath[file_name] = new_filepath + else: + self.filepath[file_name] = file_path + self.check_with_omop_cdm() + self.tables = list(self.filepath.keys()) + + """ + if "concept" in self.tables: + df_concept = dd.read_csv(self.filepath["concept"], usecols=vocabularies_tables_columns["concept"]) + self.concept_id_to_name = dict(zip(df_concept['id'], df_concept['name'])) + self.concept_name_to_id = dict(zip(df_concept['name'], df_concept['id'])) + """ + + def __repr__(self) -> str: + # TODO this should be seperated by diff table categories + def format_tables(tables, max_line_length=80): + line = "" + for table in tables: + # Check if adding the next table would exceed the max line length + if len(line) + len(table) > max_line_length: + # Yield the current line and start a new one + yield line + line = table + else: + # Add the table to the current line + line += table if line == "" else ", " + table + # Yield the last line + yield line + + tables_str = "\n".join(format_tables(self.tables)) + return f'OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables: {tables_str}' + + def set_path(self, table_name, file_path): + # TODO move to init + self.tables.append(table_name) + self.filepath[table_name] = file_path + + def check_with_omop_cdm(self): + for file_name, path in self.filepath.items(): + if file_name not in set(field_level.cdmTableName): + raise KeyError(f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!") + # If not a single file, read the first one + if not os.path.isfile(path): + folder_walk = os.walk(path) + first_file_in_folder = next(folder_walk)[2][0] + path = os.path.join(path, first_file_in_folder) + + if path.endswith('csv'): + with open(path, "r") as f: + dict_reader = csv.DictReader(f, delimiter=self.delimiter) + columns = dict_reader.fieldnames + columns = list(filter(None, columns)) + elif path.endswith('parquet'): + df = dd.read_parquet(path) + columns = list(df.columns) + else: + raise TypeError("Only support CSV and Parquet file!") + columns_lowercase = [column.lower() for column in columns] + + invalid_column_name = [] + for _, column in enumerate(columns_lowercase): + cdm_columns = set(field_level[field_level.cdmTableName == file_name]['cdmFieldName']) + if column not in cdm_columns: + invalid_column_name.append(column) + if len(invalid_column_name) > 0: + print(f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}") + raise KeyError + + + + def _get_column_types(self, + path: str = None, + filename: str = None): + column_types = {} + # If not a single file, read the first one + if not os.path.isfile(path): + folder_walk = os.walk(path) + first_file_in_folder = next(folder_walk)[2][0] + path = os.path.join(path, first_file_in_folder) + + if path.endswith('csv'): + with open(path, "r") as f: + dict_reader = csv.DictReader(f, delimiter=self.delimiter) + columns = dict_reader.fieldnames + columns = list(filter(None, columns)) + elif path.endswith('parquet'): + df = dd.read_parquet(path) + columns = list(df.columns) + else: + raise TypeError("Only support CSV and Parquet file!") + columns_lowercase = [column.lower() for column in columns] + for _, column in enumerate(columns_lowercase): + column_types[column] = dtype_mapping[field_level[(field_level.cdmTableName == filename) & (field_level.cdmFieldName == column)]['cdmDatatype'].values[0]] + return column_types + + def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=None, use_dask=False, **kwargs): + + if use_dask: + if not os.path.isfile(path): + folder_walk = os.walk(path) + filetype = next(folder_walk)[2][0].split(".")[-1] + else: + filetype = path.split(".")[-1] + if filetype == 'csv': + if not os.path.isfile(path): + path = f"{path}/*.csv" + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = dd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) + elif filetype == 'parquet': + if not os.path.isfile(path): + path = f"{path}/*.parquet" + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = dd.read_parquet(path, dtype=dtype, parse_dates=parse_dates, columns=usecols) + else: + raise TypeError("Only support CSV and Parquet file!") + else: + if not os.path.isfile(path): + raise TypeError("Only support reading a single file!") + filetype = path.split(".")[-1] + if filetype == 'csv': + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = pd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) + elif filetype == 'parquet': + df = pd.read_parquet(path, columns=usecols) + else: + raise TypeError("Only support CSV and Parquet file!") + + if index: + df = df.set_index(index) + return df + + # TODO redo this using omop cdm csv file + @property + def table_catalog(self): + """ + A dictionary containing all of the ``Clinical`` OMOP CDM tables in the connected database. + """ + table_catalog_dict = {} + table_catalog_dict['Clinical data'] = [ + "person", + "observation_period", + "specimen", + "death", + "visit_occurrence", + "visit_detail", + "procedure_occurrence", + "drug_exposure", + "device_exposure", + "condition_occurrence", + "measurement", + "note", + "note_nlp", + "observation", + "fact_relationship", + ] + + table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] + table_catalog_dict["Health economics data"] = ["payer_plan_period", "cost"] + table_catalog_dict["Standardized derived elements"] = ["cohort", "cohort_definition", "drug_era", "dose_era", "condition_era"] + table_catalog_dict["Metadata"] = ["cdm_source", "metadata"] + table_catalog_dict["Vocabulary"] = [ + "concept", + "vocabulary", + "domain", + "concept_class", + "concept_relationship", + "relationship", + "concept_synonym", + "concept_ancestor", + "source_to_concept_map", + "drug_strength", + ] + self._table_catalog_dict = table_catalog_dict + + def load(self, level="stay_level", tables=["visit_occurrence", "person", "death"], remove_empty_column=True): + # TODO patient level and hospital level + if level == "stay_level": + index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} + # TODO Only support clinical_tables_columns + + for table in tables: + print(f"reading table [{table}]") + column_types = self._get_column_types(path = self.filepath[table], filename=table) + df = self._read_table(self.filepath[table], dtype=column_types, index='person_id') # TODO parse_dates = parse_dates + if remove_empty_column: + # TODO dask Support + #columns = [column for column in df.columns if not df[column].compute().isna().all()] + columns = [column for column in df.columns if not df[column].isna().all()] + df = df.loc[:, columns] + setattr(self, table, df) + + # concept_id_list = list(self.concept.concept_id) + # concept_name_list = list(self.concept.concept_id) + # concept_domain_id_list = list(set(self.concept.domain_id)) + + # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] + # TODO dask Support + joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") + + joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") + + # TODO dask Support + #joined_table = joined_table.compute() + + # TODO check this earlier + joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') + joined_table = joined_table.set_index("visit_occurrence_id") + # obs_only_list = list(self.joined_table.columns) + # obs_only_list.remove('visit_occurrence_id') + columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) + adata = ep.ad.df_to_anndata( + joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only + ) + # TODO this needs to be fixed because anndata set obs index as string by default + #adata.obs.index = adata.obs.index.astype(int) + + """ + for column in self.measurement.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.drug_exposure.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.observation.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + """ + + return adata + + def feature_counts( + self, + source: Literal[ + "observation", + "measurement", + "procedure_occurrence", + "specimen", + "device_exposure", + "drug_exposure", + "condition_occurrence", + ], + number=20, + key = None + ): + + if source == 'measurement': + columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] + elif source == 'observation': + columns = ["value_as_number", "value_as_string", "measurement_datetime"] + elif source == 'condition_occurrence': + columns = None + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + column_types = self._get_column_types(path = self.filepath[source], filename=source) + df_source = self._read_table(self.filepath[source], dtype=column_types, usecols=[f"{source}_concept_id"], use_dask=True) + # TODO dask Support + #feature_counts = df_source[f"{source}_concept_id"].value_counts().compute()[0:number] + feature_counts = df_source[f"{source}_concept_id"].value_counts().compute() + feature_counts = feature_counts.to_frame().reset_index(drop=False)[0:number] + + + feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = self.map_concept_id( + feature_counts[f"{source}_concept_id"], verbose=False + ) + feature_counts["feature_name"] = self.get_concept_name(feature_counts[f"{source}_concept_id_1"]) + if feature_counts[f"{source}_concept_id_1"].equals(feature_counts[f"{source}_concept_id_2"]): + feature_counts.drop(f"{source}_concept_id_2", axis=1, inplace=True) + feature_counts.rename(columns={f"{source}_concept_id_1": f"{source}_concept_id"}) + feature_counts = feature_counts.reindex(columns=["feature_name", f"{source}_concept_id", "count"]) + else: + feature_counts = feature_counts.reindex( + columns=["feature_name", f"{source}_concept_id_1", f"{source}_concept_id_2", "count"] + ) + + ax = sns.barplot(feature_counts, x="feature_name", y="count") + ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") + plt.tight_layout() + return feature_counts + + def map_concept_id(self, concept_id: Union[str, List], verbose=True): + if isinstance(concept_id, numbers.Integral): + concept_id = [concept_id] + concept_id_1 = [] + concept_id_2 = [] + concept_id_mapped_not_found = [] + + if "concept_relationship" in self.tables: + column_types = self._get_column_types(path = self.filepath["concept_relationship"], filename="concept_relationship") + df_concept_relationship = self._read_csv( + self.filepath["concept_relationship"], dtype=column_types + ) + # TODO dask Support + #df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + df_concept_relationship.dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + concept_relationship_dict = df_to_dict( + df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], + key="concept_id_1", + value="concept_id_2", + ) + concept_relationship_dict_reverse = df_to_dict( + df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"], + key="concept_id_1", + value="concept_id_2", + ) + for id in concept_id: + try: + concept_id_2.append(concept_relationship_dict[id]) + concept_id_1.append(id) + except KeyError: + try: + concept_id_1.append(concept_relationship_dict_reverse[id]) + concept_id_2.append(id) + except KeyError: + concept_id_1.append(id) + concept_id_2.append(id) + concept_id_mapped_not_found.append(id) + if len(concept_id_mapped_not_found) > 0: + # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") + if verbose: + rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!") + else: + concept_id_1 = concept_id + concept_id_2 = concept_id + + if len(concept_id_1) == 1: + return concept_id_1[0], concept_id_2[0] + else: + return concept_id_1, concept_id_2 + + def get_concept_name(self, concept_id: Union[str, List], raise_error=False, verbose=True): + if isinstance(concept_id, numbers.Integral): + concept_id = [concept_id] + + column_types = self._get_column_types(path = self.filepath["concept"], filename="concept") + df_concept = self._read_table(self.filepath["concept"], dtype=column_types) + # TODO dask Support + #df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + df_concept.dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") + concept_name = [] + concept_name_not_found = [] + for id in concept_id: + try: + concept_name.append(concept_dict[id]) + except KeyError: + concept_name.append(id) + concept_name_not_found.append(id) + if len(concept_name_not_found) > 0: + # warnings.warn(f"Couldn't find concept {id} in concept table!") + if verbose: + rprint(f"Couldn't find concept {concept_name_not_found} in concept table!") + if raise_error: + raise KeyError + if len(concept_name) == 1: + return concept_name[0] + else: + return concept_name + + def extract_note(self, adata, source="note"): + column_types = self._get_column_types(path = self.filepath[source], filename=source) + df_source = dd.read_csv(self.filepath[source], dtype=column_types) + if columns is None: + columns = df_source.columns + obs_dict = [ + { + column: list(df_source[df_source["visit_occurrence_id"] == int(visit_occurrence_id)][column]) + for column in columns + } + for visit_occurrence_id in adata.obs.index + ] + adata.obsm["note"] = ak.Array(obs_dict) + return adata + + def note_nlp_map( + self, + ): + # Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping + pass + + + def get_feature_info( + self, + adata, + source: Literal[ + "observation", + "measurement", + "procedure_occurrence", + "specimen", + "device_exposure", + "drug_exposure", + "condition_occurrence", + ], + features: str or int or List[Union[str, int]] = None, + key: str = None, + ignore_not_shown_in_concept_table: bool = True, + exact_match: bool = True, + + verbose: bool = False, + ): + if key is None: + if source in ["measurement", "observation", "specimen"]: + key = f"{source}_concept_id" + elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: + key = f"{source.split('_')[0]}_concept_id" + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + if isinstance(features, str): + features = [features] + rprint(f"Trying to extarct the following features: {features}") + + # Input could be feature names/feature id (concept id) + # First convert all input feaure names into feature id. Map concept using CONCEPT_RELATIONSHIP table if required. + # Then try to extract feature data from source table using feature id. + + # TODO support features name + + if "concept" in self.tables: + column_types = self._get_column_types(path = self.filepath["concept"], filename="concept") + df_concept = self._read_table(self.filepath["concept"], dtype=column_types).dropna( + subset=["concept_id", "concept_name"] + ) # usecols=vocabularies_tables_columns["concept"], + concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") + + # TODO query this in the table + + feature_id_list = [] + feature_name_list = [] + domain_id_list = [] + concept_class_id_list = [] + concept_code_list = [] + + fetures_not_shown_in_concept_table = [] + + info_df = pd.DataFrame([]) + # Get feature id for each input, and check if each feature occurs in the concept table + for feature in features: + # if the input is feature ID + if isinstance(feature, numbers.Integral): + feature_id = feature + feature_id_1, feature_id_2 = self.map_concept_id(feature_id, verbose=False) + try: + feature_name = self.get_concept_name(feature_id_1, raise_error=True, verbose=False) + except KeyError: + if ignore_not_shown_in_concept_table: + fetures_not_shown_in_concept_table.append(feature) + continue + else: + rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table") + raise + match_score = 1 + + # if the input is feature name + elif isinstance(feature, str): + # return a list of (value, key, score) + result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) + + # if find 2 best matches + if len(result) == 2: + match_score = result[0][2] + + if match_score != 1: + if exact_match: + rprint( + f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similar ones: 1) [red]{result[0][0]}[/] 2) [red]{result[1][0]}" + ) + raise ValueError + else: + if result[1][1] == 1: + rprint( + f"Found multiple exact matches for [red]{feature}[/] in the concept table: 1) concept id: [red]{result[0][1]}[/] 2) concept id: [red]{result[1][1]}[/]. It is better to specify concept id directly." + ) + raise ValueError + feature_name = feature + feature_id = result[0][1] + # if only find 1 match + else: + feature_name = result[0][0] + match_score = result[0][1] + feature_id = result[0][2] + if exact_match and match_score != 1: + rprint( + f"Unable to find an exact match for [red]{feature}[/] in the concept table Similar one is [red]{result[0][0]}" + ) + raise ValueError + feature_id_1, feature_id_2 = self.map_concept_id(feature_id) + + else: + rprint( + "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] you want to extarct" + ) + raise TypeError + + info_df = pd.concat([info_df, pd.DataFrame(data=[[feature_name, feature_id_1, feature_id_2]], columns=['feature_name', 'feature_id_1', 'feature_id_2'])]) + + + # feature_name_list.append(feature_name) + # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) + # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) + # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0]) + + if verbose: + """ + if map_concept: + rprint( + f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, feature ID [green]{concept_id}[/] in concept relationship table, match socre = [green]{match_score}." + ) + else: + """ + rprint( + f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match socre = [green]{match_score}." + ) + if info_df[f"feature_id_1"].equals(info_df[f"feature_id_2"]): + info_df.drop(f"feature_id_2", axis=1, inplace=True) + info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) + info_df = info_df.reset_index(drop=True) + else: + info_df = info_df.reset_index(drop=True) + return info_df + + def get_feature_statistics( + self, + adata, + source: Literal[ + "observation", + "measurement", + "procedure_occurrence", + "specimen", + "device_exposure", + "drug_exposure", + "condition_occurrence", + ], + features: str or int or List[Union[str, int]] = None, + level="stay_level", + value_col: str = 'value_source_value', + aggregation_methods: Union[Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]]]=None, + add_aggregation_to_X: bool = True, + verbose: bool = False, + use_dask: bool = None, + ): + if source in ["measurement", "observation", "specimen"]: + key = f"{source}_concept_id" + elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: + key = f"{source.split('_')[0]}_concept_id" + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + if source == 'measurement': + source_table_columns = ['visit_occurrence_id', 'measurement_datetime', key, value_col] + elif source == 'observation': + source_table_columns = ['visit_occurrence_id', "observation_datetime", key, value_col] + elif source == 'condition_occurrence': + source_table_columns = None + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + if use_dask is None: + use_dask = self.use_dask + source_column_types = self._get_column_types(path = self.filepath[source], filename=source) + df_source = self._read_table(self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask) + info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) + info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] + + # Select featrues + df_source = df_source[df_source[key].isin(list(info_df.feature_id))] + #TODO Select time + #da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] + #df_source[f'{source}_name'] = df_source[key].map(info_dict) + if aggregation_methods is None: + aggregation_methods = ["min", "max", "mean", "std", "count"] + if level == 'stay_level': + result = df_source.groupby(['visit_occurrence_id', key]).agg({ + value_col: aggregation_methods}) + + if use_dask: + result = result.compute() + result = result.reset_index(drop=False) + result.columns = ["_".join(a) for a in result.columns.to_flat_index()] + result.columns = result.columns.str.removesuffix('_') + result.columns = result.columns.str.removeprefix(f'{value_col}_') + result[f'{source}_name'] = result[key].map(info_dict) + + df_statistics = result.pivot(index='visit_occurrence_id', + columns=f'{source}_name', + values=aggregation_methods) + df_statistics.columns = df_statistics.columns.swaplevel() + df_statistics.columns = ["_".join(a) for a in df_statistics.columns.to_flat_index()] + + + # TODO + sort_columns = True + if sort_columns: + new_column_order = [] + for feature in features: + for suffix in (f'_{aggregation_method}' for aggregation_method in aggregation_methods): + col_name = f'{feature}{suffix}' + if col_name in df_statistics.columns: + new_column_order.append(col_name) + + df_statistics.columns = new_column_order + + df_statistics.index = df_statistics.index.astype(str) + + adata.obs = pd.merge(adata.obs, df_statistics, how='left', left_index=True, right_index=True) + + if add_aggregation_to_X: + adata = ep.ad.move_to_x(adata, list(df_statistics.columns)) + return adata + + + def extract_features( + self, + adata, + source: Literal[ + "observation", + "measurement", + "procedure_occurrence", + "specimen", + "device_exposure", + "drug_exposure", + "condition_occurrence", + ], + features: str or int or List[Union[str, int]] = None, + source_table_columns: Union[str, List[str]] = None, + dropna: Optional[bool] = True, + verbose: Optional[bool] = True, + use_dask: bool = None, + ): + + if source in ["measurement", "observation", "specimen"]: + key = f"{source}_concept_id" + elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: + key = f"{source.split('_')[0]}_concept_id" + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + + if source_table_columns is None: + if source == 'measurement': + source_table_columns = ['visit_occurrence_id', 'measurement_datetime', 'value_as_number', key] + elif source == 'observation': + source_table_columns = ['visit_occurrence_id', "value_as_number", "value_as_string", "observation_datetime", key] + elif source == 'condition_occurrence': + source_table_columns = None + else: + raise KeyError(f"Extracting data from {source} is not supported yet") + if use_dask is None: + use_dask = self.use_dask + + + # TODO load using Dask or Dask-Awkward + # Load source table using dask + source_column_types = self._get_column_types(path = self.filepath[source], filename=source) + df_source = self._read_table(self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask) + info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) + info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] + + + # Select featrues + df_source = df_source[df_source[key].isin(list(info_df.feature_id))] + + # TODO select time period + #df_source = df_source[(df_source.time >= 0) & (df_source.time <= 48*60*60)] + #da_measurement['measurement_name'] = da_measurement.measurement_concept_id.replace(info_dict) + + # TODO dask caching + """ + from dask.cache import Cache + cache = Cache(2e9) + cache.register() + """ + if use_dask: + if dropna == True: + df_source = df_source.compute().dropna() + else: + df_source = df_source.compute() + else: + if dropna == True: + df_source = df_source.dropna() + + # Preprocess steps outside the loop + unique_visit_occurrence_ids = set(adata.obs.index.astype(int)) + empty_entry = {source_table_column: [] for source_table_column in source_table_columns if source_table_column not in [key, 'visit_occurrence_id'] } + + # Filter data once, if possible + filtered_data = { + feature_id: df_source[df_source[key] == feature_id] + for feature_id in set(info_dict.keys()) + } + + for feature_id in set(info_dict.keys()): + df_feature = filtered_data[feature_id][list(set(source_table_columns) - set([key]))] + grouped = df_feature.groupby("visit_occurrence_id") + if verbose: + print(f"Adding feature [{info_dict[feature_id]}] into adata.obsm") + + # Use set difference and intersection more efficiently + feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) + + # Creating the array more efficiently + adata.obsm[info_dict[feature_id]] = ak.Array([ + grouped.get_group(visit_occurrence_id)[list(set(source_table_columns) - set([key, 'visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry + for visit_occurrence_id in unique_visit_occurrence_ids + ]) + + return adata + + + def drop_nan(self, + adata, + key: Union[str, List[str]], + slot: Union[str, None] = 'obsm', + ): + if isinstance(key, str): + key_list = [key] + else: + key_list = key + if slot == 'obsm': + for key in key_list: + ak_array = adata.obsm[key] + + # Update the combined mask based on the presence of None in each field + for i, field in enumerate(ak_array.fields): + field_mask = ak.is_none(ak.nan_to_none(ak_array[field]), axis=1) + if i==0: + combined_mask = ak.full_like(field_mask, fill_value=False, dtype=bool) + combined_mask = combined_mask | field_mask + ak_array = ak_array[~combined_mask] + adata.obsm[key] = ak_array + + return adata + + # downsampling + def aggregate_timeseries_in_bins(self, + adata, + features: Union[str, List[str]], + slot: Union[str, None] = 'obsm', + value_key: str = 'value_as_number', + time_key: str = 'measurement_datetime', + time_binning_method: Literal["floor", "ceil", "round"] = "floor", + bin_size: Union[str, Offset] = 'h', + aggregation_method: Literal['median', 'mean', 'min', 'max'] = 'median', + time_upper_bound: int = 48# TODO + ): + + if isinstance(features, str): + features_list = [features] + else: + features_list = features + + # Ensure the time_binning_method provided is one of the expected methods + if time_binning_method not in ["floor", "ceil", "round"]: + raise ValueError(f"time_binning_method {time_binning_method} is not supported. Choose from 'floor', 'ceil', or 'round'.") + + if aggregation_method not in {'median', 'mean', 'min', 'max'}: + raise ValueError(f"aggregation_method {aggregation_method} is not supported. Choose from 'median', 'mean', 'min', or 'max'.") + + if slot == 'obsm': + for feature in features_list: + print(f"processing feature [{feature}]") + df = self.to_dataframe(adata, features) + if pd.api.types.is_datetime64_any_dtype(df[time_key]): + func = getattr(df[time_key].dt, time_binning_method, None) + if func is not None: + df[time_key] = func(bin_size) + else: + # TODO need to take care of this if it doesn't follow omop standard + if bin_size == 'h': + df[time_key] = df[time_key] / 3600 + func = getattr(np, time_binning_method) + df[time_key] = func(df[time_key]) + + df[time_key] = df[time_key].astype(str) + # Adjust time values that are equal to the time_upper_bound + #df.loc[df[time_key] == time_upper_bound, time_key] = time_upper_bound - 1 + + # Group and aggregate data + df = df.groupby(["visit_occurrence_id", time_key])[value_key].agg(aggregation_method).reset_index(drop=False) + grouped = df.groupby("visit_occurrence_id") + + unique_visit_occurrence_ids = adata.obs.index + empty_entry = {value_key: [], time_key: []} + + # Efficiently use set difference and intersection + feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) + # Efficiently create the array + ak_array = ak.Array([ + grouped.get_group(visit_occurrence_id)[[value_key, time_key]].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry + for visit_occurrence_id in unique_visit_occurrence_ids + ]) + adata.obsm[feature] = ak_array + + return adata + + def timeseries_discretizer(self, + adata, + key: Union[str, List[str]], + slot: Union[str, None] = 'obsm', + value_key: str = 'value_as_number', + time_key: str = 'measurement_datetime', + freq: str = 'hour', #TODO + time_limit: int = 48, #TODO + method: str = 'median' #TODO + ): + + pass + + + + def from_dataframe( + self, + adata, + feature: str, + df + ): + grouped = df.groupby("visit_occurrence_id") + unique_visit_occurrence_ids = set(adata.obs.index) + + # Use set difference and intersection more efficiently + feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) + empty_entry = {source_table_column: [] for source_table_column in set(df.columns) if source_table_column not in ['visit_occurrence_id'] } + + # Creating the array more efficiently + ak_array = ak.Array([ + grouped.get_group(visit_occurrence_id)[list(set(df.columns) - set(['visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry + for visit_occurrence_id in unique_visit_occurrence_ids]) + adata.obsm[feature] = ak_array + + return adata + + # TODO add function to check feature and add concept + # More IO functions + def to_dataframe( + self, + adata, + features: Union[str, List[str]], # TODO also support list of features + # patient str or List, # TODO also support subset of patients/visit + ): + # TODO + # can be viewed as patient level - only select some patient + # TODO change variable name here + if isinstance(features, str): + features = [features] + df_concat = pd.DataFrame([]) + for feature in features: + df = ak.to_dataframe(adata.obsm[feature]) + + df.reset_index(drop=False, inplace=True) + df["entry"] = adata.obs.index[df["entry"]] + df = df.rename(columns={"entry": "visit_occurrence_id"}) + del df["subentry"] + for col in df.columns: + if col.endswith('time'): + df[col] = pd.to_datetime(df[col]) + + df['feature_name'] = feature + df_concat = pd.concat([df_concat, df], axis= 0) + + + return df_concat + + + def plot_timeseries(self, + adata, + visit_occurrence_id: int, + key: Union[str, List[str]], + slot: Union[str, None] = 'obsm', + value_key: str = 'value_as_number', + time_key: str = 'measurement_datetime', + x_label: str = None + ): + + + if isinstance(key, str): + key_list = [key] + else: + key_list = key + + # Initialize min_x and max_x + min_x = None + max_x = None + + if slot == 'obsm': + fig, ax = plt.subplots(figsize=(20, 6)) + # Scatter plot + for i, key in enumerate(key_list): + df = self.to_dataframe(adata, key) + x = df[df.visit_occurrence_id == visit_occurrence_id][time_key] + y = df[df.visit_occurrence_id == visit_occurrence_id][value_key] + + # Check if x is empty + if not x.empty: + ax.scatter(x=x, y=y, label=key) + ax.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=len(key_list), prop={"size": 14}) + + ax.plot(x, y) + + + if min_x is None or min_x > x.min(): + min_x = x.min() + if max_x is None or max_x < x.max(): + max_x = x.max() + + + else: + # Skip this iteration if x is empty + continue + + if min_x is not None and max_x is not None: + + # Adapt this to input data + # TODO step + #plt.xticks(np.arange(min_x, max_x, step=1)) + # Adapt this to input data + plt.xlabel(x_label if x_label else "Hours since ICU admission") + + plt.show() + + + def violin( + self, + adata: AnnData, + obsm_key: str = None, + keys: Union[str, Sequence[str]] = None, + groupby: Optional[str] = None, + log: Optional[bool] = False, + use_raw: Optional[bool] = None, + stripplot: bool = True, + jitter: Union[float, bool] = True, + size: int = 1, + layer: Optional[str] = None, + scale: Literal["area", "count", "width"] = "width", + order: Optional[Sequence[str]] = None, + multi_panel: Optional[bool] = None, + xlabel: str = "", + ylabel: Union[str, Sequence[str]] = None, + rotation: Optional[float] = None, + show: Optional[bool] = None, + save: Union[bool, str] = None, + ax: Optional[Axes] = None, + **kwds, + ): # pragma: no cover + """Violin plot. + + Wraps :func:`seaborn.violinplot` for :class:`~anndata.AnnData`. + + Args: + adata: :class:`~anndata.AnnData` object object containing all observations. + keys: Keys for accessing variables of `.var_names` or fields of `.obs`. + groupby: The key of the observation grouping to consider. + log: Plot on logarithmic axis. + use_raw: Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present. + stripplot: Add a stripplot on top of the violin plot. See :func:`~seaborn.stripplot`. + jitter: Add jitter to the stripplot (only when stripplot is True) See :func:`~seaborn.stripplot`. + size: Size of the jitter points. + layer: Name of the AnnData object layer that wants to be plotted. By + default adata.raw.X is plotted. If `use_raw=False` is set, + then `adata.X` is plotted. If `layer` is set to a valid layer name, + then the layer is plotted. `layer` takes precedence over `use_raw`. + scale: The method used to scale the width of each violin. + If 'width' (the default), each violin will have the same width. + If 'area', each violin will have the same area. + If 'count', a violin’s width corresponds to the number of observations. + order: Order in which to show the categories. + multi_panel: Display keys in multiple panels also when `groupby is not None`. + xlabel: Label of the x axis. Defaults to `groupby` if `rotation` is `None`, otherwise, no label is shown. + ylabel: Label of the y axis. If `None` and `groupby` is `None`, defaults to `'value'`. + If `None` and `groubpy` is not `None`, defaults to `keys`. + rotation: Rotation of xtick labels. + {show_save_ax} + **kwds: + Are passed to :func:`~seaborn.violinplot`. + + Returns: + A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`. + + Example: + .. code-block:: python + + import ehrapy as ep + + adata = ep.dt.mimic_2(encoded=True) + ep.pp.knn_impute(adata) + ep.pp.log_norm(adata, offset=1) + ep.pp.neighbors(adata) + ep.tl.leiden(adata, resolution=0.5, key_added="leiden_0_5") + ep.pl.violin(adata, keys=["age"], groupby="leiden_0_5") + + Preview: + .. image:: /_static/docstring_previews/violin.png + """ + + if obsm_key: + df = self.to_dataframe(adata, features=obsm_key) + df = df[["visit_occurrence_id", "value_as_number"]] + df = df.rename(columns = {"value_as_number": obsm_key}) + + if groupby: + df = df.set_index('visit_occurrence_id').join(adata.obs[groupby].to_frame()).reset_index(drop=False) + adata = ep.ad.df_to_anndata(df, columns_obs_only=['visit_occurrence_id', groupby]) + else: + adata = ep.ad.df_to_anndata(df, columns_obs_only=['visit_occurrence_id']) + keys=obsm_key + + violin_partial = partial( + sc.pl.violin, + keys=keys, + log=log, + use_raw=use_raw, + stripplot=stripplot, + jitter=jitter, + size=size, + layer=layer, + scale=scale, + order=order, + multi_panel=multi_panel, + xlabel=xlabel, + ylabel=ylabel, + rotation=rotation, + show=show, + save=save, + ax=ax, + **kwds,) + + return violin_partial(adata=adata, groupby=groupby) + + + def qc_lab_measurements( + self, + adata: AnnData, + reference_table: pd.DataFrame = None, + measurements: list[str] = None, + obsm_measurements: list[str] = None, + action: Literal["remove"] = "remove", + unit: Literal["traditional", "SI"] = None, + layer: str = None, + threshold: int = 20, + age_col: str = None, + age_range: str = None, + sex_col: str = None, + sex: str = None, + ethnicity_col: str = None, + ethnicity: str = None, + copy: bool = False, + verbose: bool = False, + ) -> AnnData: + + if copy: + adata = adata.copy() + + preprocessing_dir = '/Users/xinyuezhang/ehrapy/ehrapy/preprocessing' + if reference_table is None: + reference_table = pd.read_csv( + f"{preprocessing_dir}/laboratory_reference_tables/laposata.tsv", sep="\t", index_col="Measurement" + ) + if obsm_measurements: + measurements = obsm_measurements + for measurement in measurements: + best_column_match, score = process.extractOne( + query=measurement, choices=reference_table.index, score_cutoff=threshold + ) + if best_column_match is None: + rprint(f"[bold yellow]Unable to find a match for {measurement}") + continue + if verbose: + rprint( + f"[bold blue]Detected [green]{best_column_match}[blue] for [green]{measurement}[blue] with score [green]{score}." + ) + + reference_column = "SI Reference Interval" if unit == "SI" else "Traditional Reference Interval" + + # Fetch all non None columns from the reference statistics + not_none_columns = [col for col in [sex_col, age_col, ethnicity_col] if col is not None] + not_none_columns.append(reference_column) + reference_values = reference_table.loc[[best_column_match], not_none_columns] + + additional_columns = False + if sex_col or age_col or ethnicity_col: # check if additional columns were provided + additional_columns = True + + # Check if multiple reference values occur and no additional information is available: + if reference_values.shape[0] > 1 and additional_columns is False: + raise ValueError( + f"Several options for {best_column_match} reference value are available. Please specify sex, age or " + f"ethnicity columns and their values." + ) + + try: + if age_col: + min_age, max_age = age_range.split("-") + reference_values = reference_values[ + (reference_values[age_col].str.split("-").str[0].astype(int) >= int(min_age)) + and (reference_values[age_col].str.split("-").str[1].astype(int) <= int(max_age)) + ] + if sex_col: + sexes = "U|M" if sex is None else sex + reference_values = reference_values[reference_values[sex_col].str.contains(sexes)] + if ethnicity_col: + reference_values = reference_values[reference_values[ethnicity_col].isin([ethnicity])] + + if layer is not None: + actual_measurements = adata[:, measurement].layers[layer] + else: + if obsm_measurements: + actual_measurements = adata.obsm[measurement]['value_as_number'] + ak_measurements = adata.obsm[measurement] + else: + actual_measurements = adata[:, measurement].X + except TypeError: + rprint(f"[bold yellow]Unable to find specified reference values for {measurement}.") + + check = reference_values[reference_column].values + check_str: str = np.array2string(check) + check_str = check_str.replace("[", "").replace("]", "").replace("'", "") + if "<" in check_str: + upperbound = float(check_str.replace("<", "")) + if verbose: + rprint(f"[bold blue]Using upperbound [green]{upperbound}") + upperbound_check_results = actual_measurements < upperbound + if isinstance(actual_measurements, ak.Array): + if action == 'remove': + if verbose: + rprint(f"Removing {ak.count(actual_measurements) - ak.count(actual_measurements[upperbound_check_results])} outliers") + adata.obsm[measurement] = ak_measurements[upperbound_check_results] + else: + upperbound_check_results_array: np.ndarray = upperbound_check_results.copy() + adata.obs[f"{measurement} normal"] = upperbound_check_results_array + + elif ">" in check_str: + lower_bound = float(check_str.replace(">", "")) + if verbose: + rprint(f"[bold blue]Using lowerbound [green]{lower_bound}") + + lower_bound_check_results = actual_measurements > lower_bound + if isinstance(actual_measurements, ak.Array): + if action == 'remove': + adata.obsm[measurement] = ak_measurements[lower_bound_check_results] + else: + adata.obs[f"{measurement} normal"] = lower_bound_check_results_array + lower_bound_check_results_array = lower_bound_check_results.copy() + else: # "-" range case + min_value = float(check_str.split("-")[0]) + max_value = float(check_str.split("-")[1]) + if verbose: + rprint(f"[bold blue]Using minimum of [green]{min_value}[blue] and maximum of [green]{max_value}") + + range_check_results = (actual_measurements >= min_value) & (actual_measurements <= max_value) + if isinstance(actual_measurements, ak.Array): + if action == 'remove': + adata.obsm[measurement] = ak_measurements[range_check_results] + else: + adata.obs[f"{measurement} normal"] = range_check_results_array + range_check_results_array: np.ndarray = range_check_results.copy() + + if copy: + return adata diff --git a/omop_conversion.py b/omop_conversion.py new file mode 100644 index 0000000..7cb9951 --- /dev/null +++ b/omop_conversion.py @@ -0,0 +1,246 @@ +import os +import glob + +import pandas as pd + +import ehrapy as ep +from pathlib import Path +from .utils.omop_utils import * +from rich.console import Console +from rich.text import Text +import rich.repr +from rich import print as rprint +from typing import TYPE_CHECKING, Any, Callable, Literal, Union, List + +@rich.repr.auto(angular=True) +class OMOP: + def __init__(self, folder_path, delimiter=None, make_filename_lowercase=True, use_dask=False): + self.base = folder_path + self.delimiter = delimiter + self.use_dask = use_dask + filepath_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) + self.loaded_tabel = None + + self.filepath_dict = check_with_omop_cdm(filepath_list, base=self.base, delimiter=self.delimiter, make_filename_lowercase=make_filename_lowercase) + self.tables = list(self.filepath_dict.keys()) + + ''' + def __repr__(self) -> str: + print_str = f'OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables:\n' + table_catalog_dict = get_table_catalog_dict() + for _, (key, value) in enumerate(table_catalog_dict.items()): + table_list = [table_name for table_name in self.tables if table_name in value] + if len(table_list) != 0: + print_str = print_str + f"{key} tables: {', '.join(table_list)}\n" + return print_str + ''' + + def __rich_repr__(self): + console = Console() + table_catalog_dict = get_table_catalog_dict() + color_map = { + 'Clinical data': 'blue', + 'Health system data': 'green', + 'Health economics data': 'red', + 'Standardized derived elements': 'magenta', + 'Metadata': 'white', + 'Vocabulary': 'dark_orange' + } + # Object description + print_str = f'OMOP object ([red]{os.path.basename(self.base)}[/]) with {len(self.tables)} tables.\n' + + # Tables information + for key, value in table_catalog_dict.items(): + table_list = [table_name for table_name in self.tables if table_name in value] + if len(table_list) != 0: + print_str = print_str + f"[{color_map[key]}]{key} tables[/]: [black]{', '.join(table_list)}[/]\n" + #table_list_str = ', '.join(table_list) + + #text = Text(f"{key} tables: ", style=color_map[key]) + #text.append(table_list_str) + #yield None, f"{key} tables", "red" + console.print(print_str) + yield None + + + #TODO + def new_load(self, + level: Literal["stay_level", "patient_level"] = "stay_level", + tables: Union[str, List[str]] = None, + remove_empty_column=True): + + table_catalog_dict = get_table_catalog_dict() + if not tables: + tables = self.table + + for table in self.table: + # Load Clinical data tables + if table in table_catalog_dict['Clinical data']: + # in patient level + if table in ["person", "death"]: + column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) + df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') + elif table in ["visit_occurrence_id"]: + column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) + df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') + else: + warnings(f"Please use extract_features function to extract features from table {table}") + continue + elif table in table_catalog_dict["Health system data"]: + column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) + df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') + + + + + # Load Health system data tables + + # Load Health economics data tables + + # Load Standardized derived elements tables + + # Load Metadata tables + + # Load Vocabulary tables + + + # TODO patient level and hospital level + if level == "stay_level": + index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} + # TODO Only support clinical_tables_columns + + for table in tables: + print(f"reading table [{table}]") + column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) + df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') + if remove_empty_column: + # TODO dask Support + #columns = [column for column in df.columns if not df[column].compute().isna().all()] + columns = [column for column in df.columns if not df[column].isna().all()] + df = df.loc[:, columns] + setattr(self, table, df) + + # concept_id_list = list(self.concept.concept_id) + # concept_name_list = list(self.concept.concept_id) + # concept_domain_id_list = list(set(self.concept.domain_id)) + + # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] + # TODO dask Support + joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") + + joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") + + # TODO dask Support + #joined_table = joined_table.compute() + + # TODO check this earlier + joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') + joined_table = joined_table.set_index("visit_occurrence_id") + # obs_only_list = list(self.joined_table.columns) + # obs_only_list.remove('visit_occurrence_id') + columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) + adata = ep.ad.df_to_anndata( + joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only + ) + # TODO this needs to be fixed because anndata set obs index as string by default + #adata.obs.index = adata.obs.index.astype(int) + + """ + for column in self.measurement.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.drug_exposure.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.observation.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + """ + + return adata + + def load(self, + level: Literal["stay_level", "patient_level"] = "stay_level", + tables: Union[str, List[str]] = None, + remove_empty_column=True): + + if not tables: + tables = ['person', 'death', 'visit_occurrence'] + # TODO patient level and hospital level + if level == "stay_level": + index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} + # TODO Only support clinical_tables_columns + + for table in tables: + print(f"reading table [{table}]") + column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, table_name=table) + df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') + if remove_empty_column: + # TODO dask Support + #columns = [column for column in df.columns if not df[column].compute().isna().all()] + columns = [column for column in df.columns if not df[column].isna().all()] + df = df.loc[:, columns] + setattr(self, table, df) + + # concept_id_list = list(self.concept.concept_id) + # concept_name_list = list(self.concept.concept_id) + # concept_domain_id_list = list(set(self.concept.domain_id)) + + # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] + # TODO dask Support + joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") + + joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") + + # TODO dask Support + #joined_table = joined_table.compute() + + # TODO check this earlier + joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') + joined_table = joined_table.set_index("visit_occurrence_id") + # obs_only_list = list(self.joined_table.columns) + # obs_only_list.remove('visit_occurrence_id') + columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) + adata = ep.ad.df_to_anndata( + joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only + ) + # TODO this needs to be fixed because anndata set obs index as string by default + #adata.obs.index = adata.obs.index.astype(int) + + """ + for column in self.measurement.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.drug_exposure.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.observation.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + """ + + return adata + + \ No newline at end of file From d59b6bf8b3f7f9b84da591764e5cd50143422a42 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 08:42:54 +0100 Subject: [PATCH 05/13] update --- ehrdata/dt/_omop.py | 6 ++++++ pyproject.toml | 1 + tests/test_basic.py | 4 ++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/ehrdata/dt/_omop.py b/ehrdata/dt/_omop.py index 2a9a605..e155ba0 100644 --- a/ehrdata/dt/_omop.py +++ b/ehrdata/dt/_omop.py @@ -125,6 +125,12 @@ def init_omop(folder_path, """ adata.uns.update(adata_dict) + elif level == "patient_level": + # TODO patient level + # Each row in anndata would be a patient + pass + else: + raise ValueError("level should be 'stay_level' or 'patient_level'") return adata diff --git a/pyproject.toml b/pyproject.toml index 4b73f92..d180631 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ urls.Source = "https://github.com/theislab/ehrdata" urls.Home-page = "https://github.com/theislab/ehrdata" dependencies = [ "anndata", + "awkward", # for debug logging (referenced from the issue template) "session-info" ] diff --git a/tests/test_basic.py b/tests/test_basic.py index e61e5c0..6cbb48b 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,10 +1,10 @@ import pytest -import ehrdata_source +import ehrdata as ehr def test_package_has_version(): - assert ehrdata_source.__version__ is not None + assert ehr.__version__ is not None @pytest.mark.skip(reason="This decorator should be removed when test passes.") From 64acfccf27cfe710053c4cbdaa7427bc140dc439 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 08:49:35 +0100 Subject: [PATCH 06/13] update --- pyproject.toml | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d180631..4d87a12 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,16 @@ urls.Source = "https://github.com/theislab/ehrdata" urls.Home-page = "https://github.com/theislab/ehrdata" dependencies = [ "anndata", + "ehrapy", + "scanpy", "awkward", + "matplotlib", + "pandas", + "dask", + "thefuzz", + "rich", + "seaborn", + # for debug logging (referenced from the issue template) "session-info" ] From 9865cc161161df017ff60eb46bddfcc3837ac015 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 10:15:41 +0100 Subject: [PATCH 07/13] minor fix --- ehrdata/dt/__init__.py | 1 - ehrdata/dt/_omop.py | 136 -------------------------------- ehrdata/io/__init__.py | 2 +- ehrdata/io/_omop.py | 174 ++++++++++++++++++++++++++++++++++++----- 4 files changed, 154 insertions(+), 159 deletions(-) delete mode 100644 ehrdata/dt/__init__.py delete mode 100644 ehrdata/dt/_omop.py diff --git a/ehrdata/dt/__init__.py b/ehrdata/dt/__init__.py deleted file mode 100644 index 8a2b780..0000000 --- a/ehrdata/dt/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from ehrdata.dt._omop import init_omop \ No newline at end of file diff --git a/ehrdata/dt/_omop.py b/ehrdata/dt/_omop.py deleted file mode 100644 index e155ba0..0000000 --- a/ehrdata/dt/_omop.py +++ /dev/null @@ -1,136 +0,0 @@ -import os - - -import pandas as pd - -import ehrapy as ep -from pathlib import Path -from ehrdata.utils.omop_utils import * -from rich.console import Console -from rich.text import Text -import rich.repr -from rich import print as rprint -from typing import TYPE_CHECKING, Any, Callable, Literal, Union, List - - - - -def init_omop(folder_path, - delimiter=None, - make_filename_lowercase=True, - use_dask=False, - level: Literal["stay_level", "patient_level"] = "stay_level", - tables: Union[str, List[str]] = None, - remove_empty_column=True): - - - - filepath_dict = check_with_omop_cdm(folder_path=folder_path, delimiter=delimiter, make_filename_lowercase=make_filename_lowercase) - tables = list(filepath_dict.keys()) - adata_dict = {} - adata_dict['filepath_dict'] = filepath_dict - adata_dict['tables'] = tables - adata_dict['delimiter'] = delimiter - adata_dict['use_dask'] = use_dask - - - table_catalog_dict = get_table_catalog_dict() - - color_map = { - 'Clinical data': 'blue', - 'Health system data': 'green', - 'Health economics data': 'red', - 'Standardized derived elements': 'magenta', - 'Metadata': 'white', - 'Vocabulary': 'dark_orange' - } - # Object description - print_str = f'OMOP Database ([red]{os.path.basename(folder_path)}[/]) with {len(tables)} tables.\n' - - # Tables information - for key, value in table_catalog_dict.items(): - table_list = [table_name for table_name in tables if table_name in value] - if len(table_list) != 0: - print_str = print_str + f"[{color_map[key]}]{key} tables[/]: [black]{', '.join(table_list)}[/]\n" - #table_list_str = ', '.join(table_list) - - #text = Text(f"{key} tables: ", style=color_map[key]) - #text.append(table_list_str) - #yield None, f"{key} tables", "red" - rprint(print_str) - - tables = ['person', 'death', 'visit_occurrence'] - # TODO patient level and hospital level - if level == "stay_level": - index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} - # TODO Only support clinical_tables_columns - table_dict = {} - for table in tables: - print(f"reading table [{table}]") - column_types = get_column_types(adata_dict, table_name=table) - df = read_table(adata_dict, table_name=table, dtype=column_types, index='person_id') - if remove_empty_column: - # TODO dask Support - #columns = [column for column in df.columns if not df[column].compute().isna().all()] - columns = [column for column in df.columns if not df[column].isna().all()] - df = df.loc[:, columns] - table_dict[table] = df - - # concept_id_list = list(self.concept.concept_id) - # concept_name_list = list(self.concept.concept_id) - # concept_domain_id_list = list(set(self.concept.domain_id)) - - # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] - # TODO dask Support - joined_table = pd.merge(table_dict["visit_occurrence"], table_dict["person"], left_index=True, right_index=True, how="left") - - joined_table = pd.merge(joined_table, table_dict["death"], left_index=True, right_index=True, how="left") - - # TODO dask Support - #joined_table = joined_table.compute() - - # TODO check this earlier - joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') - joined_table = joined_table.set_index("visit_occurrence_id") - # obs_only_list = list(self.joined_table.columns) - # obs_only_list.remove('visit_occurrence_id') - columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) - adata = ep.ad.df_to_anndata( - joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only - ) - # TODO this needs to be fixed because anndata set obs index as string by default - #adata.obs.index = adata.obs.index.astype(int) - - """ - for column in self.measurement.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.drug_exposure.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.observation.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - """ - - adata.uns.update(adata_dict) - elif level == "patient_level": - # TODO patient level - # Each row in anndata would be a patient - pass - else: - raise ValueError("level should be 'stay_level' or 'patient_level'") - - return adata - diff --git a/ehrdata/io/__init__.py b/ehrdata/io/__init__.py index aaf540f..297c0be 100644 --- a/ehrdata/io/__init__.py +++ b/ehrdata/io/__init__.py @@ -1 +1 @@ -from ehrdata.io._omop import from_dataframe, to_dataframe \ No newline at end of file +from ehrdata.io._omop import from_dataframe, init_omop, to_dataframe diff --git a/ehrdata/io/_omop.py b/ehrdata/io/_omop.py index cb6331f..4e82578 100644 --- a/ehrdata/io/_omop.py +++ b/ehrdata/io/_omop.py @@ -1,33 +1,167 @@ -from typing import List, Union, Literal, Optional +import os +from typing import Literal, Union + import awkward as ak +import ehrapy as ep import pandas as pd +from rich import print as rprint -def from_dataframe( - adata, - feature: str, - df +from ehrdata.utils.omop_utils import check_with_omop_cdm, get_column_types, get_table_catalog_dict, read_table + + +def init_omop( + folder_path, + delimiter=None, + make_filename_lowercase=True, + use_dask=False, + level: Literal["stay_level", "patient_level"] = "stay_level", + tables: Union[str, list[str]] = None, + remove_empty_column=True, ): + filepath_dict = check_with_omop_cdm( + folder_path=folder_path, delimiter=delimiter, make_filename_lowercase=make_filename_lowercase + ) + tables = list(filepath_dict.keys()) + adata_dict = {} + adata_dict["filepath_dict"] = filepath_dict + adata_dict["tables"] = tables + adata_dict["delimiter"] = delimiter + adata_dict["use_dask"] = use_dask + + table_catalog_dict = get_table_catalog_dict() + + color_map = { + "Clinical data": "blue", + "Health system data": "green", + "Health economics data": "red", + "Standardized derived elements": "magenta", + "Metadata": "white", + "Vocabulary": "dark_orange", + } + # Object description + print_str = f"OMOP Database ([red]{os.path.basename(folder_path)}[/]) with {len(tables)} tables.\n" + + # Tables information + for key, value in table_catalog_dict.items(): + table_list = [table_name for table_name in tables if table_name in value] + if len(table_list) != 0: + print_str = print_str + f"[{color_map[key]}]{key} tables[/]: [black]{', '.join(table_list)}[/]\n" + # table_list_str = ', '.join(table_list) + + # text = Text(f"{key} tables: ", style=color_map[key]) + # text.append(table_list_str) + # yield None, f"{key} tables", "red" + rprint(print_str) + + tables = ["person", "death", "visit_occurrence"] + # TODO patient level and hospital level + if level == "stay_level": + # index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} + # TODO Only support clinical_tables_columns + table_dict = {} + for table in tables: + print(f"reading table [{table}]") + column_types = get_column_types(adata_dict, table_name=table) + df = read_table(adata_dict, table_name=table, dtype=column_types, index="person_id") + if remove_empty_column: + # TODO dask Support + # columns = [column for column in df.columns if not df[column].compute().isna().all()] + columns = [column for column in df.columns if not df[column].isna().all()] + df = df.loc[:, columns] + table_dict[table] = df + + # concept_id_list = list(self.concept.concept_id) + # concept_name_list = list(self.concept.concept_id) + # concept_domain_id_list = list(set(self.concept.domain_id)) + + # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] + # TODO dask Support + joined_table = pd.merge( + table_dict["visit_occurrence"], table_dict["person"], left_index=True, right_index=True, how="left" + ) + + joined_table = pd.merge(joined_table, table_dict["death"], left_index=True, right_index=True, how="left") + + # TODO dask Support + # joined_table = joined_table.compute() + + # TODO check this earlier + joined_table = joined_table.drop_duplicates(subset="visit_occurrence_id") + joined_table = joined_table.set_index("visit_occurrence_id") + # obs_only_list = list(self.joined_table.columns) + # obs_only_list.remove('visit_occurrence_id') + columns_obs_only = list(set(joined_table.columns) - {"year_of_birth", "gender_source_value"}) + adata = ep.ad.df_to_anndata(joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only) + # TODO this needs to be fixed because anndata set obs index as string by default + # adata.obs.index = adata.obs.index.astype(int) + + """ + for column in self.measurement.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.drug_exposure.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + + for column in self.observation.columns: + if column != 'visit_occurrence_id': + obs_list = [] + for visit_occurrence_id in adata.obs.index: + obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) + adata.obsm[column]= ak.Array(obs_list) + """ + + adata.uns.update(adata_dict) + elif level == "patient_level": + # TODO patient level + # Each row in anndata would be a patient + pass + else: + raise ValueError("level should be 'stay_level' or 'patient_level'") + + return adata + + +def from_dataframe(adata, feature: str, df): grouped = df.groupby("visit_occurrence_id") unique_visit_occurrence_ids = set(adata.obs.index) # Use set difference and intersection more efficiently feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - empty_entry = {source_table_column: [] for source_table_column in set(df.columns) if source_table_column not in ['visit_occurrence_id'] } - + empty_entry = { + source_table_column: [] + for source_table_column in set(df.columns) + if source_table_column not in ["visit_occurrence_id"] + } + columns_in_ak_array = list(set(df.columns) - {"visit_occurrence_id"}) # Creating the array more efficiently - ak_array = ak.Array([ - grouped.get_group(visit_occurrence_id)[list(set(df.columns) - set(['visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids]) + ak_array = ak.Array( + [ + grouped.get_group(visit_occurrence_id)[columns_in_ak_array].to_dict(orient="list") + if visit_occurrence_id in feature_ids + else empty_entry + for visit_occurrence_id in unique_visit_occurrence_ids + ] + ) adata.obsm[feature] = ak_array - + return adata - + + # TODO add function to check feature and add concept # More IO functions + def to_dataframe( adata, - features: Union[str, List[str]], # TODO also support list of features + features: Union[str, list[str]], # TODO also support list of features # patient str or List, # TODO also support subset of patients/visit ): # TODO @@ -43,13 +177,11 @@ def to_dataframe( df["entry"] = adata.obs.index[df["entry"]] df = df.rename(columns={"entry": "visit_occurrence_id"}) del df["subentry"] - for col in df.columns: - if col.endswith('time'): + for col in df.columns: + if col.endswith("time"): df[col] = pd.to_datetime(df[col]) - - df['feature_name'] = feature - df_concat = pd.concat([df_concat, df], axis= 0) - - - return df_concat + df["feature_name"] = feature + df_concat = pd.concat([df_concat, df], axis=0) + + return df_concat From 012b30aaffcc4dac71cfce3f98b5a7f6d519b650 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 11:22:41 +0100 Subject: [PATCH 08/13] Refactor code in _omop.py files --- ehrdata/pl/_omop.py | 42 ++-- ehrdata/pp/_omop.py | 93 ++++---- ehrdata/tl/_omop.py | 29 +-- ehrdata/utils/omop_utils.py | 450 +++++++++++++++++++++++------------- 4 files changed, 380 insertions(+), 234 deletions(-) diff --git a/ehrdata/pl/_omop.py b/ehrdata/pl/_omop.py index 7565a68..7ec676d 100644 --- a/ehrdata/pl/_omop.py +++ b/ehrdata/pl/_omop.py @@ -1,8 +1,11 @@ -from typing import List, Union, Literal, Optional -from ehrdata.utils.omop_utils import * -from ehrdata.tl import get_concept_name -import seaborn as sns +from typing import Literal + import matplotlib.pyplot as plt +import seaborn as sns + +from ehrdata.tl import get_concept_name +from ehrdata.utils.omop_utils import get_column_types, map_concept_id, read_table + # TODO allow users to pass features def feature_counts( @@ -17,29 +20,24 @@ def feature_counts( "condition_occurrence", ], number=20, - key = None -): - - if source == 'measurement': - columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] - elif source == 'observation': - columns = ["value_as_number", "value_as_string", "measurement_datetime"] - elif source == 'condition_occurrence': - columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - filepath_dict = adata.uns['filepath_dict'] - tables = adata.uns['tables'] - + key=None, +): + # if source == 'measurement': + # columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] + # elif source == 'observation': + # columns = ["value_as_number", "value_as_string", "measurement_datetime"] + # elif source == 'condition_occurrence': + # columns = None + # else: + # raise KeyError(f"Extracting data from {source} is not supported yet") + column_types = get_column_types(adata.uns, table_name=source) df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=[f"{source}_concept_id"]) feature_counts = df_source[f"{source}_concept_id"].value_counts() - if adata.uns['use_dask']: + if adata.uns["use_dask"]: feature_counts = feature_counts.compute() feature_counts = feature_counts.to_frame().reset_index(drop=False)[0:number] - feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = map_concept_id( adata.uns, concept_id=feature_counts[f"{source}_concept_id"], verbose=False ) @@ -56,4 +54,4 @@ def feature_counts( ax = sns.barplot(feature_counts, x="feature_name", y="count") ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") plt.tight_layout() - return feature_counts \ No newline at end of file + return feature_counts diff --git a/ehrdata/pp/_omop.py b/ehrdata/pp/_omop.py index 7e86845..928dd14 100644 --- a/ehrdata/pp/_omop.py +++ b/ehrdata/pp/_omop.py @@ -1,7 +1,12 @@ -from typing import List, Union, Literal, Optional -from ehrdata.utils.omop_utils import * -import ehrapy as ep import warnings +from typing import Literal, Union + +import ehrapy as ep +import pandas as pd +from rich import print as rprint + +from ehrdata.utils.omop_utils import get_column_types, get_feature_info, read_table + def get_feature_statistics( adata, @@ -14,10 +19,12 @@ def get_feature_statistics( "drug_exposure", "condition_occurrence", ], - features: Union[str, int , List[Union[str, int]]] = None, + features: Union[str, int, list[Union[str, int]]] = None, level="stay_level", value_col: str = None, - aggregation_methods: Union[Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]]]=None, + aggregation_methods: Union[ + Literal["min", "max", "mean", "std", "count"], list[Literal["min", "max", "mean", "std", "count"]] + ] = None, add_aggregation_to_X: bool = True, verbose: bool = False, use_dask: bool = None, @@ -28,16 +35,22 @@ def get_feature_statistics( key = f"{source.split('_')[0]}_concept_id" else: raise KeyError(f"Extracting data from {source} is not supported yet") - - if source == 'measurement': - value_col = 'value_as_number' - warnings.warn(f"Extracting values from {value_col}. Value in measurement table could be saved in these columns: value_as_number, value_source_value.\nSpecify value_col to extract value from desired column.") - source_table_columns = ['visit_occurrence_id', 'measurement_datetime', key, value_col] - elif source == 'observation': - value_col = 'value_as_number' - warnings.warn(f"Extracting values from {value_col}. Value in observation table could be saved in these columns: value_as_number, value_as_string, value_source_value.\nSpecify value_col to extract value from desired column.") - source_table_columns = ['visit_occurrence_id', "observation_datetime", key, value_col] - elif source == 'condition_occurrence': + + if source == "measurement": + value_col = "value_as_number" + warnings.warn( + f"Extracting values from {value_col}. Value in measurement table could be saved in these columns: value_as_number, value_source_value.\nSpecify value_col to extract value from desired column.", + stacklevel=2, + ) + source_table_columns = ["visit_occurrence_id", "measurement_datetime", key, value_col] + elif source == "observation": + value_col = "value_as_number" + warnings.warn( + f"Extracting values from {value_col}. Value in observation table could be saved in these columns: value_as_number, value_as_string, value_source_value.\nSpecify value_col to extract value from desired column.", + stacklevel=2, + ) + source_table_columns = ["visit_occurrence_id", "observation_datetime", key, value_col] + elif source == "condition_occurrence": source_table_columns = None else: raise KeyError(f"Extracting data from {source} is not supported yet") @@ -49,62 +62,60 @@ def get_feature_statistics( use_dask = True column_types = get_column_types(adata.uns, table_name=source) - df_source = read_table(adata.uns, table_name=source, dtype=column_types, usecols=source_table_columns, use_dask=use_dask) - + df_source = read_table( + adata.uns, table_name=source, dtype=column_types, usecols=source_table_columns, use_dask=use_dask + ) + info_df = get_feature_info(adata.uns, features=features, verbose=verbose) - info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] - + info_dict = info_df[["feature_id", "feature_name"]].set_index("feature_id").to_dict()["feature_name"] + # Select featrues df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - #TODO Select time - #da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] - #df_source[f'{source}_name'] = df_source[key].map(info_dict) + # TODO Select time + # da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] + # df_source[f'{source}_name'] = df_source[key].map(info_dict) if aggregation_methods is None: aggregation_methods = ["min", "max", "mean", "std", "count"] - if level == 'stay_level': - result = df_source.groupby(['visit_occurrence_id', key]).agg({ - value_col: aggregation_methods}) - + if level == "stay_level": + result = df_source.groupby(["visit_occurrence_id", key]).agg({value_col: aggregation_methods}) + if use_dask: result = result.compute() result = result.reset_index(drop=False) result.columns = ["_".join(a) for a in result.columns.to_flat_index()] - result.columns = result.columns.str.removesuffix('_') - result.columns = result.columns.str.removeprefix(f'{value_col}_') - result[f'{source}_name'] = result[key].map(info_dict) + result.columns = result.columns.str.removesuffix("_") + result.columns = result.columns.str.removeprefix(f"{value_col}_") + result[f"{source}_name"] = result[key].map(info_dict) - df_statistics = result.pivot(index='visit_occurrence_id', - columns=f'{source}_name', - values=aggregation_methods) + df_statistics = result.pivot(index="visit_occurrence_id", columns=f"{source}_name", values=aggregation_methods) df_statistics.columns = df_statistics.columns.swaplevel() df_statistics.columns = ["_".join(a) for a in df_statistics.columns.to_flat_index()] - # TODO sort_columns = True if sort_columns: new_column_order = [] for feature in features: - for suffix in (f'_{aggregation_method}' for aggregation_method in aggregation_methods): - col_name = f'{feature}{suffix}' + for suffix in (f"_{aggregation_method}" for aggregation_method in aggregation_methods): + col_name = f"{feature}{suffix}" if col_name in df_statistics.columns: new_column_order.append(col_name) df_statistics.columns = new_column_order - + df_statistics.index = df_statistics.index.astype(str) - - adata.obs = pd.merge(adata.obs, df_statistics, how='left', left_index=True, right_index=True) - + + adata.obs = pd.merge(adata.obs, df_statistics, how="left", left_index=True, right_index=True) + if add_aggregation_to_X: uns = adata.uns obsm = adata.obsm varm = adata.varm - layers = adata.layers + # layers = adata.layers adata = ep.ad.move_to_x(adata, list(df_statistics.columns)) adata.uns = uns adata.obsm = obsm adata.varm = varm # It will change # adata.layers = layers - return adata \ No newline at end of file + return adata diff --git a/ehrdata/tl/_omop.py b/ehrdata/tl/_omop.py index 490bb04..01c6de3 100644 --- a/ehrdata/tl/_omop.py +++ b/ehrdata/tl/_omop.py @@ -1,28 +1,28 @@ -from ehrdata.utils.omop_utils import * #get_column_types, read_table, df_to_dict -from typing import List, Union, Literal, Optional, Dict import numbers -from rich import print as rprint +from typing import Union + from anndata import AnnData +from rich import print as rprint + +from ehrdata.utils.omop_utils import df_to_dict, get_column_types, read_table -def get_concept_name( - adata: Union[AnnData, Dict], - concept_id: Union[str, List], - raise_error=False, - verbose=True): - + +def get_concept_name(adata: Union[AnnData, dict], concept_id: Union[str, list], raise_error=False, verbose=True): if isinstance(concept_id, numbers.Integral): concept_id = [concept_id] - + if isinstance(adata, AnnData): adata_dict = adata.uns else: adata_dict = adata - + column_types = get_column_types(adata_dict, table_name="concept") df_concept = read_table(adata_dict, table_name="concept", dtype=column_types) # TODO dask Support - #df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] - df_concept.dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + # df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] + df_concept.dropna( + subset=["concept_id", "concept_name"], inplace=True, ignore_index=True + ) # usecols=vocabularies_tables_columns["concept"] concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") concept_name = [] concept_name_not_found = [] @@ -43,6 +43,7 @@ def get_concept_name( else: return concept_name + # TODO def get_concept_id(): - pass \ No newline at end of file + pass diff --git a/ehrdata/utils/omop_utils.py b/ehrdata/utils/omop_utils.py index b6a796b..137be3c 100644 --- a/ehrdata/utils/omop_utils.py +++ b/ehrdata/utils/omop_utils.py @@ -1,20 +1,25 @@ -import pandas as pd -import os import csv +import glob +import numbers +import os import warnings -import dask.dataframe as dd from pathlib import Path -from typing import List, Union, Literal, Optional, Dict -import numbers +from typing import Union + +import dask.dataframe as dd +import pandas as pd from rich import print as rprint -import glob -from difflib import SequenceMatcher -from heapq import nlargest as _nlargest def get_table_catalog_dict(): + """Get the table catalog dictionary of the OMOP CDM v5.4 + + Returns + ------- + Dictionary: a dictionary of the table catalog. The key is the category of the table, and the value is a list of table names + """ table_catalog_dict = {} - table_catalog_dict['Clinical data'] = [ + table_catalog_dict["Clinical data"] = [ "person", "observation_period", "specimen", @@ -34,7 +39,13 @@ def get_table_catalog_dict(): table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] table_catalog_dict["Health economics data"] = ["payer_plan_period", "cost"] - table_catalog_dict["Standardized derived elements"] = ["cohort", "cohort_definition", "drug_era", "dose_era", "condition_era"] + table_catalog_dict["Standardized derived elements"] = [ + "cohort", + "cohort_definition", + "drug_era", + "dose_era", + "condition_era", + ] table_catalog_dict["Metadata"] = ["cdm_source", "metadata"] table_catalog_dict["Vocabulary"] = [ "concept", @@ -50,42 +61,68 @@ def get_table_catalog_dict(): ] return table_catalog_dict + def get_dtype_mapping(): - dtype_mapping = {'integer': "Int64", - 'Integer': "Int64", - 'float': float, - 'bigint': "Int64", - 'varchar(MAX)': str, - 'varchar(2000)': str, - 'varchar(1000)': str, - 'varchar(255)': str, - 'varchar(250)': str, - 'varchar(80)': str, - 'varchar(60)': str, - 'varchar(50)': str, - 'varchar(25)': str, - 'varchar(20)': str, - 'varchar(10)': str, - 'varchar(9)': str, - 'varchar(3)': str, - 'varchar(2)': str, - 'varchar(1)': str, - 'datetime': object, - 'date': object} - + """Get the data type mapping of the OMOP CDM v5.4 + + Returns + ------- + Dictionary: a dictionary of the data type mapping from OMOP CDM v5.4 to Python + """ + dtype_mapping = { + "integer": "Int64", + "Integer": "Int64", + "float": float, + "bigint": "Int64", + "varchar(MAX)": str, + "varchar(2000)": str, + "varchar(1000)": str, + "varchar(255)": str, + "varchar(250)": str, + "varchar(80)": str, + "varchar(60)": str, + "varchar(50)": str, + "varchar(25)": str, + "varchar(20)": str, + "varchar(10)": str, + "varchar(9)": str, + "varchar(3)": str, + "varchar(2)": str, + "varchar(1)": str, + "datetime": object, + "date": object, + } + return dtype_mapping + def get_omop_cdm_field_level(): + """Get the field level table sof the OMOP CDM v5.4 + + Returns + ------- + Pandas DataFrame + """ pth = f"{Path(__file__).resolve().parent}/OMOP_CDMv5.4_Field_Level.csv" df = pd.read_csv(pth) return df -def check_with_omop_cdm( - delimiter, - folder_path=str, - make_filename_lowercase=True): - - + +def check_with_omop_cdm(folder_path: str, delimiter: str = None, make_filename_lowercase: bool = True) -> dict: + """Check if the data adheres to the OMOP Common Data Model (CDM) version 5.4 standards + + Check if the table name and column names adhere to the OMOP CDM v5.4 + + Args: + folder_path (str): The path of the folder containing the OMOP data + delimiter (str, optional): The delimiter of the CSV file. Defaults to None. + make_filename_lowercase (bool, optional): Whether to make the filename into lowercase. Defaults to True. + + Returns + ------- + dict: a dictionary of the table path. The key is the table name, and the value is the path of the table + """ + # TODO check if each column's data type adheres to the OMOP CDM print("Checking if your data adheres to the OMOP Common Data Model (CDM) version 5.4 standards.") filepath_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) filepath_dict = {} @@ -94,7 +131,7 @@ def check_with_omop_cdm( is_single_file = True else: is_single_file = False - + # TODO support table stored in a folder """ # If not a single file, only check the first one's column names @@ -105,7 +142,6 @@ def check_with_omop_cdm( is_single_file = False """ if is_single_file and not check_csv_has_only_header(path): - # Make filename into lowercase if make_filename_lowercase: new_path = os.path.join(folder_path, path.split("/")[-1].lower()) @@ -113,42 +149,54 @@ def check_with_omop_cdm( warnings(f"Rename file [{path}] to [{new_path}]") os.rename(path, new_path) path = new_path - + # check if table name adheres to the OMOP CDM file_name = os.path.basename(path).split(".")[0] field_level = get_omop_cdm_field_level() if file_name not in set(field_level.cdmTableName): - raise KeyError(f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!") - - - + raise KeyError( + f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!" + ) + # check if column names adhere to the OMOP CDM - if path.endswith('csv'): - with open(path, "r") as f: + if path.endswith("csv"): + with open(path) as f: dict_reader = csv.DictReader(f, delimiter=delimiter) columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith('parquet'): + columns = list(filter(None, columns)) + elif path.endswith("parquet"): df = dd.read_parquet(path) columns = list(df.columns) else: raise TypeError("Only support CSV and Parquet file!") - + invalid_column_name = [] for _, column in enumerate(columns): - cdm_columns = set(field_level[field_level.cdmTableName == file_name]['cdmFieldName']) + cdm_columns = set(field_level[field_level.cdmTableName == file_name]["cdmFieldName"]) if column not in cdm_columns: invalid_column_name.append(column) if len(invalid_column_name) > 0: - print(f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}") + print( + f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}" + ) raise KeyError - + filepath_dict[file_name] = path return filepath_dict - -def check_csv_has_only_header(file_path): - if file_path.endswith('csv'): - with open(file_path, 'r') as file: + + +def check_csv_has_only_header(file_path: str) -> bool: + """Check if the CSV file has only header + + Args: + file_path (str): The path of the CSV file + + Returns + ------- + bool: True if the CSV file has only header, False otherwise + """ + if file_path.endswith("csv"): + with open(file_path) as file: reader = csv.reader(file) header = next(reader, None) if header is not None: @@ -158,24 +206,33 @@ def check_csv_has_only_header(file_path): return False else: return False - -def get_column_types(adata_dict, - table_name: str = None): - - path = adata_dict['filepath_dict'][table_name] + + +def get_column_types(adata_dict: dict, table_name: str) -> dict: + """Get the column types of the table + + Args: + adata_dict (dict): a dictionary containing filepath_dict and delimiter information + table_name (str): Table name in OMOP CDM v5.4. + + Returns + ------- + dict: a dictionary of the column types. The key is the column name, and the value is the column type + """ + path = adata_dict["filepath_dict"][table_name] column_types = {} # If not a single file, read the first one if not os.path.isfile(path): folder_walk = os.walk(path) first_file_in_folder = next(folder_walk)[2][0] path = os.path.join(path, first_file_in_folder) - - if path.endswith('csv'): - with open(path, "r") as f: - dict_reader = csv.DictReader(f, delimiter=adata_dict['delimiter']) + + if path.endswith("csv"): + with open(path) as f: + dict_reader = csv.DictReader(f, delimiter=adata_dict["delimiter"]) columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith('parquet'): + columns = list(filter(None, columns)) + elif path.endswith("parquet"): df = dd.read_parquet(path) columns = list(df.columns) else: @@ -184,35 +241,75 @@ def get_column_types(adata_dict, for _, column in enumerate(columns_lowercase): dtype_mapping = get_dtype_mapping() field_level = get_omop_cdm_field_level() - column_types[column] = dtype_mapping[field_level[(field_level.cdmTableName == table_name) & (field_level.cdmFieldName == column)]['cdmDatatype'].values[0]] + column_types[column] = dtype_mapping[ + field_level[(field_level.cdmTableName == table_name) & (field_level.cdmFieldName == column)][ + "cdmDatatype" + ].values[0] + ] return column_types -def get_primary_key(table_name): +def get_primary_key(table_name: str) -> str: + """Get the primary key of the table + + Args: + table_name (str, optional): Table name in OMOP CDM v5.4. + + Returns + ------- + str: the primary key of the table + """ field_level = get_omop_cdm_field_level() - primary_key = field_level[(field_level.cdmTableName == table_name) & (field_level.isPrimaryKey == 'Yes')]['cdmFieldName'].values[0] + primary_key = field_level[(field_level.cdmTableName == table_name) & (field_level.isPrimaryKey == "Yes")][ + "cdmFieldName" + ].values[0] return primary_key - -def read_table(adata_dict, table_name: str = None, dtype=None, parse_dates=None, index=None, usecols=None, use_dask=None): - + + +def read_table( + adata_dict: dict, + table_name: str, + dtype: dict = None, + parse_dates: Union[list[str], str] = None, + index: str = None, + usecols: Union[list[str], str] = None, + use_dask: bool = None, +) -> Union[pd.DataFrame, dd.DataFrame]: + """Read the table either in CSV or Parquet format using pandas or dask + + Args: + adata_dict (dict): a dictionary containing filepath_dict, delimiter, use_dask, tables information + table_name (str, optional): Table name in OMOP CDM v5.4. + dtype (dict, optional): Data type of the columns. Defaults to None. + parse_dates (Union[List[str], str], optional): Columns to parse as dates. Defaults to None. + index (str, optional): set the index of the DataFrame. Defaults to None. + usecols (Union[List[str], str], optional): Columns to read. Defaults to None. + use_dask (bool, optional): Whether to use dask. It is recommended to use dask when the table is large. Defaults to None. + + Returns + ------- + Union[pd.DataFrame, dd.DataFrame]: a pandas or dask DataFrame + """ if not use_dask: - use_dask = adata_dict['use_dask'] - path = adata_dict['filepath_dict'][table_name] + use_dask = adata_dict["use_dask"] + path = adata_dict["filepath_dict"][table_name] if use_dask: if not os.path.isfile(path): folder_walk = os.walk(path) filetype = next(folder_walk)[2][0].split(".")[-1] else: filetype = path.split(".")[-1] - if filetype == 'csv': + if filetype == "csv": if not os.path.isfile(path): path = f"{path}/*.csv" if usecols: dtype = {key: dtype[key] for key in usecols if key in dtype} if parse_dates: parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_csv(path, delimiter=adata_dict['delimiter'], dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == 'parquet': + df = dd.read_csv( + path, delimiter=adata_dict["delimiter"], dtype=dtype, parse_dates=parse_dates, usecols=usecols + ) + elif filetype == "parquet": if not os.path.isfile(path): path = f"{path}/*.parquet" if usecols: @@ -226,47 +323,59 @@ def read_table(adata_dict, table_name: str = None, dtype=None, parse_dates=None, if not os.path.isfile(path): raise TypeError("Only support reading a single file!") filetype = path.split(".")[-1] - if filetype == 'csv': + if filetype == "csv": if usecols: dtype = {key: dtype[key] for key in usecols if key in dtype} if parse_dates: parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = pd.read_csv(path, delimiter=adata_dict['delimiter'], dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == 'parquet': + df = pd.read_csv( + path, delimiter=adata_dict["delimiter"], dtype=dtype, parse_dates=parse_dates, usecols=usecols + ) + elif filetype == "parquet": df = pd.read_parquet(path, columns=usecols) - + else: raise TypeError("Only support CSV and Parquet file!") - - + if index: df = df.set_index(index) return df def map_concept_id( - adata_dict, - concept_id: Union[str, List], - verbose=True): - - filepath_dict = adata_dict['filepath_dict'] - tables = adata_dict['tables'] - delimiter = adata_dict['delimiter'] - + adata_dict: dict, concept_id: Union[str, list[int]], verbose: bool = True +) -> tuple[list[int], list[int]]: + """Map between concept_id_1 and concept_id_2 using concept_relationship table + + Args: + adata_dict (dict): a dictionary containing filepath_dict, delimiter, tables information. + concept_id (Union[str, list[int]]): It could be a single concept_id or a list of concept_id. + verbose (bool, optional): Defaults to True. + + Returns + ------- + Tuple[list[int], list[int]]: a tuple of list of concept_id_1 and list of concept_id_2. If no map is found, the concept_id_1 and concept_id_2 will be the same. + """ + filepath_dict = adata_dict["filepath_dict"] + tables = adata_dict["tables"] + delimiter = adata_dict["delimiter"] + if isinstance(concept_id, numbers.Integral): concept_id = [concept_id] concept_id_1 = [] concept_id_2 = [] concept_id_mapped_not_found = [] - + if "concept_relationship" in tables: column_types = get_column_types(adata_dict, table_name="concept_relationship") df_concept_relationship = pd.read_csv( - filepath_dict["concept_relationship"], dtype=column_types + filepath_dict["concept_relationship"], dtype=column_types, delimiter=delimiter ) # TODO dask Support - #df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], - df_concept_relationship.dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + # df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + df_concept_relationship.dropna( + subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True + ) # , usecols=vocabularies_tables_columns["concept_relationship"], concept_relationship_dict = df_to_dict( df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], key="concept_id_1", @@ -296,67 +405,90 @@ def map_concept_id( else: concept_id_1 = concept_id concept_id_2 = concept_id - + if len(concept_id_1) == 1: return concept_id_1[0], concept_id_2[0] else: return concept_id_1, concept_id_2 - - -def df_to_dict(df, key, value): + + +def df_to_dict(df: pd.DataFrame, key: str, value: str) -> dict: + """Convert a DataFrame to a dictionary + + Args: + df (pd.DataFrame): a DataFrame + key (str): the column name to be used as the key of the dictionary + value (str): the column name to be used as the value of the dictionary + + Returns + ------- + dict: a dictionary + """ if isinstance(df, dd.DataFrame): return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() else: return pd.Series(df[value].values, index=df[key]).to_dict() -def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): - """Use SequenceMatcher to return a list of the indexes of the best - "good enough" matches. word is a sequence for which close matches - are desired (typically a string). - possibilities is a dictionary of sequences. - Optional arg n (default 2) is the maximum number of close matches to - return. n must be > 0. - Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities - that don't score at least that similar to word are ignored. - """ - - if not n > 0: - raise ValueError("n must be > 0: %r" % (n,)) - if not 0.0 <= cutoff <= 1.0: - raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) - result = [] - s = SequenceMatcher() - s.set_seq2(word) - for _, (key, value) in enumerate(possibilities.items()): - s.set_seq1(value) - if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: - result.append((s.ratio(), value, key)) - - # Move the best scorers to head of list - result = _nlargest(n, result) +# def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): +# """Use SequenceMatcher to return a list of the indexes of the best +# "good enough" matches. word is a sequence for which close matches +# are desired (typically a string). +# possibilities is a dictionary of sequences. +# Optional arg n (default 2) is the maximum number of close matches to +# return. n must be > 0. +# Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities +# that don't score at least that similar to word are ignored. +# """ +# if not n > 0: +# raise ValueError("n must be > 0: %r" % (n,)) +# if not 0.0 <= cutoff <= 1.0: +# raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) +# result = [] +# s = SequenceMatcher() +# s.set_seq2(word) +# for _, (key, value) in enumerate(possibilities.items()): +# s.set_seq1(value) +# if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: +# result.append((s.ratio(), value, key)) - # Strip scores for the best n matches - return [(value, key, score) for score, value, key in result] +# # Move the best scorers to head of list +# result = _nlargest(n, result) +# # Strip scores for the best n matches +# return [(value, key, score) for score, value, key in result] def get_feature_info( - adata_dict: Dict, - features: Union[str, int, List[Union[str, int]]] = None, + adata_dict: dict, + features: Union[str, int, list[Union[str, int]]] = None, ignore_not_shown_in_concept_table: bool = True, exact_match: bool = True, verbose: bool = True, -): +) -> pd.DataFrame: + """Get the feature information from the concept table + + Args: + adata_dict (dict): a dictionary containing filepath_dict, delimiter, tables information. + features (Union[str, int, list[Union[str, int]]], optional): a feature name or a feature id. Defaults to None. + ignore_not_shown_in_concept_table (bool, optional): If True, it will ignore the features that are not shown in the concept table. Defaults to True. + exact_match (bool, optional): If True, it will only return the exact match if the feature name is input. Defaults to True. + verbose (bool, optional): Defaults to True. - if "concept" in adata_dict['tables']: + Returns + ------- + pd.DataFrame: a DataFrame containing the feature information + """ + if "concept" in adata_dict["tables"]: column_types = get_column_types(adata_dict, table_name="concept") - + df_concept = read_table(adata_dict, table_name="concept", dtype=column_types).dropna( subset=["concept_id", "concept_name"] ) # usecols=vocabularies_tables_columns["concept"], - #concept_dict = df_to_dict(df=df_concept, key="concept_name", value="concept_id") - + # concept_dict = df_to_dict(df=df_concept, key="concept_name", value="concept_id") + else: + rprint("concept table is not found in the OMOP CDM v5.4!") + raise ValueError fetures_not_shown_in_concept_table = [] info_df = pd.DataFrame([]) @@ -369,7 +501,7 @@ def get_feature_info( feature_id = feature feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) try: - feature_name = df_concept[df_concept['concept_id'] == feature_id_1]['concept_name'].values[0] + feature_name = df_concept[df_concept["concept_id"] == feature_id_1]["concept_name"].values[0] except KeyError: if ignore_not_shown_in_concept_table: fetures_not_shown_in_concept_table.append(feature) @@ -382,23 +514,21 @@ def get_feature_info( # if the input is feature name elif isinstance(feature, str): # return a list of (value, key, score) - #result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) + # result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) from thefuzz import process - - # the thefuzz match returns a list of tuples of (matched string, match ratio) - result = process.extract(feature, list(df_concept['concept_name'].values), limit=2) + # the thefuzz match returns a list of tuples of (matched string, match ratio) + result = process.extract(feature, list(df_concept["concept_name"].values), limit=2) match_1 = result[0] match_1_name = match_1[0] match_1_ratio = match_1[1] # Most of the case: if find 2 best matches if len(result) == 2: - match_2 = result[1] match_2_name = match_2[0] match_2_ratio = match_2[1] - + if match_1_ratio != 100: if exact_match: rprint( @@ -407,14 +537,13 @@ def get_feature_info( raise ValueError else: if match_2_ratio == 100: - match_1_id = df_concept[df_concept['concept_name'] == match_1_name]['concept_id'].values[0] - match_2_id = df_concept[df_concept['concept_name'] == match_2_name]['concept_id'].values[0] + match_1_id = df_concept[df_concept["concept_name"] == match_1_name]["concept_id"].values[0] + match_2_id = df_concept[df_concept["concept_name"] == match_2_name]["concept_id"].values[0] rprint( f"Found multiple exact matches for [blue]{feature}[/] in the concept table.\n1) concept id: [blue]{match_1_id}[/] 2) concept id: [blue]{match_2_id}[/]. Please specify concept_id directly." ) raise ValueError - - + # Very rare: if only find 1 match else: if exact_match and match_1_ratio != 1: @@ -422,20 +551,27 @@ def get_feature_info( f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similiar one: [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/]" ) raise ValueError - + feature_name = match_1_name - feature_id = df_concept[df_concept['concept_name'] == feature_name]['concept_id'].values[0] + feature_id = df_concept[df_concept["concept_name"] == feature_name]["concept_id"].values[0] feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) - + else: rprint( - f"Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] that you want to extarct" + "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] that you want to extarct" ) raise TypeError - - info_df = pd.concat([info_df, pd.DataFrame(data=[[feature_name, feature_id_1, feature_id_2]], columns=['feature_name', 'feature_id_1', 'feature_id_2'])]) - - + + info_df = pd.concat( + [ + info_df, + pd.DataFrame( + data=[[feature_name, feature_id_1, feature_id_2]], + columns=["feature_name", "feature_id_1", "feature_id_2"], + ), + ] + ) + # feature_name_list.append(feature_name) # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) @@ -446,10 +582,10 @@ def get_feature_info( f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match ratio = [green]{match_1_ratio}." ) - if info_df[f"feature_id_1"].equals(info_df[f"feature_id_2"]): - info_df.drop(f"feature_id_2", axis=1, inplace=True) + if info_df["feature_id_1"].equals(info_df["feature_id_2"]): + info_df.drop("feature_id_2", axis=1, inplace=True) info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) info_df = info_df.reset_index(drop=True) else: info_df = info_df.reset_index(drop=True) - return info_df \ No newline at end of file + return info_df From c750333eead0af1f138267d6e64fb1e678cdc546 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 11:26:05 +0100 Subject: [PATCH 09/13] Remove unused import in __init__.py --- ehrdata/__init__.py | 4 +- ehrdata/utils/omop_utils.py | 591 ------------------------------------ 2 files changed, 2 insertions(+), 593 deletions(-) delete mode 100644 ehrdata/utils/omop_utils.py diff --git a/ehrdata/__init__.py b/ehrdata/__init__.py index 73939b7..06b7e30 100644 --- a/ehrdata/__init__.py +++ b/ehrdata/__init__.py @@ -1,7 +1,7 @@ from importlib.metadata import version -from . import dt, pl, pp, tl, io +from . import io, pl, pp, tl -__all__ = ["dt", "pl", "pp", "tl", "io"] +__all__ = ["pl", "pp", "tl", "io"] __version__ = "0.0.0" diff --git a/ehrdata/utils/omop_utils.py b/ehrdata/utils/omop_utils.py deleted file mode 100644 index 137be3c..0000000 --- a/ehrdata/utils/omop_utils.py +++ /dev/null @@ -1,591 +0,0 @@ -import csv -import glob -import numbers -import os -import warnings -from pathlib import Path -from typing import Union - -import dask.dataframe as dd -import pandas as pd -from rich import print as rprint - - -def get_table_catalog_dict(): - """Get the table catalog dictionary of the OMOP CDM v5.4 - - Returns - ------- - Dictionary: a dictionary of the table catalog. The key is the category of the table, and the value is a list of table names - """ - table_catalog_dict = {} - table_catalog_dict["Clinical data"] = [ - "person", - "observation_period", - "specimen", - "death", - "visit_occurrence", - "visit_detail", - "procedure_occurrence", - "drug_exposure", - "device_exposure", - "condition_occurrence", - "measurement", - "note", - "note_nlp", - "observation", - "fact_relationship", - ] - - table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] - table_catalog_dict["Health economics data"] = ["payer_plan_period", "cost"] - table_catalog_dict["Standardized derived elements"] = [ - "cohort", - "cohort_definition", - "drug_era", - "dose_era", - "condition_era", - ] - table_catalog_dict["Metadata"] = ["cdm_source", "metadata"] - table_catalog_dict["Vocabulary"] = [ - "concept", - "vocabulary", - "domain", - "concept_class", - "concept_relationship", - "relationship", - "concept_synonym", - "concept_ancestor", - "source_to_concept_map", - "drug_strength", - ] - return table_catalog_dict - - -def get_dtype_mapping(): - """Get the data type mapping of the OMOP CDM v5.4 - - Returns - ------- - Dictionary: a dictionary of the data type mapping from OMOP CDM v5.4 to Python - """ - dtype_mapping = { - "integer": "Int64", - "Integer": "Int64", - "float": float, - "bigint": "Int64", - "varchar(MAX)": str, - "varchar(2000)": str, - "varchar(1000)": str, - "varchar(255)": str, - "varchar(250)": str, - "varchar(80)": str, - "varchar(60)": str, - "varchar(50)": str, - "varchar(25)": str, - "varchar(20)": str, - "varchar(10)": str, - "varchar(9)": str, - "varchar(3)": str, - "varchar(2)": str, - "varchar(1)": str, - "datetime": object, - "date": object, - } - - return dtype_mapping - - -def get_omop_cdm_field_level(): - """Get the field level table sof the OMOP CDM v5.4 - - Returns - ------- - Pandas DataFrame - """ - pth = f"{Path(__file__).resolve().parent}/OMOP_CDMv5.4_Field_Level.csv" - df = pd.read_csv(pth) - return df - - -def check_with_omop_cdm(folder_path: str, delimiter: str = None, make_filename_lowercase: bool = True) -> dict: - """Check if the data adheres to the OMOP Common Data Model (CDM) version 5.4 standards - - Check if the table name and column names adhere to the OMOP CDM v5.4 - - Args: - folder_path (str): The path of the folder containing the OMOP data - delimiter (str, optional): The delimiter of the CSV file. Defaults to None. - make_filename_lowercase (bool, optional): Whether to make the filename into lowercase. Defaults to True. - - Returns - ------- - dict: a dictionary of the table path. The key is the table name, and the value is the path of the table - """ - # TODO check if each column's data type adheres to the OMOP CDM - print("Checking if your data adheres to the OMOP Common Data Model (CDM) version 5.4 standards.") - filepath_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) - filepath_dict = {} - for path in filepath_list: - if os.path.isfile(path): - is_single_file = True - else: - is_single_file = False - - # TODO support table stored in a folder - """ - # If not a single file, only check the first one's column names - if not os.path.isfile(path): - folder_walk = os.walk(path) - first_file_in_folder = next(folder_walk)[2][0] - file = os.path.join(path, first_file_in_folder) - is_single_file = False - """ - if is_single_file and not check_csv_has_only_header(path): - # Make filename into lowercase - if make_filename_lowercase: - new_path = os.path.join(folder_path, path.split("/")[-1].lower()) - if path != new_path: - warnings(f"Rename file [{path}] to [{new_path}]") - os.rename(path, new_path) - path = new_path - - # check if table name adheres to the OMOP CDM - file_name = os.path.basename(path).split(".")[0] - field_level = get_omop_cdm_field_level() - if file_name not in set(field_level.cdmTableName): - raise KeyError( - f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!" - ) - - # check if column names adhere to the OMOP CDM - if path.endswith("csv"): - with open(path) as f: - dict_reader = csv.DictReader(f, delimiter=delimiter) - columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith("parquet"): - df = dd.read_parquet(path) - columns = list(df.columns) - else: - raise TypeError("Only support CSV and Parquet file!") - - invalid_column_name = [] - for _, column in enumerate(columns): - cdm_columns = set(field_level[field_level.cdmTableName == file_name]["cdmFieldName"]) - if column not in cdm_columns: - invalid_column_name.append(column) - if len(invalid_column_name) > 0: - print( - f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}" - ) - raise KeyError - - filepath_dict[file_name] = path - return filepath_dict - - -def check_csv_has_only_header(file_path: str) -> bool: - """Check if the CSV file has only header - - Args: - file_path (str): The path of the CSV file - - Returns - ------- - bool: True if the CSV file has only header, False otherwise - """ - if file_path.endswith("csv"): - with open(file_path) as file: - reader = csv.reader(file) - header = next(reader, None) - if header is not None: - second_row = next(reader, None) - return second_row is None - else: - return False - else: - return False - - -def get_column_types(adata_dict: dict, table_name: str) -> dict: - """Get the column types of the table - - Args: - adata_dict (dict): a dictionary containing filepath_dict and delimiter information - table_name (str): Table name in OMOP CDM v5.4. - - Returns - ------- - dict: a dictionary of the column types. The key is the column name, and the value is the column type - """ - path = adata_dict["filepath_dict"][table_name] - column_types = {} - # If not a single file, read the first one - if not os.path.isfile(path): - folder_walk = os.walk(path) - first_file_in_folder = next(folder_walk)[2][0] - path = os.path.join(path, first_file_in_folder) - - if path.endswith("csv"): - with open(path) as f: - dict_reader = csv.DictReader(f, delimiter=adata_dict["delimiter"]) - columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith("parquet"): - df = dd.read_parquet(path) - columns = list(df.columns) - else: - raise TypeError("Only support CSV and Parquet file!") - columns_lowercase = [column.lower() for column in columns] - for _, column in enumerate(columns_lowercase): - dtype_mapping = get_dtype_mapping() - field_level = get_omop_cdm_field_level() - column_types[column] = dtype_mapping[ - field_level[(field_level.cdmTableName == table_name) & (field_level.cdmFieldName == column)][ - "cdmDatatype" - ].values[0] - ] - return column_types - - -def get_primary_key(table_name: str) -> str: - """Get the primary key of the table - - Args: - table_name (str, optional): Table name in OMOP CDM v5.4. - - Returns - ------- - str: the primary key of the table - """ - field_level = get_omop_cdm_field_level() - primary_key = field_level[(field_level.cdmTableName == table_name) & (field_level.isPrimaryKey == "Yes")][ - "cdmFieldName" - ].values[0] - return primary_key - - -def read_table( - adata_dict: dict, - table_name: str, - dtype: dict = None, - parse_dates: Union[list[str], str] = None, - index: str = None, - usecols: Union[list[str], str] = None, - use_dask: bool = None, -) -> Union[pd.DataFrame, dd.DataFrame]: - """Read the table either in CSV or Parquet format using pandas or dask - - Args: - adata_dict (dict): a dictionary containing filepath_dict, delimiter, use_dask, tables information - table_name (str, optional): Table name in OMOP CDM v5.4. - dtype (dict, optional): Data type of the columns. Defaults to None. - parse_dates (Union[List[str], str], optional): Columns to parse as dates. Defaults to None. - index (str, optional): set the index of the DataFrame. Defaults to None. - usecols (Union[List[str], str], optional): Columns to read. Defaults to None. - use_dask (bool, optional): Whether to use dask. It is recommended to use dask when the table is large. Defaults to None. - - Returns - ------- - Union[pd.DataFrame, dd.DataFrame]: a pandas or dask DataFrame - """ - if not use_dask: - use_dask = adata_dict["use_dask"] - path = adata_dict["filepath_dict"][table_name] - if use_dask: - if not os.path.isfile(path): - folder_walk = os.walk(path) - filetype = next(folder_walk)[2][0].split(".")[-1] - else: - filetype = path.split(".")[-1] - if filetype == "csv": - if not os.path.isfile(path): - path = f"{path}/*.csv" - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_csv( - path, delimiter=adata_dict["delimiter"], dtype=dtype, parse_dates=parse_dates, usecols=usecols - ) - elif filetype == "parquet": - if not os.path.isfile(path): - path = f"{path}/*.parquet" - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_parquet(path, dtype=dtype, parse_dates=parse_dates, columns=usecols) - else: - raise TypeError("Only support CSV and Parquet file!") - else: - if not os.path.isfile(path): - raise TypeError("Only support reading a single file!") - filetype = path.split(".")[-1] - if filetype == "csv": - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = pd.read_csv( - path, delimiter=adata_dict["delimiter"], dtype=dtype, parse_dates=parse_dates, usecols=usecols - ) - elif filetype == "parquet": - df = pd.read_parquet(path, columns=usecols) - - else: - raise TypeError("Only support CSV and Parquet file!") - - if index: - df = df.set_index(index) - return df - - -def map_concept_id( - adata_dict: dict, concept_id: Union[str, list[int]], verbose: bool = True -) -> tuple[list[int], list[int]]: - """Map between concept_id_1 and concept_id_2 using concept_relationship table - - Args: - adata_dict (dict): a dictionary containing filepath_dict, delimiter, tables information. - concept_id (Union[str, list[int]]): It could be a single concept_id or a list of concept_id. - verbose (bool, optional): Defaults to True. - - Returns - ------- - Tuple[list[int], list[int]]: a tuple of list of concept_id_1 and list of concept_id_2. If no map is found, the concept_id_1 and concept_id_2 will be the same. - """ - filepath_dict = adata_dict["filepath_dict"] - tables = adata_dict["tables"] - delimiter = adata_dict["delimiter"] - - if isinstance(concept_id, numbers.Integral): - concept_id = [concept_id] - concept_id_1 = [] - concept_id_2 = [] - concept_id_mapped_not_found = [] - - if "concept_relationship" in tables: - column_types = get_column_types(adata_dict, table_name="concept_relationship") - df_concept_relationship = pd.read_csv( - filepath_dict["concept_relationship"], dtype=column_types, delimiter=delimiter - ) - # TODO dask Support - # df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], - df_concept_relationship.dropna( - subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True - ) # , usecols=vocabularies_tables_columns["concept_relationship"], - concept_relationship_dict = df_to_dict( - df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], - key="concept_id_1", - value="concept_id_2", - ) - concept_relationship_dict_reverse = df_to_dict( - df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"], - key="concept_id_1", - value="concept_id_2", - ) - for id in concept_id: - try: - concept_id_2.append(concept_relationship_dict[id]) - concept_id_1.append(id) - except KeyError: - try: - concept_id_1.append(concept_relationship_dict_reverse[id]) - concept_id_2.append(id) - except KeyError: - concept_id_1.append(id) - concept_id_2.append(id) - concept_id_mapped_not_found.append(id) - if len(concept_id_mapped_not_found) > 0: - # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") - if verbose: - rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!") - else: - concept_id_1 = concept_id - concept_id_2 = concept_id - - if len(concept_id_1) == 1: - return concept_id_1[0], concept_id_2[0] - else: - return concept_id_1, concept_id_2 - - -def df_to_dict(df: pd.DataFrame, key: str, value: str) -> dict: - """Convert a DataFrame to a dictionary - - Args: - df (pd.DataFrame): a DataFrame - key (str): the column name to be used as the key of the dictionary - value (str): the column name to be used as the value of the dictionary - - Returns - ------- - dict: a dictionary - """ - if isinstance(df, dd.DataFrame): - return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() - else: - return pd.Series(df[value].values, index=df[key]).to_dict() - - -# def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): -# """Use SequenceMatcher to return a list of the indexes of the best -# "good enough" matches. word is a sequence for which close matches -# are desired (typically a string). -# possibilities is a dictionary of sequences. -# Optional arg n (default 2) is the maximum number of close matches to -# return. n must be > 0. -# Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities -# that don't score at least that similar to word are ignored. -# """ -# if not n > 0: -# raise ValueError("n must be > 0: %r" % (n,)) -# if not 0.0 <= cutoff <= 1.0: -# raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) -# result = [] -# s = SequenceMatcher() -# s.set_seq2(word) -# for _, (key, value) in enumerate(possibilities.items()): -# s.set_seq1(value) -# if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: -# result.append((s.ratio(), value, key)) - -# # Move the best scorers to head of list -# result = _nlargest(n, result) - -# # Strip scores for the best n matches -# return [(value, key, score) for score, value, key in result] - - -def get_feature_info( - adata_dict: dict, - features: Union[str, int, list[Union[str, int]]] = None, - ignore_not_shown_in_concept_table: bool = True, - exact_match: bool = True, - verbose: bool = True, -) -> pd.DataFrame: - """Get the feature information from the concept table - - Args: - adata_dict (dict): a dictionary containing filepath_dict, delimiter, tables information. - features (Union[str, int, list[Union[str, int]]], optional): a feature name or a feature id. Defaults to None. - ignore_not_shown_in_concept_table (bool, optional): If True, it will ignore the features that are not shown in the concept table. Defaults to True. - exact_match (bool, optional): If True, it will only return the exact match if the feature name is input. Defaults to True. - verbose (bool, optional): Defaults to True. - - Returns - ------- - pd.DataFrame: a DataFrame containing the feature information - """ - if "concept" in adata_dict["tables"]: - column_types = get_column_types(adata_dict, table_name="concept") - - df_concept = read_table(adata_dict, table_name="concept", dtype=column_types).dropna( - subset=["concept_id", "concept_name"] - ) # usecols=vocabularies_tables_columns["concept"], - # concept_dict = df_to_dict(df=df_concept, key="concept_name", value="concept_id") - else: - rprint("concept table is not found in the OMOP CDM v5.4!") - raise ValueError - fetures_not_shown_in_concept_table = [] - - info_df = pd.DataFrame([]) - if isinstance(features, str): - features = [features] - # Get feature id for each input, and check if each feature occurs in the concept table - for feature in features: - # if the input is feature ID - if isinstance(feature, numbers.Integral): - feature_id = feature - feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) - try: - feature_name = df_concept[df_concept["concept_id"] == feature_id_1]["concept_name"].values[0] - except KeyError: - if ignore_not_shown_in_concept_table: - fetures_not_shown_in_concept_table.append(feature) - continue - else: - rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table") - raise - match_1_ratio = 100 - - # if the input is feature name - elif isinstance(feature, str): - # return a list of (value, key, score) - # result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) - from thefuzz import process - - # the thefuzz match returns a list of tuples of (matched string, match ratio) - result = process.extract(feature, list(df_concept["concept_name"].values), limit=2) - - match_1 = result[0] - match_1_name = match_1[0] - match_1_ratio = match_1[1] - # Most of the case: if find 2 best matches - if len(result) == 2: - match_2 = result[1] - match_2_name = match_2[0] - match_2_ratio = match_2[1] - - if match_1_ratio != 100: - if exact_match: - rprint( - f"Unable to find an exact match for [blue]{feature}[/] in the concept table.\nSimilar ones: 1) [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/] 2) [blue]{match_2_name}[/] with match ratio [red]{match_2_ratio}[/]" - ) - raise ValueError - else: - if match_2_ratio == 100: - match_1_id = df_concept[df_concept["concept_name"] == match_1_name]["concept_id"].values[0] - match_2_id = df_concept[df_concept["concept_name"] == match_2_name]["concept_id"].values[0] - rprint( - f"Found multiple exact matches for [blue]{feature}[/] in the concept table.\n1) concept id: [blue]{match_1_id}[/] 2) concept id: [blue]{match_2_id}[/]. Please specify concept_id directly." - ) - raise ValueError - - # Very rare: if only find 1 match - else: - if exact_match and match_1_ratio != 1: - rprint( - f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similiar one: [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/]" - ) - raise ValueError - - feature_name = match_1_name - feature_id = df_concept[df_concept["concept_name"] == feature_name]["concept_id"].values[0] - feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) - - else: - rprint( - "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] that you want to extarct" - ) - raise TypeError - - info_df = pd.concat( - [ - info_df, - pd.DataFrame( - data=[[feature_name, feature_id_1, feature_id_2]], - columns=["feature_name", "feature_id_1", "feature_id_2"], - ), - ] - ) - - # feature_name_list.append(feature_name) - # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) - # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) - # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0]) - - if verbose: - rprint( - f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match ratio = [green]{match_1_ratio}." - ) - - if info_df["feature_id_1"].equals(info_df["feature_id_2"]): - info_df.drop("feature_id_2", axis=1, inplace=True) - info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) - info_df = info_df.reset_index(drop=True) - else: - info_df = info_df.reset_index(drop=True) - return info_df From af7be5b09b3fcb149b7b72eb049b7a3800d29de1 Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 11:30:52 +0100 Subject: [PATCH 10/13] Update import statements in _omop.py files --- ehrdata/io/_omop.py | 2 +- ehrdata/pl/_omop.py | 2 +- ehrdata/pp/_omop.py | 2 +- ehrdata/tl/_omop.py | 2 +- ehrdata/utils/_omop_utils.py | 591 +++++++++++++++++++++++++++++++++++ 5 files changed, 595 insertions(+), 4 deletions(-) create mode 100644 ehrdata/utils/_omop_utils.py diff --git a/ehrdata/io/_omop.py b/ehrdata/io/_omop.py index 4e82578..0425dc8 100644 --- a/ehrdata/io/_omop.py +++ b/ehrdata/io/_omop.py @@ -6,7 +6,7 @@ import pandas as pd from rich import print as rprint -from ehrdata.utils.omop_utils import check_with_omop_cdm, get_column_types, get_table_catalog_dict, read_table +from ehrdata.utils._omop_utils import check_with_omop_cdm, get_column_types, get_table_catalog_dict, read_table def init_omop( diff --git a/ehrdata/pl/_omop.py b/ehrdata/pl/_omop.py index 7ec676d..889d433 100644 --- a/ehrdata/pl/_omop.py +++ b/ehrdata/pl/_omop.py @@ -4,7 +4,7 @@ import seaborn as sns from ehrdata.tl import get_concept_name -from ehrdata.utils.omop_utils import get_column_types, map_concept_id, read_table +from ehrdata.utils._omop_utils import get_column_types, map_concept_id, read_table # TODO allow users to pass features diff --git a/ehrdata/pp/_omop.py b/ehrdata/pp/_omop.py index 928dd14..09f6c9b 100644 --- a/ehrdata/pp/_omop.py +++ b/ehrdata/pp/_omop.py @@ -5,7 +5,7 @@ import pandas as pd from rich import print as rprint -from ehrdata.utils.omop_utils import get_column_types, get_feature_info, read_table +from ehrdata.utils._omop_utils import get_column_types, get_feature_info, read_table def get_feature_statistics( diff --git a/ehrdata/tl/_omop.py b/ehrdata/tl/_omop.py index 01c6de3..0e3ba10 100644 --- a/ehrdata/tl/_omop.py +++ b/ehrdata/tl/_omop.py @@ -4,7 +4,7 @@ from anndata import AnnData from rich import print as rprint -from ehrdata.utils.omop_utils import df_to_dict, get_column_types, read_table +from ehrdata.utils._omop_utils import df_to_dict, get_column_types, read_table def get_concept_name(adata: Union[AnnData, dict], concept_id: Union[str, list], raise_error=False, verbose=True): diff --git a/ehrdata/utils/_omop_utils.py b/ehrdata/utils/_omop_utils.py new file mode 100644 index 0000000..137be3c --- /dev/null +++ b/ehrdata/utils/_omop_utils.py @@ -0,0 +1,591 @@ +import csv +import glob +import numbers +import os +import warnings +from pathlib import Path +from typing import Union + +import dask.dataframe as dd +import pandas as pd +from rich import print as rprint + + +def get_table_catalog_dict(): + """Get the table catalog dictionary of the OMOP CDM v5.4 + + Returns + ------- + Dictionary: a dictionary of the table catalog. The key is the category of the table, and the value is a list of table names + """ + table_catalog_dict = {} + table_catalog_dict["Clinical data"] = [ + "person", + "observation_period", + "specimen", + "death", + "visit_occurrence", + "visit_detail", + "procedure_occurrence", + "drug_exposure", + "device_exposure", + "condition_occurrence", + "measurement", + "note", + "note_nlp", + "observation", + "fact_relationship", + ] + + table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] + table_catalog_dict["Health economics data"] = ["payer_plan_period", "cost"] + table_catalog_dict["Standardized derived elements"] = [ + "cohort", + "cohort_definition", + "drug_era", + "dose_era", + "condition_era", + ] + table_catalog_dict["Metadata"] = ["cdm_source", "metadata"] + table_catalog_dict["Vocabulary"] = [ + "concept", + "vocabulary", + "domain", + "concept_class", + "concept_relationship", + "relationship", + "concept_synonym", + "concept_ancestor", + "source_to_concept_map", + "drug_strength", + ] + return table_catalog_dict + + +def get_dtype_mapping(): + """Get the data type mapping of the OMOP CDM v5.4 + + Returns + ------- + Dictionary: a dictionary of the data type mapping from OMOP CDM v5.4 to Python + """ + dtype_mapping = { + "integer": "Int64", + "Integer": "Int64", + "float": float, + "bigint": "Int64", + "varchar(MAX)": str, + "varchar(2000)": str, + "varchar(1000)": str, + "varchar(255)": str, + "varchar(250)": str, + "varchar(80)": str, + "varchar(60)": str, + "varchar(50)": str, + "varchar(25)": str, + "varchar(20)": str, + "varchar(10)": str, + "varchar(9)": str, + "varchar(3)": str, + "varchar(2)": str, + "varchar(1)": str, + "datetime": object, + "date": object, + } + + return dtype_mapping + + +def get_omop_cdm_field_level(): + """Get the field level table sof the OMOP CDM v5.4 + + Returns + ------- + Pandas DataFrame + """ + pth = f"{Path(__file__).resolve().parent}/OMOP_CDMv5.4_Field_Level.csv" + df = pd.read_csv(pth) + return df + + +def check_with_omop_cdm(folder_path: str, delimiter: str = None, make_filename_lowercase: bool = True) -> dict: + """Check if the data adheres to the OMOP Common Data Model (CDM) version 5.4 standards + + Check if the table name and column names adhere to the OMOP CDM v5.4 + + Args: + folder_path (str): The path of the folder containing the OMOP data + delimiter (str, optional): The delimiter of the CSV file. Defaults to None. + make_filename_lowercase (bool, optional): Whether to make the filename into lowercase. Defaults to True. + + Returns + ------- + dict: a dictionary of the table path. The key is the table name, and the value is the path of the table + """ + # TODO check if each column's data type adheres to the OMOP CDM + print("Checking if your data adheres to the OMOP Common Data Model (CDM) version 5.4 standards.") + filepath_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) + filepath_dict = {} + for path in filepath_list: + if os.path.isfile(path): + is_single_file = True + else: + is_single_file = False + + # TODO support table stored in a folder + """ + # If not a single file, only check the first one's column names + if not os.path.isfile(path): + folder_walk = os.walk(path) + first_file_in_folder = next(folder_walk)[2][0] + file = os.path.join(path, first_file_in_folder) + is_single_file = False + """ + if is_single_file and not check_csv_has_only_header(path): + # Make filename into lowercase + if make_filename_lowercase: + new_path = os.path.join(folder_path, path.split("/")[-1].lower()) + if path != new_path: + warnings(f"Rename file [{path}] to [{new_path}]") + os.rename(path, new_path) + path = new_path + + # check if table name adheres to the OMOP CDM + file_name = os.path.basename(path).split(".")[0] + field_level = get_omop_cdm_field_level() + if file_name not in set(field_level.cdmTableName): + raise KeyError( + f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!" + ) + + # check if column names adhere to the OMOP CDM + if path.endswith("csv"): + with open(path) as f: + dict_reader = csv.DictReader(f, delimiter=delimiter) + columns = dict_reader.fieldnames + columns = list(filter(None, columns)) + elif path.endswith("parquet"): + df = dd.read_parquet(path) + columns = list(df.columns) + else: + raise TypeError("Only support CSV and Parquet file!") + + invalid_column_name = [] + for _, column in enumerate(columns): + cdm_columns = set(field_level[field_level.cdmTableName == file_name]["cdmFieldName"]) + if column not in cdm_columns: + invalid_column_name.append(column) + if len(invalid_column_name) > 0: + print( + f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}" + ) + raise KeyError + + filepath_dict[file_name] = path + return filepath_dict + + +def check_csv_has_only_header(file_path: str) -> bool: + """Check if the CSV file has only header + + Args: + file_path (str): The path of the CSV file + + Returns + ------- + bool: True if the CSV file has only header, False otherwise + """ + if file_path.endswith("csv"): + with open(file_path) as file: + reader = csv.reader(file) + header = next(reader, None) + if header is not None: + second_row = next(reader, None) + return second_row is None + else: + return False + else: + return False + + +def get_column_types(adata_dict: dict, table_name: str) -> dict: + """Get the column types of the table + + Args: + adata_dict (dict): a dictionary containing filepath_dict and delimiter information + table_name (str): Table name in OMOP CDM v5.4. + + Returns + ------- + dict: a dictionary of the column types. The key is the column name, and the value is the column type + """ + path = adata_dict["filepath_dict"][table_name] + column_types = {} + # If not a single file, read the first one + if not os.path.isfile(path): + folder_walk = os.walk(path) + first_file_in_folder = next(folder_walk)[2][0] + path = os.path.join(path, first_file_in_folder) + + if path.endswith("csv"): + with open(path) as f: + dict_reader = csv.DictReader(f, delimiter=adata_dict["delimiter"]) + columns = dict_reader.fieldnames + columns = list(filter(None, columns)) + elif path.endswith("parquet"): + df = dd.read_parquet(path) + columns = list(df.columns) + else: + raise TypeError("Only support CSV and Parquet file!") + columns_lowercase = [column.lower() for column in columns] + for _, column in enumerate(columns_lowercase): + dtype_mapping = get_dtype_mapping() + field_level = get_omop_cdm_field_level() + column_types[column] = dtype_mapping[ + field_level[(field_level.cdmTableName == table_name) & (field_level.cdmFieldName == column)][ + "cdmDatatype" + ].values[0] + ] + return column_types + + +def get_primary_key(table_name: str) -> str: + """Get the primary key of the table + + Args: + table_name (str, optional): Table name in OMOP CDM v5.4. + + Returns + ------- + str: the primary key of the table + """ + field_level = get_omop_cdm_field_level() + primary_key = field_level[(field_level.cdmTableName == table_name) & (field_level.isPrimaryKey == "Yes")][ + "cdmFieldName" + ].values[0] + return primary_key + + +def read_table( + adata_dict: dict, + table_name: str, + dtype: dict = None, + parse_dates: Union[list[str], str] = None, + index: str = None, + usecols: Union[list[str], str] = None, + use_dask: bool = None, +) -> Union[pd.DataFrame, dd.DataFrame]: + """Read the table either in CSV or Parquet format using pandas or dask + + Args: + adata_dict (dict): a dictionary containing filepath_dict, delimiter, use_dask, tables information + table_name (str, optional): Table name in OMOP CDM v5.4. + dtype (dict, optional): Data type of the columns. Defaults to None. + parse_dates (Union[List[str], str], optional): Columns to parse as dates. Defaults to None. + index (str, optional): set the index of the DataFrame. Defaults to None. + usecols (Union[List[str], str], optional): Columns to read. Defaults to None. + use_dask (bool, optional): Whether to use dask. It is recommended to use dask when the table is large. Defaults to None. + + Returns + ------- + Union[pd.DataFrame, dd.DataFrame]: a pandas or dask DataFrame + """ + if not use_dask: + use_dask = adata_dict["use_dask"] + path = adata_dict["filepath_dict"][table_name] + if use_dask: + if not os.path.isfile(path): + folder_walk = os.walk(path) + filetype = next(folder_walk)[2][0].split(".")[-1] + else: + filetype = path.split(".")[-1] + if filetype == "csv": + if not os.path.isfile(path): + path = f"{path}/*.csv" + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = dd.read_csv( + path, delimiter=adata_dict["delimiter"], dtype=dtype, parse_dates=parse_dates, usecols=usecols + ) + elif filetype == "parquet": + if not os.path.isfile(path): + path = f"{path}/*.parquet" + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = dd.read_parquet(path, dtype=dtype, parse_dates=parse_dates, columns=usecols) + else: + raise TypeError("Only support CSV and Parquet file!") + else: + if not os.path.isfile(path): + raise TypeError("Only support reading a single file!") + filetype = path.split(".")[-1] + if filetype == "csv": + if usecols: + dtype = {key: dtype[key] for key in usecols if key in dtype} + if parse_dates: + parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} + df = pd.read_csv( + path, delimiter=adata_dict["delimiter"], dtype=dtype, parse_dates=parse_dates, usecols=usecols + ) + elif filetype == "parquet": + df = pd.read_parquet(path, columns=usecols) + + else: + raise TypeError("Only support CSV and Parquet file!") + + if index: + df = df.set_index(index) + return df + + +def map_concept_id( + adata_dict: dict, concept_id: Union[str, list[int]], verbose: bool = True +) -> tuple[list[int], list[int]]: + """Map between concept_id_1 and concept_id_2 using concept_relationship table + + Args: + adata_dict (dict): a dictionary containing filepath_dict, delimiter, tables information. + concept_id (Union[str, list[int]]): It could be a single concept_id or a list of concept_id. + verbose (bool, optional): Defaults to True. + + Returns + ------- + Tuple[list[int], list[int]]: a tuple of list of concept_id_1 and list of concept_id_2. If no map is found, the concept_id_1 and concept_id_2 will be the same. + """ + filepath_dict = adata_dict["filepath_dict"] + tables = adata_dict["tables"] + delimiter = adata_dict["delimiter"] + + if isinstance(concept_id, numbers.Integral): + concept_id = [concept_id] + concept_id_1 = [] + concept_id_2 = [] + concept_id_mapped_not_found = [] + + if "concept_relationship" in tables: + column_types = get_column_types(adata_dict, table_name="concept_relationship") + df_concept_relationship = pd.read_csv( + filepath_dict["concept_relationship"], dtype=column_types, delimiter=delimiter + ) + # TODO dask Support + # df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], + df_concept_relationship.dropna( + subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True + ) # , usecols=vocabularies_tables_columns["concept_relationship"], + concept_relationship_dict = df_to_dict( + df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], + key="concept_id_1", + value="concept_id_2", + ) + concept_relationship_dict_reverse = df_to_dict( + df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"], + key="concept_id_1", + value="concept_id_2", + ) + for id in concept_id: + try: + concept_id_2.append(concept_relationship_dict[id]) + concept_id_1.append(id) + except KeyError: + try: + concept_id_1.append(concept_relationship_dict_reverse[id]) + concept_id_2.append(id) + except KeyError: + concept_id_1.append(id) + concept_id_2.append(id) + concept_id_mapped_not_found.append(id) + if len(concept_id_mapped_not_found) > 0: + # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") + if verbose: + rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!") + else: + concept_id_1 = concept_id + concept_id_2 = concept_id + + if len(concept_id_1) == 1: + return concept_id_1[0], concept_id_2[0] + else: + return concept_id_1, concept_id_2 + + +def df_to_dict(df: pd.DataFrame, key: str, value: str) -> dict: + """Convert a DataFrame to a dictionary + + Args: + df (pd.DataFrame): a DataFrame + key (str): the column name to be used as the key of the dictionary + value (str): the column name to be used as the value of the dictionary + + Returns + ------- + dict: a dictionary + """ + if isinstance(df, dd.DataFrame): + return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() + else: + return pd.Series(df[value].values, index=df[key]).to_dict() + + +# def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): +# """Use SequenceMatcher to return a list of the indexes of the best +# "good enough" matches. word is a sequence for which close matches +# are desired (typically a string). +# possibilities is a dictionary of sequences. +# Optional arg n (default 2) is the maximum number of close matches to +# return. n must be > 0. +# Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities +# that don't score at least that similar to word are ignored. +# """ +# if not n > 0: +# raise ValueError("n must be > 0: %r" % (n,)) +# if not 0.0 <= cutoff <= 1.0: +# raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) +# result = [] +# s = SequenceMatcher() +# s.set_seq2(word) +# for _, (key, value) in enumerate(possibilities.items()): +# s.set_seq1(value) +# if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: +# result.append((s.ratio(), value, key)) + +# # Move the best scorers to head of list +# result = _nlargest(n, result) + +# # Strip scores for the best n matches +# return [(value, key, score) for score, value, key in result] + + +def get_feature_info( + adata_dict: dict, + features: Union[str, int, list[Union[str, int]]] = None, + ignore_not_shown_in_concept_table: bool = True, + exact_match: bool = True, + verbose: bool = True, +) -> pd.DataFrame: + """Get the feature information from the concept table + + Args: + adata_dict (dict): a dictionary containing filepath_dict, delimiter, tables information. + features (Union[str, int, list[Union[str, int]]], optional): a feature name or a feature id. Defaults to None. + ignore_not_shown_in_concept_table (bool, optional): If True, it will ignore the features that are not shown in the concept table. Defaults to True. + exact_match (bool, optional): If True, it will only return the exact match if the feature name is input. Defaults to True. + verbose (bool, optional): Defaults to True. + + Returns + ------- + pd.DataFrame: a DataFrame containing the feature information + """ + if "concept" in adata_dict["tables"]: + column_types = get_column_types(adata_dict, table_name="concept") + + df_concept = read_table(adata_dict, table_name="concept", dtype=column_types).dropna( + subset=["concept_id", "concept_name"] + ) # usecols=vocabularies_tables_columns["concept"], + # concept_dict = df_to_dict(df=df_concept, key="concept_name", value="concept_id") + else: + rprint("concept table is not found in the OMOP CDM v5.4!") + raise ValueError + fetures_not_shown_in_concept_table = [] + + info_df = pd.DataFrame([]) + if isinstance(features, str): + features = [features] + # Get feature id for each input, and check if each feature occurs in the concept table + for feature in features: + # if the input is feature ID + if isinstance(feature, numbers.Integral): + feature_id = feature + feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) + try: + feature_name = df_concept[df_concept["concept_id"] == feature_id_1]["concept_name"].values[0] + except KeyError: + if ignore_not_shown_in_concept_table: + fetures_not_shown_in_concept_table.append(feature) + continue + else: + rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table") + raise + match_1_ratio = 100 + + # if the input is feature name + elif isinstance(feature, str): + # return a list of (value, key, score) + # result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) + from thefuzz import process + + # the thefuzz match returns a list of tuples of (matched string, match ratio) + result = process.extract(feature, list(df_concept["concept_name"].values), limit=2) + + match_1 = result[0] + match_1_name = match_1[0] + match_1_ratio = match_1[1] + # Most of the case: if find 2 best matches + if len(result) == 2: + match_2 = result[1] + match_2_name = match_2[0] + match_2_ratio = match_2[1] + + if match_1_ratio != 100: + if exact_match: + rprint( + f"Unable to find an exact match for [blue]{feature}[/] in the concept table.\nSimilar ones: 1) [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/] 2) [blue]{match_2_name}[/] with match ratio [red]{match_2_ratio}[/]" + ) + raise ValueError + else: + if match_2_ratio == 100: + match_1_id = df_concept[df_concept["concept_name"] == match_1_name]["concept_id"].values[0] + match_2_id = df_concept[df_concept["concept_name"] == match_2_name]["concept_id"].values[0] + rprint( + f"Found multiple exact matches for [blue]{feature}[/] in the concept table.\n1) concept id: [blue]{match_1_id}[/] 2) concept id: [blue]{match_2_id}[/]. Please specify concept_id directly." + ) + raise ValueError + + # Very rare: if only find 1 match + else: + if exact_match and match_1_ratio != 1: + rprint( + f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similiar one: [blue]{match_1_name}[/] with match ratio [red]{match_1_ratio}[/]" + ) + raise ValueError + + feature_name = match_1_name + feature_id = df_concept[df_concept["concept_name"] == feature_name]["concept_id"].values[0] + feature_id_1, feature_id_2 = map_concept_id(adata_dict=adata_dict, concept_id=feature_id, verbose=False) + + else: + rprint( + "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] that you want to extarct" + ) + raise TypeError + + info_df = pd.concat( + [ + info_df, + pd.DataFrame( + data=[[feature_name, feature_id_1, feature_id_2]], + columns=["feature_name", "feature_id_1", "feature_id_2"], + ), + ] + ) + + # feature_name_list.append(feature_name) + # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) + # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) + # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0]) + + if verbose: + rprint( + f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match ratio = [green]{match_1_ratio}." + ) + + if info_df["feature_id_1"].equals(info_df["feature_id_2"]): + info_df.drop("feature_id_2", axis=1, inplace=True) + info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) + info_df = info_df.reset_index(drop=True) + else: + info_df = info_df.reset_index(drop=True) + return info_df From ad0213df38f87f2ae902df9ab722c56fbb2e0f6a Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 11:36:46 +0100 Subject: [PATCH 11/13] remove files --- ehrdata_source.py | 1313 -------------------------------------------- omop_conversion.py | 246 --------- 2 files changed, 1559 deletions(-) delete mode 100644 ehrdata_source.py delete mode 100644 omop_conversion.py diff --git a/ehrdata_source.py b/ehrdata_source.py deleted file mode 100644 index 11eceba..0000000 --- a/ehrdata_source.py +++ /dev/null @@ -1,1313 +0,0 @@ -import awkward as ak -import numpy as np -import pandas as pd -import csv -import pandas as pd -import matplotlib.pyplot as plt -import seaborn as sns -import ehrapy as ep -import scanpy as sc -from anndata import AnnData -import mudata as md -from mudata import MuData -from typing import List, Union, Literal, Optional -import os -import glob -import dask.dataframe as dd -from thefuzz import process -import sys -from rich import print as rprint -import missingno as msno -import warnings -import numbers -import os -from pandas.tseries.offsets import DateOffset as Offset - -import anndata as ad -from collections.abc import Collection, Iterable, Mapping, Sequence -from enum import Enum -from functools import partial -from types import MappingProxyType -from typing import TYPE_CHECKING, Any, Callable, Literal, Union - -import scanpy as sc -from scanpy.plotting import DotPlot, MatrixPlot, StackedViolin -from matplotlib.axes import Axes - -from difflib import SequenceMatcher -from heapq import nlargest as _nlargest - - -pth = 'auxillary_files/OMOP_CDMv5.4_Field_Level.csv' -field_level = pd.read_csv(pth) -dtype_mapping = {'integer': "Int64", - 'Integer': "Int64", - 'float': float, - 'bigint': "Int64", - 'varchar(MAX)': str, - 'varchar(2000)': str, - 'varchar(1000)': str, - 'varchar(255)': str, - 'varchar(250)': str, - 'varchar(80)': str, - 'varchar(60)': str, - 'varchar(50)': str, - 'varchar(25)': str, - 'varchar(20)': str, - 'varchar(10)': str, - 'varchar(9)': str, - 'varchar(3)': str, - 'varchar(2)': str, - 'varchar(1)': str, - 'datetime': object, - 'date': object} - - - -def get_close_matches_using_dict(word, possibilities, n=2, cutoff=0.6): - """Use SequenceMatcher to return a list of the indexes of the best - "good enough" matches. word is a sequence for which close matches - are desired (typically a string). - possibilities is a dictionary of sequences. - Optional arg n (default 2) is the maximum number of close matches to - return. n must be > 0. - Optional arg cutoff (default 0.6) is a float in [0, 1]. Possibilities - that don't score at least that similar to word are ignored. - """ - - if not n > 0: - raise ValueError("n must be > 0: %r" % (n,)) - if not 0.0 <= cutoff <= 1.0: - raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff,)) - result = [] - s = SequenceMatcher() - s.set_seq2(word) - for _, (key, value) in enumerate(possibilities.items()): - s.set_seq1(value) - if s.real_quick_ratio() >= cutoff and s.quick_ratio() >= cutoff and s.ratio() >= cutoff: - result.append((s.ratio(), value, key)) - - # Move the best scorers to head of list - result = _nlargest(n, result) - - # Strip scores for the best n matches - return [(value, key, score) for score, value, key in result] - - -def df_to_dict(df, key, value): - if isinstance(df, dd.DataFrame): - return pd.Series(df[value].compute().values, index=df[key].compute()).to_dict() - else: - return pd.Series(df[value].values, index=df[key]).to_dict() - - -def check_csv_has_only_header(file_path): - if file_path.endswith('csv'): - with open(file_path, 'r') as file: - reader = csv.reader(file) - header = next(reader, None) # Read the header - if header is not None: - second_row = next(reader, None) # Try to read the next row - return second_row is None # If there's no second row, return True - else: - return False # File is empty or not a valid CSV - else: - return False - - -class OMOP: - def __init__(self, folder_path, delimiter=None, make_filename_lowercase=True, use_dask=False): - self.base = folder_path - self.delimiter = delimiter - self.use_dask = use_dask - # TODO support also parquet and other formats - file_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) - self.loaded_tabel = None - self.filepath = {} - for file_path in file_list: - file_name = file_path.split("/")[-1].split(".")[0] - if check_csv_has_only_header(file_path): - pass - else: - # Rename the file - if make_filename_lowercase: - new_filepath = os.path.join(self.base, file_path.split("/")[-1].lower()) - if file_path != new_filepath: - warnings(f"Rename file [file_path] to [new_filepath]") - os.rename(file_path, new_filepath) - self.filepath[file_name] = new_filepath - else: - self.filepath[file_name] = file_path - self.check_with_omop_cdm() - self.tables = list(self.filepath.keys()) - - """ - if "concept" in self.tables: - df_concept = dd.read_csv(self.filepath["concept"], usecols=vocabularies_tables_columns["concept"]) - self.concept_id_to_name = dict(zip(df_concept['id'], df_concept['name'])) - self.concept_name_to_id = dict(zip(df_concept['name'], df_concept['id'])) - """ - - def __repr__(self) -> str: - # TODO this should be seperated by diff table categories - def format_tables(tables, max_line_length=80): - line = "" - for table in tables: - # Check if adding the next table would exceed the max line length - if len(line) + len(table) > max_line_length: - # Yield the current line and start a new one - yield line - line = table - else: - # Add the table to the current line - line += table if line == "" else ", " + table - # Yield the last line - yield line - - tables_str = "\n".join(format_tables(self.tables)) - return f'OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables: {tables_str}' - - def set_path(self, table_name, file_path): - # TODO move to init - self.tables.append(table_name) - self.filepath[table_name] = file_path - - def check_with_omop_cdm(self): - for file_name, path in self.filepath.items(): - if file_name not in set(field_level.cdmTableName): - raise KeyError(f"Table [{file_name}] is not defined in OMOP CDM v5.4! Please change the table name manually!") - # If not a single file, read the first one - if not os.path.isfile(path): - folder_walk = os.walk(path) - first_file_in_folder = next(folder_walk)[2][0] - path = os.path.join(path, first_file_in_folder) - - if path.endswith('csv'): - with open(path, "r") as f: - dict_reader = csv.DictReader(f, delimiter=self.delimiter) - columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith('parquet'): - df = dd.read_parquet(path) - columns = list(df.columns) - else: - raise TypeError("Only support CSV and Parquet file!") - columns_lowercase = [column.lower() for column in columns] - - invalid_column_name = [] - for _, column in enumerate(columns_lowercase): - cdm_columns = set(field_level[field_level.cdmTableName == file_name]['cdmFieldName']) - if column not in cdm_columns: - invalid_column_name.append(column) - if len(invalid_column_name) > 0: - print(f"Column {invalid_column_name} is not defined in Table [{file_name}] in OMOP CDM v5.4! Please change the column name manually!\nFor more information, please refer to: https://ohdsi.github.io/CommonDataModel/cdm54.html#{file_name.upper()}") - raise KeyError - - - - def _get_column_types(self, - path: str = None, - filename: str = None): - column_types = {} - # If not a single file, read the first one - if not os.path.isfile(path): - folder_walk = os.walk(path) - first_file_in_folder = next(folder_walk)[2][0] - path = os.path.join(path, first_file_in_folder) - - if path.endswith('csv'): - with open(path, "r") as f: - dict_reader = csv.DictReader(f, delimiter=self.delimiter) - columns = dict_reader.fieldnames - columns = list(filter(None, columns)) - elif path.endswith('parquet'): - df = dd.read_parquet(path) - columns = list(df.columns) - else: - raise TypeError("Only support CSV and Parquet file!") - columns_lowercase = [column.lower() for column in columns] - for _, column in enumerate(columns_lowercase): - column_types[column] = dtype_mapping[field_level[(field_level.cdmTableName == filename) & (field_level.cdmFieldName == column)]['cdmDatatype'].values[0]] - return column_types - - def _read_table(self, path, dtype=None, parse_dates=None, index=None, usecols=None, use_dask=False, **kwargs): - - if use_dask: - if not os.path.isfile(path): - folder_walk = os.walk(path) - filetype = next(folder_walk)[2][0].split(".")[-1] - else: - filetype = path.split(".")[-1] - if filetype == 'csv': - if not os.path.isfile(path): - path = f"{path}/*.csv" - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == 'parquet': - if not os.path.isfile(path): - path = f"{path}/*.parquet" - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = dd.read_parquet(path, dtype=dtype, parse_dates=parse_dates, columns=usecols) - else: - raise TypeError("Only support CSV and Parquet file!") - else: - if not os.path.isfile(path): - raise TypeError("Only support reading a single file!") - filetype = path.split(".")[-1] - if filetype == 'csv': - if usecols: - dtype = {key: dtype[key] for key in usecols if key in dtype} - if parse_dates: - parse_dates = {key: parse_dates[key] for key in usecols if key in parse_dates} - df = pd.read_csv(path, delimiter=self.delimiter, dtype=dtype, parse_dates=parse_dates, usecols=usecols) - elif filetype == 'parquet': - df = pd.read_parquet(path, columns=usecols) - else: - raise TypeError("Only support CSV and Parquet file!") - - if index: - df = df.set_index(index) - return df - - # TODO redo this using omop cdm csv file - @property - def table_catalog(self): - """ - A dictionary containing all of the ``Clinical`` OMOP CDM tables in the connected database. - """ - table_catalog_dict = {} - table_catalog_dict['Clinical data'] = [ - "person", - "observation_period", - "specimen", - "death", - "visit_occurrence", - "visit_detail", - "procedure_occurrence", - "drug_exposure", - "device_exposure", - "condition_occurrence", - "measurement", - "note", - "note_nlp", - "observation", - "fact_relationship", - ] - - table_catalog_dict["Health system data"] = ["location", "care_site", "provider"] - table_catalog_dict["Health economics data"] = ["payer_plan_period", "cost"] - table_catalog_dict["Standardized derived elements"] = ["cohort", "cohort_definition", "drug_era", "dose_era", "condition_era"] - table_catalog_dict["Metadata"] = ["cdm_source", "metadata"] - table_catalog_dict["Vocabulary"] = [ - "concept", - "vocabulary", - "domain", - "concept_class", - "concept_relationship", - "relationship", - "concept_synonym", - "concept_ancestor", - "source_to_concept_map", - "drug_strength", - ] - self._table_catalog_dict = table_catalog_dict - - def load(self, level="stay_level", tables=["visit_occurrence", "person", "death"], remove_empty_column=True): - # TODO patient level and hospital level - if level == "stay_level": - index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} - # TODO Only support clinical_tables_columns - - for table in tables: - print(f"reading table [{table}]") - column_types = self._get_column_types(path = self.filepath[table], filename=table) - df = self._read_table(self.filepath[table], dtype=column_types, index='person_id') # TODO parse_dates = parse_dates - if remove_empty_column: - # TODO dask Support - #columns = [column for column in df.columns if not df[column].compute().isna().all()] - columns = [column for column in df.columns if not df[column].isna().all()] - df = df.loc[:, columns] - setattr(self, table, df) - - # concept_id_list = list(self.concept.concept_id) - # concept_name_list = list(self.concept.concept_id) - # concept_domain_id_list = list(set(self.concept.domain_id)) - - # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] - # TODO dask Support - joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") - - joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") - - # TODO dask Support - #joined_table = joined_table.compute() - - # TODO check this earlier - joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') - joined_table = joined_table.set_index("visit_occurrence_id") - # obs_only_list = list(self.joined_table.columns) - # obs_only_list.remove('visit_occurrence_id') - columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) - adata = ep.ad.df_to_anndata( - joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only - ) - # TODO this needs to be fixed because anndata set obs index as string by default - #adata.obs.index = adata.obs.index.astype(int) - - """ - for column in self.measurement.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.drug_exposure.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.observation.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - """ - - return adata - - def feature_counts( - self, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - number=20, - key = None - ): - - if source == 'measurement': - columns = ["value_as_number", "time", "visit_occurrence_id", "measurement_concept_id"] - elif source == 'observation': - columns = ["value_as_number", "value_as_string", "measurement_datetime"] - elif source == 'condition_occurrence': - columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = self._read_table(self.filepath[source], dtype=column_types, usecols=[f"{source}_concept_id"], use_dask=True) - # TODO dask Support - #feature_counts = df_source[f"{source}_concept_id"].value_counts().compute()[0:number] - feature_counts = df_source[f"{source}_concept_id"].value_counts().compute() - feature_counts = feature_counts.to_frame().reset_index(drop=False)[0:number] - - - feature_counts[f"{source}_concept_id_1"], feature_counts[f"{source}_concept_id_2"] = self.map_concept_id( - feature_counts[f"{source}_concept_id"], verbose=False - ) - feature_counts["feature_name"] = self.get_concept_name(feature_counts[f"{source}_concept_id_1"]) - if feature_counts[f"{source}_concept_id_1"].equals(feature_counts[f"{source}_concept_id_2"]): - feature_counts.drop(f"{source}_concept_id_2", axis=1, inplace=True) - feature_counts.rename(columns={f"{source}_concept_id_1": f"{source}_concept_id"}) - feature_counts = feature_counts.reindex(columns=["feature_name", f"{source}_concept_id", "count"]) - else: - feature_counts = feature_counts.reindex( - columns=["feature_name", f"{source}_concept_id_1", f"{source}_concept_id_2", "count"] - ) - - ax = sns.barplot(feature_counts, x="feature_name", y="count") - ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right") - plt.tight_layout() - return feature_counts - - def map_concept_id(self, concept_id: Union[str, List], verbose=True): - if isinstance(concept_id, numbers.Integral): - concept_id = [concept_id] - concept_id_1 = [] - concept_id_2 = [] - concept_id_mapped_not_found = [] - - if "concept_relationship" in self.tables: - column_types = self._get_column_types(path = self.filepath["concept_relationship"], filename="concept_relationship") - df_concept_relationship = self._read_csv( - self.filepath["concept_relationship"], dtype=column_types - ) - # TODO dask Support - #df_concept_relationship.compute().dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], - df_concept_relationship.dropna(subset=["concept_id_1", "concept_id_2", "relationship_id"], inplace=True) # , usecols=vocabularies_tables_columns["concept_relationship"], - concept_relationship_dict = df_to_dict( - df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Maps to"], - key="concept_id_1", - value="concept_id_2", - ) - concept_relationship_dict_reverse = df_to_dict( - df=df_concept_relationship[df_concept_relationship["relationship_id"] == "Mapped from"], - key="concept_id_1", - value="concept_id_2", - ) - for id in concept_id: - try: - concept_id_2.append(concept_relationship_dict[id]) - concept_id_1.append(id) - except KeyError: - try: - concept_id_1.append(concept_relationship_dict_reverse[id]) - concept_id_2.append(id) - except KeyError: - concept_id_1.append(id) - concept_id_2.append(id) - concept_id_mapped_not_found.append(id) - if len(concept_id_mapped_not_found) > 0: - # warnings.warn(f"Couldn't find a map for concept {id} in concept_relationship table!") - if verbose: - rprint(f"Couldn't find a map for concept {concept_id_mapped_not_found} in concept_relationship table!") - else: - concept_id_1 = concept_id - concept_id_2 = concept_id - - if len(concept_id_1) == 1: - return concept_id_1[0], concept_id_2[0] - else: - return concept_id_1, concept_id_2 - - def get_concept_name(self, concept_id: Union[str, List], raise_error=False, verbose=True): - if isinstance(concept_id, numbers.Integral): - concept_id = [concept_id] - - column_types = self._get_column_types(path = self.filepath["concept"], filename="concept") - df_concept = self._read_table(self.filepath["concept"], dtype=column_types) - # TODO dask Support - #df_concept.compute().dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] - df_concept.dropna(subset=["concept_id", "concept_name"], inplace=True, ignore_index=True) # usecols=vocabularies_tables_columns["concept"] - concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") - concept_name = [] - concept_name_not_found = [] - for id in concept_id: - try: - concept_name.append(concept_dict[id]) - except KeyError: - concept_name.append(id) - concept_name_not_found.append(id) - if len(concept_name_not_found) > 0: - # warnings.warn(f"Couldn't find concept {id} in concept table!") - if verbose: - rprint(f"Couldn't find concept {concept_name_not_found} in concept table!") - if raise_error: - raise KeyError - if len(concept_name) == 1: - return concept_name[0] - else: - return concept_name - - def extract_note(self, adata, source="note"): - column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = dd.read_csv(self.filepath[source], dtype=column_types) - if columns is None: - columns = df_source.columns - obs_dict = [ - { - column: list(df_source[df_source["visit_occurrence_id"] == int(visit_occurrence_id)][column]) - for column in columns - } - for visit_occurrence_id in adata.obs.index - ] - adata.obsm["note"] = ak.Array(obs_dict) - return adata - - def note_nlp_map( - self, - ): - # Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping - pass - - - def get_feature_info( - self, - adata, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - features: str or int or List[Union[str, int]] = None, - key: str = None, - ignore_not_shown_in_concept_table: bool = True, - exact_match: bool = True, - - verbose: bool = False, - ): - if key is None: - if source in ["measurement", "observation", "specimen"]: - key = f"{source}_concept_id" - elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: - key = f"{source.split('_')[0]}_concept_id" - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if isinstance(features, str): - features = [features] - rprint(f"Trying to extarct the following features: {features}") - - # Input could be feature names/feature id (concept id) - # First convert all input feaure names into feature id. Map concept using CONCEPT_RELATIONSHIP table if required. - # Then try to extract feature data from source table using feature id. - - # TODO support features name - - if "concept" in self.tables: - column_types = self._get_column_types(path = self.filepath["concept"], filename="concept") - df_concept = self._read_table(self.filepath["concept"], dtype=column_types).dropna( - subset=["concept_id", "concept_name"] - ) # usecols=vocabularies_tables_columns["concept"], - concept_dict = df_to_dict(df=df_concept, key="concept_id", value="concept_name") - - # TODO query this in the table - - feature_id_list = [] - feature_name_list = [] - domain_id_list = [] - concept_class_id_list = [] - concept_code_list = [] - - fetures_not_shown_in_concept_table = [] - - info_df = pd.DataFrame([]) - # Get feature id for each input, and check if each feature occurs in the concept table - for feature in features: - # if the input is feature ID - if isinstance(feature, numbers.Integral): - feature_id = feature - feature_id_1, feature_id_2 = self.map_concept_id(feature_id, verbose=False) - try: - feature_name = self.get_concept_name(feature_id_1, raise_error=True, verbose=False) - except KeyError: - if ignore_not_shown_in_concept_table: - fetures_not_shown_in_concept_table.append(feature) - continue - else: - rprint(f"Feature ID - [red]{feature_id_1}[/] could not be found in concept table") - raise - match_score = 1 - - # if the input is feature name - elif isinstance(feature, str): - # return a list of (value, key, score) - result = get_close_matches_using_dict(feature, concept_dict, n=2, cutoff=0.2) - - # if find 2 best matches - if len(result) == 2: - match_score = result[0][2] - - if match_score != 1: - if exact_match: - rprint( - f"Unable to find an exact match for [red]{feature}[/] in the concept table. Similar ones: 1) [red]{result[0][0]}[/] 2) [red]{result[1][0]}" - ) - raise ValueError - else: - if result[1][1] == 1: - rprint( - f"Found multiple exact matches for [red]{feature}[/] in the concept table: 1) concept id: [red]{result[0][1]}[/] 2) concept id: [red]{result[1][1]}[/]. It is better to specify concept id directly." - ) - raise ValueError - feature_name = feature - feature_id = result[0][1] - # if only find 1 match - else: - feature_name = result[0][0] - match_score = result[0][1] - feature_id = result[0][2] - if exact_match and match_score != 1: - rprint( - f"Unable to find an exact match for [red]{feature}[/] in the concept table Similar one is [red]{result[0][0]}" - ) - raise ValueError - feature_id_1, feature_id_2 = self.map_concept_id(feature_id) - - else: - rprint( - "Please input either [red]feature name (string)[/] or [red]feature id (integer)[/] you want to extarct" - ) - raise TypeError - - info_df = pd.concat([info_df, pd.DataFrame(data=[[feature_name, feature_id_1, feature_id_2]], columns=['feature_name', 'feature_id_1', 'feature_id_2'])]) - - - # feature_name_list.append(feature_name) - # domain_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "domain_id"].reset_index(drop=True).compute()[0]) - # concept_class_id_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_class_id"].reset_index(drop=True).compute()[0]) - # concept_code_list.append(df_concept.loc[df_concept["concept_id"] == feature_id, "concept_code"].reset_index(drop=True).compute()[0]) - - if verbose: - """ - if map_concept: - rprint( - f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, feature ID [green]{concept_id}[/] in concept relationship table, match socre = [green]{match_score}." - ) - else: - """ - rprint( - f"Detected: feature [green]{feature_name}[/], feature ID [green]{feature_id}[/] in concept table, match socre = [green]{match_score}." - ) - if info_df[f"feature_id_1"].equals(info_df[f"feature_id_2"]): - info_df.drop(f"feature_id_2", axis=1, inplace=True) - info_df = info_df.rename(columns={"feature_id_1": "feature_id"}) - info_df = info_df.reset_index(drop=True) - else: - info_df = info_df.reset_index(drop=True) - return info_df - - def get_feature_statistics( - self, - adata, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - features: str or int or List[Union[str, int]] = None, - level="stay_level", - value_col: str = 'value_source_value', - aggregation_methods: Union[Literal["min", "max", "mean", "std", "count"], List[Literal["min", "max", "mean", "std", "count"]]]=None, - add_aggregation_to_X: bool = True, - verbose: bool = False, - use_dask: bool = None, - ): - if source in ["measurement", "observation", "specimen"]: - key = f"{source}_concept_id" - elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: - key = f"{source.split('_')[0]}_concept_id" - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if source == 'measurement': - source_table_columns = ['visit_occurrence_id', 'measurement_datetime', key, value_col] - elif source == 'observation': - source_table_columns = ['visit_occurrence_id', "observation_datetime", key, value_col] - elif source == 'condition_occurrence': - source_table_columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if use_dask is None: - use_dask = self.use_dask - source_column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = self._read_table(self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask) - info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) - info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] - - # Select featrues - df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - #TODO Select time - #da_measurement = da_measurement[(da_measurement.time >= 0) & (da_measurement.time <= 48*60*60)] - #df_source[f'{source}_name'] = df_source[key].map(info_dict) - if aggregation_methods is None: - aggregation_methods = ["min", "max", "mean", "std", "count"] - if level == 'stay_level': - result = df_source.groupby(['visit_occurrence_id', key]).agg({ - value_col: aggregation_methods}) - - if use_dask: - result = result.compute() - result = result.reset_index(drop=False) - result.columns = ["_".join(a) for a in result.columns.to_flat_index()] - result.columns = result.columns.str.removesuffix('_') - result.columns = result.columns.str.removeprefix(f'{value_col}_') - result[f'{source}_name'] = result[key].map(info_dict) - - df_statistics = result.pivot(index='visit_occurrence_id', - columns=f'{source}_name', - values=aggregation_methods) - df_statistics.columns = df_statistics.columns.swaplevel() - df_statistics.columns = ["_".join(a) for a in df_statistics.columns.to_flat_index()] - - - # TODO - sort_columns = True - if sort_columns: - new_column_order = [] - for feature in features: - for suffix in (f'_{aggregation_method}' for aggregation_method in aggregation_methods): - col_name = f'{feature}{suffix}' - if col_name in df_statistics.columns: - new_column_order.append(col_name) - - df_statistics.columns = new_column_order - - df_statistics.index = df_statistics.index.astype(str) - - adata.obs = pd.merge(adata.obs, df_statistics, how='left', left_index=True, right_index=True) - - if add_aggregation_to_X: - adata = ep.ad.move_to_x(adata, list(df_statistics.columns)) - return adata - - - def extract_features( - self, - adata, - source: Literal[ - "observation", - "measurement", - "procedure_occurrence", - "specimen", - "device_exposure", - "drug_exposure", - "condition_occurrence", - ], - features: str or int or List[Union[str, int]] = None, - source_table_columns: Union[str, List[str]] = None, - dropna: Optional[bool] = True, - verbose: Optional[bool] = True, - use_dask: bool = None, - ): - - if source in ["measurement", "observation", "specimen"]: - key = f"{source}_concept_id" - elif source in ["device_exposure", "procedure_occurrence", "drug_exposure", "condition_occurrence"]: - key = f"{source.split('_')[0]}_concept_id" - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - - if source_table_columns is None: - if source == 'measurement': - source_table_columns = ['visit_occurrence_id', 'measurement_datetime', 'value_as_number', key] - elif source == 'observation': - source_table_columns = ['visit_occurrence_id', "value_as_number", "value_as_string", "observation_datetime", key] - elif source == 'condition_occurrence': - source_table_columns = None - else: - raise KeyError(f"Extracting data from {source} is not supported yet") - if use_dask is None: - use_dask = self.use_dask - - - # TODO load using Dask or Dask-Awkward - # Load source table using dask - source_column_types = self._get_column_types(path = self.filepath[source], filename=source) - df_source = self._read_table(self.filepath[source], dtype=source_column_types, usecols=source_table_columns, use_dask=use_dask) - info_df = self.get_feature_info(adata, source=source, features=features, verbose=False) - info_dict = info_df[['feature_id', 'feature_name']].set_index('feature_id').to_dict()['feature_name'] - - - # Select featrues - df_source = df_source[df_source[key].isin(list(info_df.feature_id))] - - # TODO select time period - #df_source = df_source[(df_source.time >= 0) & (df_source.time <= 48*60*60)] - #da_measurement['measurement_name'] = da_measurement.measurement_concept_id.replace(info_dict) - - # TODO dask caching - """ - from dask.cache import Cache - cache = Cache(2e9) - cache.register() - """ - if use_dask: - if dropna == True: - df_source = df_source.compute().dropna() - else: - df_source = df_source.compute() - else: - if dropna == True: - df_source = df_source.dropna() - - # Preprocess steps outside the loop - unique_visit_occurrence_ids = set(adata.obs.index.astype(int)) - empty_entry = {source_table_column: [] for source_table_column in source_table_columns if source_table_column not in [key, 'visit_occurrence_id'] } - - # Filter data once, if possible - filtered_data = { - feature_id: df_source[df_source[key] == feature_id] - for feature_id in set(info_dict.keys()) - } - - for feature_id in set(info_dict.keys()): - df_feature = filtered_data[feature_id][list(set(source_table_columns) - set([key]))] - grouped = df_feature.groupby("visit_occurrence_id") - if verbose: - print(f"Adding feature [{info_dict[feature_id]}] into adata.obsm") - - # Use set difference and intersection more efficiently - feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - - # Creating the array more efficiently - adata.obsm[info_dict[feature_id]] = ak.Array([ - grouped.get_group(visit_occurrence_id)[list(set(source_table_columns) - set([key, 'visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids - ]) - - return adata - - - def drop_nan(self, - adata, - key: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - ): - if isinstance(key, str): - key_list = [key] - else: - key_list = key - if slot == 'obsm': - for key in key_list: - ak_array = adata.obsm[key] - - # Update the combined mask based on the presence of None in each field - for i, field in enumerate(ak_array.fields): - field_mask = ak.is_none(ak.nan_to_none(ak_array[field]), axis=1) - if i==0: - combined_mask = ak.full_like(field_mask, fill_value=False, dtype=bool) - combined_mask = combined_mask | field_mask - ak_array = ak_array[~combined_mask] - adata.obsm[key] = ak_array - - return adata - - # downsampling - def aggregate_timeseries_in_bins(self, - adata, - features: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - value_key: str = 'value_as_number', - time_key: str = 'measurement_datetime', - time_binning_method: Literal["floor", "ceil", "round"] = "floor", - bin_size: Union[str, Offset] = 'h', - aggregation_method: Literal['median', 'mean', 'min', 'max'] = 'median', - time_upper_bound: int = 48# TODO - ): - - if isinstance(features, str): - features_list = [features] - else: - features_list = features - - # Ensure the time_binning_method provided is one of the expected methods - if time_binning_method not in ["floor", "ceil", "round"]: - raise ValueError(f"time_binning_method {time_binning_method} is not supported. Choose from 'floor', 'ceil', or 'round'.") - - if aggregation_method not in {'median', 'mean', 'min', 'max'}: - raise ValueError(f"aggregation_method {aggregation_method} is not supported. Choose from 'median', 'mean', 'min', or 'max'.") - - if slot == 'obsm': - for feature in features_list: - print(f"processing feature [{feature}]") - df = self.to_dataframe(adata, features) - if pd.api.types.is_datetime64_any_dtype(df[time_key]): - func = getattr(df[time_key].dt, time_binning_method, None) - if func is not None: - df[time_key] = func(bin_size) - else: - # TODO need to take care of this if it doesn't follow omop standard - if bin_size == 'h': - df[time_key] = df[time_key] / 3600 - func = getattr(np, time_binning_method) - df[time_key] = func(df[time_key]) - - df[time_key] = df[time_key].astype(str) - # Adjust time values that are equal to the time_upper_bound - #df.loc[df[time_key] == time_upper_bound, time_key] = time_upper_bound - 1 - - # Group and aggregate data - df = df.groupby(["visit_occurrence_id", time_key])[value_key].agg(aggregation_method).reset_index(drop=False) - grouped = df.groupby("visit_occurrence_id") - - unique_visit_occurrence_ids = adata.obs.index - empty_entry = {value_key: [], time_key: []} - - # Efficiently use set difference and intersection - feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - # Efficiently create the array - ak_array = ak.Array([ - grouped.get_group(visit_occurrence_id)[[value_key, time_key]].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids - ]) - adata.obsm[feature] = ak_array - - return adata - - def timeseries_discretizer(self, - adata, - key: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - value_key: str = 'value_as_number', - time_key: str = 'measurement_datetime', - freq: str = 'hour', #TODO - time_limit: int = 48, #TODO - method: str = 'median' #TODO - ): - - pass - - - - def from_dataframe( - self, - adata, - feature: str, - df - ): - grouped = df.groupby("visit_occurrence_id") - unique_visit_occurrence_ids = set(adata.obs.index) - - # Use set difference and intersection more efficiently - feature_ids = unique_visit_occurrence_ids.intersection(grouped.groups.keys()) - empty_entry = {source_table_column: [] for source_table_column in set(df.columns) if source_table_column not in ['visit_occurrence_id'] } - - # Creating the array more efficiently - ak_array = ak.Array([ - grouped.get_group(visit_occurrence_id)[list(set(df.columns) - set(['visit_occurrence_id']))].to_dict(orient='list') if visit_occurrence_id in feature_ids else empty_entry - for visit_occurrence_id in unique_visit_occurrence_ids]) - adata.obsm[feature] = ak_array - - return adata - - # TODO add function to check feature and add concept - # More IO functions - def to_dataframe( - self, - adata, - features: Union[str, List[str]], # TODO also support list of features - # patient str or List, # TODO also support subset of patients/visit - ): - # TODO - # can be viewed as patient level - only select some patient - # TODO change variable name here - if isinstance(features, str): - features = [features] - df_concat = pd.DataFrame([]) - for feature in features: - df = ak.to_dataframe(adata.obsm[feature]) - - df.reset_index(drop=False, inplace=True) - df["entry"] = adata.obs.index[df["entry"]] - df = df.rename(columns={"entry": "visit_occurrence_id"}) - del df["subentry"] - for col in df.columns: - if col.endswith('time'): - df[col] = pd.to_datetime(df[col]) - - df['feature_name'] = feature - df_concat = pd.concat([df_concat, df], axis= 0) - - - return df_concat - - - def plot_timeseries(self, - adata, - visit_occurrence_id: int, - key: Union[str, List[str]], - slot: Union[str, None] = 'obsm', - value_key: str = 'value_as_number', - time_key: str = 'measurement_datetime', - x_label: str = None - ): - - - if isinstance(key, str): - key_list = [key] - else: - key_list = key - - # Initialize min_x and max_x - min_x = None - max_x = None - - if slot == 'obsm': - fig, ax = plt.subplots(figsize=(20, 6)) - # Scatter plot - for i, key in enumerate(key_list): - df = self.to_dataframe(adata, key) - x = df[df.visit_occurrence_id == visit_occurrence_id][time_key] - y = df[df.visit_occurrence_id == visit_occurrence_id][value_key] - - # Check if x is empty - if not x.empty: - ax.scatter(x=x, y=y, label=key) - ax.legend(loc=9, bbox_to_anchor=(0.5, -0.1), ncol=len(key_list), prop={"size": 14}) - - ax.plot(x, y) - - - if min_x is None or min_x > x.min(): - min_x = x.min() - if max_x is None or max_x < x.max(): - max_x = x.max() - - - else: - # Skip this iteration if x is empty - continue - - if min_x is not None and max_x is not None: - - # Adapt this to input data - # TODO step - #plt.xticks(np.arange(min_x, max_x, step=1)) - # Adapt this to input data - plt.xlabel(x_label if x_label else "Hours since ICU admission") - - plt.show() - - - def violin( - self, - adata: AnnData, - obsm_key: str = None, - keys: Union[str, Sequence[str]] = None, - groupby: Optional[str] = None, - log: Optional[bool] = False, - use_raw: Optional[bool] = None, - stripplot: bool = True, - jitter: Union[float, bool] = True, - size: int = 1, - layer: Optional[str] = None, - scale: Literal["area", "count", "width"] = "width", - order: Optional[Sequence[str]] = None, - multi_panel: Optional[bool] = None, - xlabel: str = "", - ylabel: Union[str, Sequence[str]] = None, - rotation: Optional[float] = None, - show: Optional[bool] = None, - save: Union[bool, str] = None, - ax: Optional[Axes] = None, - **kwds, - ): # pragma: no cover - """Violin plot. - - Wraps :func:`seaborn.violinplot` for :class:`~anndata.AnnData`. - - Args: - adata: :class:`~anndata.AnnData` object object containing all observations. - keys: Keys for accessing variables of `.var_names` or fields of `.obs`. - groupby: The key of the observation grouping to consider. - log: Plot on logarithmic axis. - use_raw: Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present. - stripplot: Add a stripplot on top of the violin plot. See :func:`~seaborn.stripplot`. - jitter: Add jitter to the stripplot (only when stripplot is True) See :func:`~seaborn.stripplot`. - size: Size of the jitter points. - layer: Name of the AnnData object layer that wants to be plotted. By - default adata.raw.X is plotted. If `use_raw=False` is set, - then `adata.X` is plotted. If `layer` is set to a valid layer name, - then the layer is plotted. `layer` takes precedence over `use_raw`. - scale: The method used to scale the width of each violin. - If 'width' (the default), each violin will have the same width. - If 'area', each violin will have the same area. - If 'count', a violin’s width corresponds to the number of observations. - order: Order in which to show the categories. - multi_panel: Display keys in multiple panels also when `groupby is not None`. - xlabel: Label of the x axis. Defaults to `groupby` if `rotation` is `None`, otherwise, no label is shown. - ylabel: Label of the y axis. If `None` and `groupby` is `None`, defaults to `'value'`. - If `None` and `groubpy` is not `None`, defaults to `keys`. - rotation: Rotation of xtick labels. - {show_save_ax} - **kwds: - Are passed to :func:`~seaborn.violinplot`. - - Returns: - A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`. - - Example: - .. code-block:: python - - import ehrapy as ep - - adata = ep.dt.mimic_2(encoded=True) - ep.pp.knn_impute(adata) - ep.pp.log_norm(adata, offset=1) - ep.pp.neighbors(adata) - ep.tl.leiden(adata, resolution=0.5, key_added="leiden_0_5") - ep.pl.violin(adata, keys=["age"], groupby="leiden_0_5") - - Preview: - .. image:: /_static/docstring_previews/violin.png - """ - - if obsm_key: - df = self.to_dataframe(adata, features=obsm_key) - df = df[["visit_occurrence_id", "value_as_number"]] - df = df.rename(columns = {"value_as_number": obsm_key}) - - if groupby: - df = df.set_index('visit_occurrence_id').join(adata.obs[groupby].to_frame()).reset_index(drop=False) - adata = ep.ad.df_to_anndata(df, columns_obs_only=['visit_occurrence_id', groupby]) - else: - adata = ep.ad.df_to_anndata(df, columns_obs_only=['visit_occurrence_id']) - keys=obsm_key - - violin_partial = partial( - sc.pl.violin, - keys=keys, - log=log, - use_raw=use_raw, - stripplot=stripplot, - jitter=jitter, - size=size, - layer=layer, - scale=scale, - order=order, - multi_panel=multi_panel, - xlabel=xlabel, - ylabel=ylabel, - rotation=rotation, - show=show, - save=save, - ax=ax, - **kwds,) - - return violin_partial(adata=adata, groupby=groupby) - - - def qc_lab_measurements( - self, - adata: AnnData, - reference_table: pd.DataFrame = None, - measurements: list[str] = None, - obsm_measurements: list[str] = None, - action: Literal["remove"] = "remove", - unit: Literal["traditional", "SI"] = None, - layer: str = None, - threshold: int = 20, - age_col: str = None, - age_range: str = None, - sex_col: str = None, - sex: str = None, - ethnicity_col: str = None, - ethnicity: str = None, - copy: bool = False, - verbose: bool = False, - ) -> AnnData: - - if copy: - adata = adata.copy() - - preprocessing_dir = '/Users/xinyuezhang/ehrapy/ehrapy/preprocessing' - if reference_table is None: - reference_table = pd.read_csv( - f"{preprocessing_dir}/laboratory_reference_tables/laposata.tsv", sep="\t", index_col="Measurement" - ) - if obsm_measurements: - measurements = obsm_measurements - for measurement in measurements: - best_column_match, score = process.extractOne( - query=measurement, choices=reference_table.index, score_cutoff=threshold - ) - if best_column_match is None: - rprint(f"[bold yellow]Unable to find a match for {measurement}") - continue - if verbose: - rprint( - f"[bold blue]Detected [green]{best_column_match}[blue] for [green]{measurement}[blue] with score [green]{score}." - ) - - reference_column = "SI Reference Interval" if unit == "SI" else "Traditional Reference Interval" - - # Fetch all non None columns from the reference statistics - not_none_columns = [col for col in [sex_col, age_col, ethnicity_col] if col is not None] - not_none_columns.append(reference_column) - reference_values = reference_table.loc[[best_column_match], not_none_columns] - - additional_columns = False - if sex_col or age_col or ethnicity_col: # check if additional columns were provided - additional_columns = True - - # Check if multiple reference values occur and no additional information is available: - if reference_values.shape[0] > 1 and additional_columns is False: - raise ValueError( - f"Several options for {best_column_match} reference value are available. Please specify sex, age or " - f"ethnicity columns and their values." - ) - - try: - if age_col: - min_age, max_age = age_range.split("-") - reference_values = reference_values[ - (reference_values[age_col].str.split("-").str[0].astype(int) >= int(min_age)) - and (reference_values[age_col].str.split("-").str[1].astype(int) <= int(max_age)) - ] - if sex_col: - sexes = "U|M" if sex is None else sex - reference_values = reference_values[reference_values[sex_col].str.contains(sexes)] - if ethnicity_col: - reference_values = reference_values[reference_values[ethnicity_col].isin([ethnicity])] - - if layer is not None: - actual_measurements = adata[:, measurement].layers[layer] - else: - if obsm_measurements: - actual_measurements = adata.obsm[measurement]['value_as_number'] - ak_measurements = adata.obsm[measurement] - else: - actual_measurements = adata[:, measurement].X - except TypeError: - rprint(f"[bold yellow]Unable to find specified reference values for {measurement}.") - - check = reference_values[reference_column].values - check_str: str = np.array2string(check) - check_str = check_str.replace("[", "").replace("]", "").replace("'", "") - if "<" in check_str: - upperbound = float(check_str.replace("<", "")) - if verbose: - rprint(f"[bold blue]Using upperbound [green]{upperbound}") - upperbound_check_results = actual_measurements < upperbound - if isinstance(actual_measurements, ak.Array): - if action == 'remove': - if verbose: - rprint(f"Removing {ak.count(actual_measurements) - ak.count(actual_measurements[upperbound_check_results])} outliers") - adata.obsm[measurement] = ak_measurements[upperbound_check_results] - else: - upperbound_check_results_array: np.ndarray = upperbound_check_results.copy() - adata.obs[f"{measurement} normal"] = upperbound_check_results_array - - elif ">" in check_str: - lower_bound = float(check_str.replace(">", "")) - if verbose: - rprint(f"[bold blue]Using lowerbound [green]{lower_bound}") - - lower_bound_check_results = actual_measurements > lower_bound - if isinstance(actual_measurements, ak.Array): - if action == 'remove': - adata.obsm[measurement] = ak_measurements[lower_bound_check_results] - else: - adata.obs[f"{measurement} normal"] = lower_bound_check_results_array - lower_bound_check_results_array = lower_bound_check_results.copy() - else: # "-" range case - min_value = float(check_str.split("-")[0]) - max_value = float(check_str.split("-")[1]) - if verbose: - rprint(f"[bold blue]Using minimum of [green]{min_value}[blue] and maximum of [green]{max_value}") - - range_check_results = (actual_measurements >= min_value) & (actual_measurements <= max_value) - if isinstance(actual_measurements, ak.Array): - if action == 'remove': - adata.obsm[measurement] = ak_measurements[range_check_results] - else: - adata.obs[f"{measurement} normal"] = range_check_results_array - range_check_results_array: np.ndarray = range_check_results.copy() - - if copy: - return adata diff --git a/omop_conversion.py b/omop_conversion.py deleted file mode 100644 index 7cb9951..0000000 --- a/omop_conversion.py +++ /dev/null @@ -1,246 +0,0 @@ -import os -import glob - -import pandas as pd - -import ehrapy as ep -from pathlib import Path -from .utils.omop_utils import * -from rich.console import Console -from rich.text import Text -import rich.repr -from rich import print as rprint -from typing import TYPE_CHECKING, Any, Callable, Literal, Union, List - -@rich.repr.auto(angular=True) -class OMOP: - def __init__(self, folder_path, delimiter=None, make_filename_lowercase=True, use_dask=False): - self.base = folder_path - self.delimiter = delimiter - self.use_dask = use_dask - filepath_list = glob.glob(os.path.join(folder_path, "*.csv")) + glob.glob(os.path.join(folder_path, "*.parquet")) - self.loaded_tabel = None - - self.filepath_dict = check_with_omop_cdm(filepath_list, base=self.base, delimiter=self.delimiter, make_filename_lowercase=make_filename_lowercase) - self.tables = list(self.filepath_dict.keys()) - - ''' - def __repr__(self) -> str: - print_str = f'OMOP object ({os.path.basename(self.base)}) with {len(self.tables)} tables.\nTables:\n' - table_catalog_dict = get_table_catalog_dict() - for _, (key, value) in enumerate(table_catalog_dict.items()): - table_list = [table_name for table_name in self.tables if table_name in value] - if len(table_list) != 0: - print_str = print_str + f"{key} tables: {', '.join(table_list)}\n" - return print_str - ''' - - def __rich_repr__(self): - console = Console() - table_catalog_dict = get_table_catalog_dict() - color_map = { - 'Clinical data': 'blue', - 'Health system data': 'green', - 'Health economics data': 'red', - 'Standardized derived elements': 'magenta', - 'Metadata': 'white', - 'Vocabulary': 'dark_orange' - } - # Object description - print_str = f'OMOP object ([red]{os.path.basename(self.base)}[/]) with {len(self.tables)} tables.\n' - - # Tables information - for key, value in table_catalog_dict.items(): - table_list = [table_name for table_name in self.tables if table_name in value] - if len(table_list) != 0: - print_str = print_str + f"[{color_map[key]}]{key} tables[/]: [black]{', '.join(table_list)}[/]\n" - #table_list_str = ', '.join(table_list) - - #text = Text(f"{key} tables: ", style=color_map[key]) - #text.append(table_list_str) - #yield None, f"{key} tables", "red" - console.print(print_str) - yield None - - - #TODO - def new_load(self, - level: Literal["stay_level", "patient_level"] = "stay_level", - tables: Union[str, List[str]] = None, - remove_empty_column=True): - - table_catalog_dict = get_table_catalog_dict() - if not tables: - tables = self.table - - for table in self.table: - # Load Clinical data tables - if table in table_catalog_dict['Clinical data']: - # in patient level - if table in ["person", "death"]: - column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) - df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') - elif table in ["visit_occurrence_id"]: - column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) - df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') - else: - warnings(f"Please use extract_features function to extract features from table {table}") - continue - elif table in table_catalog_dict["Health system data"]: - column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) - df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') - - - - - # Load Health system data tables - - # Load Health economics data tables - - # Load Standardized derived elements tables - - # Load Metadata tables - - # Load Vocabulary tables - - - # TODO patient level and hospital level - if level == "stay_level": - index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} - # TODO Only support clinical_tables_columns - - for table in tables: - print(f"reading table [{table}]") - column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, filename=table) - df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') - if remove_empty_column: - # TODO dask Support - #columns = [column for column in df.columns if not df[column].compute().isna().all()] - columns = [column for column in df.columns if not df[column].isna().all()] - df = df.loc[:, columns] - setattr(self, table, df) - - # concept_id_list = list(self.concept.concept_id) - # concept_name_list = list(self.concept.concept_id) - # concept_domain_id_list = list(set(self.concept.domain_id)) - - # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] - # TODO dask Support - joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") - - joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") - - # TODO dask Support - #joined_table = joined_table.compute() - - # TODO check this earlier - joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') - joined_table = joined_table.set_index("visit_occurrence_id") - # obs_only_list = list(self.joined_table.columns) - # obs_only_list.remove('visit_occurrence_id') - columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) - adata = ep.ad.df_to_anndata( - joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only - ) - # TODO this needs to be fixed because anndata set obs index as string by default - #adata.obs.index = adata.obs.index.astype(int) - - """ - for column in self.measurement.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.drug_exposure.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.observation.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - """ - - return adata - - def load(self, - level: Literal["stay_level", "patient_level"] = "stay_level", - tables: Union[str, List[str]] = None, - remove_empty_column=True): - - if not tables: - tables = ['person', 'death', 'visit_occurrence'] - # TODO patient level and hospital level - if level == "stay_level": - index = {"visit_occurrence": "visit_occurrence_id", "person": "person_id", "death": "person_id"} - # TODO Only support clinical_tables_columns - - for table in tables: - print(f"reading table [{table}]") - column_types = get_column_types(path = self.filepath_dict[table], delimiter=self.delimiter, table_name=table) - df = read_table(self.filepath_dict[table], delimiter=self.delimiter, dtype=column_types, index='person_id') - if remove_empty_column: - # TODO dask Support - #columns = [column for column in df.columns if not df[column].compute().isna().all()] - columns = [column for column in df.columns if not df[column].isna().all()] - df = df.loc[:, columns] - setattr(self, table, df) - - # concept_id_list = list(self.concept.concept_id) - # concept_name_list = list(self.concept.concept_id) - # concept_domain_id_list = list(set(self.concept.domain_id)) - - # self.loaded_tabel = ['visit_occurrence', 'person', 'death', 'measurement', 'observation', 'drug_exposure'] - # TODO dask Support - joined_table = pd.merge(self.visit_occurrence, self.person, left_index=True, right_index=True, how="left") - - joined_table = pd.merge(joined_table, self.death, left_index=True, right_index=True, how="left") - - # TODO dask Support - #joined_table = joined_table.compute() - - # TODO check this earlier - joined_table = joined_table.drop_duplicates(subset='visit_occurrence_id') - joined_table = joined_table.set_index("visit_occurrence_id") - # obs_only_list = list(self.joined_table.columns) - # obs_only_list.remove('visit_occurrence_id') - columns_obs_only = list(set(joined_table.columns) - set(["year_of_birth", "gender_source_value"])) - adata = ep.ad.df_to_anndata( - joined_table, index_column="visit_occurrence_id", columns_obs_only=columns_obs_only - ) - # TODO this needs to be fixed because anndata set obs index as string by default - #adata.obs.index = adata.obs.index.astype(int) - - """ - for column in self.measurement.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.measurement[self.measurement['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.drug_exposure.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.drug_exposure[self.drug_exposure['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - - for column in self.observation.columns: - if column != 'visit_occurrence_id': - obs_list = [] - for visit_occurrence_id in adata.obs.index: - obs_list.append(list(self.observation[self.observation['visit_occurrence_id'] == int(visit_occurrence_id)][column])) - adata.obsm[column]= ak.Array(obs_list) - """ - - return adata - - \ No newline at end of file From fa1d026f788b41d739a7d7b99682cefebed52f4a Mon Sep 17 00:00:00 2001 From: Xinyue Zhang Date: Thu, 15 Feb 2024 11:39:25 +0100 Subject: [PATCH 12/13] Fix import statements in __init__.py files --- ehrdata/pl/__init__.py | 2 +- ehrdata/tl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ehrdata/pl/__init__.py b/ehrdata/pl/__init__.py index a1a7091..4a54f81 100644 --- a/ehrdata/pl/__init__.py +++ b/ehrdata/pl/__init__.py @@ -1 +1 @@ -from ehrdata.pl._omop import feature_counts \ No newline at end of file +from ehrdata.pl._omop import feature_counts diff --git a/ehrdata/tl/__init__.py b/ehrdata/tl/__init__.py index 83756fa..9dd676d 100644 --- a/ehrdata/tl/__init__.py +++ b/ehrdata/tl/__init__.py @@ -1 +1 @@ -from ehrdata.tl._omop import get_concept_name \ No newline at end of file +from ehrdata.tl._omop import get_concept_name From 50f2bb6999aa5335dc55d21d57987fdd015662bb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 15 Feb 2024 10:56:46 +0000 Subject: [PATCH 13/13] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ehrdata/io/_omop.py | 8 +++++--- tests/test_basic.py | 3 +-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/ehrdata/io/_omop.py b/ehrdata/io/_omop.py index 0425dc8..a536008 100644 --- a/ehrdata/io/_omop.py +++ b/ehrdata/io/_omop.py @@ -144,9 +144,11 @@ def from_dataframe(adata, feature: str, df): # Creating the array more efficiently ak_array = ak.Array( [ - grouped.get_group(visit_occurrence_id)[columns_in_ak_array].to_dict(orient="list") - if visit_occurrence_id in feature_ids - else empty_entry + ( + grouped.get_group(visit_occurrence_id)[columns_in_ak_array].to_dict(orient="list") + if visit_occurrence_id in feature_ids + else empty_entry + ) for visit_occurrence_id in unique_visit_occurrence_ids ] ) diff --git a/tests/test_basic.py b/tests/test_basic.py index 6cbb48b..4552089 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,6 +1,5 @@ -import pytest - import ehrdata as ehr +import pytest def test_package_has_version():