Skip to content

Commit

Permalink
update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
xinyuejohn committed May 28, 2024
1 parent 15c66d8 commit 3a40c50
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 108 deletions.
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@
autosummary_generate = True
autodoc_member_order = "groupwise"
default_role = "literal"
napoleon_google_docstring = False
napoleon_numpy_docstring = True
napoleon_google_docstring = True
napoleon_numpy_docstring = False
napoleon_include_init_with_doc = False
napoleon_use_rtype = True # having a separate entry generally helps readability
napoleon_use_param = True
Expand Down
58 changes: 30 additions & 28 deletions ehrdata/io/_omop.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,11 @@ def init_omop(
load_tables: Optional[Union[str, list[str], tuple[str], Literal["auto"]]] = None,
remove_empty_column: bool = True,
) -> AnnData:
"""Initialize an OMOP database, load tables and create anndata object
"""Initialize an OMOP database, load tables, and create anndata object.
Args:
folder_path: Path to the folder containing the OMOP CDM tables
delimiter: If data is in csv format, delimiter can be specified. Defaults to ','.
folder_path: Path to the folder containing the OMOP CDM tables.
delimiter: If data is in CSV format, delimiter can be specified. Defaults to ','.
make_filename_lowercase: If True, the filename will be converted to lowercase. Defaults to True.
use_dask: If True, dask will be used to read the tables. For large tables, it is recommended to use dask. Defaults to False.
level: For stay level, each row in anndata would be a visit_occurrence. For patient level, each row in anndata would be a patient. Defaults to "stay_level".
Expand All @@ -41,7 +41,7 @@ def init_omop(
Returns
-------
AnnData: Anndata object
AnnData: Anndata object.
"""
if delimiter is None:
delimiter = ","
Expand Down Expand Up @@ -270,20 +270,20 @@ def extract_features(
verbose: Optional[bool] = True,
use_dask: bool = None,
) -> AnnData:
"""Extract features from OMOP CDM tables and add them to .obsm of anndata object
"""Extract features from OMOP CDM tables and add them to .obsm of anndata object.
Args:
adata (AnnData): Anndata object
source (Literal[ "observation", "measurement", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "condition_occurrence", ]): source table name. Defaults to None.
features (Union[str, int, list[Union[str, int]]], optional): feature_id or feature_name, or list of feature_id or feature_name. Defaults to None.
source_table_columns (Union[str, list[str]], optional): columns to be extracted from source table. If None, all columns will be extracted. Defaults to None.
dropna (Optional[bool], optional): drop rows with missing values. Defaults to True.
verbose (Optional[bool], optional): print progress. Defaults to True.
use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
adata: Anndata object.
source: Source table name. One of {"observation", "measurement", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "condition_occurrence"}.
features: Feature_id or feature_name, or list of feature_id or feature_name. Defaults to None.
source_table_columns: Columns to be extracted from source table. If None, all columns will be extracted. Defaults to None.
dropna: Drop rows with missing values. Defaults to True.
verbose: Print progress. Defaults to True.
use_dask: If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
Returns
-------
AnnData: Anndata object
Anndata object.
"""
if source in ["measurement", "observation", "specimen"]:
key = f"{source}_concept_id"
Expand Down Expand Up @@ -365,16 +365,16 @@ def extract_note(
use_dask: bool = None,
columns: Optional[list[str]] = None,
) -> AnnData:
"""Extract note from OMOP CDM Note table and add them to .obsm of anndata object
"""Extract note from OMOP CDM Note table and add them to .obsm of anndata object.
Args:
adata (AnnData): Anndata object
use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
columns (Optional[list[str]], optional): columns to be extracted from note table. If None, all columns will be extracted. Defaults to None.
adata: Anndata object.
use_dask: If True, dask will be used to read the tables. For large tables, it is recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
columns: Columns to be extracted from note table. If None, all columns will be extracted. Defaults to None.
Returns
-------
AnnData: Anndata object
Anndata object.
"""
if use_dask is None:
use_dask = use_dask = adata.uns["use_dask"]
Expand All @@ -395,16 +395,16 @@ def extract_note(


def from_dataframe(adata: AnnData, feature: str, df: pd.DataFrame) -> AnnData:
"""Add data from a dataframe to .obsm of anndata object
"""Add data from a dataframe to .obsm of anndata object.
Args:
adata (AnnData): Anndata object
feature (str): feature name. It will be used as the key in .obsm
df: dataframe containing the data. It should have a column named 'visit_occurrence_id'
adata: Anndata object.
feature: Feature name. It will be used as the key in .obsm.
df: Dataframe containing the data. It should have a column named 'visit_occurrence_id'.
Returns
-------
AnnData: Anndata object with an awkward array in obsm[feature]
Anndata object with an awkward array in obsm[feature].
"""
# Add new rows for those visit_occurrence_id that don't have any data
new_row_dict = {col: [] for col in df.columns}
Expand Down Expand Up @@ -439,16 +439,18 @@ def to_dataframe(
keep_na: Optional[bool] = False,
convert_to_datetime: Optional[bool] = True,
) -> DataFrame:
"""Convert data in .obsm of anndata object to dataframe
"""Convert data in .obsm of anndata object to dataframe.
Args:
adata (AnnData): Anndata object with data in .obsm
features (Union[str, list[str]]): feature name or list of feature names
visit_occurrence_id (Optional[Union[str, list[str]]], optional): visit_occurrence_id or list of visit_occurrence_id. If None, all visit_occurrence_id will be selected. Defaults to None.
adata: Anndata object with data in .obsm.
features: Feature name or list of feature names.
visit_occurrence_id: Visit_occurrence_id or list of visit_occurrence_id. If None, all visit_occurrence_id will be selected. Defaults to None.
keep_na: Keep rows with missing values. Defaults to False.
convert_to_datetime: Convert datetime columns to datetime dtype. Defaults to True.
Returns
-------
dataframe containing the data
Dataframe containing the data.
"""
# TODO
# can be viewed as patient level - only select some patient
Expand Down
85 changes: 30 additions & 55 deletions ehrdata/pl/_omop.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ def feature_counts(
"""Plot feature counts for a given source table and return a dataframe with feature names and counts.
Args:
adata (AnnData): Anndata object
source (Literal[ "observation", "measurement", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "condition_occurrence", ]): source table name. Defaults to None.
number (int, optional): Number of top features to plot. Defaults to 20.
use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
adata: Anndata object.
source: Source table name.
number: Number of top features to plot. Defaults to 20.
use_dask: If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
Returns
-------
Dataframe with feature names and counts
Dataframe with feature names and counts.
"""
path = adata.uns["filepath_dict"][source]
if isinstance(path, list):
Expand Down Expand Up @@ -97,20 +97,18 @@ def plot_timeseries(
"""Plot timeseries data using data from adata.obsm.
Args:
adata (AnnData): Anndata object
visit_occurrence_id (int): visit_occurrence_id to plot
key (Union[str, list[str]]): feature key or list of keys in adata.obsm to plot
slot (Union[str, None], optional): Slot to use. Defaults to "obsm".
value_key (str, optional): key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
time_key (str, optional): key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
from_time (Optional[str], optional): Start time for the plot. Defaults to None.
to_time (Optional[str], optional): End time for the plot. Defaults to None.
x_label (str, optional): x labe name. Defaults to None.
y_label (str, optional): y label name. Defaults to None.
title (str, optional): title of the plot. Defaults to None.
show (Optional[bool], optional): Show the plot, do not return axis.
adata: Anndata object.
visit_occurrence_id: Visit_occurrence_id to plot.
key: Feature key or list of keys in adata.obsm to plot.
slot: Slot to use. Defaults to "obsm".
value_key: Key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
time_key: Key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
from_time: Start time for the plot. Defaults to None.
to_time: End time for the plot. Defaults to None.
x_label: x label name. Defaults to None.
y_label: y label name. Defaults to None.
title: Title of the plot. Defaults to None.
show: Show the plot, do not return axis.
"""
if isinstance(key, str):
key_list = [key]
Expand Down Expand Up @@ -193,54 +191,31 @@ def violin(
): # pragma: no cover
"""Violin plot.
Wraps :func:`seaborn.violinplot` for :class:`~anndata.AnnData`.
Args:
adata: :class:`~anndata.AnnData` object object containing all observations.
obsm_key: feature key or list of keys in adata.obsm to plot
adata: AnnData object containing all observations.
obsm_key: Feature key or list of keys in adata.obsm to plot.
keys: Keys for accessing variables of `.var_names` or fields of `.obs`.
groupby: The key of the observation grouping to consider.
log: Plot on logarithmic axis.
use_raw: Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present.
stripplot: Add a stripplot on top of the violin plot. See :func:`~seaborn.stripplot`.
jitter: Add jitter to the stripplot (only when stripplot is True) See :func:`~seaborn.stripplot`.
stripplot: Add a stripplot on top of the violin plot.
jitter: Add jitter to the stripplot (only when stripplot is True).
size: Size of the jitter points.
layer: Name of the AnnData object layer that wants to be plotted. By
default adata.raw.X is plotted. If `use_raw=False` is set,
then `adata.X` is plotted. If `layer` is set to a valid layer name,
then the layer is plotted. `layer` takes precedence over `use_raw`.
layer: Name of the AnnData object layer that wants to be plotted.
scale: The method used to scale the width of each violin.
If 'width' (the default), each violin will have the same width.
If 'area', each violin will have the same area.
If 'count', a violin’s width corresponds to the number of observations.
order: Order in which to show the categories.
multi_panel: Display keys in multiple panels also when `groupby is not None`.
xlabel: Label of the x axis. Defaults to `groupby` if `rotation` is `None`, otherwise, no label is shown.
ylabel: Label of the y axis. If `None` and `groupby` is `None`, defaults to `'value'`.
If `None` and `groubpy` is not `None`, defaults to `keys`.
multi_panel: Display keys in multiple panels also when `groupby` is not None.
xlabel: Label of the x axis.
ylabel: Label of the y axis.
rotation: Rotation of xtick labels.
{show_save_ax}
**kwds:
Are passed to :func:`~seaborn.violinplot`.
show: Show the plot, do not return axis.
save: Save the plot to file.
ax: Matplotlib Axes object to use. If not passed, uses the current Axes instance.
**kwds: Additional keyword arguments passed to seaborn.violinplot.
Returns
-------
A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`.
Example:
.. code-block:: python
import ehrapy as ep
adata = ep.dt.mimic_2(encoded=True)
ep.pp.knn_impute(adata)
ep.pp.log_norm(adata, offset=1)
ep.pp.neighbors(adata)
ep.tl.leiden(adata, resolution=0.5, key_added="leiden_0_5")
ep.pl.violin(adata, keys=["age"], groupby="leiden_0_5")
Preview:
.. image:: /_static/docstring_previews/violin.png
A matplotlib Axes object if ax is None, else None.
"""
if obsm_key:
df = to_dataframe(adata, features=obsm_key)
Expand Down
33 changes: 22 additions & 11 deletions ehrdata/pp/_omop.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,22 +35,22 @@ def get_feature_statistics(
verbose: bool = False,
use_dask: bool = None,
) -> AnnData:
"""Calculate statistics for the specified features from the OMOP tables and adds them to the AnnData object.
"""Calculates statistics for specified features from the OMOP tables and adds them to the AnnData object.
Args:
adata (AnnData): Anndata object
source (Literal[ "observation", "measurement", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "condition_occurrence", ]): source table name. Defaults to None.
features (Union[str, int, list[Union[str, int]]], optional): concept_id or concept_name, or list of concept_id or concept_name. Defaults to None.
level (Literal["stay_level", "patient_level"], optional): For stay level, statistics are calculated for each stay. For patient level, statistics are calculated for each patient. It should be aligned with the setting of the adata object. Defaults to "stay_level".
value_col (str, optional): column name in source table to extract value from. Defaults to None.
aggregation_methods (Union[ Literal["min", "max", "mean", "std", "count"], list[Literal["min", "max", "mean", "std", "count"]] ], optional): aggregation methods to calculate statistics. Defaults to ["min", "max", "mean", "std", "count"].
add_aggregation_to_X (bool, optional): add the calculated statistics to adata.X. If False, the statistics will be added to adata.obs. Defaults to True.
verbose (bool, optional): print verbose information. Defaults to False.
use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
adata: Anndata object.
source: Source table name.
features: Concept ID or concept name, or list of concept IDs or concept names.
level: Determines whether statistics are calculated for each stay or each patient.
value_col: Column name in the source table to extract value from.
aggregation_methods: Aggregation methods to calculate statistics.
add_aggregation_to: Determines where the calculated statistics are added.
verbose: Prints verbose information.
use_dask: Determines whether to use Dask for reading tables.
Returns
-------
AnnData: Anndata object with added statistics either in adata.obs (if add_aggregation_to_X=False) or adata.X (if add_aggregation_to_X=True)
Anndata object with added statistics either in adata.obs (if add_aggregation_to='obs') or adata.X (if add_aggregation_to='X').
"""
if source in ["measurement", "observation", "specimen"]:
key = f"{source}_concept_id"
Expand Down Expand Up @@ -356,6 +356,17 @@ def drop_nan(
key: Union[str, list[str]],
slot: Union[str, None] = "obsm",
):
"""Remove observations with NaN values from the AnnData object.
Args:
adata (AnnData): Annotated data matrix.
key (Union[str, List[str]]): Key or list of keys representing the data to be checked for NaN values.
slot (Optional[str], optional): Slot to check for NaN values. Defaults to "obsm".
Returns
-------
AnnData: Annotated data matrix with NaN observations removed.
"""
if isinstance(key, str):
key_list = [key]
else:
Expand Down
15 changes: 3 additions & 12 deletions ehrdata/tl/_omop.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,13 +78,13 @@ def aggregate_timeseries_in_bins(
Args:
adata (AnnData): Anndata object
features (Union[str, list[str]]): concept_id or concept_name, or list of concept_id or concept_name. Defaults to None.
features (Union[str, list[str]]): concept_id or concept_name, or list of concept_id or concept_name. Defaults to None.
slot (Union[str, None], optional): Slot to read the data. Defaults to "obsm".
value_key (str, optional): key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
time_key (str, optional): key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
time_binning_method (Literal["floor", "ceil", "round"], optional): Time binning method. Defaults to "floor".
time_binning_method (Literal["floor", "ceil", "round"], optional): Time binning method. Defaults to "floor".
bin_size (Union[str, Offset], optional): Time bin size. Defaults to "h".
aggregation_method (Literal["median", "mean", "min", "max"], optional): Aggregation method. Defaults to "median".
aggregation_method (Literal["median", "mean", "min", "max"], optional): Aggregation method. Defaults to "median".
Returns
-------
Expand Down Expand Up @@ -133,12 +133,3 @@ def aggregate_timeseries_in_bins(
adata = from_dataframe(adata, feature, df)

return adata


# TODO
def note_nlp_map(
self,
):
# Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping
# connect with existing functions
pass

0 comments on commit 3a40c50

Please sign in to comment.