update docs

theislab · May 28, 2024 · 3a40c50 · 3a40c50
1 parent 15c66d8
commit 3a40c50
Show file tree

Hide file tree

Showing 5 changed files with 87 additions and 108 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -64,8 +64,8 @@
 autosummary_generate = True
 autodoc_member_order = "groupwise"
 default_role = "literal"
-napoleon_google_docstring = False
-napoleon_numpy_docstring = True
+napoleon_google_docstring = True
+napoleon_numpy_docstring = False
 napoleon_include_init_with_doc = False
 napoleon_use_rtype = True  # having a separate entry generally helps readability
 napoleon_use_param = True

diff --git a/ehrdata/io/_omop.py b/ehrdata/io/_omop.py
@@ -28,11 +28,11 @@ def init_omop(
     load_tables: Optional[Union[str, list[str], tuple[str], Literal["auto"]]] = None,
     remove_empty_column: bool = True,
 ) -> AnnData:
-    """Initialize an OMOP database, load tables and create anndata object
+    """Initialize an OMOP database, load tables, and create anndata object.
 
     Args:
-        folder_path: Path to the folder containing the OMOP CDM tables
-        delimiter: If data is in csv format, delimiter can be specified. Defaults to ','.
+        folder_path: Path to the folder containing the OMOP CDM tables.
+        delimiter: If data is in CSV format, delimiter can be specified. Defaults to ','.
         make_filename_lowercase: If True, the filename will be converted to lowercase. Defaults to True.
         use_dask: If True, dask will be used to read the tables. For large tables, it is recommended to use dask. Defaults to False.
         level: For stay level, each row in anndata would be a visit_occurrence. For patient level, each row in anndata would be a patient. Defaults to "stay_level".
@@ -41,7 +41,7 @@ def init_omop(
 
     Returns
     -------
-        AnnData: Anndata object
+        AnnData: Anndata object.
     """
     if delimiter is None:
         delimiter = ","
@@ -270,20 +270,20 @@ def extract_features(
     verbose: Optional[bool] = True,
     use_dask: bool = None,
 ) -> AnnData:
-    """Extract features from OMOP CDM tables and add them to .obsm of anndata object
+    """Extract features from OMOP CDM tables and add them to .obsm of anndata object.
 
     Args:
-        adata (AnnData): Anndata object
-        source (Literal[ &quot;observation&quot;, &quot;measurement&quot;, &quot;procedure_occurrence&quot;, &quot;specimen&quot;, &quot;device_exposure&quot;, &quot;drug_exposure&quot;, &quot;condition_occurrence&quot;, ]): source table name. Defaults to None.
-        features (Union[str, int, list[Union[str, int]]], optional): feature_id or feature_name, or list of feature_id or feature_name. Defaults to None.
-        source_table_columns (Union[str, list[str]], optional): columns to be extracted from source table. If None, all columns will be extracted. Defaults to None.
-        dropna (Optional[bool], optional): drop rows with missing values. Defaults to True.
-        verbose (Optional[bool], optional): print progress. Defaults to True.
-        use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns[&quot;use_dask&quot;]. Defaults to None.
+        adata: Anndata object.
+        source: Source table name. One of {"observation", "measurement", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "condition_occurrence"}.
+        features: Feature_id or feature_name, or list of feature_id or feature_name. Defaults to None.
+        source_table_columns: Columns to be extracted from source table. If None, all columns will be extracted. Defaults to None.
+        dropna: Drop rows with missing values. Defaults to True.
+        verbose: Print progress. Defaults to True.
+        use_dask: If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
 
     Returns
     -------
-        AnnData: Anndata object
+        Anndata object.
     """
     if source in ["measurement", "observation", "specimen"]:
         key = f"{source}_concept_id"
@@ -365,16 +365,16 @@ def extract_note(
     use_dask: bool = None,
     columns: Optional[list[str]] = None,
 ) -> AnnData:
-    """Extract note from OMOP CDM Note table and add them to .obsm of anndata object
+    """Extract note from OMOP CDM Note table and add them to .obsm of anndata object.
 
     Args:
-        adata (AnnData): Anndata object
-        use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is recommended to use dask. If None, it will be set to adata.uns[&quot;use_dask&quot;]. Defaults to None.
-        columns (Optional[list[str]], optional): columns to be extracted from note table. If None, all columns will be extracted. Defaults to None.
+        adata: Anndata object.
+        use_dask: If True, dask will be used to read the tables. For large tables, it is recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
+        columns: Columns to be extracted from note table. If None, all columns will be extracted. Defaults to None.
 
     Returns
     -------
-        AnnData: Anndata object
+        Anndata object.
     """
     if use_dask is None:
         use_dask = use_dask = adata.uns["use_dask"]
@@ -395,16 +395,16 @@ def extract_note(
 
 
 def from_dataframe(adata: AnnData, feature: str, df: pd.DataFrame) -> AnnData:
-    """Add data from a dataframe to .obsm of anndata object
+    """Add data from a dataframe to .obsm of anndata object.
 
     Args:
-        adata (AnnData): Anndata object
-        feature (str): feature name. It will be used as the key in .obsm
-        df: dataframe containing the data. It should have a column named 'visit_occurrence_id'
+        adata: Anndata object.
+        feature: Feature name. It will be used as the key in .obsm.
+        df: Dataframe containing the data. It should have a column named 'visit_occurrence_id'.
 
     Returns
     -------
-        AnnData: Anndata object with an awkward array in obsm[feature]
+        Anndata object with an awkward array in obsm[feature].
     """
     # Add new rows for those visit_occurrence_id that don't have any data
     new_row_dict = {col: [] for col in df.columns}
@@ -439,16 +439,18 @@ def to_dataframe(
     keep_na: Optional[bool] = False,
     convert_to_datetime: Optional[bool] = True,
 ) -> DataFrame:
-    """Convert data in .obsm of anndata object to dataframe
+    """Convert data in .obsm of anndata object to dataframe.
 
     Args:
-        adata (AnnData): Anndata object with data in .obsm
-        features (Union[str, list[str]]): feature name or list of feature names
-        visit_occurrence_id (Optional[Union[str, list[str]]], optional): visit_occurrence_id or list of visit_occurrence_id. If None, all visit_occurrence_id will be selected. Defaults to None.
+        adata: Anndata object with data in .obsm.
+        features: Feature name or list of feature names.
+        visit_occurrence_id: Visit_occurrence_id or list of visit_occurrence_id. If None, all visit_occurrence_id will be selected. Defaults to None.
+        keep_na: Keep rows with missing values. Defaults to False.
+        convert_to_datetime: Convert datetime columns to datetime dtype. Defaults to True.
 
     Returns
     -------
-        dataframe containing the data
+        Dataframe containing the data.
     """
     # TODO
     # can be viewed as patient level - only select some patient

diff --git a/ehrdata/pl/_omop.py b/ehrdata/pl/_omop.py
@@ -34,14 +34,14 @@ def feature_counts(
     """Plot feature counts for a given source table and return a dataframe with feature names and counts.
 
     Args:
-        adata (AnnData): Anndata object
-        source (Literal[ &quot;observation&quot;, &quot;measurement&quot;, &quot;procedure_occurrence&quot;, &quot;specimen&quot;, &quot;device_exposure&quot;, &quot;drug_exposure&quot;, &quot;condition_occurrence&quot;, ]): source table name. Defaults to None.
-        number (int, optional): Number of top features to plot. Defaults to 20.
-        use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns[&quot;use_dask&quot;]. Defaults to None.
+        adata: Anndata object.
+        source: Source table name.
+        number: Number of top features to plot. Defaults to 20.
+        use_dask: If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns["use_dask"]. Defaults to None.
 
     Returns
     -------
-        Dataframe with feature names and counts
+        Dataframe with feature names and counts.
     """
     path = adata.uns["filepath_dict"][source]
     if isinstance(path, list):
@@ -97,20 +97,18 @@ def plot_timeseries(
     """Plot timeseries data using data from adata.obsm.
 
     Args:
-        adata (AnnData): Anndata object
-        visit_occurrence_id (int): visit_occurrence_id to plot
-        key (Union[str, list[str]]): feature key or list of keys in adata.obsm to plot
-        slot (Union[str, None], optional): Slot to use. Defaults to &quot;obsm&quot;.
-        value_key (str, optional): key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
-        time_key (str, optional): key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
-        from_time (Optional[str], optional): Start time for the plot. Defaults to None.
-        to_time (Optional[str], optional): End time for the plot. Defaults to None.
-        x_label (str, optional): x labe name. Defaults to None.
-        y_label (str, optional): y label name. Defaults to None.
-        title (str, optional): title of the plot. Defaults to None.
-
-        show (Optional[bool], optional): Show the plot, do not return axis.
-
+        adata: Anndata object.
+        visit_occurrence_id: Visit_occurrence_id to plot.
+        key: Feature key or list of keys in adata.obsm to plot.
+        slot: Slot to use. Defaults to "obsm".
+        value_key: Key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
+        time_key: Key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
+        from_time: Start time for the plot. Defaults to None.
+        to_time: End time for the plot. Defaults to None.
+        x_label: x label name. Defaults to None.
+        y_label: y label name. Defaults to None.
+        title: Title of the plot. Defaults to None.
+        show: Show the plot, do not return axis.
     """
     if isinstance(key, str):
         key_list = [key]
@@ -193,54 +191,31 @@ def violin(
 ):  # pragma: no cover
     """Violin plot.
 
-    Wraps :func:`seaborn.violinplot` for :class:`~anndata.AnnData`.
-
     Args:
-        adata: :class:`~anndata.AnnData` object object containing all observations.
-        obsm_key: feature key or list of keys in adata.obsm to plot
+        adata: AnnData object containing all observations.
+        obsm_key: Feature key or list of keys in adata.obsm to plot.
         keys: Keys for accessing variables of `.var_names` or fields of `.obs`.
         groupby: The key of the observation grouping to consider.
         log: Plot on logarithmic axis.
         use_raw: Whether to use `raw` attribute of `adata`. Defaults to `True` if `.raw` is present.
-        stripplot: Add a stripplot on top of the violin plot. See :func:`~seaborn.stripplot`.
-        jitter: Add jitter to the stripplot (only when stripplot is True) See :func:`~seaborn.stripplot`.
+        stripplot: Add a stripplot on top of the violin plot.
+        jitter: Add jitter to the stripplot (only when stripplot is True).
         size: Size of the jitter points.
-        layer: Name of the AnnData object layer that wants to be plotted. By
-            default adata.raw.X is plotted. If `use_raw=False` is set,
-            then `adata.X` is plotted. If `layer` is set to a valid layer name,
-            then the layer is plotted. `layer` takes precedence over `use_raw`.
+        layer: Name of the AnnData object layer that wants to be plotted.
         scale: The method used to scale the width of each violin.
-            If 'width' (the default), each violin will have the same width.
-            If 'area', each violin will have the same area.
-            If 'count', a violin’s width corresponds to the number of observations.
         order: Order in which to show the categories.
-        multi_panel: Display keys in multiple panels also when `groupby is not None`.
-        xlabel: Label of the x axis. Defaults to `groupby` if `rotation` is `None`, otherwise, no label is shown.
-        ylabel: Label of the y axis. If `None` and `groupby` is `None`, defaults to `'value'`.
-                If `None` and `groubpy` is not `None`, defaults to `keys`.
+        multi_panel: Display keys in multiple panels also when `groupby` is not None.
+        xlabel: Label of the x axis.
+        ylabel: Label of the y axis.
         rotation: Rotation of xtick labels.
-        {show_save_ax}
-        **kwds:
-            Are passed to :func:`~seaborn.violinplot`.
+        show: Show the plot, do not return axis.
+        save: Save the plot to file.
+        ax: Matplotlib Axes object to use. If not passed, uses the current Axes instance.
+        **kwds: Additional keyword arguments passed to seaborn.violinplot.
 
     Returns
     -------
-        A :class:`~matplotlib.axes.Axes` object if `ax` is `None` else `None`.
-
-    Example:
-        .. code-block:: python
-
-            import ehrapy as ep
-
-            adata = ep.dt.mimic_2(encoded=True)
-            ep.pp.knn_impute(adata)
-            ep.pp.log_norm(adata, offset=1)
-            ep.pp.neighbors(adata)
-            ep.tl.leiden(adata, resolution=0.5, key_added="leiden_0_5")
-            ep.pl.violin(adata, keys=["age"], groupby="leiden_0_5")
-
-    Preview:
-        .. image:: /_static/docstring_previews/violin.png
+        A matplotlib Axes object if ax is None, else None.
     """
     if obsm_key:
         df = to_dataframe(adata, features=obsm_key)

diff --git a/ehrdata/pp/_omop.py b/ehrdata/pp/_omop.py
@@ -35,22 +35,22 @@ def get_feature_statistics(
     verbose: bool = False,
     use_dask: bool = None,
 ) -> AnnData:
-    """Calculate statistics for the specified features from the OMOP tables and adds them to the AnnData object.
+    """Calculates statistics for specified features from the OMOP tables and adds them to the AnnData object.
 
     Args:
-        adata (AnnData): Anndata object
-        source (Literal[ &quot;observation&quot;, &quot;measurement&quot;, &quot;procedure_occurrence&quot;, &quot;specimen&quot;, &quot;device_exposure&quot;, &quot;drug_exposure&quot;, &quot;condition_occurrence&quot;, ]): source table name. Defaults to None.
-        features (Union[str, int, list[Union[str, int]]], optional): concept_id or concept_name, or list of concept_id or concept_name. Defaults to None.
-        level (Literal[&quot;stay_level&quot;, &quot;patient_level&quot;], optional): For stay level, statistics are calculated for each stay. For patient level, statistics are calculated for each patient. It should be aligned with the setting of the adata object. Defaults to &quot;stay_level&quot;.
-        value_col (str, optional): column name in source table to extract value from. Defaults to None.
-        aggregation_methods (Union[ Literal[&quot;min&quot;, &quot;max&quot;, &quot;mean&quot;, &quot;std&quot;, &quot;count&quot;], list[Literal[&quot;min&quot;, &quot;max&quot;, &quot;mean&quot;, &quot;std&quot;, &quot;count&quot;]] ], optional): aggregation methods to calculate statistics. Defaults to [&quot;min&quot;, &quot;max&quot;, &quot;mean&quot;, &quot;std&quot;, &quot;count&quot;].
-        add_aggregation_to_X (bool, optional): add the calculated statistics to adata.X. If False, the statistics will be added to adata.obs. Defaults to True.
-        verbose (bool, optional): print verbose information. Defaults to False.
-        use_dask (bool, optional): If True, dask will be used to read the tables. For large tables, it is highly recommended to use dask. If None, it will be set to adata.uns[&quot;use_dask&quot;]. Defaults to None.
+        adata: Anndata object.
+        source: Source table name.
+        features: Concept ID or concept name, or list of concept IDs or concept names.
+        level: Determines whether statistics are calculated for each stay or each patient.
+        value_col: Column name in the source table to extract value from.
+        aggregation_methods: Aggregation methods to calculate statistics.
+        add_aggregation_to: Determines where the calculated statistics are added.
+        verbose: Prints verbose information.
+        use_dask: Determines whether to use Dask for reading tables.
 
     Returns
     -------
-        AnnData: Anndata object with added statistics either in adata.obs (if add_aggregation_to_X=False) or adata.X (if add_aggregation_to_X=True)
+        Anndata object with added statistics either in adata.obs (if add_aggregation_to='obs') or adata.X (if add_aggregation_to='X').
     """
     if source in ["measurement", "observation", "specimen"]:
         key = f"{source}_concept_id"
@@ -356,6 +356,17 @@ def drop_nan(
     key: Union[str, list[str]],
     slot: Union[str, None] = "obsm",
 ):
+    """Remove observations with NaN values from the AnnData object.
+
+    Args:
+        adata (AnnData): Annotated data matrix.
+        key (Union[str, List[str]]): Key or list of keys representing the data to be checked for NaN values.
+        slot (Optional[str], optional): Slot to check for NaN values. Defaults to "obsm".
+
+    Returns
+    -------
+        AnnData: Annotated data matrix with NaN observations removed.
+    """
     if isinstance(key, str):
         key_list = [key]
     else:

diff --git a/ehrdata/tl/_omop.py b/ehrdata/tl/_omop.py
@@ -78,13 +78,13 @@ def aggregate_timeseries_in_bins(
 
     Args:
         adata (AnnData): Anndata object
-        features (Union[str, list[str]]):  concept_id or concept_name, or list of concept_id or concept_name. Defaults to None.
+        features (Union[str, list[str]]): concept_id or concept_name, or list of concept_id or concept_name. Defaults to None.
         slot (Union[str, None], optional): Slot to read the data. Defaults to "obsm".
         value_key (str, optional): key in awkward array in adata.obsm to be used as value. Defaults to "value_as_number".
         time_key (str, optional): key in awkward array in adata.obsm to be used as time. Defaults to "measurement_datetime".
-        time_binning_method (Literal[&quot;floor&quot;, &quot;ceil&quot;, &quot;round&quot;], optional): Time binning method. Defaults to "floor".
+        time_binning_method (Literal["floor", "ceil", "round"], optional): Time binning method. Defaults to "floor".
         bin_size (Union[str, Offset], optional): Time bin size. Defaults to "h".
-        aggregation_method (Literal[&quot;median&quot;, &quot;mean&quot;, &quot;min&quot;, &quot;max&quot;], optional): Aggregation method. Defaults to "median".
+        aggregation_method (Literal["median", "mean", "min", "max"], optional): Aggregation method. Defaults to "median".
 
     Returns
     -------
@@ -133,12 +133,3 @@ def aggregate_timeseries_in_bins(
             adata = from_dataframe(adata, feature, df)
 
     return adata
-
-
-# TODO
-def note_nlp_map(
-    self,
-):
-    # Got some inspirations from: https://github.com/aws-samples/amazon-comprehend-medical-omop-notes-mapping
-    # connect with existing functions
-    pass