From 16f46e23dd50476915891940862398a29b0902e2 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Thu, 1 Feb 2024 22:03:19 +0900 Subject: [PATCH 01/11] impl dodge wip --- whitecanvas/canvas/_base.py | 47 +- whitecanvas/canvas/_grid.py | 10 + whitecanvas/canvas/dataframe/__init__.py | 6 +- whitecanvas/canvas/dataframe/_base.py | 184 +++ whitecanvas/canvas/dataframe/_both_cat.py | 91 ++ whitecanvas/canvas/dataframe/_feature_cat.py | 400 +++++++ whitecanvas/canvas/dataframe/_one_cat.py | 633 ++++++++++ whitecanvas/canvas/dataframe/_plot.py | 1094 ------------------ whitecanvas/canvas/dataframe/_utils.py | 132 --- whitecanvas/layers/group/band_collection.py | 89 +- whitecanvas/layers/tabular/__init__.py | 2 + whitecanvas/layers/tabular/_box_like.py | 361 ++---- whitecanvas/layers/tabular/_dataframe.py | 126 +- whitecanvas/layers/tabular/_df_compat.py | 9 +- whitecanvas/layers/tabular/_jitter.py | 127 +- whitecanvas/layers/tabular/_plans.py | 4 +- whitecanvas/layers/tabular/_shared.py | 4 +- 17 files changed, 1597 insertions(+), 1722 deletions(-) create mode 100644 whitecanvas/canvas/dataframe/_base.py create mode 100644 whitecanvas/canvas/dataframe/_both_cat.py create mode 100644 whitecanvas/canvas/dataframe/_feature_cat.py create mode 100644 whitecanvas/canvas/dataframe/_one_cat.py delete mode 100644 whitecanvas/canvas/dataframe/_plot.py delete mode 100644 whitecanvas/canvas/dataframe/_utils.py diff --git a/whitecanvas/canvas/_base.py b/whitecanvas/canvas/_base.py index e0b78061..f400e644 100644 --- a/whitecanvas/canvas/_base.py +++ b/whitecanvas/canvas/_base.py @@ -8,6 +8,7 @@ Iterable, Iterator, Literal, + Sequence, TypeVar, overload, ) @@ -56,6 +57,7 @@ from typing_extensions import Concatenate, ParamSpec, Self _P = ParamSpec("_P") + _DF = TypeVar("_DF") _L = TypeVar("_L", bound=_l.Layer) _L0 = TypeVar("_L0", _l.Bars, _l.Band) @@ -307,7 +309,14 @@ def update_axes( self.y.label.color = color return self - def cat(self, data, update_labels: bool = True) -> _df.DataFramePlotter: + def cat( + self, + data: _DF, + *, + x: str | None = None, + y: str | None = None, + update_labels: bool = True, + ) -> _df.FeatureCatPlotter[Self, _DF]: """ Categorize input data for plotting. @@ -328,9 +337,43 @@ def cat(self, data, update_labels: bool = True) -> _df.DataFramePlotter: CategorizedPlot Plotter object. """ - plotter = _df.DataFramePlotter(self, data, update_label=update_labels) + plotter = _df.FeatureCatPlotter(self, data, x, y, update_label=update_labels) return plotter + def cat_x( + self, + data: _DF, + *, + x: str | Sequence[str] | None = None, + y: str | None = None, + update_labels: bool = True, + ) -> _df.OneAxisCatPlotter[Self, _DF]: + return _df.OneAxisCatPlotter( + self, data, x, y, Orientation.VERTICAL, update_labels + ) + + def cat_y( + self, + data: _DF, + *, + x: str | None = None, + y: str | Sequence[str] | None = None, + update_labels: bool = True, + ) -> _df.OneAxisCatPlotter[Self, _DF]: + return _df.OneAxisCatPlotter( + self, data, y, x, Orientation.HORIZONTAL, update_labels + ) + + def cat_xy( + self, + data: _DF, + *, + x: str | Sequence[str] | None = None, + y: str | Sequence[str] | None = None, + update_labels: bool = True, + ) -> _df.BothAxesCatPlotter[Self, _DF]: + return _df.BothAxesCatPlotter(self, data, x, y, update_labels) + def stack_over(self, layer: _L0) -> StackOverPlotter[Self, _L0]: """ Stack new data over the existing layer. diff --git a/whitecanvas/canvas/_grid.py b/whitecanvas/canvas/_grid.py index 60081852..edcd0aa5 100644 --- a/whitecanvas/canvas/_grid.py +++ b/whitecanvas/canvas/_grid.py @@ -251,6 +251,16 @@ def _repr_png_(self): return file_obj.read() return None + def _ipython_display_(self, *args: Any, **kwargs: Any) -> Any: + if hasattr(self._backend_object, "_ipython_display_"): + return self._backend_object._ipython_display_(*args, **kwargs) + raise NotImplementedError() + + def _repr_mimebundle_(self, *args: Any, **kwargs: Any) -> dict: + if hasattr(self._backend_object, "_repr_mimebundle_"): + return self._backend_object._repr_mimebundle_(*args, **kwargs) + raise NotImplementedError() + class CanvasVGrid(CanvasGrid): @override diff --git a/whitecanvas/canvas/dataframe/__init__.py b/whitecanvas/canvas/dataframe/__init__.py index 7491ce65..4cb006ad 100644 --- a/whitecanvas/canvas/dataframe/__init__.py +++ b/whitecanvas/canvas/dataframe/__init__.py @@ -1,3 +1,5 @@ -from whitecanvas.canvas.dataframe._plot import DataFramePlotter +from whitecanvas.canvas.dataframe._both_cat import BothAxesCatPlotter +from whitecanvas.canvas.dataframe._feature_cat import FeatureCatPlotter +from whitecanvas.canvas.dataframe._one_cat import OneAxisCatPlotter -__all__ = ["DataFramePlotter"] +__all__ = ["FeatureCatPlotter", "BothAxesCatPlotter", "OneAxisCatPlotter"] diff --git a/whitecanvas/canvas/dataframe/_base.py b/whitecanvas/canvas/dataframe/_base.py new file mode 100644 index 00000000..0a0af604 --- /dev/null +++ b/whitecanvas/canvas/dataframe/_base.py @@ -0,0 +1,184 @@ +from __future__ import annotations + +import itertools +import weakref +from typing import ( + TYPE_CHECKING, + Generic, + Iterator, + Literal, + Sequence, + TypeVar, + Union, +) + +import numpy as np + +from whitecanvas._exceptions import ReferenceDeletedError +from whitecanvas.layers.tabular import _utils + +if TYPE_CHECKING: + from typing_extensions import Self + + from whitecanvas.canvas._base import CanvasBase + from whitecanvas.layers.tabular._dataframe import DataFrameWrapper + +_C = TypeVar("_C", bound="CanvasBase") +_DF = TypeVar("_DF") +NStr = Union[str, Sequence[str]] +AggMethods = Literal["min", "max", "mean", "median", "sum", "std"] + + +class BaseCatPlotter(Generic[_C, _DF]): + def __init__( + self, + canvas: _C, + df: _DF, + ): + self._canvas_ref = weakref.ref(canvas) + self._df = df + + def _canvas(self) -> _C: + canvas = self._canvas_ref() + if canvas is None: + raise ReferenceDeletedError("Canvas has been deleted.") + return canvas + + def __enter__(self) -> Self: + return self + + def __exit__(self, *args) -> None: + pass + + +class CatIterator(Generic[_DF]): + def __init__( + self, + df: DataFrameWrapper[_DF], + offsets: tuple[str, ...], + full: bool = True, + ): + self._df = df + self._offsets = offsets + self._full = full + self._cat_map_cache = {} + + @property + def df(self) -> DataFrameWrapper[_DF]: + return self._df + + @property + def offsets(self) -> tuple[str, ...]: + return self._offsets + + def category_map(self, columns: tuple[str, ...] | None = None) -> dict[tuple, int]: + """Calculate how to map category columns to integers.""" + if columns is None: + key = self._offsets + else: + key = tuple(columns) + if key in self._cat_map_cache: + return self._cat_map_cache[key] + if self._full: + each_uni = [_utils.unique(self._df[c], axis=None) for c in key] + _map = {uni: i for i, uni in enumerate(itertools.product(*each_uni))} + else: + group_keys = [sl for sl, _ in self._df.group_by(key)] + labels = np.array(group_keys, dtype=object) + each_uni = [_utils.unique(_l, axis=None) for _l in labels.T] + exists = set(group_keys) + i = 0 + for uni in itertools.product(*each_uni): + if uni not in exists: + continue + _map[uni] = i + i += 1 + self._cat_map_cache[key] = _map + return _map + + def iter_arrays( + self, + by: tuple[str, ...], + dodge: tuple[str, ...] | None = None, + ) -> Iterator[tuple[tuple, float, DataFrameWrapper[_DF]]]: + if dodge is None: + dodge = () + if set(self._offsets) > set(by): + raise ValueError( + f"offsets must be a subset of by, got offsets={self._offsets!r} and " + f"by={by!r}" + ) + indices = [by.index(d) for d in self._offsets] + _map = self.category_map(self._offsets) + if not dodge: + for sl, group in self._df.group_by(by): + key = tuple(sl[i] for i in indices) + yield sl, _map[key], group + else: + if set(self._offsets) & set(dodge): + raise ValueError( + f"offsets and dodge must be disjoint, got offsets={self._offsets!r}" + f" and dodge={dodge!r}" + ) + inv_indices = [by.index(d) for d in dodge] + _res_map = self.category_map(dodge) + _nres = len(_res_map) + _width = 0.8 + dmax = (_nres - 1) / 2 / _nres * _width + dd = np.linspace(-dmax, dmax, _nres) + for sl, group in self._df.group_by(by): + key = tuple(sl[i] for i in indices) + res = tuple(sl[i] for i in inv_indices) + yield sl, dd[_res_map[res]] + _map[key], group + + def prep_arrays( + self, + by: tuple[str, ...], + value: str, + dodge: tuple[str, ...] | None = None, + ) -> tuple[list[float], list[np.ndarray], list[tuple]]: + x = [] + arrays = [] + categories = [] + for sl, offset, group in self.iter_arrays(by, dodge): + x.append(offset) + arrays.append(group[value]) + categories.append(sl) + return x, arrays, categories + + def prep_position_map( + self, + by: tuple[str], + dodge: tuple[str, ...] | None = None, + ) -> dict[tuple, float]: + out = {} + for sl, offset, _ in self.iter_arrays(by, dodge): + out[sl] = offset + return out + + def axis_ticks(self) -> tuple[list[float], list[str]]: + pos = [] + labels = [] + for k, v in self.category_map(self._offsets).items(): + pos.append(v) + labels.append("\n".join(map(str, k))) + return pos, labels + + def axis_label(self) -> str: + return "/".join(self._offsets) + + def zoom_factor(self, dodge: tuple[str, ...] | None = None) -> float: + """Return the zoom factor for the given dodge.""" + if dodge: + _res_map = self.category_map(dodge) + _nres = len(_res_map) + if _nres == 1: + return 1.0 + _width = 0.8 + dmax = (_nres - 1) / 2 / _nres * _width + return 2 * dmax / (_nres - 1) + else: + return 1.0 + + def categories(self) -> list[tuple]: + return list(self.category_map(self._offsets).keys()) diff --git a/whitecanvas/canvas/dataframe/_both_cat.py b/whitecanvas/canvas/dataframe/_both_cat.py new file mode 100644 index 00000000..e4aa1414 --- /dev/null +++ b/whitecanvas/canvas/dataframe/_both_cat.py @@ -0,0 +1,91 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Sequence, + TypeVar, +) + +from whitecanvas.canvas.dataframe._base import BaseCatPlotter +from whitecanvas.layers import tabular as _lt +from whitecanvas.types import ColormapType + +if TYPE_CHECKING: + from whitecanvas.canvas._base import CanvasBase + + NStr = str | Sequence[str] + +_C = TypeVar("_C", bound="CanvasBase") +_DF = TypeVar("_DF") + + +class BothAxesCatPlotter(BaseCatPlotter[_C, _DF]): + def __init__( + self, + canvas: _C, + df: _DF, + x: str | tuple[str, ...], + y: str | tuple[str, ...], + update_label: bool = False, + ): + super().__init__(canvas, df, update_label) + self._x = x + self._y = y + if update_label: + self._update_xy_label(x, y) + + def _update_xy_label( + self, + x: str | tuple[str, ...], + y: str | tuple[str, ...], + ) -> None: + """Update the x and y labels using the column names""" + canvas = self._canvas() + if not isinstance(x, str): + x = "/".join(x) + if not isinstance(y, str): + y = "/".join(y) + canvas.x.label.text = x + canvas.y.label.text = y + + def add_heatmap( + self, + value: str, + *, + cmap: ColormapType = "inferno", + clim: tuple[float, float] | None = None, + name: str | None = None, + fill: float = 0, + ) -> _lt.DFHeatmap[_DF]: + canvas = self._canvas() + layer = _lt.DFHeatmap.build_heatmap( + self._df, self._x, self._y, value, cmap=cmap, clim=clim, name=name, + fill=fill, backend=canvas._get_backend(), + ) # fmt: skip + if self._update_label: + canvas.x.ticks.set_labels(*layer._generate_xticks()) + canvas.y.ticks.set_labels(*layer._generate_yticks()) + return canvas.add_layer(layer) + + +# TODO: add this in agg plotter +# def add_heatmap( +# self, +# value: str, +# *, +# cmap: ColormapType = "inferno", +# clim: tuple[float, float] | None = None, +# name: str | None = None, +# fill: float = 0, +# ) -> _lt.DFHeatmap[_DF]: +# canvas = self._canvas() +# df = parse(self._df) +# df_agg = self._aggregate(df, (x, y), value) +# layer = _lt.DFHeatmap.build_heatmap( +# df_agg, x, y, value, cmap=cmap, clim=clim, name=name, fill=fill, +# backend=canvas._get_backend(), +# ) # fmt: skip +# if self._update_label: +# canvas.x.ticks.set_labels(*layer._generate_xticks()) +# canvas.y.ticks.set_labels(*layer._generate_yticks()) +# return canvas.add_layer(layer) diff --git a/whitecanvas/canvas/dataframe/_feature_cat.py b/whitecanvas/canvas/dataframe/_feature_cat.py new file mode 100644 index 00000000..e5c4c7cd --- /dev/null +++ b/whitecanvas/canvas/dataframe/_feature_cat.py @@ -0,0 +1,400 @@ +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Sequence, + TypeVar, +) + +from whitecanvas.canvas.dataframe._base import BaseCatPlotter +from whitecanvas.layers import tabular as _lt +from whitecanvas.layers.tabular._dataframe import parse +from whitecanvas.types import ArrayLike1D, ColormapType, Orientation + +if TYPE_CHECKING: + from whitecanvas.canvas._base import CanvasBase + + NStr = str | Sequence[str] + +_C = TypeVar("_C", bound="CanvasBase") +_DF = TypeVar("_DF") + + +class FeatureCatPlotter(BaseCatPlotter[_C, _DF]): + """ + Categorical plotter that categorizes the data by features (color, style etc.) + """ + + def __init__( + self, + canvas: _C, + df: _DF, + x: str | None, + y: str | None, + update_label: bool = False, + ): + super().__init__(canvas, df, update_label) + self._x = x + self._y = y + self._update_label = update_label + if update_label: + self._update_xy_label(x, y) + + def _get_x(self) -> str: + if self._x is None: + raise ValueError("Column for x-axis is not set") + return self._x + + def _get_y(self) -> str: + if self._y is None: + raise ValueError("Column for y-axis is not set") + return self._y + + def _update_xy_label(self, x: str | None, y: str | None) -> None: + """Update the x and y labels using the column names""" + canvas = self._canvas() + if isinstance(x, str): + canvas.x.label.text = x + if isinstance(y, str): + canvas.y.label.text = y + + def along_x(self) -> FeatureCatPlotter[_C, _DF]: + return self.__class__( + self._canvas(), self._df, self._get_x(), None, self._update_label + ) + + def along_y(self) -> FeatureCatPlotter[_C, _DF]: + return self.__class__( + self._canvas(), self._df, None, self._get_y(), self._update_label + ) + + def add_line( + self, + *, + name: str | None = None, + color: NStr | None = None, + width: str | None = None, + style: NStr | None = None, + ) -> _lt.DFLines[_DF]: + """ + Add a categorical line plot. + + >>> ### Use "time" column as x-axis and "value" column as y-axis + >>> canvas.cat(df).add_line("time", "value") + + >>> ### Multiple lines colored by column "group" + >>> canvas.cat(df).add_line("time", "value", color="group") + + >>> ### Multiple lines styled by column "group" + >>> canvas.cat(df).add_line("time", "value", style="group") + + Parameters + ---------- + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + width : str, optional + Column name for line width. Must be numerical. + style : str or sequence of str, optional + Column name(s) for styling the lines. Must be categorical. + + Returns + ------- + WrappedLines + Line collection layer. + """ + canvas = self._canvas() + layer = _lt.DFLines.from_table( + self._df, self._get_x(), self._get_y(), name=name, color=color, width=width, + style=style, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def add_markers( + self, + *, + name: str | None = None, + color: NStr | None = None, + hatch: NStr | None = None, + size: str | None = None, + symbol: NStr | None = None, + ) -> _lt.DFMarkers[_DF]: + """ + Add a categorical marker plot. + + >>> ### Use "time" column as x-axis and "value" column as y-axis + >>> canvas.cat(df).add_markers("time", "value") + + >>> ### Multiple markers colored by column "group" + >>> canvas.cat(df).add_markers("time", "value", color="group") + + >>> ### Multiple markers with hatches determined by column "group" + >>> canvas.cat(df).add_markers("time", "value", style="group") + + >>> ### Multiple markers with symbols determined by "group" + >>> canvas.cat(df).add_markers("time", "value", symbol="group") + + Parameters + ---------- + x : str + Column name for x-axis. + y : str + Column name for y-axis. + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + size : str, optional + Column name for marker size. Must be numerical. + symbol : str or sequence of str, optional + Column name(s) for symbols. Must be categorical. + + Returns + ------- + WrappedMarkers + Marker collection layer. + """ + canvas = self._canvas() + df = parse(self._df) + layer = _lt.DFMarkers( + df, self._get_x(), self._get_y(), name=name, color=color, hatch=hatch, + size=size, symbol=symbol, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def add_bar( + self, + *, + name: str | None = None, + color: NStr | None = None, + hatch: NStr | None = None, + extent: float = 0.8, + ) -> _lt.DFBars[_DF]: + """ + Add a categorical bar plot. + + >>> ### Use "time" column as x-axis and "value" column as y-axis + >>> canvas.cat(df).add_bar("time", "value") + + >>> ### Multiple bars colored by column "group" + >>> canvas.cat(df).add_bar("time", "value", color="group") + + >>> ### Multiple bars with hatches determined by column "group" + >>> canvas.cat(df).add_bar("time", "value", hatch="group") + + Parameters + ---------- + x : str + Column name for x-axis. + y : str + Column name for y-axis. + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + extent : float, optional + Width of the bars. Usually in range (0, 1]. + + Returns + ------- + WrappedBars + Bar collection layer. + """ + canvas = self._canvas() + layer = _lt.DFBars.from_table( + self._df, self._get_x(), self._get_y(), name=name, color=color, hatch=hatch, + extent=extent, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def add_hist2d( + self, + *, + cmap: ColormapType = "inferno", + name: str | None = None, + bins: int | tuple[int, int] = 10, + range: tuple[tuple[float, float], tuple[float, float]] | None = None, + density: bool = False, + ): + """Add 2-D histogram of given columns.""" + canvas = self._canvas() + layer = _lt.DFHeatmap.build_hist( + self._df, self._get_x(), self._get_y(), cmap=cmap, name=name, bins=bins, + range=range, density=density, backend=canvas._get_backend(), + ) # fmt: skip + return canvas.add_layer(layer) + + def add_pointplot( + self, + *, + name: str | None = None, + color: NStr | None = None, + hatch: NStr | None = None, + size: float | None = None, + capsize: float = 0.15, + ): + canvas = self._canvas() + layer = _lt.DFPointPlot2D( + parse(self._df), self._get_x(), self._get_y(), name=name, color=color, + hatch=hatch, size=size, capsize=capsize, backend=canvas._get_backend(), + ) # fmt: skip + return canvas.add_layer(layer) + + def add_hist( + self, + *, + bins: int | ArrayLike1D = 10, + range: tuple[float, float] | None = None, + density: bool = False, + name: str | None = None, + color: NStr | None = None, + hatch: NStr | None = None, + ): + raise NotImplementedError + + def add_hist_line( + self, + *, + bins: int | ArrayLike1D = 10, + range: tuple[float, float] | None = None, + density: bool = False, + name: str | None = None, + color: NStr | None = None, + width: str | None = None, + style: NStr | None = None, + ): + """ + Add lines representing histograms. + + >>> ### Use "value" column as x-axis + >>> canvas.cat(df).add_line_hist("value", bins=8, density=True) + + >>> ### Multiple histograms colored by column "group" + >>> canvas.cat(df).add_line_hist("value", color="group") + + Parameters + ---------- + bins : int or array-like, default 10 + If an integer, the number of bins. If an array, the bin edges. + range : (float, float), default None + If provided, the lower and upper range of the bins. + density : bool, default False + If True, the total area of the histogram will be normalized to 1. + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + width : str, optional + Column name for line width. Must be numerical. + style : str or sequence of str, optional + Column name(s) for styling the lines. Must be categorical. + + Returns + ------- + WrappedLines + Line collection layer. + """ + canvas = self._canvas() + x0, orient = self._column_and_orient() + layer = _lt.DFLines.build_hist( + self._df, x0, bins=bins, range=range, density=density, name=name, + orient=orient, color=color, width=width, style=style, + backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + if self._update_label: + ax_label = "density" if density else "count" + if orient.is_vertical: + canvas.y.label.text = ax_label + else: + canvas.x.label.text = ax_label + return canvas.add_layer(layer) + + def add_kde( + self, + *, + band_width: float | None = None, + name: str | None = None, + color: NStr | None = None, + width: str | None = None, + style: NStr | None = None, + ): + """ + Add lines representing kernel density estimation. + + >>> ### Use "value" column as x-axis + >>> canvas.cat(df).add_kde("value") + + >>> ### Multiple KDEs colored by column "group" + >>> canvas.cat(df).add_kde("value", color="group") + + Parameters + ---------- + band_width : float, default None + Bandwidth of the kernel density estimation. If None, use Scott's rule. + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + width : str, optional + Column name for line width. Must be numerical. + style : str or sequence of str, optional + Column name(s) for styling the lines. Must be categorical. + + Returns + ------- + WrappedLines + Line collection layer. + """ + canvas = self._canvas() + x0, orient = self._column_and_orient() + layer = _lt.DFLines.build_kde( + self._df, x0, band_width=band_width, name=name, + orient=orient, color=color, width=width, style=style, + backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + if self._update_label: + ax_label = "density" + if orient.is_vertical: + canvas.y.label.text = ax_label + else: + canvas.x.label.text = ax_label + return canvas.add_layer(layer) + + def _column_and_orient(self) -> tuple[str, Orientation]: + if self._x is None and self._y is None: + raise ValueError("Column for either x- or y-axis must be set") + elif self._x is not None and self._y is not None: + raise ValueError("Only one of x- or y-axis can be set") + elif self._x is not None: + return self._x, Orientation.VERTICAL + else: + return self._y, Orientation.HORIZONTAL + + +class FeatureCatAggPlotter: + ... diff --git a/whitecanvas/canvas/dataframe/_one_cat.py b/whitecanvas/canvas/dataframe/_one_cat.py new file mode 100644 index 00000000..4cfcac4b --- /dev/null +++ b/whitecanvas/canvas/dataframe/_one_cat.py @@ -0,0 +1,633 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Sequence, TypeVar + +from whitecanvas import theme +from whitecanvas.canvas.dataframe._base import AggMethods, BaseCatPlotter, CatIterator +from whitecanvas.layers import tabular as _lt +from whitecanvas.layers.tabular import _jitter, _shared, parse +from whitecanvas.types import ColorType, Hatch, Orientation, Symbol + +if TYPE_CHECKING: + from whitecanvas.canvas._base import CanvasBase + from whitecanvas.layers.tabular._box_like import _BoxLikeMixin + from whitecanvas.layers.tabular._dataframe import DataFrameWrapper + + NStr = str | Sequence[str] + +_C = TypeVar("_C", bound="CanvasBase") +_DF = TypeVar("_DF") + + +class OneAxisCatPlotter(BaseCatPlotter[_C, _DF]): + def __init__( + self, + canvas: _C, + df: _DF, + offset: str | tuple[str, ...], + value: str | None, + orient: Orientation, + update_label: bool = False, + ): + super().__init__(canvas, df) + if isinstance(offset, str): + offset = (offset,) + self._offset = offset + self._cat_iter = CatIterator(parse(df), offset) + self._value = value + self._orient = orient + self._update_label = update_label + if update_label: + if value is not None: + self._update_axis_labels(value) + pos, label = self._cat_iter.axis_ticks() + if self._orient.is_vertical: + canvas.x.ticks.set_labels(pos, label) + else: + canvas.y.ticks.set_labels(pos, label) + + def __repr__(self) -> str: + return ( + f"{type(self).__name__}(offset={self._offset!r}, value={self._value!r}, " + f"orient={self._orient!r})" + ) + + def _update_axis_labels(self, value_label: str) -> None: + """Update the x and y labels using the column names""" + canvas = self._canvas() + offset_label = self._cat_iter.axis_label() + if self._orient.is_vertical: + canvas.x.label.text = offset_label + canvas.y.label.text = value_label + else: + canvas.x.label.text = value_label + canvas.y.label.text = offset_label + + def _get_value(self) -> str: + if self._value is None: + raise ValueError("Value column is not specified.") + return self._value + + def _update_xy_ticks(self, pos, label): + """Update the x or y ticks to categorical ticks""" + canvas = self._canvas() + if self._orient.is_vertical: + canvas.x.ticks.set_labels(pos, label) + else: + canvas.y.ticks.set_labels(pos, label) + + ### 1-D categorical ### + + def add_violinplot( + self, + *, + name: str | None = None, + color: NStr | None = None, + hatch: NStr | None = None, + dodge: NStr | bool | None = None, + extent: float = 0.8, + shape: str = "both", + ) -> _lt.DFViolinPlot[_DF]: + """ + Add a categorical violin plot. + + >>> ### Use "species" column as categories and "weight" column as values. + >>> canvas.cat_x(df, x="species", y="weight").add_violinplot() + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_violinplot(offset, "weight", color="region") + + Parameters + ---------- + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + extent : float, default 0.8 + Width of the violins. Usually in range (0, 1]. + shape : str, default "both" + Shape of the violins. Can be "both", "left", or "right". + + + Returns + ------- + WrappedViolinPlot + Violin plot layer. + """ + canvas = self._canvas() + layer = _lt.DFViolinPlot( + self._cat_iter, self._get_value(), name=name, color=color, hatch=hatch, + dodge=dodge, extent=extent, shape=shape, orient=self._orient, + backend=canvas._get_backend(), + ) # fmt: skip + self._post_add_boxlike(layer, color) + return canvas.add_layer(layer) + + def add_boxplot( + self, + *, + color: NStr | None = None, + hatch: NStr | None = None, + dodge: NStr | bool | None = None, + name: str | None = None, + capsize: float = 0.1, + extent: float = 0.8, + ) -> _lt.DFBoxPlot[_DF]: + """ + Add a categorical box plot. + + >>> ### Use "species" column as categories and "weight" column as values. + >>> canvas.cat(df).add_boxplot("species", "weight") + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_boxplot(offset, "weight", color="region") + + Parameters + ---------- + offset : tuple of str + Column name(s) for x-axis. + value : str + Column name for y-axis. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + name : str, optional + Name of the layer. + orient : str, default "vertical" + Orientation of the violins. Can be "vertical" or "horizontal". + capsize : float, default 0.1 + Length of the caps as a fraction of the width of the box. + extent : float, default 0.8 + Width of the violins. Usually in range (0, 1]. + + Returns + ------- + WrappedBoxPlot + Box plot layer. + """ + canvas = self._canvas() + layer = _lt.DFBoxPlot( + self._cat_iter, self._get_value(), name=name, color=color, hatch=hatch, + dodge=dodge, orient=self._orient, capsize=capsize, extent=extent, + backend=canvas._get_backend(), + ) # fmt: skip + self._post_add_boxlike(layer, color) + return canvas.add_layer(layer) + + def add_pointplot( + self, + *, + color: NStr | None = None, + hatch: NStr | None = None, + dodge: NStr | bool | None = None, + name: str | None = None, + capsize: float = 0.1, + ) -> _lt.DFPointPlot[_DF]: + """ + Add a categorical point plot (markers with error bars). + + >>> ### Use "species" column as categories and "weight" column as values. + >>> canvas.cat(df).add_pointplot("species", "weight") + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_pointplot(offset, "weight", color="region") + + The default estimator and errors are mean and standard deviation. To change + them, use `est_by_*` and `err_by_*` methods. + + >>> ### Use standard error x 2 (~95%) as error bars. + >>> canvas.cat(df).add_pointplot("species", "weight").err_by_se(scale=2.0) + + Parameters + ---------- + offset : tuple of str + Column name(s) for x-axis. + value : str + Column name for y-axis. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + name : str, optional + Name of the layer. + orient : str, default "vertical" + Orientation of the violins. Can be "vertical" or "horizontal". + capsize : float, default 0.1 + Length of the caps as a fraction of the width of the box. + + Returns + ------- + WrappedPointPlot + Point plot layer. + """ + canvas = self._canvas() + layer = _lt.DFPointPlot( + self._cat_iter, self._get_value(), name=name, color=color, hatch=hatch, + dodge=dodge, orient=self._orient, capsize=capsize, + backend=canvas._get_backend(), + ) # fmt: skip + self._post_add_boxlike(layer, color) + return canvas.add_layer(layer) + + def add_barplot( + self, + *, + color: NStr | None = None, + hatch: NStr | None = None, + dodge: NStr | bool | None = None, + name: str | None = None, + capsize: float = 0.1, + extent: float = 0.8, + ) -> _lt.DFBarPlot[_DF]: + """ + Add a categorical bar plot (bars with error bars). + + >>> ### Use "species" column as categories and "weight" column as values. + >>> canvas.cat(df).add_barplot("species", "weight") + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_barplot(offset, "weight", color="region") + + The default estimator and errors are mean and standard deviation. To change + them, use `est_by_*` and `err_by_*` methods. + + >>> ### Use standard error x 2 (~95%) as error bars. + >>> canvas.cat(df).add_barplot("species", "weight").err_by_se(scale=2.0) + + Parameters + ---------- + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + name : str, optional + Name of the layer. + capsize : float, default 0.1 + Length of the caps as a fraction of the width of the box. + extent : float, default 0.8 + Width of the violins. Usually in range (0, 1]. + + Returns + ------- + WrappedBarPlot + Bar plot layer. + """ + canvas = self._canvas() + layer = _lt.DFBarPlot( + self._cat_iter, self._get_value(), name=name, color=color, hatch=hatch, + dodge=dodge, orient=self._orient, capsize=capsize, extent=extent, + backend=canvas._get_backend(), + ) # fmt: skip + self._post_add_boxlike(layer, color) + return canvas.add_layer(layer) + + def _post_add_boxlike(self, layer: _BoxLikeMixin, color): + canvas = self._canvas() + if color is not None and not layer._color_by.is_const(): + layer.with_color_palette(canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + + def add_stripplot( + self, + *, + color: NStr | None = None, + hatch: NStr | None = None, + symbol: NStr | None = None, + size: str | None = None, + dodge: NStr | bool | None = None, + name: str | None = None, + extent: float = 0.5, + seed: int | None = 0, + ) -> _lt.DFMarkerGroups[_DF]: + """ + Add a categorical strip plot. + + >>> ### Use "species" column as categories and "weight" column as values. + >>> canvas.cat(df).add_stripplot("species", "weight") + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_stripplot(offset, "weight", color="region") + + Parameters + ---------- + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + symbol : str or sequence of str, optional + Column name(s) for symbols. Must be categorical. + size : str, optional + Column name for marker size. Must be numerical. + name : str, optional + Name of the layer. + extent : float, default 0.5 + Width of the violins. Usually in range (0, 1]. + seed : int, optional + Random seed for jittering. + + Returns + ------- + WrappedMarkerGroups + Marker collection layer. + """ + canvas = self._canvas() + symbol = theme._default("markers.symbol", symbol) + size = theme._default("markers.size", size) + + df = parse(self._df) + splitby, dodge = _splitby_dodge(df, self._offset, color, hatch, dodge) + _map = self._cat_iter.prep_position_map(splitby, dodge) + _extent = self._cat_iter.zoom_factor(dodge) * extent + xj = _jitter.UniformJitter(splitby, _map, extent=_extent, seed=seed) + yj = _jitter.IdentityJitter(self._get_value()).check(df) + if not self._orient.is_vertical: + xj, yj = yj, xj + layer = _lt.DFMarkerGroups( + df, xj, yj, name=name, color=color, hatch=hatch, orient=self._orient, + symbol=symbol, size=size, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def add_swarmplot( + self, + *, + color: NStr | None = None, + hatch: NStr | None = None, + symbol: NStr | None = None, + size: str | None = None, + dodge: NStr | bool | None = None, + name: str | None = None, + extent: float = 0.8, + sort: bool = False, + ) -> _lt.DFMarkerGroups[_DF]: + """ + Add a categorical swarm plot. + + >>> ### Use "species" column as categories and "weight" column as values. + >>> canvas.cat(df).add_swarmplot("species", "weight") + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_swarmplot(offset, "weight", color="region") + + Parameters + ---------- + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + symbol : str or sequence of str, optional + Column name(s) for symbols. Must be categorical. + size : str, optional + Column name for marker size. Must be numerical. + name : str, optional + Name of the layer. + extent : float, default 0.8 + Width of the violins. Usually in range (0, 1]. + sort : bool, default False + Whether to sort the data by value. + + Returns + ------- + WrappedMarkerGroups + Marker collection layer. + """ + canvas = self._canvas() + symbol = theme._default("markers.symbol", symbol) + size = theme._default("markers.size", size) + df = parse(self._df) + splitby, dodge = _splitby_dodge(df, self._offset, color, hatch, dodge) + _map = self._cat_iter.prep_position_map(splitby, dodge) + _extent = self._cat_iter.zoom_factor(dodge) * extent + + val = self._get_value() + if sort: + df = df.sort(val) + lims = df[val].min(), df[val].max() + xj = _jitter.SwarmJitter(splitby, _map, val, limits=lims, extent=_extent) + yj = _jitter.IdentityJitter(val).check(df) + if not self._orient.is_vertical: + xj, yj = yj, xj + layer = _lt.DFMarkerGroups( + df, xj, yj, name=name, color=color, hatch=hatch, orient=self._orient, + symbol=symbol, size=size, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def add_countplot( + self, + *, + color: NStr | None = None, + hatch: NStr | None = None, + name: str | None = None, + extent: float = 0.8, + ) -> _lt.DFBars[_DF]: + """ + Add a categorical count plot. + + >>> ### Count for each category in column "species". + >>> canvas.cat(df).add_countplot("species") + + >>> ### Color by column "region" with dodging. + >>> offset = ["species", "region"] # categories that offset will be added + >>> canvas.cat(df).add_countplot(offset, color="region") + + Parameters + ---------- + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + name : str, optional + Name of the layer. + extent : float, default 0.8 + Width of the violins. Usually in range (0, 1]. + + Returns + ------- + WrappedBars + Bar collection layer. + """ + canvas = self._canvas() + layer = _lt.DFBars.build_count( + self._df, self._offset, color=color, hatch=hatch, orient=self._orient, + extent=extent, name=name, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(layer._color_by.by, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + if self._update_label: + self._update_axis_labels("count") + return canvas.add_layer(layer) + + def agg(self, method: AggMethods = "mean") -> OneAxisCatAggPlotter[_C, _DF]: + return OneAxisCatAggPlotter( + self._canvas(), + self._df, + offset=self._offset, + value=self._get_value(), + method=method, + orient=self._orient, + ) + + +class OneAxisCatAggPlotter(BaseCatPlotter[_C, _DF]): + def __init__( + self, + canvas: _C, + cat_iter: CatIterator[_DF], + offset: str | tuple[str, ...], + value: str, + method: AggMethods, + orient: Orientation, + ): + super().__init__(canvas, cat_iter._df) + self._offset = offset + self._value = value + self._agg_method = method + self._orient = orient + self._cat_iter = cat_iter + + def add_line( + self, + *, + name: str | None = None, + color: NStr | None = None, + width: str | None = None, + style: NStr | None = None, + ) -> _lt.DFLines[_DF]: + """ + Add line that connect the aggregated values. + + >>> canvas.cat(df).mean().add_line("time", "value") + + Parameters + ---------- + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + width : str, optional + Column name for line width. Must be numerical. + style : str or sequence of str, optional + Column name(s) for styling the lines. Must be categorical. + + Returns + ------- + WrappedLines + Line collection layer. + """ + canvas = self._canvas() + df = parse(self._df) + _joined = _shared.join_columns(self._offset, color, style, source=df) + df_agg = self._aggregate(df, _joined, self._value) + xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) + yj = _jitter.IdentityJitter(self._value).check(df_agg) + if not self._orient.is_vertical: + xj, yj = yj, xj + layer = _lt.DFLines.from_table( + df_agg, xj, yj, name=name, color=color, width=width, style=style, + backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(color, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def add_markers( + self, + *, + name: str | None = None, + color: NStr | ColorType | None = None, + hatch: NStr | Hatch | None = None, + size: str | float | None = None, + symbol: NStr | Symbol | None = None, + ) -> _lt.DFMarkers[_DF]: + """ + Add markers that represent the aggregated values. + + >>> canvas.cat(df).mean().add_markers("time", "value") + + Parameters + ---------- + x : str + Column name for x-axis. + y : str + Column name for y-axis. + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + size : str, optional + Column name for marker size. Must be numerical. + symbol : str or sequence of str, optional + Column name(s) for symbols. Must be categorical. + + Returns + ------- + WrappedMarkers + Marker collection layer. + """ + canvas = self._canvas() + df = parse(self._df) + _joined = _shared.join_columns(self._offset, color, hatch, symbol, source=df) + df_agg = self._aggregate(df, _joined, self._value) + xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) + yj = _jitter.IdentityJitter(self._value).check(df_agg) + if not self._orient.is_vertical: + xj, yj = yj, xj + layer = _lt.DFMarkers( + df_agg, xj, yj, name=name, color=color, hatch=hatch, size=size, + symbol=symbol, backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(color, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + + def _aggregate( + self, + df: DataFrameWrapper, + by: tuple[str, ...], + on: str, + ) -> DataFrameWrapper[_DF]: + return df.agg_by(by, on, self._agg_method) + + +def _splitby_dodge( + source: DataFrameWrapper[_DF], + offset: str | tuple[str, ...], + color: str | tuple[str, ...] | None = None, + hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool | None = None, +) -> tuple[tuple[str, ...], tuple[str, ...]]: + if isinstance(offset, str): + offset = (offset,) + if isinstance(dodge, bool) and dodge: + dodge = _shared.join_columns(color, hatch, source=source) + elif isinstance(dodge, str): + dodge = (dodge,) + splitby = _shared.join_columns(offset, dodge, source=source) + return splitby, dodge diff --git a/whitecanvas/canvas/dataframe/_plot.py b/whitecanvas/canvas/dataframe/_plot.py deleted file mode 100644 index fe710d6f..00000000 --- a/whitecanvas/canvas/dataframe/_plot.py +++ /dev/null @@ -1,1094 +0,0 @@ -from __future__ import annotations - -import weakref -from typing import ( - TYPE_CHECKING, - Generic, - Sequence, - TypeVar, - Union, -) - -import numpy as np - -from whitecanvas import theme -from whitecanvas._exceptions import ReferenceDeletedError -from whitecanvas.canvas.dataframe._utils import PlotArg -from whitecanvas.layers import tabular as _lt -from whitecanvas.layers.tabular._dataframe import parse -from whitecanvas.types import ( - ArrayLike1D, - ColormapType, - ColorType, - Hatch, - Orientation, - Symbol, -) - -if TYPE_CHECKING: - from whitecanvas.canvas._base import CanvasBase - from whitecanvas.layers.tabular._box_like import _BoxLikeMixin - from whitecanvas.layers.tabular._dataframe import DataFrameWrapper - -_C = TypeVar("_C", bound="CanvasBase") -_DF = TypeVar("_DF") -NStr = Union[str, Sequence[str]] -_Orientation = Union[str, Orientation] - - -class _Plotter(Generic[_C, _DF]): - def __init__( - self, - canvas: _C, - df: _DF, - update_label: bool = False, - ): - self._canvas_ref = weakref.ref(canvas) - self._df = df - self._update_label = update_label - - def _canvas(self) -> _C: - canvas = self._canvas_ref() - if canvas is None: - raise ReferenceDeletedError("Canvas has been deleted.") - return canvas - - def _update_xy_label( - self, - x: str | tuple[str, ...], - y: str | tuple[str, ...], - orient: Orientation = Orientation.VERTICAL, - ) -> None: - """Update the x and y labels using the column names""" - canvas = self._canvas() - if not isinstance(x, str): - x = "/".join(x) - if not isinstance(y, str): - y = "/".join(y) - if orient.is_vertical: - canvas.x.label.text = x - canvas.y.label.text = y - else: - canvas.x.label.text = y - canvas.y.label.text = x - - def _update_xy_ticks(self, pos, label, orient: Orientation = Orientation.VERTICAL): - """Update the x or y ticks to categorical ticks""" - canvas = self._canvas() - if orient.is_vertical: - canvas.x.ticks.set_labels(pos, label) - else: - canvas.y.ticks.set_labels(pos, label) - - -class DataFramePlotter(_Plotter[_C, _DF]): - ### 0-D categorical ### - def add_line( - self, - x: str, - y: str, - *, - name: str | None = None, - color: NStr | None = None, - width: str | None = None, - style: NStr | None = None, - ) -> _lt.DFLines[_DF]: - """ - Add a categorical line plot. - - >>> ### Use "time" column as x-axis and "value" column as y-axis - >>> canvas.cat(df).add_line("time", "value") - - >>> ### Multiple lines colored by column "group" - >>> canvas.cat(df).add_line("time", "value", color="group") - - >>> ### Multiple lines styled by column "group" - >>> canvas.cat(df).add_line("time", "value", style="group") - - Parameters - ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. - name : str, optional - Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - width : str, optional - Column name for line width. Must be numerical. - style : str or sequence of str, optional - Column name(s) for styling the lines. Must be categorical. - - Returns - ------- - WrappedLines - Line collection layer. - """ - canvas = self._canvas() - layer = _lt.DFLines.from_table( - self._df, x, y, name=name, color=color, width=width, style=style, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - def add_markers( - self, - x: str, - y: str, - *, - name: str | None = None, - color: NStr | None = None, - hatch: NStr | None = None, - size: str | None = None, - symbol: NStr | None = None, - ) -> _lt.DFMarkers[_DF]: - """ - Add a categorical marker plot. - - >>> ### Use "time" column as x-axis and "value" column as y-axis - >>> canvas.cat(df).add_markers("time", "value") - - >>> ### Multiple markers colored by column "group" - >>> canvas.cat(df).add_markers("time", "value", color="group") - - >>> ### Multiple markers with hatches determined by column "group" - >>> canvas.cat(df).add_markers("time", "value", style="group") - - >>> ### Multiple markers with symbols determined by "group" - >>> canvas.cat(df).add_markers("time", "value", symbol="group") - - Parameters - ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. - name : str, optional - Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - size : str, optional - Column name for marker size. Must be numerical. - symbol : str or sequence of str, optional - Column name(s) for symbols. Must be categorical. - - Returns - ------- - WrappedMarkers - Marker collection layer. - """ - canvas = self._canvas() - layer = _lt.DFMarkers.from_table( - self._df, x, y, name=name, color=color, hatch=hatch, size=size, - symbol=symbol, backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - def add_bar( - self, - x: str, - y: str, - *, - name: str | None = None, - color: NStr | None = None, - hatch: NStr | None = None, - extent: float = 0.8, - ) -> _lt.DFBars[_DF]: - """ - Add a categorical bar plot. - - >>> ### Use "time" column as x-axis and "value" column as y-axis - >>> canvas.cat(df).add_bar("time", "value") - - >>> ### Multiple bars colored by column "group" - >>> canvas.cat(df).add_bar("time", "value", color="group") - - >>> ### Multiple bars with hatches determined by column "group" - >>> canvas.cat(df).add_bar("time", "value", hatch="group") - - Parameters - ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. - name : str, optional - Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - extent : float, optional - Width of the bars. Usually in range (0, 1]. - - Returns - ------- - WrappedBars - Bar collection layer. - """ - canvas = self._canvas() - layer = _lt.DFBars.from_table( - self._df, x, y, name=name, color=color, hatch=hatch, extent=extent, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - def add_hist_line( - self, - x: str, - *, - bins: int | ArrayLike1D = 10, - range: tuple[float, float] | None = None, - density: bool = False, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - color: NStr | None = None, - width: str | None = None, - style: NStr | None = None, - ): - """ - Add lines representing histograms. - - >>> ### Use "value" column as x-axis - >>> canvas.cat(df).add_line_hist("value", bins=8, density=True) - - >>> ### Multiple histograms colored by column "group" - >>> canvas.cat(df).add_line_hist("value", color="group") - - Parameters - ---------- - x : str - Column name for x-axis. - bins : int or array-like, default 10 - If an integer, the number of bins. If an array, the bin edges. - range : (float, float), default None - If provided, the lower and upper range of the bins. - density : bool, default False - If True, the total area of the histogram will be normalized to 1. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - width : str, optional - Column name for line width. Must be numerical. - style : str or sequence of str, optional - Column name(s) for styling the lines. Must be categorical. - - Returns - ------- - WrappedLines - Line collection layer. - """ - canvas = self._canvas() - layer = _lt.DFLines.build_hist( - self._df, x, bins=bins, range=range, density=density, name=name, - orient=orient, color=color, width=width, style=style, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - y = "density" if density else "count" - self._update_xy_label(x, y, orient) - return canvas.add_layer(layer) - - def add_kde( - self, - value: str, - *, - band_width: float | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - color: NStr | None = None, - width: str | None = None, - style: NStr | None = None, - ): - """ - Add lines representing kernel density estimation. - - >>> ### Use "value" column as x-axis - >>> canvas.cat(df).add_kde("value") - - >>> ### Multiple KDEs colored by column "group" - >>> canvas.cat(df).add_kde("value", color="group") - - Parameters - ---------- - value : str - Column name for x-axis. - band_width : float, default None - Bandwidth of the kernel density estimation. If None, use Scott's rule. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - width : str, optional - Column name for line width. Must be numerical. - style : str or sequence of str, optional - Column name(s) for styling the lines. Must be categorical. - - Returns - ------- - WrappedLines - Line collection layer. - """ - canvas = self._canvas() - layer = _lt.DFLines.build_kde( - self._df, value, band_width=band_width, name=name, - orient=orient, color=color, width=width, style=style, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(value, "density", orient) - return canvas.add_layer(layer) - - def add_hist2d( - self, - x: str, - y: str, - *, - cmap: ColormapType = "inferno", - name: str | None = None, - bins: int | tuple[int, int] = 10, - range: tuple[tuple[float, float], tuple[float, float]] | None = None, - density: bool = False, - ): - """Add 2-D histogram of given columns.""" - canvas = self._canvas() - layer = _lt.DFHeatmap.build_hist( - self._df, x, y, cmap=cmap, name=name, bins=bins, range=range, - density=density, backend=canvas._get_backend(), - ) # fmt: skip - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - ### 1-D categorical ### - - def add_violinplot( - self, - offset: tuple[str, ...], - value: str, - *, - color: NStr | None = None, - hatch: NStr | None = None, - extent: float = 0.8, - shape: str = "both", - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - ) -> _lt.DFViolinPlot[_DF]: - """ - Add a categorical violin plot. - - >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_violinplot("species", "weight") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_violinplot(offset, "weight", color="region") - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - extent : float, default 0.8 - Width of the violins. Usually in range (0, 1]. - shape : str, default "both" - Shape of the violins. Can be "both", "left", or "right". - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - - Returns - ------- - WrappedViolinPlot - Violin plot layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - layer = _lt.DFViolinPlot.from_table( - self._df, offset, value, name=name, color=color, hatch=hatch, extent=extent, - shape=shape, orient=orient, backend=canvas._get_backend(), - ) # fmt: skip - self._post_add_boxlike(layer, color, orient, value) - return canvas.add_layer(layer) - - def add_boxplot( - self, - offset: tuple[str, ...], - value: str, - *, - color: NStr | None = None, - hatch: NStr | None = None, - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - capsize: float = 0.1, - extent: float = 0.8, - ) -> _lt.DFBoxPlot[_DF]: - """ - Add a categorical box plot. - - >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_boxplot("species", "weight") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_boxplot(offset, "weight", color="region") - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - capsize : float, default 0.1 - Length of the caps as a fraction of the width of the box. - extent : float, default 0.8 - Width of the violins. Usually in range (0, 1]. - - Returns - ------- - WrappedBoxPlot - Box plot layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - layer = _lt.DFBoxPlot.from_table( - self._df, offset, value, name=name, color=color, hatch=hatch, orient=orient, - capsize=capsize, extent=extent, backend=canvas._get_backend(), - ) # fmt: skip - self._post_add_boxlike(layer, color, orient, value) - return canvas.add_layer(layer) - - def add_pointplot( - self, - offset: tuple[str, ...], - value: str, - *, - color: NStr | None = None, - hatch: NStr | None = None, - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - capsize: float = 0.1, - ) -> _lt.DFPointPlot[_DF]: - """ - Add a categorical point plot (markers with error bars). - - >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_pointplot("species", "weight") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_pointplot(offset, "weight", color="region") - - The default estimator and errors are mean and standard deviation. To change - them, use `est_by_*` and `err_by_*` methods. - - >>> ### Use standard error x 2 (~95%) as error bars. - >>> canvas.cat(df).add_pointplot("species", "weight").err_by_se(scale=2.0) - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - capsize : float, default 0.1 - Length of the caps as a fraction of the width of the box. - - Returns - ------- - WrappedPointPlot - Point plot layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - layer = _lt.DFPointPlot.from_table( - self._df, offset, value, name=name, color=color, hatch=hatch, orient=orient, - capsize=capsize, backend=canvas._get_backend(), - ) # fmt: skip - self._post_add_boxlike(layer, color, orient, value) - return canvas.add_layer(layer) - - def add_barplot( - self, - offset: tuple[str, ...], - value: str, - *, - color: NStr | None = None, - hatch: NStr | None = None, - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - capsize: float = 0.1, - extent: float = 0.8, - ) -> _lt.DFBarPlot[_DF]: - """ - Add a categorical bar plot (bars with error bars). - - >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_barplot("species", "weight") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_barplot(offset, "weight", color="region") - - The default estimator and errors are mean and standard deviation. To change - them, use `est_by_*` and `err_by_*` methods. - - >>> ### Use standard error x 2 (~95%) as error bars. - >>> canvas.cat(df).add_barplot("species", "weight").err_by_se(scale=2.0) - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - capsize : float, default 0.1 - Length of the caps as a fraction of the width of the box. - extent : float, default 0.8 - Width of the violins. Usually in range (0, 1]. - - Returns - ------- - WrappedBarPlot - Bar plot layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - layer = _lt.DFBarPlot.from_table( - self._df, offset, value, name=name, color=color, hatch=hatch, orient=orient, - capsize=capsize, extent=extent, backend=canvas._get_backend(), - ) # fmt: skip - self._post_add_boxlike(layer, color, orient, value) - return canvas.add_layer(layer) - - def _post_add_boxlike(self, layer: _BoxLikeMixin, color, orient, value: str): - canvas = self._canvas() - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - pos, labels, offset_labels = layer._generate_labels() - self._update_xy_ticks(pos, labels, orient=orient) - self._update_xy_label(offset_labels, value, orient=orient) - - def add_stripplot( - self, - offset: tuple[str, ...], - value: str, - *, - color: NStr | None = None, - hatch: NStr | None = None, - symbol: NStr | None = None, - size: str | None = None, - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - extent: float = 0.5, - seed: int | None = 0, - ) -> _lt.DFMarkerGroups[_DF]: - """ - Add a categorical strip plot. - - >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_stripplot("species", "weight") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_stripplot(offset, "weight", color="region") - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - symbol : str or sequence of str, optional - Column name(s) for symbols. Must be categorical. - size : str, optional - Column name for marker size. Must be numerical. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - extent : float, default 0.5 - Width of the violins. Usually in range (0, 1]. - seed : int, optional - Random seed for jittering. - - Returns - ------- - WrappedMarkerGroups - Marker collection layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - symbol = theme._default("markers.symbol", symbol) - size = theme._default("markers.size", size) - layer = _lt.DFMarkers.build_stripplot( - self._df, offset, value, name=name, color=color, hatch=hatch, symbol=symbol, - size=size, orient=orient, extent=extent, seed=seed, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - pos, labels = layer._generate_labels() - self._update_xy_ticks(pos, labels, orient=orient) - self._update_xy_label(offset, value, orient=orient) - return canvas.add_layer(layer) - - def add_swarmplot( - self, - offset: NStr, - value: str, - *, - color: NStr | None = None, - hatch: NStr | None = None, - symbol: NStr | None = None, - size: str | None = None, - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - extent: float = 0.8, - sort: bool = False, - ) -> _lt.DFMarkerGroups[_DF]: - """ - Add a categorical swarm plot. - - >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_swarmplot("species", "weight") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_swarmplot(offset, "weight", color="region") - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - symbol : str or sequence of str, optional - Column name(s) for symbols. Must be categorical. - size : str, optional - Column name for marker size. Must be numerical. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - extent : float, default 0.8 - Width of the violins. Usually in range (0, 1]. - sort : bool, default False - Whether to sort the data by value. - - Returns - ------- - WrappedMarkerGroups - Marker collection layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - symbol = theme._default("markers.symbol", symbol) - size = theme._default("markers.size", size) - layer = _lt.DFMarkers.build_swarmplot( - self._df, offset, value, name=name, color=color, hatch=hatch, symbol=symbol, - size=size, orient=orient, extent=extent, sort=sort, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - pos, labels = layer._generate_labels() - self._update_xy_ticks(pos, labels, orient=orient) - self._update_xy_label(offset, value, orient=orient) - return canvas.add_layer(layer) - - def add_countplot( - self, - offset: NStr, - *, - color: NStr | None = None, - hatch: NStr | None = None, - name: str | None = None, - orient: _Orientation = Orientation.VERTICAL, - extent: float = 0.8, - ) -> _lt.DFBars[_DF]: - """ - Add a categorical count plot. - - >>> ### Count for each category in column "species". - >>> canvas.cat(df).add_countplot("species") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_countplot(offset, color="region") - - Parameters - ---------- - offset : tuple of str - Column name(s) for x-axis. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - name : str, optional - Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". - extent : float, default 0.8 - Width of the violins. Usually in range (0, 1]. - - Returns - ------- - WrappedBars - Bar collection layer. - """ - canvas = self._canvas() - orient = Orientation.parse(orient) - layer = _lt.DFBars.build_count( - self._df, offset, color=color, hatch=hatch, orient=orient, extent=extent, - name=name, backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(offset, "count", orient=orient) - return canvas.add_layer(layer) - - ### 2-D categorical ### - - def add_heatmap( - self, - x: str, - y: str, - value: str, - *, - cmap: ColormapType = "inferno", - clim: tuple[float, float] | None = None, - name: str | None = None, - fill: float = 0, - ) -> _lt.DFHeatmap[_DF]: - canvas = self._canvas() - layer = _lt.DFHeatmap.build_heatmap( - self._df, x, y, value, cmap=cmap, clim=clim, name=name, fill=fill, - backend=canvas._get_backend(), - ) # fmt: skip - if self._update_label: - self._update_xy_label(x, y) - canvas.x.ticks.set_labels(*layer._generate_xticks()) - canvas.y.ticks.set_labels(*layer._generate_yticks()) - return canvas.add_layer(layer) - - def add_pointplot2d( - self, - x: str, - y: str, - *, - name: str | None = None, - color: NStr | None = None, - hatch: NStr | None = None, - size: float | None = None, - capsize: float = 0.15, - ): - canvas = self._canvas() - layer = _lt.DFPointPlot2D( - parse(self._df), x, y, name=name, color=color, hatch=hatch, size=size, - capsize=capsize, backend=canvas._get_backend(), - ) # fmt: skip - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - ### Aggregation ### - - def mean(self, orient: _Orientation = "vertical") -> DataFrameAggPlotter[_C, _DF]: - """Return a mean-plotter.""" - return self._agg_plotter("mean", orient) - - def std(self, orient: _Orientation = "vertical") -> DataFrameAggPlotter[_C, _DF]: - """Return a std-plotter.""" - return self._agg_plotter("std", orient) - - def median(self, orient: _Orientation = "vertical") -> DataFrameAggPlotter[_C, _DF]: - """Return a median-plotter.""" - return self._agg_plotter("median", orient) - - def min(self, orient: _Orientation = "vertical") -> DataFrameAggPlotter[_C, _DF]: - """Return a min-plotter.""" - return self._agg_plotter("min", orient) - - def max(self, orient: _Orientation = "vertical") -> DataFrameAggPlotter[_C, _DF]: - """Return a max-plotter.""" - return self._agg_plotter("max", orient) - - def sum(self, orient: _Orientation = "vertical") -> DataFrameAggPlotter[_C, _DF]: - """Return a sum-plotter.""" - return self._agg_plotter("sum", orient) - - def _agg_plotter( - self, - method: str, - orient: str | Orientation, - ) -> DataFrameAggPlotter[_C, _DF]: - return DataFrameAggPlotter( - self._canvas(), - self._df, - self._update_label, - method=method, - orient=Orientation.parse(orient), - ) - - -class DataFrameAggPlotter(_Plotter[_C, _DF]): - def __init__( - self, - canvas: _C, - df: _DF, - update_label: bool, - method: str, - orient: Orientation, - ): - super().__init__(canvas, df, update_label) - self._agg_method = method - self._orient = orient - - def add_line( - self, - x: str, - y: str, - *, - name: str | None = None, - color: NStr | None = None, - width: str | None = None, - style: NStr | None = None, - ) -> _lt.DFLines[_DF]: - """ - Add line that connect the aggregated values. - - >>> canvas.cat(df).mean().add_line("time", "value") - - Parameters - ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. - name : str, optional - Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - width : str, optional - Column name for line width. Must be numerical. - style : str or sequence of str, optional - Column name(s) for styling the lines. Must be categorical. - - Returns - ------- - WrappedLines - Line collection layer. - """ - canvas = self._canvas() - df = parse(self._df) - keys = list(df.iter_keys()) - _color = PlotArg.from_color(keys, color) - _style = PlotArg.from_style(keys, style) - df_agg = self._aggregate(df, self._concat_tuple(x, y, _color, _style), y) - layer = _lt.DFLines.from_table( - df_agg, x, y, name=name, color=color, width=width, style=style, - backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(_color.value, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - def add_markers( - self, - x: str, - y: str, - *, - name: str | None = None, - color: NStr | ColorType | None = None, - hatch: NStr | Hatch | None = None, - size: np.str_ | float | None = None, - symbol: NStr | Symbol | None = None, - ) -> _lt.DFMarkers[_DF]: - """ - Add markers that represent the aggregated values. - - >>> canvas.cat(df).mean().add_markers("time", "value") - - Parameters - ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. - name : str, optional - Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - size : str, optional - Column name for marker size. Must be numerical. - symbol : str or sequence of str, optional - Column name(s) for symbols. Must be categorical. - - Returns - ------- - WrappedMarkers - Marker collection layer. - """ - canvas = self._canvas() - df = parse(self._df) - keys = list(df.iter_keys()) - _color = PlotArg.from_color(keys, color) - _hatch = PlotArg.from_hatch(keys, hatch) - _symbol = PlotArg.from_symbol(keys, symbol) - df_agg = self._aggregate( - df, self._concat_tuple(x, y, _color, _hatch, _symbol), y - ) - layer = _lt.DFMarkers.from_table( - df_agg, x, y, name=name, color=color, hatch=hatch, size=size, - symbol=symbol, backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(_color.value, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_xy_label(x, y) - return canvas.add_layer(layer) - - def add_heatmap( - self, - x: str, - y: str, - value: str, - *, - cmap: ColormapType = "inferno", - clim: tuple[float, float] | None = None, - name: str | None = None, - fill: float = 0, - ) -> _lt.DFHeatmap[_DF]: - canvas = self._canvas() - df = parse(self._df) - df_agg = self._aggregate(df, (x, y), value) - layer = _lt.DFHeatmap.build_heatmap( - df_agg, x, y, value, cmap=cmap, clim=clim, name=name, fill=fill, - backend=canvas._get_backend(), - ) # fmt: skip - if self._update_label: - self._update_xy_label(x, y) - canvas.x.ticks.set_labels(*layer._generate_xticks()) - canvas.y.ticks.set_labels(*layer._generate_yticks()) - return canvas.add_layer(layer) - - def _aggregate( - self, - df: DataFrameWrapper, - by: tuple[str, ...], - on: str, - ) -> DataFrameWrapper[_DF]: - return df.agg_by(by, on, self._agg_method) - - def _concat_tuple(self, x, y, *args: PlotArg) -> tuple: - """ - Concatenate the arguments into a tuple. - - This method may return a tuple of str or other types such as Symbol, Color, etc. - """ - out = [] - if self._orient.is_vertical: - out.append(x) - else: - out.append(y) - for a in args: - if not a.is_column: - continue - elif isinstance(val := a.value, str): - out.append(val) - else: - out.extend(val) - return out diff --git a/whitecanvas/canvas/dataframe/_utils.py b/whitecanvas/canvas/dataframe/_utils.py deleted file mode 100644 index 6dd164d4..00000000 --- a/whitecanvas/canvas/dataframe/_utils.py +++ /dev/null @@ -1,132 +0,0 @@ -from __future__ import annotations - -from cmap import Color - -from whitecanvas.types import Hatch, LineStyle, Symbol - - -def _sequence_of_column_name(keys: list[str], value) -> bool: - if isinstance(value, str): - return False - if hasattr(value, "__iter__"): - for each in value: - if not isinstance(each, str): - return False - if each not in keys: - return False - return True - return False - - -class PlotArg: - def __init__(self, value, is_column: bool): - self._value = value - self._is_column = is_column - - @property - def value(self): - """The value of the argument.""" - return self._value - - @property - def is_column(self) -> bool: - """True if the value is a column name.""" - return self._is_column - - @classmethod - def from_color(cls, keys: list[str], color) -> PlotArg: - if color is None: - return PlotArg(None, False) - if isinstance(color, str): - if color in keys: - return PlotArg([color], True) - else: - return PlotArg.from_color(keys, Color(color)) - elif _sequence_of_column_name(keys, color): - return PlotArg(list(color), True) - else: - try: - col = Color(color) - except Exception: - raise ValueError( - f"'color' must be one of the column names {keys!r}, color-like " - "or sequence of them." - ) - return PlotArg(col, False) - - @classmethod - def from_symbol(cls, keys: list[str], symbol) -> PlotArg: - if symbol is None: - return PlotArg(None, False) - if isinstance(symbol, str): - if symbol in keys: - return PlotArg([symbol], True) - else: - return PlotArg.from_symbol(keys, Symbol(symbol)) - elif _sequence_of_column_name(keys, symbol): - return PlotArg(list(symbol), True) - else: - try: - sym = Symbol(symbol) - except Exception: - raise ValueError( - f"'symbol' must be one of the column names {keys!r}, symbol-like " - "or sequence of them." - ) - return PlotArg(sym, False) - - @classmethod - def from_hatch(cls, keys: list[str], hatch) -> PlotArg: - if hatch is None: - return PlotArg(None, False) - if isinstance(hatch, str): - if hatch in keys: - return PlotArg([hatch], True) - else: - return PlotArg.from_hatch(keys, Hatch(hatch)) - elif _sequence_of_column_name(keys, hatch): - return PlotArg(list(hatch), True) - else: - try: - htch = Hatch(hatch) - except Exception: - raise ValueError( - f"'hatch' must be one of the column names {keys!r}, hatch-like " - "or sequence of them." - ) from None - return PlotArg(htch, False) - - @classmethod - def from_style(cls, keys: list[str], style) -> PlotArg: - if style is None: - return PlotArg(None, False) - if isinstance(style, str): - if style in keys: - return PlotArg([style], True) - else: - return PlotArg.from_style(keys, LineStyle(style)) - elif _sequence_of_column_name(keys, style): - return PlotArg(list(style), True) - else: - try: - stl = LineStyle(style) - except Exception: - raise ValueError( - f"'style' must be one of the column names {keys!r}, style-like " - "or sequence of them." - ) from None - return PlotArg(stl, False) - - @classmethod - def from_scalar(cls, keys: list[str], value) -> PlotArg: - if value is None: - return PlotArg(None, False) - if isinstance(value, str): - if value in keys: - return PlotArg([value], True) - else: - raise ValueError(f"Not a valid column name: {value!r}") - elif _sequence_of_column_name(keys, value): - return PlotArg(list(value), True) - else: - return PlotArg(float(value), False) diff --git a/whitecanvas/layers/group/band_collection.py b/whitecanvas/layers/group/band_collection.py index df01865b..d5f00493 100644 --- a/whitecanvas/layers/group/band_collection.py +++ b/whitecanvas/layers/group/band_collection.py @@ -114,45 +114,10 @@ def from_arrays( kde_band_width: float | str = "scott", backend: str | Backend | None = None, ): - from whitecanvas.utils.kde import gaussian_kde - ori = Orientation.parse(orient) - if extent <= 0: - raise ValueError(f"extent must be positive, got {extent}") - x, data = check_array_input(x, data) - xyy_values: list[XYYData] = [] - for offset, values in zip(x, data): - arr = as_array_1d(values) - kde = gaussian_kde(arr, bw_method=kde_band_width) - - sigma = np.sqrt(kde.covariance[0, 0]) - pad = sigma * 2.5 - x_ = np.linspace(arr.min() - pad, arr.max() + pad, 100) - y = kde(x_) - if shape in ("both", "left"): - y0 = -y + offset - else: - y0 = np.zeros_like(y) + offset - if shape in ("both", "right"): - y1 = y + offset - else: - y1 = np.zeros_like(y) + offset - - data = XYYData(x_, y0, y1) - xyy_values.append(data) - - half_widths: list[float] = [] - for xyy in xyy_values: - half_width = np.max(np.abs(xyy.ydiff)) - if shape == "both": - half_width /= 2 - half_widths.append(half_width) - factor = extent / np.max(half_widths) / 2 - new_vals: list[XYYData] = [] - for xyy, xoffset in zip(xyy_values, x): - y0 = (xyy.y0 - xoffset) * factor + xoffset - y1 = (xyy.y1 - xoffset) * factor + xoffset - new_vals.append(XYYData(xyy.x, y0, y1)) + new_vals = cls._convert_data( + x, data, shape=shape, extent=extent, kde_band_width=kde_band_width + ) return cls( new_vals, name=name, @@ -256,3 +221,51 @@ def set_datasets( y0 = (xyy.y0 - xoffset) * factor + xoffset y1 = (xyy.y1 - xoffset) * factor + xoffset band.data = XYYData(xyy.x, y0, y1) + + @staticmethod + def _convert_data( + x: list[float], + data: list[ArrayLike], + shape: Literal["both", "left", "right"] = "both", + extent: float = 0.5, + kde_band_width: float | str = "scott", + ): + from whitecanvas.utils.kde import gaussian_kde + + if extent <= 0: + raise ValueError(f"extent must be positive, got {extent}") + x, data = check_array_input(x, data) + xyy_values: list[XYYData] = [] + for offset, values in zip(x, data): + arr = as_array_1d(values) + kde = gaussian_kde(arr, bw_method=kde_band_width) + + sigma = np.sqrt(kde.covariance[0, 0]) + pad = sigma * 2.5 + x_ = np.linspace(arr.min() - pad, arr.max() + pad, 100) + y = kde(x_) + if shape in ("both", "left"): + y0 = -y + offset + else: + y0 = np.zeros_like(y) + offset + if shape in ("both", "right"): + y1 = y + offset + else: + y1 = np.zeros_like(y) + offset + + data = XYYData(x_, y0, y1) + xyy_values.append(data) + + half_widths: list[float] = [] + for xyy in xyy_values: + half_width = np.max(np.abs(xyy.ydiff)) + if shape == "both": + half_width /= 2 + half_widths.append(half_width) + factor = extent / np.max(half_widths) / 2 + new_vals: list[XYYData] = [] + for xyy, xoffset in zip(xyy_values, x): + y0 = (xyy.y0 - xoffset) * factor + xoffset + y1 = (xyy.y1 - xoffset) * factor + xoffset + new_vals.append(XYYData(xyy.x, y0, y1)) + return new_vals diff --git a/whitecanvas/layers/tabular/__init__.py b/whitecanvas/layers/tabular/__init__.py index ecf6cd77..3265dd7d 100644 --- a/whitecanvas/layers/tabular/__init__.py +++ b/whitecanvas/layers/tabular/__init__.py @@ -12,6 +12,7 @@ DFMarkers, DFPointPlot2D, ) +from whitecanvas.layers.tabular._df_compat import parse __all__ = [ "DFBarPlot", @@ -24,4 +25,5 @@ "DFBoxPlot", "DFHeatmap", "DFPointPlot2D", + "parse", ] diff --git a/whitecanvas/layers/tabular/_box_like.py b/whitecanvas/layers/tabular/_box_like.py index 8f14bffc..330466db 100644 --- a/whitecanvas/layers/tabular/_box_like.py +++ b/whitecanvas/layers/tabular/_box_like.py @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, TypeVar +from typing import TYPE_CHECKING, Callable, Generic, TypeVar import numpy as np from cmap import Color @@ -13,7 +13,7 @@ from whitecanvas.layers import group as _lg from whitecanvas.layers.tabular import _plans as _p from whitecanvas.layers.tabular import _shared -from whitecanvas.layers.tabular._df_compat import DataFrameWrapper, parse +from whitecanvas.layers.tabular._df_compat import DataFrameWrapper from whitecanvas.types import ( ColorType, Hatch, @@ -24,79 +24,98 @@ if TYPE_CHECKING: from typing_extensions import Self + from whitecanvas.canvas.dataframe._base import CatIterator + _FE = _mixin.AbstractFaceEdgeMixin[_mixin.FaceNamespace, _mixin.EdgeNamespace] _DF = TypeVar("_DF") +def _splitby_dodge( + source: DataFrameWrapper[_DF], + offset: str | tuple[str, ...], + color: str | tuple[str, ...] | None = None, + hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool | None = None, +) -> tuple[tuple[str, ...], tuple[str, ...]]: + if isinstance(offset, str): + offset = (offset,) + if isinstance(dodge, bool) and dodge: + dodge = _shared.join_columns(color, hatch, source=source) + elif isinstance(dodge, str): + dodge = (dodge,) + splitby = _shared.join_columns(offset, color, hatch, dodge, source=source) + return splitby, dodge + + +def _norm_color_hatch( + color, + hatch, + cat: CatIterator[_DF], +) -> tuple[_p.ColorPlan, _p.HatchPlan]: + color_cov = _shared.ColumnOrValue(color, cat.df) + if color_cov.is_column: + color_by = _p.ColorPlan.from_palette(color_cov.columns) + elif color_cov.value is not None: + color_by = _p.ColorPlan.from_const(Color(color_cov.value)) + else: + color_by = _p.ColorPlan.default() + hatch_cov = _shared.ColumnOrValue(hatch, cat.df) + if hatch_cov.is_column: + hatch_by = _p.HatchPlan.new(hatch_cov.columns) + elif hatch_cov.value is not None: + hatch_by = _p.HatchPlan.from_const(Hatch(hatch_cov.value)) + else: + hatch_by = _p.HatchPlan.default() + return color_by, hatch_by + + class _BoxLikeMixin: _source: DataFrameWrapper[_DF] def __init__( self, - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], - value: str, - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, + categories: list[tuple], + splitby: tuple[str, ...], + color_by: _p.ColorPlan, + hatch_by: _p.HatchPlan, ): - if isinstance(offset, str): - offset = (offset,) - splitby = _shared.join_columns(offset, color, hatch, source=source) - self._y = value self._splitby = splitby - self._color_by = _p.ColorPlan.default() - self._hatch_by = _p.HatchPlan.default() - self._offset_by = _p.OffsetPlan.default().more_by(*offset) - self._source = source - - @property - def color(self) -> _p.ColorPlan | _p.ColormapPlan: - """Return the object describing how the plot is colored.""" - return self._color_by - - @property - def hatch(self) -> _p.HatchPlan: - """Return the object describing how the plot is hatched.""" - return self._hatch_by + self._categories = categories + self._color_by = color_by + self._hatch_by = hatch_by + self._get_base().face.color = color_by.generate(self._categories, self._splitby) + self._get_base().face.hatch = hatch_by.generate(self._categories, self._splitby) def _get_base(self) -> _FE: """Just for typing.""" return self._base_layer - def with_color(self, by: str | Iterable[str], palette=None) -> Self: - cov = _shared.ColumnOrValue(by, self._source) - if cov.is_column: - if set(cov.columns) > set(self._splitby): - raise ValueError(f"Cannot color by a column other than {self._splitby}") - other_by = _shared.unique_tuple(self._offset_by.by, self._hatch_by.by) - by_all = _shared.unique_tuple(cov.columns, other_by) - color_by = _p.ColorPlan.from_palette(cov.columns, palette=palette) - self._splitby = by_all - _, self._labels = self._generate_datasets() - else: - color_by = _p.ColorPlan.from_const(Color(cov.value)) - self._get_base().face.color = color_by.generate(self._labels, self._splitby) + def with_color_palette(self, palette) -> Self: + if self._color_by.is_const(): + raise ValueError("Cannot redraw color for a constant color") + color_by = _p.ColorPlan.from_palette(self._color_by.by, palette=palette) + self._get_base().face.color = color_by.generate(self._categories, self._splitby) self._color_by = color_by return self - def with_hatch( - self, - by: str | Iterable[str], - choices=None, - ) -> Self: - cov = _shared.ColumnOrValue(by, self._source) - if cov.is_column: - if set(cov.columns) > set(self._splitby): - raise ValueError(f"Cannot color by a column other than {self._splitby}") - other_by = _shared.unique_tuple(self._offset_by.by, self._color_by.by) - by_all = _shared.unique_tuple(other_by, cov.columns) - hatch_by = _p.HatchPlan.new(cov.columns, values=choices) - self._splitby = by_all - _, self._labels = self._generate_datasets() - else: - hatch_by = _p.HatchPlan.from_const(Hatch(cov.value)) - self._get_base().face.hatch = hatch_by.generate(self._labels, self._splitby) + def with_color(self, color: ColorType) -> Self: + color_by = _p.ColorPlan.from_const(Color(color)) + self._get_base().face.color = color_by.generate(self._categories, self._splitby) + self._color_by = color_by + return self + + def with_hatch_palette(self, choices) -> Self: + if self._hatch_by.is_const(): + raise ValueError("Cannot redraw hatch for a constant hatch") + hatch_by = _p.HatchPlan.new(self._hatch_by.by, values=choices) + self._get_base().face.hatch = hatch_by.generate(self._categories, self._splitby) + self._hatch_by = hatch_by + return self + + def with_hatch(self, hatch: str | Hatch) -> Self: + hatch_by = _p.HatchPlan.from_const(Hatch(hatch)) + self._get_base().face.hatch = hatch_by.generate(self._categories, self._splitby) self._hatch_by = hatch_by return self @@ -111,47 +130,6 @@ def with_edge( self._get_base().with_edge(color=color, width=width, style=style, alpha=alpha) return self - def _generate_datasets(self) -> tuple[list[np.ndarray], list[tuple[Any, ...]]]: - datasets = [] - unique_sl: list[tuple[Any, ...]] = [] - for sl, df in self._source.group_by(self._splitby): - unique_sl.append(sl) - datasets.append(df[self._y]) - return datasets, unique_sl - - def _generate_labels(self): - """Generate the tick positions, labels and the axis label.""" - _agged_by = _shared.unique_tuple(self._color_by.by, self._hatch_by.by) - _nagged = 0 - for each in reversed(self._offset_by.by): - if each in _agged_by: - _nagged += 1 - else: - break - - # If all the offset columns are redundantly categorized by color or hatch, - # then all the labels should be shown. - if _nagged == len(self._offset_by.by): - _nagged = 0 - - # group positions by aggregated labels - label_to_pos: dict[str, list[float]] = {} - for p, lbl in self._offset_by.iter_ticks(self._labels, self._splitby): - label_agged = "\n".join(lbl[: len(lbl) - _nagged]) - if label_agged in label_to_pos: - label_to_pos[label_agged].append(p) - else: - label_to_pos[label_agged] = [p] - # compute the mean position for each aggregated label - pos: list[float] = [] - labels: list[str] = [] - for label, pos_list in label_to_pos.items(): - pos.append(np.mean(pos_list)) - labels.append(label) - - offset_labels = self._offset_by.by[: len(self._offset_by.by) - _nagged] - return pos, labels, offset_labels - class DFViolinPlot( _shared.DataFrameLayerWrapper[_lg.ViolinPlot, _DF], @@ -160,53 +138,27 @@ class DFViolinPlot( ): def __init__( self, - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], + cat: CatIterator[_DF], value: str, - *, color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool | None = None, name: str | None = None, orient: Orientation = Orientation.VERTICAL, extent: float = 0.8, shape: str = "both", backend: str | Backend | None = None, ): - if isinstance(offset, str): - offset = (offset,) - _BoxLikeMixin.__init__(self, source, offset, value, color, hatch) - arrays, self._labels = self._generate_datasets() - x = self._offset_by.generate(self._labels, self._splitby) + _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) + _extent = cat.zoom_factor(dodge=dodge) * extent + color_by, hatch_by = _norm_color_hatch(color, hatch, cat) base = _lg.ViolinPlot.from_arrays( - x, arrays, name=name, orient=orient, shape=shape, extent=extent, + x, arr, name=name, orient=orient, shape=shape, extent=_extent, backend=backend, ) # fmt: skip - super().__init__(base, source) - if color is not None: - self.with_color(color) - if hatch is not None: - self.with_hatch(hatch) - - @classmethod - def from_table( - cls, - df: _DF, - offset: tuple[str, ...], - value: str, - color: str | None = None, - hatch: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - extent: float = 0.8, - shape: str = "both", - backend: str | Backend | None = None, - ) -> DFViolinPlot[_DF]: - src = parse(df) - self = DFViolinPlot( - src, offset, value, orient=orient, name=name, extent=extent, - color=color, hatch=hatch, shape=shape, backend=backend - ) # fmt: skip - return self + super().__init__(base, cat.df) + _BoxLikeMixin.__init__(self, categories, _splitby, color_by, hatch_by) @property def orient(self) -> Orientation: @@ -230,57 +182,28 @@ class DFBoxPlot( ): def __init__( self, - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], + cat: CatIterator[_DF], value: str, - *, color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool | None = None, name: str | None = None, orient: Orientation = Orientation.VERTICAL, - capsize: float = 0.1, extent: float = 0.8, + capsize: float = 0.1, backend: str | Backend | None = None, ): - _BoxLikeMixin.__init__(self, source, offset, value, color, hatch) - arrays, self._labels = self._generate_datasets() - x = self._offset_by.generate(self._labels, self._splitby) + _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) + _extent = cat.zoom_factor(dodge=dodge) * extent + _capsize = cat.zoom_factor(dodge=dodge) * capsize + color_by, hatch_by = _norm_color_hatch(color, hatch, cat) base = _lg.BoxPlot.from_arrays( - x, - arrays, - name=name, - orient=orient, - capsize=capsize, - extent=extent, + x, arr, name=name, orient=orient, capsize=_capsize, extent=_extent, backend=backend, - ) - super().__init__(base, source) - base.with_edge(color=theme.get_theme().foreground_color) - if color is not None: - self.with_color(color) - if hatch is not None: - self.with_hatch(hatch) - - @classmethod - def from_table( - cls, - df: _DF, - offset: tuple[str, ...], - value: str, - color: str | None = None, - hatch: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - capsize: float = 0.1, - extent: float = 0.8, - backend: str | Backend | None = None, - ) -> DFBoxPlot[_DF]: - src = parse(df) - self = DFBoxPlot( - src, offset, value, orient=orient, name=name, color=color, hatch=hatch, - capsize=capsize, extent=extent, backend=backend ) # fmt: skip - return self + super().__init__(base, cat.df) + _BoxLikeMixin.__init__(self, categories, _splitby, color_by, hatch_by) @property def orient(self) -> Orientation: @@ -350,7 +273,7 @@ def err_func(x): return self._update_error(err_func) def _update_estimate(self, est_func: Callable[[np.ndarray], float]) -> Self: - arrays, _ = self._generate_datasets() + arrays = self._get_arrays() est = [est_func(arr) for arr in arrays] self._set_estimation_values(est) return self @@ -359,7 +282,7 @@ def _update_error( self, err_func: Callable[[np.ndarray], tuple[float, float]], ) -> Self: - arrays, _ = self._generate_datasets() + arrays = self._get_arrays() err_low = [] err_high = [] for arr in arrays: @@ -375,50 +298,28 @@ class DFPointPlot( ): def __init__( self, - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], + cat: CatIterator[_DF], value: str, - *, color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool | None = None, name: str | None = None, orient: Orientation = Orientation.VERTICAL, capsize: float = 0.1, backend: str | Backend | None = None, ): - _BoxLikeMixin.__init__(self, source, offset, value, color, hatch) - arrays, self._labels = self._generate_datasets() - x = self._offset_by.generate(self._labels, self._splitby) + _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) + _capsize = cat.zoom_factor(dodge=dodge) * capsize + color_by, hatch_by = _norm_color_hatch(color, hatch, cat) base = _lg.LabeledPlot.from_arrays( - x, arrays, name=name, orient=orient, capsize=capsize, backend=backend, + x, arr, name=name, orient=orient, capsize=_capsize, backend=backend, ) # fmt: skip - super().__init__(base, source) + self._arrays = arr + super().__init__(base, cat.df) + _BoxLikeMixin.__init__(self, categories, _splitby, color_by, hatch_by) base.with_edge(color=theme.get_theme().foreground_color) self._orient = orient - if color is not None: - self.with_color(color) - if hatch is not None: - self.with_hatch(hatch) - - @classmethod - def from_table( - cls, - df: _DF, - offset: tuple[str, ...], - value: str, - color: str | None = None, - hatch: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - capsize: float = 0.1, - backend: str | Backend | None = None, - ) -> DFPointPlot[_DF]: - src = parse(df) - self = DFPointPlot( - src, offset, value, orient=orient, name=name, color=color, hatch=hatch, - capsize=capsize, backend=backend - ) # fmt: skip - return self @property def orient(self) -> Orientation: @@ -437,6 +338,9 @@ def with_shift( base.set_data(data.x, data.y + shift) return self + def _get_arrays(self) -> list[np.ndarray]: + return self._arrays + def _set_estimation_values(self, est): if self.orient.is_vertical: self._base_layer.set_data(ydata=est) @@ -456,58 +360,39 @@ class DFBarPlot( ): def __init__( self, - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], + cat: CatIterator[_DF], value: str, - *, color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool | None = None, name: str | None = None, orient: Orientation = Orientation.VERTICAL, capsize: float = 0.1, extent: float = 0.8, backend: str | Backend | None = None, ): - _BoxLikeMixin.__init__(self, source, offset, value, color, hatch) - arrays, self._labels = self._generate_datasets() - x = self._offset_by.generate(self._labels, self._splitby) + _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) + _extent = cat.zoom_factor(dodge=dodge) * extent + _capsize = cat.zoom_factor(dodge=dodge) * capsize + color_by, hatch_by = _norm_color_hatch(color, hatch, cat) base = _lg.LabeledBars.from_arrays( - x, arrays, name=name, orient=orient, capsize=capsize, extent=extent, + x, arr, name=name, orient=orient, capsize=_capsize, extent=_extent, backend=backend, ) # fmt: skip - super().__init__(base, source) + self._arrays = arr + super().__init__(base, cat.df) + _BoxLikeMixin.__init__(self, categories, _splitby, color_by, hatch_by) base.with_edge(color=theme.get_theme().foreground_color) self._orient = orient - if color is not None: - self.with_color(color) - if hatch is not None: - self.with_hatch(hatch) - - @classmethod - def from_table( - cls, - df: _DF, - offset: tuple[str, ...], - value: str, - color: str | None = None, - hatch: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - capsize: float = 0.1, - extent: float = 0.8, - backend: str | Backend | None = None, - ) -> DFBarPlot[_DF]: - src = parse(df) - self = DFBarPlot( - src, offset, value, orient=orient, name=name, color=color, hatch=hatch, - capsize=capsize, extent=extent, backend=backend, - ) # fmt: skip - return self @property def orient(self) -> Orientation: return self._base_layer.bars.orient + def _get_arrays(self) -> list[np.ndarray]: + return self._arrays + def _set_estimation_values(self, est): if self.orient.is_vertical: self._base_layer.set_data(ydata=est) diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index 07c557ce..19ff7619 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -64,24 +64,31 @@ def __init__( @classmethod def from_table( cls, - df: _DF, - x: str, - y: str, + df: DataFrameWrapper[_DF], + x: str | _jitter.JitterBase, + y: str | _jitter.JitterBase, color: str | None = None, width: str | None = None, style: str | None = None, name: str | None = None, backend: str | Backend | None = None, ) -> DFLines[_DF]: - src = parse(df) - splitby = _shared.join_columns(color, style, source=src) + splitby = _shared.join_columns(color, style, source=df) segs = [] labels: list[tuple[Any, ...]] = [] - for sl, df in src.group_by(splitby): + if isinstance(x, _jitter.JitterBase): + xj = x + else: + xj = _jitter.IdentityJitter(x) + if isinstance(y, _jitter.JitterBase): + yj = y + else: + yj = _jitter.IdentityJitter(y) + for sl, sub in df.group_by(splitby): labels.append(sl) - segs.append(np.column_stack([df[x], df[y]])) + segs.append(np.column_stack([xj.map(sub), yj.map(sub)])) return DFLines( - src, segs, labels, name=name, color=color, width=width, style=style, + df, segs, labels, name=name, color=color, width=width, style=style, backend=backend, ) # fmt: skip @@ -281,109 +288,6 @@ def __init__( if size is not None: self.with_size(size) - def _generate_labels(self): - pos, labels = self._x.generate_labels(self._source) - return pos, ["\n".join(str(_l) for _l in lbl) for lbl in labels] - - @property - def symbol(self) -> _p.SymbolPlan: - return self._symbol_by - - @property - def size(self) -> _p.SizePlan: - return self._size_by - - @property - def color(self) -> _p.ColorPlan: - return self._color_by - - @property - def hatch(self) -> _p.HatchPlan: - return self._hatch_by - - @property - def width(self) -> _p.WidthPlan: - return self._width_by - - @classmethod - def from_table( - cls, - df: _DF, - x: str, - y: str, - *, - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - symbol: str | tuple[str, ...] | None = None, - size: str | None = None, - name: str | None = None, - backend: str | Backend | None = None, - ) -> DFMarkers[_DF]: - src = parse(df) - xj = _jitter.identity_or_categorical(src, x) - yj = _jitter.identity_or_categorical(src, y) - return DFMarkers( - src, xj, yj, name=name, color=color, hatch=hatch, symbol=symbol, - size=size, backend=backend, - ) # fmt: skip - - @classmethod - def build_stripplot( - cls, - df: _DF, - label: str, - value: str, - *, - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - symbol: str | tuple[str, ...] | None = None, - size: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - extent: float = 0.8, - seed: int | None = 0, - backend: str | Backend | None = None, - ) -> DFMarkerGroups[_DF]: - src = parse(df) - xj = _jitter.UniformJitter(label, extent=extent, seed=seed) - yj = _jitter.identity_or_categorical(src, value) - if not Orientation.parse(orient).is_vertical: - xj, yj = yj, xj - return DFMarkerGroups( - src, xj, yj, name=name, color=color, hatch=hatch, orient=orient, - symbol=symbol, size=size, backend=backend, - ) # fmt: skip - - @classmethod - def build_swarmplot( - cls, - df: _DF, - label: str, - value: str, - *, - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - symbol: str | tuple[str, ...] | None = None, - size: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - extent: float = 0.8, - sort: bool = False, - backend: str | Backend | None = None, - ) -> DFMarkerGroups[_DF]: - src = parse(df) - if sort: - src = src.sort(value) - lims = src[value].min(), src[value].max() - xj = _jitter.SwarmJitter(label, value, limits=lims, extent=extent) - yj = _jitter.identity_or_categorical(src, value) - if not Orientation.parse(orient).is_vertical: - xj, yj = yj, xj - return DFMarkerGroups( - src, xj, yj, name=name, color=color, hatch=hatch, orient=orient, - symbol=symbol, size=size, backend=backend, - ) # fmt: skip - @overload def with_color(self, value: ColorType) -> Self: ... diff --git a/whitecanvas/layers/tabular/_df_compat.py b/whitecanvas/layers/tabular/_df_compat.py index 5e8689d3..b9c92c70 100644 --- a/whitecanvas/layers/tabular/_df_compat.py +++ b/whitecanvas/layers/tabular/_df_compat.py @@ -24,7 +24,7 @@ def __repr__(self) -> str: return f"{type(self).__name__} of {self._data!r}" def __len__(self) -> int: - such_as = next(iter(self.iter_values()), None) + such_as = next(self.iter_values(), None) if such_as is None: return 0 else: @@ -32,11 +32,11 @@ def __len__(self) -> int: @property def shape(self) -> tuple[int, int]: - such_as = next(iter(self.iter_values()), None) + such_as = next(self.iter_values(), None) if such_as is None: return 0, 0 else: - return such_as.size, len(self.iter_keys()) + return such_as.size, len(self.columns) def get_native(self) -> _T: return self._data @@ -45,6 +45,9 @@ def get_native(self) -> _T: def __getitem__(self, item: str) -> NDArray[np.generic]: ... + def __contains__(self, item: str) -> bool: + return item in self.iter_keys() + @abstractmethod def iter_keys(self) -> Iterator[str]: ... diff --git a/whitecanvas/layers/tabular/_jitter.py b/whitecanvas/layers/tabular/_jitter.py index 07ac5813..23dcae4e 100644 --- a/whitecanvas/layers/tabular/_jitter.py +++ b/whitecanvas/layers/tabular/_jitter.py @@ -1,6 +1,5 @@ from __future__ import annotations -import itertools from abc import ABC, abstractmethod from typing import TypeVar @@ -8,24 +7,15 @@ from numpy.typing import NDArray from whitecanvas.layers.tabular._df_compat import DataFrameWrapper -from whitecanvas.layers.tabular._plans import OffsetPlan -from whitecanvas.layers.tabular._utils import unique _DF = TypeVar("_DF") class JitterBase(ABC): @abstractmethod - def map(self, src: DataFrameWrapper[_DF]) -> np.ndarray: + def map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: """Map the source data to jittered data.""" - @abstractmethod - def generate_labels( - self, - src: DataFrameWrapper[_DF], - ) -> tuple[NDArray[np.floating], list[tuple[str, ...]]]: - """Generate labels for the jittered data.""" - class IdentityJitter(JitterBase): """No jittering.""" @@ -35,62 +25,37 @@ def __init__(self, by: str): raise TypeError(f"Only str is allowed, got {type(by)}") self._by = by - def map(self, src: DataFrameWrapper[_DF]) -> np.ndarray: + def map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: return src[self._by] - def generate_labels( - self, - src: DataFrameWrapper[_DF], - ) -> tuple[NDArray[np.floating], list[tuple[str, ...]]]: - """Generate labels for the jittered data.""" - return _map_x_and_label([src[b] for b in self._by]) + def check(self, src: DataFrameWrapper[_DF]) -> IdentityJitter: + if self._by not in src: + raise ValueError(f"Column {self._by} not found in the data frame.") + if src[self._by].dtype.kind not in "iufb": + raise ValueError(f"Column {self._by} is not numeric.") + return self class CategoricalLikeJitter(JitterBase): - def __init__(self, by: str | tuple[str, ...]): + def __init__(self, by: str | tuple[str, ...], mapping: dict[tuple, float]): self._by = _tuple(by) + self._mapping = mapping - def generate_labels( - self, - src: DataFrameWrapper[_DF], - ) -> tuple[NDArray[np.floating], list[tuple[str, ...]]]: - """Generate labels for the jittered data.""" - return _map_x_and_label([src[b] for b in self._by]) + def _map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: + # only map the categorical data to real numbers + args = [src[b] for b in self._by] + out = np.zeros(len(src), dtype=np.float32) + for row, pos in self._mapping.items(): + sl = np.all(np.column_stack([a == r for a, r in zip(args, row)]), axis=1) + out[sl] = pos + return out class CategoricalJitter(CategoricalLikeJitter): """Jitter for categorical data.""" - def map(self, src: DataFrameWrapper[_DF]) -> np.ndarray: - # only map the categorical data to real numbers - return _map_x([src[b] for b in self._by]) - - -def identity_or_categorical( - df: DataFrameWrapper[_DF], - by: str | tuple[str, ...], -) -> JitterBase: - """ - Return either IdentityJitter or CategoricalJitter depending on the data type. - - Parameters - ---------- - df : DataFrameWrapper - The source data. - by : str | tuple[str, ...] - Column(s) to be used for the x-axis. - """ - if isinstance(by, str): - series = df[by] - if series.dtype.kind in "iuf": - return IdentityJitter(by) - else: - return CategoricalJitter((by,)) - else: - if len(by) == 1: - return identity_or_categorical(df, by[0]) - else: - return CategoricalJitter(by) + def map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: + return self._map(src) class UniformJitter(CategoricalLikeJitter): @@ -99,17 +64,18 @@ class UniformJitter(CategoricalLikeJitter): def __init__( self, by: str | tuple[str, ...], + mapping: dict[tuple, float], extent: float = 0.8, seed: int | None = 0, ): - super().__init__(by) + super().__init__(by, mapping) self._rng = np.random.default_rng(seed) self._extent = extent - def map(self, src: DataFrameWrapper[_DF]) -> np.ndarray: + def map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: w = self._extent jitter = self._rng.uniform(-w / 2, w / 2, size=len(src)) - return _map_x([src[b] for b in self._by]) + jitter + return self._map(src) + jitter class SwarmJitter(CategoricalLikeJitter): @@ -118,16 +84,17 @@ class SwarmJitter(CategoricalLikeJitter): def __init__( self, by: str | tuple[str, ...], + mapping: dict[tuple, float], value: str, limits: tuple[float, float], extent: float = 0.8, ): - super().__init__(by) + super().__init__(by, mapping) self._value = value self._extent = extent self._limits = limits - def map(self, src: DataFrameWrapper[_DF]) -> np.ndarray: + def map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: values = src[self._value] vmin, vmax = self._limits nbin = 25 @@ -146,7 +113,7 @@ def map(self, src: DataFrameWrapper[_DF]) -> np.ndarray: offset_max = np.abs(offset_pre).max() width_default = dv * offset_max offsets = offset_pre / offset_max * min(self._extent / 2, width_default) - out = _map_x([src[b] for b in self._by]) + offsets + out = self._map(src) + offsets return out @@ -154,41 +121,3 @@ def _tuple(x) -> tuple[str, ...]: if isinstance(x, str): return (x,) return tuple(x) - - -def _map_x(args: list[np.ndarray]) -> NDArray[np.floating]: - """ - Map the input data to x-axis values. - - >>> _map_x([["a", "a", "b", "b"], ["u", "v", "u", "v"]]) # [0, 1, 2, 3] - >>> _map_x([["p", "q", "r", "r", "q"]]) # [0, 1, 2, 2, 1] - """ - by_all = tuple(str(i) for i in range(len(args))) - plan = OffsetPlan.default().more_by(*by_all) - each_unique = [unique(a, axis=None) for a in args] - labels = list(itertools.product(*each_unique)) - offsets = np.asarray(plan.generate(labels, by_all)) - out = np.zeros_like(args[0], dtype=np.float32) - for i, row in enumerate(labels): - sl = np.all(np.column_stack([a == r for a, r in zip(args, row)]), axis=1) - out[sl] = offsets[i] - return out - - -def _map_x_and_label( - args: list[np.ndarray], -) -> tuple[NDArray[np.floating], list[tuple[str, ...]]]: - """ - Map the input data to x-axis values and generate labels. - - >>> _map_x_and_label([["a", "a", "b", "b"], ["u", "v", "u", "v"]]) - >>> # [0, 1, 2, 3], [("a", "u"), ("a", "v"), ("b", "u"), ("b", "v")] - >>> _map_x_and_label([["p", "q", "r", "r", "q"]]) - >>> # [0, 1, 2], [("p",), ("q",), ("r",)] - """ - by_all = tuple(str(i) for i in range(len(args))) - plan = OffsetPlan.default().more_by(*by_all) - each_unique = [unique(a, axis=None) for a in args] - labels = list(itertools.product(*each_unique)) - offsets = np.asarray(plan.generate(labels, by_all)) - return offsets, labels diff --git a/whitecanvas/layers/tabular/_plans.py b/whitecanvas/layers/tabular/_plans.py index 3a8597d0..2f642001 100644 --- a/whitecanvas/layers/tabular/_plans.py +++ b/whitecanvas/layers/tabular/_plans.py @@ -53,7 +53,7 @@ class OffsetPolicy(ABC): @abstractmethod def get(self, interval: int) -> float: - """Get 1D array for offsets""" + """Get increment of position for given interval from the previous position.""" def with_shift(self, val: float) -> CompositeOffsetPolicy: return CompositeOffsetPolicy([self, ConstOffset(val)]) @@ -269,7 +269,7 @@ def from_const(cls, value: _V) -> Self: def is_const(self) -> bool: """Return True if the plan is a constant plan.""" - return len(self.values) == 1 + return len(self.by) == 0 def generate( self, diff --git a/whitecanvas/layers/tabular/_shared.py b/whitecanvas/layers/tabular/_shared.py index 334b6d5e..e4cd61b0 100644 --- a/whitecanvas/layers/tabular/_shared.py +++ b/whitecanvas/layers/tabular/_shared.py @@ -73,7 +73,9 @@ def join_columns( continue cv = ColumnOrValue(obj, source) if cv.is_column: - out.extend(cv.columns) + for each in cv.columns: + if each not in out: + out.append(each) return tuple(out) From b11e941ac9ff60cffa3b8e6b81582cd0bdb5cc93 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Fri, 2 Feb 2024 01:05:15 +0900 Subject: [PATCH 02/11] many bug fixes --- examples/raincloud_plot.py | 12 +- examples/superplot.py | 40 +++ whitecanvas/canvas/_base.py | 20 +- whitecanvas/canvas/dataframe/__init__.py | 13 +- whitecanvas/canvas/dataframe/_base.py | 4 +- whitecanvas/canvas/dataframe/_both_cat.py | 2 +- whitecanvas/canvas/dataframe/_feature_cat.py | 18 +- whitecanvas/canvas/dataframe/_one_cat.py | 256 +++++++++++++------ whitecanvas/layers/_primitive/line.py | 2 +- whitecanvas/layers/tabular/_box_like.py | 12 +- whitecanvas/layers/tabular/_dataframe.py | 112 ++++---- whitecanvas/layers/tabular/_df_compat.py | 16 +- whitecanvas/theme/_dataclasses.py | 2 +- 13 files changed, 340 insertions(+), 169 deletions(-) create mode 100644 examples/superplot.py diff --git a/examples/raincloud_plot.py b/examples/raincloud_plot.py index 02853291..8fe6cec0 100644 --- a/examples/raincloud_plot.py +++ b/examples/raincloud_plot.py @@ -11,16 +11,14 @@ def main(): url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv" df = pd.read_csv(url) - x = "species" - y = "sepal_width" - cat_plt = canvas.cat(df) + cat_plt = canvas.cat_x(df, x="species", y="sepal_width") cat_plt.add_stripplot( - x, y, color=x, extent=0.3 + color="species", extent=0.3 ).with_edge(color="#3F3F00").with_shift(-0.3) - cat_plt.add_boxplot(x, y, color=x, extent=0.3) - cat_plt.mean().add_markers(x, y, color="black", size=10, symbol="+") + cat_plt.add_boxplot(color="species", extent=0.3) + cat_plt.mean().add_markers(color="black", size=10, symbol="+") cat_plt.add_violinplot( - x, y, color=x, extent=0.3, shape="right" + color="species", extent=0.3, shape="right" ).with_edge(color="#3F3F00").with_shift(0.2) canvas.show(block=True) diff --git a/examples/superplot.py b/examples/superplot.py new file mode 100644 index 00000000..6b2cbfe1 --- /dev/null +++ b/examples/superplot.py @@ -0,0 +1,40 @@ +import numpy as np + +from whitecanvas import new_canvas + + +def rand(mean: float, n: int) -> list[float]: + """Generate random data.""" + return np.random.normal(loc=mean, scale=mean / 4, size=n).tolist() + +def main(): + # generate some random data + np.random.seed(174623) + data = { + "label": ["Control"] * 50 + ["Treatment"] * 50, + "value": rand(1.1, 15) + rand(1.4, 20) + rand(0.9, 15) + rand(3.3, 15) + rand(2.9, 20) + rand(3.8, 15), + "replicate": [1] * 15 + [2] * 20 + [3] * 15 + [1] * 15 + [2] * 20 + [3] * 15, + } + + canvas = new_canvas("matplotlib:qt") + cat_plt = canvas.cat_x(data, x="label", y="value") + + # plot all the raw data + cat_plt.add_swarmplot(color="replicate", size=8) + + # plot the mean of each replicate + cat_plt.mean_for_each("replicate").add_markers( + color="replicate", size=18, symbol="D" + ) + + # plot the mean of all the data for control and treatment + cat_plt.mean().add_markers(color="black", size=20, symbol="+") + + # plot the mean of replicate means + cat_plt.mean_for_each("replicate").mean().add_markers( + color="black", size=30, symbol="_" + ) + canvas.show(block=True) + +if __name__ == "__main__": + main() diff --git a/whitecanvas/canvas/_base.py b/whitecanvas/canvas/_base.py index f400e644..46820144 100644 --- a/whitecanvas/canvas/_base.py +++ b/whitecanvas/canvas/_base.py @@ -316,7 +316,7 @@ def cat( x: str | None = None, y: str | None = None, update_labels: bool = True, - ) -> _df.FeatureCatPlotter[Self, _DF]: + ) -> _df.CatPlotter[Self, _DF]: """ Categorize input data for plotting. @@ -337,7 +337,7 @@ def cat( CategorizedPlot Plotter object. """ - plotter = _df.FeatureCatPlotter(self, data, x, y, update_label=update_labels) + plotter = _df.CatPlotter(self, data, x, y, update_label=update_labels) return plotter def cat_x( @@ -347,10 +347,8 @@ def cat_x( x: str | Sequence[str] | None = None, y: str | None = None, update_labels: bool = True, - ) -> _df.OneAxisCatPlotter[Self, _DF]: - return _df.OneAxisCatPlotter( - self, data, x, y, Orientation.VERTICAL, update_labels - ) + ) -> _df.XCatPlotter[Self, _DF]: + return _df.XCatPlotter(self, data, x, y, update_labels) def cat_y( self, @@ -359,10 +357,8 @@ def cat_y( x: str | None = None, y: str | Sequence[str] | None = None, update_labels: bool = True, - ) -> _df.OneAxisCatPlotter[Self, _DF]: - return _df.OneAxisCatPlotter( - self, data, y, x, Orientation.HORIZONTAL, update_labels - ) + ) -> _df.YCatPlotter[Self, _DF]: + return _df.YCatPlotter(self, data, y, x, update_labels) def cat_xy( self, @@ -371,8 +367,8 @@ def cat_xy( x: str | Sequence[str] | None = None, y: str | Sequence[str] | None = None, update_labels: bool = True, - ) -> _df.BothAxesCatPlotter[Self, _DF]: - return _df.BothAxesCatPlotter(self, data, x, y, update_labels) + ) -> _df.XYCatPlotter[Self, _DF]: + return _df.XYCatPlotter(self, data, x, y, update_labels) def stack_over(self, layer: _L0) -> StackOverPlotter[Self, _L0]: """ diff --git a/whitecanvas/canvas/dataframe/__init__.py b/whitecanvas/canvas/dataframe/__init__.py index 4cb006ad..f8365657 100644 --- a/whitecanvas/canvas/dataframe/__init__.py +++ b/whitecanvas/canvas/dataframe/__init__.py @@ -1,5 +1,10 @@ -from whitecanvas.canvas.dataframe._both_cat import BothAxesCatPlotter -from whitecanvas.canvas.dataframe._feature_cat import FeatureCatPlotter -from whitecanvas.canvas.dataframe._one_cat import OneAxisCatPlotter +from whitecanvas.canvas.dataframe._both_cat import XYCatPlotter +from whitecanvas.canvas.dataframe._feature_cat import CatPlotter +from whitecanvas.canvas.dataframe._one_cat import XCatPlotter, YCatPlotter -__all__ = ["FeatureCatPlotter", "BothAxesCatPlotter", "OneAxisCatPlotter"] +__all__ = [ + "CatPlotter", + "XCatPlotter", + "YCatPlotter", + "XYCatPlotter", +] diff --git a/whitecanvas/canvas/dataframe/_base.py b/whitecanvas/canvas/dataframe/_base.py index 0a0af604..2b26b74a 100644 --- a/whitecanvas/canvas/dataframe/_base.py +++ b/whitecanvas/canvas/dataframe/_base.py @@ -15,7 +15,7 @@ import numpy as np from whitecanvas._exceptions import ReferenceDeletedError -from whitecanvas.layers.tabular import _utils +from whitecanvas.layers.tabular import _utils, parse if TYPE_CHECKING: from typing_extensions import Self @@ -36,7 +36,7 @@ def __init__( df: _DF, ): self._canvas_ref = weakref.ref(canvas) - self._df = df + self._df = parse(df) def _canvas(self) -> _C: canvas = self._canvas_ref() diff --git a/whitecanvas/canvas/dataframe/_both_cat.py b/whitecanvas/canvas/dataframe/_both_cat.py index e4aa1414..a0c8c628 100644 --- a/whitecanvas/canvas/dataframe/_both_cat.py +++ b/whitecanvas/canvas/dataframe/_both_cat.py @@ -19,7 +19,7 @@ _DF = TypeVar("_DF") -class BothAxesCatPlotter(BaseCatPlotter[_C, _DF]): +class XYCatPlotter(BaseCatPlotter[_C, _DF]): def __init__( self, canvas: _C, diff --git a/whitecanvas/canvas/dataframe/_feature_cat.py b/whitecanvas/canvas/dataframe/_feature_cat.py index e5c4c7cd..7e6fbcd7 100644 --- a/whitecanvas/canvas/dataframe/_feature_cat.py +++ b/whitecanvas/canvas/dataframe/_feature_cat.py @@ -8,7 +8,7 @@ from whitecanvas.canvas.dataframe._base import BaseCatPlotter from whitecanvas.layers import tabular as _lt -from whitecanvas.layers.tabular._dataframe import parse +from whitecanvas.layers.tabular import _jitter from whitecanvas.types import ArrayLike1D, ColormapType, Orientation if TYPE_CHECKING: @@ -20,7 +20,7 @@ _DF = TypeVar("_DF") -class FeatureCatPlotter(BaseCatPlotter[_C, _DF]): +class CatPlotter(BaseCatPlotter[_C, _DF]): """ Categorical plotter that categorizes the data by features (color, style etc.) """ @@ -33,7 +33,7 @@ def __init__( y: str | None, update_label: bool = False, ): - super().__init__(canvas, df, update_label) + super().__init__(canvas, df) self._x = x self._y = y self._update_label = update_label @@ -58,12 +58,12 @@ def _update_xy_label(self, x: str | None, y: str | None) -> None: if isinstance(y, str): canvas.y.label.text = y - def along_x(self) -> FeatureCatPlotter[_C, _DF]: + def along_x(self) -> CatPlotter[_C, _DF]: return self.__class__( self._canvas(), self._df, self._get_x(), None, self._update_label ) - def along_y(self) -> FeatureCatPlotter[_C, _DF]: + def along_y(self) -> CatPlotter[_C, _DF]: return self.__class__( self._canvas(), self._df, None, self._get_y(), self._update_label ) @@ -162,9 +162,10 @@ def add_markers( Marker collection layer. """ canvas = self._canvas() - df = parse(self._df) + xj = _jitter.IdentityJitter(self._get_x()) + yj = _jitter.IdentityJitter(self._get_y()) layer = _lt.DFMarkers( - df, self._get_x(), self._get_y(), name=name, color=color, hatch=hatch, + self._df, xj, yj, name=name, color=color, hatch=hatch, size=size, symbol=symbol, backend=canvas._get_backend(), ) # fmt: skip if color is not None and not layer._color_by.is_const(): @@ -252,7 +253,7 @@ def add_pointplot( ): canvas = self._canvas() layer = _lt.DFPointPlot2D( - parse(self._df), self._get_x(), self._get_y(), name=name, color=color, + self._df, self._get_x(), self._get_y(), name=name, color=color, hatch=hatch, size=size, capsize=capsize, backend=canvas._get_backend(), ) # fmt: skip return canvas.add_layer(layer) @@ -267,6 +268,7 @@ def add_hist( color: NStr | None = None, hatch: NStr | None = None, ): + # TODO: implement this raise NotImplementedError def add_hist_line( diff --git a/whitecanvas/canvas/dataframe/_one_cat.py b/whitecanvas/canvas/dataframe/_one_cat.py index 4cfcac4b..94dceb67 100644 --- a/whitecanvas/canvas/dataframe/_one_cat.py +++ b/whitecanvas/canvas/dataframe/_one_cat.py @@ -1,14 +1,16 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Sequence, TypeVar +from typing import TYPE_CHECKING, Generic, Sequence, TypeVar from whitecanvas import theme from whitecanvas.canvas.dataframe._base import AggMethods, BaseCatPlotter, CatIterator from whitecanvas.layers import tabular as _lt -from whitecanvas.layers.tabular import _jitter, _shared, parse +from whitecanvas.layers.tabular import _jitter, _shared from whitecanvas.types import ColorType, Hatch, Orientation, Symbol if TYPE_CHECKING: + from typing_extensions import Self + from whitecanvas.canvas._base import CanvasBase from whitecanvas.layers.tabular._box_like import _BoxLikeMixin from whitecanvas.layers.tabular._dataframe import DataFrameWrapper @@ -19,23 +21,84 @@ _DF = TypeVar("_DF") +class _Aggregator(Generic[_C, _DF]): + def __init__(self, method: str, plotter: OneAxisCatPlotter[_C, _DF] = None): + self._method = method + self._plotter = plotter + + def __get__(self, ins: _C, owner) -> Self: + return _Aggregator(self._method, ins) + + def __repr__(self) -> str: + return f"Aggregator<{self._method}>" + + def __call__(self) -> OneAxisCatAggPlotter[_C, _DF]: + """Aggregate the values before plotting it.""" + plotter = self._plotter + if plotter is None: + raise TypeError("Cannot call this method from a class.") + if self._method == "size": + value = "size" + elif plotter._value is None: + raise ValueError("Value column is not specified.") + else: + value = plotter._value + return OneAxisCatAggPlotter( + plotter._canvas(), + plotter._cat_iter, + offset=plotter._offset, + value=value, + method=self._method, + orient=plotter._orient, + ) + + +class _GroupAggregator(Generic[_C, _DF]): + def __init__(self, method: str, plotter: OneAxisCatPlotter[_C, _DF] = None): + self._method = method + self._plotter = plotter + + def __get__(self, ins: _C, owner) -> Self: + return _GroupAggregator(self._method, ins) + + def __repr__(self) -> str: + return f"GroupAggregator<{self._method}>" + + def __call__(self, by: str | tuple[str, ...]) -> OneAxisCatPlotter[_C, _DF]: + """Aggregate the values for each group before plotting it.""" + plotter = self._plotter + if isinstance(by, str): + by = (by,) + elif len(by) == 0: + raise ValueError("No column is specified for grouping.") + return type(plotter)( + plotter._canvas(), + plotter._df.agg_by((*plotter._offset, *by), plotter._value, self._method), + offset=plotter._offset, + value=plotter._value, + update_label=plotter._update_label, + ) + + class OneAxisCatPlotter(BaseCatPlotter[_C, _DF]): + _orient: Orientation + def __init__( self, canvas: _C, df: _DF, - offset: str | tuple[str, ...], + offset: str | tuple[str, ...] | None, value: str | None, - orient: Orientation, update_label: bool = False, ): super().__init__(canvas, df) if isinstance(offset, str): offset = (offset,) + elif offset is None: + offset = () self._offset = offset - self._cat_iter = CatIterator(parse(df), offset) + self._cat_iter = CatIterator(self._df, offset) self._value = value - self._orient = orient self._update_label = update_label if update_label: if value is not None: @@ -84,7 +147,7 @@ def add_violinplot( name: str | None = None, color: NStr | None = None, hatch: NStr | None = None, - dodge: NStr | bool | None = None, + dodge: NStr | bool = True, extent: float = 0.8, shape: str = "both", ) -> _lt.DFViolinPlot[_DF]: @@ -131,7 +194,7 @@ def add_boxplot( *, color: NStr | None = None, hatch: NStr | None = None, - dodge: NStr | bool | None = None, + dodge: NStr | bool = True, name: str | None = None, capsize: float = 0.1, extent: float = 0.8, @@ -184,7 +247,7 @@ def add_pointplot( *, color: NStr | None = None, hatch: NStr | None = None, - dodge: NStr | bool | None = None, + dodge: NStr | bool = True, name: str | None = None, capsize: float = 0.1, ) -> _lt.DFPointPlot[_DF]: @@ -240,7 +303,7 @@ def add_barplot( *, color: NStr | None = None, hatch: NStr | None = None, - dodge: NStr | bool | None = None, + dodge: NStr | bool = True, name: str | None = None, capsize: float = 0.1, extent: float = 0.8, @@ -302,7 +365,7 @@ def add_stripplot( hatch: NStr | None = None, symbol: NStr | None = None, size: str | None = None, - dodge: NStr | bool | None = None, + dodge: NStr | bool = False, name: str | None = None, extent: float = 0.5, seed: int | None = 0, @@ -343,7 +406,7 @@ def add_stripplot( symbol = theme._default("markers.symbol", symbol) size = theme._default("markers.size", size) - df = parse(self._df) + df = self._df splitby, dodge = _splitby_dodge(df, self._offset, color, hatch, dodge) _map = self._cat_iter.prep_position_map(splitby, dodge) _extent = self._cat_iter.zoom_factor(dodge) * extent @@ -361,6 +424,21 @@ def add_stripplot( layer.with_color(canvas._color_palette.next()) return canvas.add_layer(layer) + def add_markers( + self, + *, + name: str | None = None, + color: NStr | None = None, + hatch: NStr | None = None, + symbol: NStr | None = None, + size: str | None = None, + dodge: NStr | bool = False, + ) -> _lt.DFMarkerGroups[_DF]: + return self.add_stripplot( + color=color, hatch=hatch, symbol=symbol, size=size, dodge=dodge, + extent=0, seed=0, name=name, + ) # fmt: skip + def add_swarmplot( self, *, @@ -368,7 +446,7 @@ def add_swarmplot( hatch: NStr | None = None, symbol: NStr | None = None, size: str | None = None, - dodge: NStr | bool | None = None, + dodge: NStr | bool = False, name: str | None = None, extent: float = 0.8, sort: bool = False, @@ -408,7 +486,7 @@ def add_swarmplot( canvas = self._canvas() symbol = theme._default("markers.symbol", symbol) size = theme._default("markers.size", size) - df = parse(self._df) + df = self._df splitby, dodge = _splitby_dodge(df, self._offset, color, hatch, dodge) _map = self._cat_iter.prep_position_map(splitby, dodge) _extent = self._cat_iter.zoom_factor(dodge) * extent @@ -431,62 +509,20 @@ def add_swarmplot( layer.with_color(canvas._color_palette.next()) return canvas.add_layer(layer) - def add_countplot( - self, - *, - color: NStr | None = None, - hatch: NStr | None = None, - name: str | None = None, - extent: float = 0.8, - ) -> _lt.DFBars[_DF]: - """ - Add a categorical count plot. - - >>> ### Count for each category in column "species". - >>> canvas.cat(df).add_countplot("species") - - >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_countplot(offset, color="region") - - Parameters - ---------- - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - name : str, optional - Name of the layer. - extent : float, default 0.8 - Width of the violins. Usually in range (0, 1]. - - Returns - ------- - WrappedBars - Bar collection layer. - """ - canvas = self._canvas() - layer = _lt.DFBars.build_count( - self._df, self._offset, color=color, hatch=hatch, orient=self._orient, - extent=extent, name=name, backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - if self._update_label: - self._update_axis_labels("count") - return canvas.add_layer(layer) + mean = _Aggregator("mean") + median = _Aggregator("median") + min = _Aggregator("min") + max = _Aggregator("max") + std = _Aggregator("std") + sum = _Aggregator("sum") + count = _Aggregator("size") - def agg(self, method: AggMethods = "mean") -> OneAxisCatAggPlotter[_C, _DF]: - return OneAxisCatAggPlotter( - self._canvas(), - self._df, - offset=self._offset, - value=self._get_value(), - method=method, - orient=self._orient, - ) + mean_for_each = _GroupAggregator("mean") + median_for_each = _GroupAggregator("median") + min_for_each = _GroupAggregator("min") + max_for_each = _GroupAggregator("max") + std_for_each = _GroupAggregator("std") + sum_for_each = _GroupAggregator("sum") class OneAxisCatAggPlotter(BaseCatPlotter[_C, _DF]): @@ -536,7 +572,7 @@ def add_line( Line collection layer. """ canvas = self._canvas() - df = parse(self._df) + df = self._df _joined = _shared.join_columns(self._offset, color, style, source=df) df_agg = self._aggregate(df, _joined, self._value) xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) @@ -590,7 +626,7 @@ def add_markers( Marker collection layer. """ canvas = self._canvas() - df = parse(self._df) + df = self._df _joined = _shared.join_columns(self._offset, color, hatch, symbol, source=df) df_agg = self._aggregate(df, _joined, self._value) xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) @@ -607,13 +643,77 @@ def add_markers( layer.with_color(canvas._color_palette.next()) return canvas.add_layer(layer) + def add_bars( + self, + *, + name: str | None = None, + color: NStr | ColorType | None = None, + hatch: NStr | Hatch | None = None, + extent: float = 0.8, + ) -> _lt.DFBars[_DF]: + """ + Add bars that represent the aggregated values. + + >>> canvas.cat(df).mean().add_bars("time", "value") + + Parameters + ---------- + x : str + Column name for x-axis. + y : str + Column name for y-axis. + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + width : str, optional + Column name for bar width. Must be numerical. + + Returns + ------- + WrappedBars + Bar collection layer. + """ + canvas = self._canvas() + df = self._df + _joined = _shared.join_columns(self._offset, color, hatch, source=df) + df_agg = self._aggregate(df, _joined, self._value) + xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) + yj = _jitter.IdentityJitter(self._value).check(df_agg) + if not self._orient.is_vertical: + xj, yj = yj, xj + layer = _lt.DFBars.from_table( + df_agg, xj, yj, name=name, color=color, hatch=hatch, extent=extent, + backend=canvas._get_backend(), + ) # fmt: skip + if color is not None and not layer._color_by.is_const(): + layer.with_color(color, palette=canvas._color_palette) + elif color is None: + layer.with_color(canvas._color_palette.next()) + return canvas.add_layer(layer) + def _aggregate( self, df: DataFrameWrapper, by: tuple[str, ...], on: str, ) -> DataFrameWrapper[_DF]: - return df.agg_by(by, on, self._agg_method) + if self._agg_method == "size": + return df.value_count(by) + else: + if on is None: + raise ValueError("Value column is not specified.") + return df.agg_by(by, on, self._agg_method) + + +class XCatPlotter(OneAxisCatPlotter[_C, _DF]): + _orient = Orientation.VERTICAL + + +class YCatPlotter(OneAxisCatPlotter[_C, _DF]): + _orient = Orientation.HORIZONTAL def _splitby_dodge( @@ -621,13 +721,19 @@ def _splitby_dodge( offset: str | tuple[str, ...], color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, - dodge: str | tuple[str, ...] | bool | None = None, + dodge: str | tuple[str, ...] | bool = False, ) -> tuple[tuple[str, ...], tuple[str, ...]]: if isinstance(offset, str): offset = (offset,) - if isinstance(dodge, bool) and dodge: - dodge = _shared.join_columns(color, hatch, source=source) + if isinstance(dodge, bool): + if dodge: + _all = _shared.join_columns(color, hatch, source=source) + dodge = tuple(c for c in _all if c not in offset) + else: + dodge = () elif isinstance(dodge, str): dodge = (dodge,) + else: + dodge = tuple(dodge) splitby = _shared.join_columns(offset, dodge, source=source) return splitby, dodge diff --git a/whitecanvas/layers/_primitive/line.py b/whitecanvas/layers/_primitive/line.py index 085fb8d0..49d9673b 100644 --- a/whitecanvas/layers/_primitive/line.py +++ b/whitecanvas/layers/_primitive/line.py @@ -144,7 +144,7 @@ def __init__( width: float = 1, alpha: float = 1.0, style: LineStyle | str = LineStyle.SOLID, - antialias: bool = False, + antialias: bool = True, backend: Backend | str | None = None, ): xdata, ydata = normalize_xy(xdata, ydata) diff --git a/whitecanvas/layers/tabular/_box_like.py b/whitecanvas/layers/tabular/_box_like.py index 330466db..12965607 100644 --- a/whitecanvas/layers/tabular/_box_like.py +++ b/whitecanvas/layers/tabular/_box_like.py @@ -36,14 +36,20 @@ def _splitby_dodge( offset: str | tuple[str, ...], color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, - dodge: str | tuple[str, ...] | bool | None = None, + dodge: str | tuple[str, ...] | bool = False, ) -> tuple[tuple[str, ...], tuple[str, ...]]: if isinstance(offset, str): offset = (offset,) - if isinstance(dodge, bool) and dodge: - dodge = _shared.join_columns(color, hatch, source=source) + if isinstance(dodge, bool): + if dodge: + _all = _shared.join_columns(color, hatch, source=source) + dodge = tuple(c for c in _all if c not in offset) + else: + dodge = () elif isinstance(dodge, str): dodge = (dodge,) + else: + dodge = tuple(dodge) splitby = _shared.join_columns(offset, color, hatch, dodge, source=source) return splitby, dodge diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index 19ff7619..e130aa99 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -8,6 +8,7 @@ from cmap import Color, Colormap from whitecanvas import layers as _l +from whitecanvas import theme from whitecanvas.backend import Backend from whitecanvas.layers import _mixin from whitecanvas.layers import group as _lg @@ -287,6 +288,8 @@ def __init__( self.with_symbol(symbol) if size is not None: self.with_size(size) + else: + self.with_size(theme.get_theme().markers.size) @overload def with_color(self, value: ColorType) -> Self: @@ -359,10 +362,10 @@ def with_edge_colormap( self._edge_color_by = color_by return self - def with_hatch(self, by: str | Iterable[str], choices=None) -> Self: + def with_hatch(self, by: str | Iterable[str], palette=None) -> Self: cov = _shared.ColumnOrValue(by, self._source) if cov.is_column: - hatch_by = _p.HatchPlan.new(cov.columns, values=choices) + hatch_by = _p.HatchPlan.new(cov.columns, values=palette) else: hatch_by = _p.HatchPlan.from_const(Hatch(cov.value)) hatches = hatch_by.map(self._source) @@ -474,9 +477,9 @@ class DFBars( def __init__( self, source: DataFrameWrapper[_DF], - offset: str, - value: str, - *, + x, + y, + labels: list[tuple[Any, ...]], color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, name: str | None = None, @@ -484,27 +487,15 @@ def __init__( extent: float = 0.8, backend: str | Backend | None = None, ): - if isinstance(offset, str): - offset = (offset,) - splitby = _shared.join_columns(offset, color, hatch, source=source) - unique_sl: list[tuple[Any, ...]] = [] - values = [] - for sl, df in source.group_by(splitby): - unique_sl.append(sl) - series = df[value] - if len(series) != 1: - raise ValueError(f"More than one value found for category {sl!r}.") - values.append(series[0]) - + splitby = _shared.join_columns(color, hatch, source=source) self._color_by = _p.ColorPlan.default() - self._hatch_by = _p.HatchPlan.default() - self._offset_by = _p.OffsetPlan.default().more_by(*offset) - self._labels = unique_sl + self._width_by = _p.WidthPlan.default() + self._style_by = _p.StylePlan.default() + self._labels = labels self._splitby = splitby - x = self._offset_by.generate(self._labels, splitby) base = _l.Bars( - x, values, name=name, orient=orient, extent=extent, backend=backend + x, y, name=name, orient=orient, extent=extent, backend=backend ).with_face_multi() super().__init__(base, source) if color is not None: @@ -513,55 +504,70 @@ def __init__( self.with_hatch(hatch) @classmethod - def from_table( + def from_cat( cls, - df: _DF, - x: str, - y: str, - *, + df: DataFrameWrapper[_DF], + offset: str | tuple[str, ...], + value: str, color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, name: str | None = None, + orient: Orientation = Orientation.VERTICAL, extent: float = 0.8, backend: str | Backend | None = None, - ) -> DFBars[_DF]: - src = parse(df) - return DFBars( - src, x, y, name=name, color=color, hatch=hatch, extent=extent, - backend=backend + ): + if isinstance(offset, str): + offset = (offset,) + splitby = _shared.join_columns(offset, color, hatch, source=df) + labels: list[tuple[Any, ...]] = [] + values = [] + for sl, sub in df.group_by(splitby): + labels.append(sl) + series = sub[value] + if len(series) != 1: + raise ValueError(f"More than one value found for category {sl!r}.") + values.append(series[0]) + return cls( + df, offset, value, labels, name=name, color=color, hatch=hatch, + orient=orient, extent=extent, backend=backend ) # fmt: skip @classmethod - def build_count( + def from_table( cls, - df: _DF, - offset: str, + df: DataFrameWrapper[_DF], + x: str | _jitter.JitterBase, + y: str | _jitter.JitterBase, *, color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, extent: float = 0.8, backend: str | Backend | None = None, ) -> DFBars[_DF]: - src = parse(df) - splitby = _shared.join_columns(offset, color, hatch, source=src) - new_src = src.value_count(splitby) + splitby = _shared.join_columns(color, hatch, source=df) + labels: list[tuple[Any, ...]] = [] + if isinstance(x, _jitter.JitterBase): + xj = x + else: + xj = _jitter.IdentityJitter(x) + if isinstance(y, _jitter.JitterBase): + yj = y + else: + yj = _jitter.IdentityJitter(y) + xs = [] + ys = [] + for sl, sub in df.group_by(splitby): + labels.append(sl) + xs.append(xj.map(sub)) + ys.append(yj.map(sub)) + x0 = np.concatenate(xs) + y0 = np.concatenate(ys) return DFBars( - new_src, offset, "size", name=name, color=color, hatch=hatch, - orient=orient, extent=extent, backend=backend + df, x0, y0, labels, name=name, color=color, hatch=hatch, extent=extent, + backend=backend ) # fmt: skip - @property - def color(self) -> _p.ColorPlan: - """Return the color plan object.""" - return self._color_by - - @property - def hatch(self) -> _p.HatchPlan: - """Return the hatch plan object.""" - return self._hatch_by - def with_color(self, by: str | Iterable[str] | ColorType, palette=None) -> Self: cov = _shared.ColumnOrValue(by, self._source) if cov.is_column: @@ -570,7 +576,7 @@ def with_color(self, by: str | Iterable[str] | ColorType, palette=None) -> Self: color_by = _p.ColorPlan.from_palette(cov.columns, palette=palette) else: color_by = _p.ColorPlan.from_const(Color(cov.value)) - self._base_layer.face.color = color_by.generate(self._labels, self._splitby) + self._base_layer.face.color = color_by.map(self._source) self._color_by = color_by return self @@ -582,7 +588,7 @@ def with_hatch(self, by: str | Iterable[str], choices=None) -> Self: hatch_by = _p.HatchPlan.new(cov.columns, values=choices) else: hatch_by = _p.HatchPlan.from_const(Hatch(cov.value)) - self._base_layer.face.hatch = hatch_by.generate(self._labels, self._splitby) + self._base_layer.face.hatch = hatch_by.map(self._source) self._hatch_by = hatch_by return self diff --git a/whitecanvas/layers/tabular/_df_compat.py b/whitecanvas/layers/tabular/_df_compat.py index b9c92c70..d93c9bfd 100644 --- a/whitecanvas/layers/tabular/_df_compat.py +++ b/whitecanvas/layers/tabular/_df_compat.py @@ -126,6 +126,9 @@ def filter( return DictWrapper({k: v[sl] for k, v in self._data.items()}) def group_by(self, by: tuple[str, ...]) -> Iterator[tuple[tuple[Any, ...], Self]]: + if by == (): + yield (), self + return observed = set() for row in zip(*[self._data[b] for b in by]): if row in observed: @@ -181,6 +184,9 @@ def filter( return PandasWrapper(self._data[sers]) def group_by(self, by: tuple[str, ...]) -> Iterator[tuple[tuple[Any, ...], Self]]: + if by == (): + yield (), self + return for sl, sub in self._data.groupby(list(by), observed=True): yield sl, PandasWrapper(sub) @@ -228,6 +234,9 @@ def filter( return PolarsWrapper(df) def group_by(self, by: tuple[str, ...]) -> Iterator[tuple[tuple[Any, ...], Self]]: + if by == (): + yield (), self + return for sl, sub in self._data.group_by(by, maintain_order=True): yield sl, PolarsWrapper(sub) @@ -238,7 +247,7 @@ def agg_by(self, by: tuple[str, ...], on: str, method: str) -> Self: return PolarsWrapper(self._data.group_by(by, maintain_order=True).agg(expr)) def value_count(self, by: tuple[str, ...]) -> Self: - return ( + return PolarsWrapper( self._data.group_by(by, maintain_order=True) .count() .rename({"count": "size"}) @@ -270,6 +279,9 @@ def filter( return PyArrowWrapper(df) def group_by(self, by: tuple[str, ...]) -> Iterator[tuple[tuple[Any, ...], Self]]: + if by == (): + yield (), self + return for sl, sub in self._data.group_by(by, maintain_order=True): yield sl, PyArrowWrapper(sub) @@ -284,7 +296,7 @@ def agg_by(self, by: tuple[str, ...], on: str, method: str) -> Self: ) def value_count(self, by: tuple[str, ...]) -> Self: - return ( + return PyArrowWrapper( self._data.group_by(by, maintain_order=True) .count() .rename_columns([*by, "size"]) diff --git a/whitecanvas/theme/_dataclasses.py b/whitecanvas/theme/_dataclasses.py index 35b50181..b9e92848 100644 --- a/whitecanvas/theme/_dataclasses.py +++ b/whitecanvas/theme/_dataclasses.py @@ -90,7 +90,7 @@ class Line(_BaseModel): class Markers(_BaseModel): """Markers of points.""" - size: int = _field(8) + size: float = _field(8.0) hatch: Hatch = _field(Hatch.SOLID) symbol: Symbol = _field(Symbol.CIRCLE) From f11d13da993b702b4f079fdf2e4502e09380308c Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Fri, 2 Feb 2024 18:09:05 +0900 Subject: [PATCH 03/11] more bug fixes --- .../{categorical_axis.md => cat_num.md} | 2 +- docs/categorical/index.md | 16 +- docs/categorical/lines_and_markers.md | 44 ----- docs/categorical/num_num.md | 97 ++++++++++ mkdocs.yml | 4 +- tests/test_categorical.py | 58 +++--- whitecanvas/canvas/_base.py | 23 +-- whitecanvas/canvas/_imageref.py | 21 ++- whitecanvas/canvas/dataframe/_both_cat.py | 169 ++++++++++++++---- whitecanvas/canvas/dataframe/_feature_cat.py | 153 ++++++++-------- whitecanvas/canvas/dataframe/_one_cat.py | 17 +- whitecanvas/layers/tabular/_dataframe.py | 117 +++--------- whitecanvas/layers/tabular/_df_compat.py | 23 ++- 13 files changed, 456 insertions(+), 288 deletions(-) rename docs/categorical/{categorical_axis.md => cat_num.md} (99%) delete mode 100644 docs/categorical/lines_and_markers.md create mode 100644 docs/categorical/num_num.md diff --git a/docs/categorical/categorical_axis.md b/docs/categorical/cat_num.md similarity index 99% rename from docs/categorical/categorical_axis.md rename to docs/categorical/cat_num.md index 772c6eed..e8d85113 100644 --- a/docs/categorical/categorical_axis.md +++ b/docs/categorical/cat_num.md @@ -1,4 +1,4 @@ -# Categorical Axis +# Categorical × Numerical Data There are several plots that use categorical axis. Examples are: diff --git a/docs/categorical/index.md b/docs/categorical/index.md index 89dbe9ab..f5039dac 100644 --- a/docs/categorical/index.md +++ b/docs/categorical/index.md @@ -6,10 +6,18 @@ support for high-level categorical plotting methods that use DataFrame objects a In `whitecanvas`, similar functions are provided, but these methods do not depend on any external plotting libraries or DataFrames, and are more flexible in some cases. -## The `cat` Method +## The Categorical Plotters -The `cat` method converts a tabular data into a categorical plotter. Currently, -following objects are allowed as input: +Methods starting with "cat" return categorical plotters. Methods include: + +- `cat` ... plotter for numerical data in x/y-axis categorized by such as color. +- `cat_x` ... plotter for categorical data in x-axis. +- `cat_y` ... plotter for categorical data in y-axis. +- `cat_xy` ... plotter for categorical data in both x- and y-axis. + +These methods need a tabular data and the names of the columns that will be used as the +x and y values. +Currently, following objects are allowed as the tabular data input: - `dict` of array-like objects - `pandas.DataFrame` @@ -32,6 +40,6 @@ df = { "value": rng.normal(size=130), } -canvas.cat(df).add_stripplot("label", "value").with_edge(color="black") +canvas.cat_x(df, x="label", y="value").add_stripplot().with_edge(color="black") canvas.show() ``` diff --git a/docs/categorical/lines_and_markers.md b/docs/categorical/lines_and_markers.md deleted file mode 100644 index 0ac88435..00000000 --- a/docs/categorical/lines_and_markers.md +++ /dev/null @@ -1,44 +0,0 @@ -# Categorical Lines and Markers - -Line plot and scatter plot use numerical values for both x and y axes. In this case, -the plot is categorized by such as color, marker symbol, etc. - -``` python -from whitecanvas import new_canvas - -# sample data -df = { - "label": ["A"] * 5 + ["B"] * 5, - "x": [0, 1, 2, 3, 4, 0, 1, 2, 3, 4], - "y": [3, 1, 2, 4, 3, 5, 3, 3, 1, 2], -} -``` - -By setting `color=` to one of the column name, lines are split by the column and -different colors are used for each group. - -``` python -#!name: categorical_add_line_color -canvas = new_canvas("matplotlib") -canvas.cat(df).add_line("x", "y", color="label") -canvas.show() -``` - -By setting `style=`, different line styles are used instead. In the following example, -`color="black"` means that all the lines should be the same color (black). - -``` python -#!name: categorical_add_line_style -canvas = new_canvas("matplotlib") -canvas.cat(df).add_line("x", "y", color="black", style="label") -canvas.show() -``` - -In the case of markers, you can use symbols to distinguish groups. - -``` python -#!name: categorical_add_markers_symbol -canvas = new_canvas("matplotlib") -canvas.cat(df).add_markers("x", "y", symbol="label") -canvas.show() -``` diff --git a/docs/categorical/num_num.md b/docs/categorical/num_num.md new file mode 100644 index 00000000..c88bf135 --- /dev/null +++ b/docs/categorical/num_num.md @@ -0,0 +1,97 @@ +# Numerical × Numerical Data + +## Categorical Lines and Markers + +Line plot and scatter plot use numerical values for both x and y axes. In this case, +the plot is categorized by such as color, marker symbol, etc. + +``` python +from whitecanvas import new_canvas + +# sample data +df = { + "label": ["A"] * 5 + ["B"] * 5, + "x": [0, 1, 2, 3, 4, 0, 1, 2, 3, 4], + "y": [3, 1, 2, 4, 3, 5, 3, 3, 1, 2], +} +``` + +By setting `color=` to one of the column name, lines are split by the column and +different colors are used for each group. + +``` python +#!name: categorical_add_line_color +canvas = new_canvas("matplotlib") +canvas.cat(df, "x", "y").add_line(color="label") +canvas.show() +``` + +By setting `style=`, different line styles are used instead. In the following example, +`color="black"` means that all the lines should be the same color (black). + +``` python +#!name: categorical_add_line_style +canvas = new_canvas("matplotlib") +canvas.cat(df, "x", "y").add_line(color="black", style="label") +canvas.show() +``` + +In the case of markers, you can use symbols to distinguish groups. + +``` python +#!name: categorical_add_markers_symbol +canvas = new_canvas("matplotlib") +canvas.cat(df, "x", "y").add_markers(symbol="label") +canvas.show() +``` + +## Distribution of Numerical Data + +There are several ways to visualize the distribution of numerical data. + +- Histogram +- Kernel Density Estimation (KDE) + +These representations only use one array of numerical data. Therefore, either `x` or `y` should be empty in the `cat` method. + +``` python +import numpy as np + +rng = np.random.default_rng(12345) + +# sample data +df = { + "label": ["A"] * 60 + ["B"] * 30 + ["C"] * 40, + "value": rng.normal(size=130), +} +``` + +`x="value"` means that the x-axis being "value" and the y-axis being the count. +Arguments forwards to the `histogram` method of `numpy`. + +``` python +#!name: cat_hist_x +canvas = new_canvas("matplotlib") +canvas.cat(df, x="value").add_hist(bins=10) +canvas.show() +``` + +To transpose the histogram, use `y="value"`. + +``` python +#!name: cat_hist_y +canvas = new_canvas("matplotlib") +canvas.cat(df, y="value").add_hist(bins=10) +canvas.show() +``` + +If both `x` and `y` are set, the plotter cannot determine which axis to use. To tell +the plotter which axis to use, call `along_x()` or `along_y()` to restrict the +dimension. + +``` python +canvas = new_canvas("matplotlib") +# canvas.cat(df, x="label", y="value").add_hist(bins=10) # This will raise an error +canvas.cat(df, x="label", y="value").along_x().add_hist(bins=10) +canvas.show() +``` diff --git a/mkdocs.yml b/mkdocs.yml index 76c403a6..d137731e 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -33,8 +33,8 @@ nav: - Working with the Backend Objects: canvas/native_objects.md - Categorical Plot: - Overview: categorical/index.md - - Categorical Lines and Markers: categorical/lines_and_markers.md - - Categorical Axis: categorical/categorical_axis.md + - Numerical × Numerical Data: categorical/num_num.md + - Categorical × Numerical Data: categorical/cat_num.md plugins: diff --git a/tests/test_categorical.py b/tests/test_categorical.py index dacd368a..78ecf7dc 100644 --- a/tests/test_categorical.py +++ b/tests/test_categorical.py @@ -4,20 +4,41 @@ from ._utils import assert_color_array_equal import pytest +def test_cat(backend: str): + canvas = new_canvas(backend=backend) + rng = np.random.default_rng(1642) + df = { + "x": rng.normal(size=30), + "y": rng.normal(size=30), + "label": np.repeat(["A", "B", "C"], 10), + } + canvas.cat(df, "x", "y").add_line() + canvas.cat(df, "x", "y").add_line(color="label") + canvas.cat(df, "x", "y").add_markers() + canvas.cat(df, "x", "y").add_markers(color="label") + canvas.cat(df, "x", "y").add_markers(hatch="label") + canvas.cat(df, "x", "y").add_hist2d(bins=(5, 4)) + canvas.cat(df, "x", "y").along_x().add_hist_line(bins=5) + canvas.cat(df, "x", "y").along_x().add_hist_line(bins=5, color="label") + canvas.cat(df, "x", "y").along_y().add_hist_line(bins=6) + canvas.cat(df, "x", "y").along_y().add_hist_line(bins=6, color="label") + @pytest.mark.parametrize("orient", ["v", "h"]) def test_cat_plots(backend: str, orient: str): canvas = new_canvas(backend=backend) df = { - "x": np.arange(30), "y": np.arange(30), "label": np.repeat(["A", "B", "C"], 10), + "c": ["P", "Q"] * 15, } - - canvas.cat(df).add_stripplot("label", "y", orient=orient) - canvas.cat(df).add_swarmplot("label", "y", orient=orient) - canvas.cat(df).add_boxplot("label", "y", orient=orient) - canvas.cat(df).add_violinplot("label", "y", orient=orient) - canvas.cat(df).add_countplot("label", orient=orient) + if orient == "v": + cat_plt = canvas.cat_x(df, "label", "y") + else: + cat_plt = canvas.cat_y(df, "y", "label") + cat_plt.add_stripplot(color="c") + cat_plt.add_swarmplot(color="c") + cat_plt.add_boxplot(color="c") + cat_plt.add_violinplot(color="c") def test_colored_plots(backend: str): canvas = new_canvas(backend=backend) @@ -27,8 +48,8 @@ def test_colored_plots(backend: str): "label": np.repeat(["A", "B", "C"], 10), } - canvas.cat(df).add_markers("x", "y", color="label") - canvas.cat(df).add_line("x", "y", color="label") + canvas.cat(df, "x", "y").add_markers(color="label") + canvas.cat(df, "x", "y").add_line(color="label") def test_markers(backend: str): canvas = new_canvas(backend=backend) @@ -40,26 +61,26 @@ def test_markers(backend: str): "label1": ["One"] * 10 + ["Two"] * 20, } - _c = canvas.cat(df) - out = _c.add_markers("x", "y", color="label0", size="size", symbol="label1") + _c = canvas.cat(df, "x", "y") + out = _c.add_markers(color="label0", size="size", symbol="label1") assert len(set(out._base_layer.symbol[:10])) == 1 assert len(set(out._base_layer.symbol[10:])) == 1 - out = _c.add_markers("x", "y", color="label1", size="size", hatch="label0") + out = _c.add_markers(color="label1", size="size", hatch="label0") assert len(set(out._base_layer.face.hatch[:10])) == 1 assert len(set(out._base_layer.face.hatch[10:20])) == 1 assert len(set(out._base_layer.face.hatch[20:])) == 1 - out = _c.add_markers("x", "y", color="label1").with_edge(color="label0") + out = _c.add_markers(color="label1").with_edge(color="label0") assert len(np.unique(out._base_layer.edge.color[:10], axis=0)) == 1 assert len(np.unique(out._base_layer.edge.color[10:20], axis=0)) == 1 assert len(np.unique(out._base_layer.edge.color[20:], axis=0)) == 1 # test scalar color - out = _c.add_markers("x", "y", color="black") + out = _c.add_markers(color="black") assert_color_array_equal(out._base_layer.face.color, "black") - out = _c.add_markers("x", "y", color="transparent").with_edge_colormap("size") + out = _c.add_markers(color="transparent").with_edge_colormap("size") def test_heatmap(backend: str): canvas = new_canvas(backend=backend) @@ -68,7 +89,7 @@ def test_heatmap(backend: str): "y": ["P", "P", "Q", "Q", "R", "R"], "z": [1, 2, 3, 4, 5, 6], } - im = canvas.cat(df).add_heatmap("x", "y", value="z") + im = canvas.cat_xy(df, "x", "y").first().add_heatmap(value="z") canvas.imref(im).add_text() df = { @@ -76,9 +97,6 @@ def test_heatmap(backend: str): "y": ["P", "Q", "Q", "Q", "P", "Q"], "z": [1.1, 2.1, 3.4, 6.4, 1.1, 6.8], } - with pytest.raises(ValueError): - # has duplication - canvas.cat(df).add_heatmap("x", "y", value="z") - im = canvas.cat(df).mean().add_heatmap("x", "y", value="z", fill=-1) + im = canvas.cat_xy(df, "x", "y").mean().add_heatmap(value="z", fill=-1) canvas.imref(im).add_text(fmt=".1f") assert im.clim == (1.1, 6.6) diff --git a/whitecanvas/canvas/_base.py b/whitecanvas/canvas/_base.py index 46820144..089affc6 100644 --- a/whitecanvas/canvas/_base.py +++ b/whitecanvas/canvas/_base.py @@ -312,9 +312,9 @@ def update_axes( def cat( self, data: _DF, - *, x: str | None = None, y: str | None = None, + *, update_labels: bool = True, ) -> _df.CatPlotter[Self, _DF]: """ @@ -343,9 +343,9 @@ def cat( def cat_x( self, data: _DF, - *, x: str | Sequence[str] | None = None, y: str | None = None, + *, update_labels: bool = True, ) -> _df.XCatPlotter[Self, _DF]: return _df.XCatPlotter(self, data, x, y, update_labels) @@ -353,9 +353,9 @@ def cat_x( def cat_y( self, data: _DF, - *, x: str | None = None, y: str | Sequence[str] | None = None, + *, update_labels: bool = True, ) -> _df.YCatPlotter[Self, _DF]: return _df.YCatPlotter(self, data, y, x, update_labels) @@ -363,9 +363,9 @@ def cat_y( def cat_xy( self, data: _DF, + x: str | Sequence[str], + y: str | Sequence[str], *, - x: str | Sequence[str] | None = None, - y: str | Sequence[str] | None = None, update_labels: bool = True, ) -> _df.XYCatPlotter[Self, _DF]: return _df.XYCatPlotter(self, data, x, y, update_labels) @@ -760,7 +760,8 @@ def add_hist2d( cmap: ColormapType = "inferno", name: str | None = None, bins: int | tuple[int, int] = 10, - range: tuple[tuple[float, float], tuple[float, float]] | None = None, + rangex: tuple[float, float] | None = None, + rangey: tuple[float, float] | None = None, density: bool = False, ) -> _l.Image: """ @@ -786,8 +787,10 @@ def add_hist2d( bins : int or tuple[int, int], optional Bins of the histogram of X/Y dimension respectively. If an integer is given, it will be used for both dimensions. - range : (2, 2) array-like, optional - Range in which histogram will be built. + rangex : (float, float), optional + Range of x values in which histogram will be built. + rangey : (float, float), optional + Range of y values in which histogram will be built. density : bool, default False If True, values of the histogram will be normalized so that the total intensity of the histogram will be 1. @@ -798,8 +801,8 @@ def add_hist2d( Image layer representing the 2D histogram. """ layer = _l.Image.build_hist( - x, y, bins=bins, range=range, density=density, name=name, cmap=cmap, - backend=self._get_backend(), + x, y, bins=bins, range=(rangex, rangey), density=density, name=name, + cmap=cmap, backend=self._get_backend(), ) # fmt: skip return self.add_layer(layer) diff --git a/whitecanvas/canvas/_imageref.py b/whitecanvas/canvas/_imageref.py index d148b0bf..909f62a1 100644 --- a/whitecanvas/canvas/_imageref.py +++ b/whitecanvas/canvas/_imageref.py @@ -61,6 +61,7 @@ def add_text( size: int = 8, color_rule: ColorType | Callable[[np.ndarray], ColorType] | None = None, fmt: str = "", + text_invalid: str | None = None, ) -> Texts[_mixin.MonoFace, _mixin.MonoEdge, _mixin.MultiFont]: """ Add text annotation to each pixel of the image. @@ -130,14 +131,18 @@ def _color_rule(x: NDArray[np.number]) -> NDArray[np.float32]: fmt_style = "{}" for iy, y in enumerate(ys): for ix, x in enumerate(xs): - texts.append(fmt_style.format(img_data[iy, ix])) + if np.isfinite(img_data[iy, ix]): + text = fmt_style.format(img_data[iy, ix]) + else: + if text_invalid is None: + text = repr(img_data[iy, ix]) + else: + text = text_invalid + texts.append(text) xdata.append(x) ydata.append(y) colors.append(_color_rule(img_color[iy, ix])) - return canvas.add_text( - xdata, - ydata, - texts, - size=size, - anchor="center", - ).with_font_multi(color=np.stack(colors, axis=0)) + return ( + canvas.add_text(xdata, ydata, texts, size=size, anchor="center") + .with_font_multi(color=np.stack(colors, axis=0)) + ) # fmt: skip diff --git a/whitecanvas/canvas/dataframe/_both_cat.py b/whitecanvas/canvas/dataframe/_both_cat.py index a0c8c628..6e49fb3c 100644 --- a/whitecanvas/canvas/dataframe/_both_cat.py +++ b/whitecanvas/canvas/dataframe/_both_cat.py @@ -2,16 +2,22 @@ from typing import ( TYPE_CHECKING, + Generic, Sequence, TypeVar, ) -from whitecanvas.canvas.dataframe._base import BaseCatPlotter +import numpy as np + +from whitecanvas.canvas.dataframe._base import BaseCatPlotter, CatIterator from whitecanvas.layers import tabular as _lt from whitecanvas.types import ColormapType if TYPE_CHECKING: + from typing_extensions import Self + from whitecanvas.canvas._base import CanvasBase + from whitecanvas.layers.tabular._dataframe import DataFrameWrapper NStr = str | Sequence[str] @@ -19,6 +25,34 @@ _DF = TypeVar("_DF") +class _XYAggregator(Generic[_C, _DF]): + def __init__(self, method: str, plotter: XYCatPlotter[_C, _DF] = None): + self._method = method + self._plotter = plotter + + def __get__(self, ins: _C, owner) -> Self: + return _XYAggregator(self._method, ins) + + def __repr__(self) -> str: + return f"XYAggregator<{self._method}>" + + def __call__(self) -> XYCatAggPlotter[_C, _DF]: + """Aggregate the values before plotting it.""" + plotter = self._plotter + if plotter is None: + raise TypeError("Cannot call this method from a class.") + if plotter._x is None or plotter._y is None: + raise ValueError("Value column is not specified.") + return XYCatAggPlotter( + plotter._canvas(), + plotter._cat_iter_x, + plotter._cat_iter_y, + x=plotter._x, + y=plotter._y, + method=self._method, + ) + + class XYCatPlotter(BaseCatPlotter[_C, _DF]): def __init__( self, @@ -28,11 +62,19 @@ def __init__( y: str | tuple[str, ...], update_label: bool = False, ): - super().__init__(canvas, df, update_label) - self._x = x - self._y = y + super().__init__(canvas, df) + if isinstance(x, str): + x = (x,) + if isinstance(y, str): + y = (y,) + self._x: tuple[str, ...] = x + self._y: tuple[str, ...] = y + self._update_label = update_label + self._cat_iter_x = CatIterator(self._df, x) + self._cat_iter_y = CatIterator(self._df, y) if update_label: self._update_xy_label(x, y) + self._update_axis_labels() def _update_xy_label( self, @@ -48,6 +90,38 @@ def _update_xy_label( canvas.x.label.text = x canvas.y.label.text = y + def _update_axis_labels(self) -> None: + """Update the x and y labels using the column names""" + canvas = self._canvas() + canvas.x.ticks.set_labels(*self._cat_iter_x.axis_ticks()) + canvas.y.ticks.set_labels(*self._cat_iter_y.axis_ticks()) + + mean = _XYAggregator("mean") + median = _XYAggregator("median") + sum = _XYAggregator("sum") + min = _XYAggregator("min") + max = _XYAggregator("max") + count = _XYAggregator("size") + first = _XYAggregator("first") + + +class XYCatAggPlotter(BaseCatPlotter[_C, _DF]): + def __init__( + self, + canvas: _C, + cat_iter_x: CatIterator[_DF], + cat_iter_y: CatIterator[_DF], + x: str | tuple[str, ...], + y: str | tuple[str, ...], + method: str, + ): + super().__init__(canvas, cat_iter_x.df) + self._cat_iter_x = cat_iter_x + self._cat_iter_y = cat_iter_y + self._x = x + self._y = y + self._agg_method = method + def add_heatmap( self, value: str, @@ -57,35 +131,66 @@ def add_heatmap( name: str | None = None, fill: float = 0, ) -> _lt.DFHeatmap[_DF]: + """ + Add a heatmap whose color represents the value of the aggregated data. + + Parameters + ---------- + value : str + Column name to use as the value. + cmap : colormap-like, default "inferno" + Colormap to use for the heatmap. + clim : (float, float), optional + Color limits for the colormap. If not specified, the limits are calculated + from the data min/max. + name : str, optional + Name of the layer. + fill : float, optional + Value to fill for the cells that do not have any data. This value will not + be considered when calculating the color limits. + + Returns + ------- + DFHeatmap + Dataframe bound heatmap layer. + """ canvas = self._canvas() - layer = _lt.DFHeatmap.build_heatmap( - self._df, self._x, self._y, value, cmap=cmap, clim=clim, name=name, - fill=fill, backend=canvas._get_backend(), + df = self._df + by_both = (*self._x, *self._y) + nx = len(self._x) + df_agg = self._aggregate(df, by_both, value) + map_x = self._cat_iter_x.prep_position_map(self._x) + map_y = self._cat_iter_y.prep_position_map(self._y) + dtype = df[value].dtype + if dtype.kind not in "fiub": + raise ValueError(f"Column {value!r} is not numeric.") + arr = np.full((len(map_y), len(map_x)), fill, dtype=dtype) + for sl, sub in df_agg.group_by(by_both): + xval, yval = sl[:nx], sl[nx:] + vals = sub[value] + if vals.size == 1: + arr[map_y[yval], map_x[xval]] = vals[0] + else: + raise ValueError(f"More than one value found for {sl!r}.") + if clim is None: + # `fill` may be outside the range of the data, so calculate clim here. + clim = df_agg[value].min(), df_agg[value].max() + layer = _lt.DFHeatmap.from_array( + df_agg, arr, name=name, cmap=cmap, clim=clim, backend=canvas._get_backend(), ) # fmt: skip - if self._update_label: - canvas.x.ticks.set_labels(*layer._generate_xticks()) - canvas.y.ticks.set_labels(*layer._generate_yticks()) return canvas.add_layer(layer) - -# TODO: add this in agg plotter -# def add_heatmap( -# self, -# value: str, -# *, -# cmap: ColormapType = "inferno", -# clim: tuple[float, float] | None = None, -# name: str | None = None, -# fill: float = 0, -# ) -> _lt.DFHeatmap[_DF]: -# canvas = self._canvas() -# df = parse(self._df) -# df_agg = self._aggregate(df, (x, y), value) -# layer = _lt.DFHeatmap.build_heatmap( -# df_agg, x, y, value, cmap=cmap, clim=clim, name=name, fill=fill, -# backend=canvas._get_backend(), -# ) # fmt: skip -# if self._update_label: -# canvas.x.ticks.set_labels(*layer._generate_xticks()) -# canvas.y.ticks.set_labels(*layer._generate_yticks()) -# return canvas.add_layer(layer) + def _aggregate( + self, + df: DataFrameWrapper[_DF], + by: tuple[str, ...], + on: str, + ) -> DataFrameWrapper[_DF]: + if self._agg_method == "size": + return df.value_count(by) + elif self._agg_method == "first": + return df.value_first(by, on) + else: + if on is None: + raise ValueError("Value column is not specified.") + return df.agg_by(by, on, self._agg_method) diff --git a/whitecanvas/canvas/dataframe/_feature_cat.py b/whitecanvas/canvas/dataframe/_feature_cat.py index 7e6fbcd7..7ec50387 100644 --- a/whitecanvas/canvas/dataframe/_feature_cat.py +++ b/whitecanvas/canvas/dataframe/_feature_cat.py @@ -59,14 +59,17 @@ def _update_xy_label(self, x: str | None, y: str | None) -> None: canvas.y.label.text = y def along_x(self) -> CatPlotter[_C, _DF]: - return self.__class__( - self._canvas(), self._df, self._get_x(), None, self._update_label - ) + """Return the same plotter but with only x-axis set.""" + return self._copy_like(self._get_x(), None, self._update_label) def along_y(self) -> CatPlotter[_C, _DF]: - return self.__class__( - self._canvas(), self._df, None, self._get_y(), self._update_label - ) + """Return the same plotter but with only y-axis set.""" + return self._copy_like(None, self._get_y(), self._update_label) + + def _copy_like(self, x, y, update_label): + out = self.__class__(self._canvas(), self._df, x, y, False) + out._update_label = update_label + return out def add_line( self, @@ -80,13 +83,13 @@ def add_line( Add a categorical line plot. >>> ### Use "time" column as x-axis and "value" column as y-axis - >>> canvas.cat(df).add_line("time", "value") + >>> canvas.cat(df, "time", "value").add_line() >>> ### Multiple lines colored by column "group" - >>> canvas.cat(df).add_line("time", "value", color="group") + >>> canvas.cat(df, "time", "value").add_line(color="group") >>> ### Multiple lines styled by column "group" - >>> canvas.cat(df).add_line("time", "value", style="group") + >>> canvas.cat(df, "time", "value").add_line(style="group") Parameters ---------- @@ -101,7 +104,7 @@ def add_line( Returns ------- - WrappedLines + DFLines Line collection layer. """ canvas = self._canvas() @@ -128,23 +131,22 @@ def add_markers( Add a categorical marker plot. >>> ### Use "time" column as x-axis and "value" column as y-axis - >>> canvas.cat(df).add_markers("time", "value") + >>> canvas.cat(df, "time", "value").add_markers() >>> ### Multiple markers colored by column "group" - >>> canvas.cat(df).add_markers("time", "value", color="group") + >>> canvas.cat(df, "time", "value").add_markers(color="group") + + >>> ### Change marker size according to "weight" column + >>> canvas.cat(df, "time", "value").add_markers(size="weight") >>> ### Multiple markers with hatches determined by column "group" - >>> canvas.cat(df).add_markers("time", "value", style="group") + >>> canvas.cat(df, "time", "value").add_markers(hatch="group") >>> ### Multiple markers with symbols determined by "group" - >>> canvas.cat(df).add_markers("time", "value", symbol="group") + >>> canvas.cat(df, "time", "value").add_markers(symbol="group") Parameters ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. name : str, optional Name of the layer. color : str or sequence of str, optional @@ -158,7 +160,7 @@ def add_markers( Returns ------- - WrappedMarkers + DFMarkers Marker collection layer. """ canvas = self._canvas() @@ -174,71 +176,48 @@ def add_markers( layer.with_color(canvas._color_palette.next()) return canvas.add_layer(layer) - def add_bar( + def add_hist2d( self, *, + cmap: ColormapType = "inferno", name: str | None = None, - color: NStr | None = None, - hatch: NStr | None = None, - extent: float = 0.8, - ) -> _lt.DFBars[_DF]: + bins: int | tuple[int, int] = 10, + rangex: tuple[float, float] | None = None, + rangey: tuple[float, float] | None = None, + density: bool = False, + ): """ - Add a categorical bar plot. - - >>> ### Use "time" column as x-axis and "value" column as y-axis - >>> canvas.cat(df).add_bar("time", "value") - - >>> ### Multiple bars colored by column "group" - >>> canvas.cat(df).add_bar("time", "value", color="group") + Add 2-D histogram of given x/y columns. - >>> ### Multiple bars with hatches determined by column "group" - >>> canvas.cat(df).add_bar("time", "value", hatch="group") + >>> ### Use "tip" column as x-axis and "total_bill" column as y-axis + >>> canvas.cat(df, "tip", "total_bill").add_hist2d() Parameters ---------- - x : str - Column name for x-axis. - y : str - Column name for y-axis. + cmap : colormap-like, default "inferno" + Colormap to use for the heatmap. name : str, optional Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - hatch : str or sequence of str, optional - Column name(s) for hatches. Must be categorical. - extent : float, optional - Width of the bars. Usually in range (0, 1]. + bins : int or tuple[int, int], default 10 + If int, the number of bins for both x and y. If tuple, the number of bins + for x and y respectively. + rangex : (float, float), optional + Range of x values in which histogram will be built. + rangey : (float, float), optional + Range of y values in which histogram will be built. + density : bool, default False + If True, the result is the value of the probability density function at the + bin, normalized such that the integral over the range is 1. Returns ------- - WrappedBars - Bar collection layer. + DFHeatmap + Dataframe bound heatmap layer. """ canvas = self._canvas() - layer = _lt.DFBars.from_table( - self._df, self._get_x(), self._get_y(), name=name, color=color, hatch=hatch, - extent=extent, backend=canvas._get_backend(), - ) # fmt: skip - if color is not None and not layer._color_by.is_const(): - layer.with_color(layer._color_by.by, palette=canvas._color_palette) - elif color is None: - layer.with_color(canvas._color_palette.next()) - return canvas.add_layer(layer) - - def add_hist2d( - self, - *, - cmap: ColormapType = "inferno", - name: str | None = None, - bins: int | tuple[int, int] = 10, - range: tuple[tuple[float, float], tuple[float, float]] | None = None, - density: bool = False, - ): - """Add 2-D histogram of given columns.""" - canvas = self._canvas() layer = _lt.DFHeatmap.build_hist( self._df, self._get_x(), self._get_y(), cmap=cmap, name=name, bins=bins, - range=range, density=density, backend=canvas._get_backend(), + range=(rangex, rangey), density=density, backend=canvas._get_backend(), ) # fmt: skip return canvas.add_layer(layer) @@ -251,6 +230,36 @@ def add_pointplot( size: float | None = None, capsize: float = 0.15, ): + """ + Add 2-D point plot. + + >>> ### Use "time" column as x-axis and "value" column as y-axis + >>> canvas.cat(df, "time", "value").add_pointplot() + + >>> ### Multiple point plots colored by column "group" + >>> canvas.cat(df, "time", "value").add_pointplot(color="group") + + >>> ### Multiple point plots with hatches determined by column "group" + >>> canvas.cat(df, "time", "value").add_pointplot(hatch="group") + + Parameters + ---------- + name : str, optional + Name of the layer. + color : str or sequence of str, optional + Column name(s) for coloring the lines. Must be categorical. + hatch : str or sequence of str, optional + Column name(s) for hatches. Must be categorical. + size : float, optional + Size of the points. + capsize : float, default 0.15 + Size of the cap on the error bars. + + Returns + ------- + DFPointPlot2D + Point plot layer. + """ canvas = self._canvas() layer = _lt.DFPointPlot2D( self._df, self._get_x(), self._get_y(), name=name, color=color, @@ -286,10 +295,10 @@ def add_hist_line( Add lines representing histograms. >>> ### Use "value" column as x-axis - >>> canvas.cat(df).add_line_hist("value", bins=8, density=True) + >>> canvas.cat(df, x="value").add_line_hist(bins=8, density=True) >>> ### Multiple histograms colored by column "group" - >>> canvas.cat(df).add_line_hist("value", color="group") + >>> canvas.cat(df, x="value").add_line_hist(color="group") Parameters ---------- @@ -345,10 +354,10 @@ def add_kde( Add lines representing kernel density estimation. >>> ### Use "value" column as x-axis - >>> canvas.cat(df).add_kde("value") + >>> canvas.cat(df, x="value").add_kde() >>> ### Multiple KDEs colored by column "group" - >>> canvas.cat(df).add_kde("value", color="group") + >>> canvas.cat(df, x="value).add_kde(color="group") Parameters ---------- @@ -398,5 +407,5 @@ def _column_and_orient(self) -> tuple[str, Orientation]: return self._y, Orientation.HORIZONTAL -class FeatureCatAggPlotter: +class CatAggPlotter: ... diff --git a/whitecanvas/canvas/dataframe/_one_cat.py b/whitecanvas/canvas/dataframe/_one_cat.py index 94dceb67..e87dd0aa 100644 --- a/whitecanvas/canvas/dataframe/_one_cat.py +++ b/whitecanvas/canvas/dataframe/_one_cat.py @@ -6,7 +6,7 @@ from whitecanvas.canvas.dataframe._base import AggMethods, BaseCatPlotter, CatIterator from whitecanvas.layers import tabular as _lt from whitecanvas.layers.tabular import _jitter, _shared -from whitecanvas.types import ColorType, Hatch, Orientation, Symbol +from whitecanvas.types import ColormapType, ColorType, Hatch, Orientation, Symbol if TYPE_CHECKING: from typing_extensions import Self @@ -96,7 +96,7 @@ def __init__( offset = (offset,) elif offset is None: offset = () - self._offset = offset + self._offset: tuple[str, ...] = offset self._cat_iter = CatIterator(self._df, offset) self._value = value self._update_label = update_label @@ -509,6 +509,15 @@ def add_swarmplot( layer.with_color(canvas._color_palette.next()) return canvas.add_layer(layer) + def add_hist_heatmap( + self, + cmap: ColormapType = "inferno", + clim: tuple[float, float] | None = None, + ) -> _lt.DFHeatmap[_DF]: + # TODO: implement this + raise NotImplementedError + + # aggregators and group aggregators mean = _Aggregator("mean") median = _Aggregator("median") min = _Aggregator("min") @@ -516,6 +525,7 @@ def add_swarmplot( std = _Aggregator("std") sum = _Aggregator("sum") count = _Aggregator("size") + first = _Aggregator("first") mean_for_each = _GroupAggregator("mean") median_for_each = _GroupAggregator("median") @@ -523,6 +533,7 @@ def add_swarmplot( max_for_each = _GroupAggregator("max") std_for_each = _GroupAggregator("std") sum_for_each = _GroupAggregator("sum") + first_for_each = _GroupAggregator("first") class OneAxisCatAggPlotter(BaseCatPlotter[_C, _DF]): @@ -702,6 +713,8 @@ def _aggregate( ) -> DataFrameWrapper[_DF]: if self._agg_method == "size": return df.value_count(by) + elif self._agg_method == "first": + return df.value_first(by, on) else: if on is None: raise ValueError("Value column is not specified.") diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index e130aa99..1f17de45 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -15,7 +15,6 @@ from whitecanvas.layers.tabular import _jitter, _shared from whitecanvas.layers.tabular import _plans as _p from whitecanvas.layers.tabular._df_compat import DataFrameWrapper, parse -from whitecanvas.layers.tabular._utils import unique from whitecanvas.types import ( ArrayLike1D, ColormapType, @@ -479,7 +478,6 @@ def __init__( source: DataFrameWrapper[_DF], x, y, - labels: list[tuple[Any, ...]], color: str | tuple[str, ...] | None = None, hatch: str | tuple[str, ...] | None = None, name: str | None = None, @@ -489,9 +487,7 @@ def __init__( ): splitby = _shared.join_columns(color, hatch, source=source) self._color_by = _p.ColorPlan.default() - self._width_by = _p.WidthPlan.default() self._style_by = _p.StylePlan.default() - self._labels = labels self._splitby = splitby base = _l.Bars( @@ -503,35 +499,6 @@ def __init__( if hatch is not None: self.with_hatch(hatch) - @classmethod - def from_cat( - cls, - df: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], - value: str, - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - name: str | None = None, - orient: Orientation = Orientation.VERTICAL, - extent: float = 0.8, - backend: str | Backend | None = None, - ): - if isinstance(offset, str): - offset = (offset,) - splitby = _shared.join_columns(offset, color, hatch, source=df) - labels: list[tuple[Any, ...]] = [] - values = [] - for sl, sub in df.group_by(splitby): - labels.append(sl) - series = sub[value] - if len(series) != 1: - raise ValueError(f"More than one value found for category {sl!r}.") - values.append(series[0]) - return cls( - df, offset, value, labels, name=name, color=color, hatch=hatch, - orient=orient, extent=extent, backend=backend - ) # fmt: skip - @classmethod def from_table( cls, @@ -543,10 +510,10 @@ def from_table( hatch: str | tuple[str, ...] | None = None, name: str | None = None, extent: float = 0.8, + orient: Orientation = Orientation.VERTICAL, backend: str | Backend | None = None, ) -> DFBars[_DF]: splitby = _shared.join_columns(color, hatch, source=df) - labels: list[tuple[Any, ...]] = [] if isinstance(x, _jitter.JitterBase): xj = x else: @@ -557,17 +524,32 @@ def from_table( yj = _jitter.IdentityJitter(y) xs = [] ys = [] - for sl, sub in df.group_by(splitby): - labels.append(sl) + for _, sub in df.group_by(splitby): xs.append(xj.map(sub)) ys.append(yj.map(sub)) x0 = np.concatenate(xs) y0 = np.concatenate(ys) return DFBars( - df, x0, y0, labels, name=name, color=color, hatch=hatch, extent=extent, - backend=backend + df, x0, y0, name=name, color=color, hatch=hatch, extent=extent, + orient=orient, backend=backend, ) # fmt: skip + @classmethod + def build_hist( + cls, + df: DataFrameWrapper[_DF], + bins: int | ArrayLike1D, + density: bool = False, + range: tuple[float, float] | None = None, + color: str | tuple[str, ...] | None = None, + hatch: str | tuple[str, ...] | None = None, + name: str | None = None, + extent: float = 0.8, + orient: Orientation = Orientation.VERTICAL, + backend: str | Backend | None = None, + ) -> DFBars[_DF]: + ... + def with_color(self, by: str | Iterable[str] | ColorType, palette=None) -> Self: cov = _shared.ColumnOrValue(by, self._source) if cov.is_column: @@ -594,17 +576,6 @@ def with_hatch(self, by: str | Iterable[str], choices=None) -> Self: class DFHeatmap(_shared.DataFrameLayerWrapper[_l.Image, _DF], Generic[_DF]): - def __init__( - self, - base: _l.Image, - source: DataFrameWrapper[_DF], - xticks: list[str] | None = None, - yticks: list[str] | None = None, - ): - super().__init__(base, source) - self._xticks = xticks - self._yticks = yticks - @property def cmap(self) -> Colormap: return self._base_layer.cmap @@ -648,54 +619,16 @@ def build_hist( return cls(base, src) @classmethod - def build_heatmap( + def from_array( cls, - df: _DF, - x: str, - y: str, - value: str, + src: DataFrameWrapper[_DF], + arr: np.ndarray, name: str | None = None, cmap: ColormapType = "gray", clim: tuple[float | None, float | None] | None = None, - fill=0, backend: Backend | str | None = None, - ) -> Self: - src = parse(df) - xnunique = unique(src[x], axis=None) - ynunique = unique(src[y], axis=None) - dtype = src[value].dtype - if dtype.kind not in "fiub": - raise ValueError(f"Column {value!r} is not numeric.") - arr = np.full((ynunique.size, xnunique.size), fill, dtype=dtype) - xmap = {x: i for i, x in enumerate(xnunique)} - ymap = {y: i for i, y in enumerate(ynunique)} - for sl, sub in src.group_by((x, y)): - xval, yval = sl - vals = sub[value] - if vals.size == 1: - arr[ymap[yval], xmap[xval]] = sub[value][0] - else: - raise ValueError(f"More than one value found for {sl!r}.") - if clim is None: - # `fill` may be outside the range of the data, so calculate clim here. - clim = src[value].min(), src[value].max() - base = _l.Image(arr, name=name, cmap=cmap, clim=clim, backend=backend) - return cls( - base, - src, - xticks=[str(_x) for _x in xnunique], - yticks=[str(_y) for _y in ynunique], - ) - - def _generate_xticks(self): - if self._xticks is None: - return None - return np.arange(len(self._xticks)), self._xticks - - def _generate_yticks(self): - if self._yticks is None: - return None - return np.arange(len(self._yticks)), self._yticks + ) -> DFHeatmap[_DF]: + return cls(_l.Image(arr, name=name, cmap=cmap, clim=clim, backend=backend), src) class DFPointPlot2D(_shared.DataFrameLayerWrapper[_lg.LabeledPlot, _DF], Generic[_DF]): diff --git a/whitecanvas/layers/tabular/_df_compat.py b/whitecanvas/layers/tabular/_df_compat.py index d93c9bfd..3efc7831 100644 --- a/whitecanvas/layers/tabular/_df_compat.py +++ b/whitecanvas/layers/tabular/_df_compat.py @@ -86,7 +86,11 @@ def agg_by(self, by: tuple[str, ...], on: str, method: str) -> Self: @abstractmethod def value_count(self, by: tuple[str, ...]) -> Self: - ... + """Return the count of each group.""" + + @abstractmethod + def value_first(self, by: tuple[str, ...], on: str) -> Self: + """Return the first value of a column for each group.""" @property def columns(self) -> list[str]: @@ -155,6 +159,14 @@ def value_count(self, by: tuple[str, ...]) -> Self: out["size"].append(len(sub[by[0]])) return DictWrapper({k: np.array(v) for k, v in out.items()}) + def value_first(self, by: tuple[str, ...], on: str) -> Self: + out = {k: [] for k in [*by, on]} + for sl, sub in self.group_by(by): + for b, s in zip(by, sl): + out[b].append(s) + out[on].append(sub[on][0]) + return DictWrapper({k: np.array(v) for k, v in out.items()}) + class PandasWrapper(DataFrameWrapper["pd.DataFrame"]): def __getitem__(self, item: str) -> np.ndarray: @@ -201,6 +213,9 @@ def value_count(self, by: tuple[str, ...]) -> Self: rows.append((*sl, len(sub))) return PandasWrapper(pd.DataFrame(rows, columns=[*by, "size"])) + def value_first(self, by: tuple[str, ...], on: str) -> Self: + return PandasWrapper(self._data.groupby(list(by)).first().reset_index()) + class PolarsWrapper(DataFrameWrapper["pl.DataFrame"]): def __getitem__(self, item: str) -> np.ndarray: @@ -253,6 +268,9 @@ def value_count(self, by: tuple[str, ...]) -> Self: .rename({"count": "size"}) ) + def value_first(self, by: tuple[str, ...], on: str) -> Self: + return PolarsWrapper(self._data.group_by(by, maintain_order=True).first()) + class PyArrowWrapper(DataFrameWrapper["pa.Table"]): def __getitem__(self, item: str) -> np.ndarray: @@ -302,6 +320,9 @@ def value_count(self, by: tuple[str, ...]) -> Self: .rename_columns([*by, "size"]) ) + def value_first(self, by: tuple[str, ...], on: str) -> Self: + return PyArrowWrapper(self._data.group_by(by, maintain_order=True).first()) + def parse(data: Any) -> DataFrameWrapper: """Parse a data object into a DataFrameWrapper.""" From 8699be09ea89ef479d3ee70489f263611b2f0568 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Fri, 2 Feb 2024 21:45:36 +0900 Subject: [PATCH 04/11] reimplement histogram --- tests/test_categorical.py | 8 +- whitecanvas/canvas/_base.py | 90 +------ whitecanvas/canvas/dataframe/_feature_cat.py | 64 +---- whitecanvas/layers/_primitive/line.py | 29 --- whitecanvas/layers/group/__init__.py | 5 +- whitecanvas/layers/group/hist.py | 243 +++++++++++++++++++ whitecanvas/layers/tabular/__init__.py | 2 + whitecanvas/layers/tabular/_dataframe.py | 206 +++++++++------- whitecanvas/utils/hist.py | 97 ++++++++ 9 files changed, 491 insertions(+), 253 deletions(-) create mode 100644 whitecanvas/layers/group/hist.py create mode 100644 whitecanvas/utils/hist.py diff --git a/tests/test_categorical.py b/tests/test_categorical.py index 78ecf7dc..fce8f0a3 100644 --- a/tests/test_categorical.py +++ b/tests/test_categorical.py @@ -18,10 +18,10 @@ def test_cat(backend: str): canvas.cat(df, "x", "y").add_markers(color="label") canvas.cat(df, "x", "y").add_markers(hatch="label") canvas.cat(df, "x", "y").add_hist2d(bins=(5, 4)) - canvas.cat(df, "x", "y").along_x().add_hist_line(bins=5) - canvas.cat(df, "x", "y").along_x().add_hist_line(bins=5, color="label") - canvas.cat(df, "x", "y").along_y().add_hist_line(bins=6) - canvas.cat(df, "x", "y").along_y().add_hist_line(bins=6, color="label") + # canvas.cat(df, "x", "y").along_x().add_hist(bins=5) + # canvas.cat(df, "x", "y").along_x().add_hist(bins=5, color="label") + # canvas.cat(df, "x", "y").along_y().add_hist(bins=6) + # canvas.cat(df, "x", "y").along_y().add_hist(bins=6, color="label") @pytest.mark.parametrize("orient", ["v", "h"]) def test_cat_plots(backend: str, orient: str): diff --git a/whitecanvas/canvas/_base.py b/whitecanvas/canvas/_base.py index 089affc6..43e66225 100644 --- a/whitecanvas/canvas/_base.py +++ b/whitecanvas/canvas/_base.py @@ -635,14 +635,15 @@ def add_hist( data: ArrayLike1D, *, bins: int | ArrayLike1D = 10, - range: tuple[float, float] | None = None, - density: bool = False, + limits: tuple[float, float] | None = None, name: str | None = None, + shape: Literal["step", "polygon", "bars"] = "bars", + kind: Literal["count", "density", "frequency", "percent"] = "count", orient: str | Orientation = Orientation.VERTICAL, color: ColorType | None = None, - alpha: float = 1.0, - hatch: str | Hatch | None = None, - ) -> _l.Bars: + width: float | None = None, + style: LineStyle | str | None = None, + ) -> _lg.Histogram: """ Add data as a histogram. @@ -655,9 +656,9 @@ def add_hist( bins : int or 1D array-like, default 10 Bins of the histogram. This parameter will directly be passed to `np.histogram`. - range : (float, float), optional - Range in which histogram will be built. This parameter will - directly be passed to `np.histogram`. + limits : (float, float), optional + Limits in which histogram will be built. This parameter will equivalent to + the `range` paraneter of `np.histogram`. density : bool, default False If True, heights of bars will be normalized so that the total area of the histogram will be 1. This parameter will directly @@ -668,10 +669,6 @@ def add_hist( Orientation of the bars. color : color-like, optional Color of the bars. - alpha : float, default 1.0 - Alpha channel of the bars. - hatch : str or FacePattern, optional - Pattern of the bar faces. Use the theme default if not specified. Returns ------- @@ -680,74 +677,11 @@ def add_hist( """ name = self._coerce_name("histogram", name) color = self._generate_colors(color) - hatch = theme._default("bars.hatch", hatch) - layer = _l.Bars.from_histogram( - data, bins=bins, range=range, density=density, name=name, color=color, - orient=orient, alpha=alpha, hatch=hatch, backend=self._get_backend(), - ) # fmt: skip - return self.add_layer(layer) - - def add_hist_line( - self, - data: ArrayLike1D, - *, - bins: int | ArrayLike1D = 10, - range: tuple[float, float] | None = None, - density: bool = False, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - color: ColorType | None = None, - width: float | None = None, - style: LineStyle | str | None = None, - alpha: float = 1.0, - antialias: bool = True, - ) -> _l.Line: - """ - Add a line plot of the histogram. - - >>> canvas.add_hist_line(np.random.normal(size=100), bins=12) - - Parameters - ---------- - data : array-like - 1D Array of data. - bins : int or 1D array-like, default 10 - Bins of the histogram. This parameter will directly be passed - to `np.histogram`. - range : (float, float), optional - Range in which histogram will be built. This parameter will - directly be passed to `np.histogram`. - density : bool, default False - If True, heights of bars will be normalized so that the total - area of the histogram will be 1. This parameter will directly - be passed to `np.histogram`. - name : str, optional - Name of the layer. - orient : str or Orientation, default Orientation.VERTICAL - Orientation of the bars. - color : color-like, optional - Color of the bars. - width : float, optional - Line width. Use the theme default if not specified. - style : str or LineStyle, optional - Line style. Use the theme default if not specified. - alpha : float, default 1.0 - Alpha channel of the line. - antialias : bool, default True - Antialiasing of the line. - - Returns - ------- - Line - The line layer that represents the histogram. - """ - name = self._coerce_name("histogram", name) - color = self._generate_colors(color) width = theme._default("line.width", width) style = theme._default("line.style", style) - layer = _l.Line.build_hist( - data, bins=bins, density=density, range=range, orient=orient, name=name, - color=color, width=width, style=style, alpha=alpha, antialias=antialias, + layer = _lg.Histogram.from_array( + data, bins=bins, limits=limits, shape=shape, kind=kind, name=name, + color=color, width=width, style=style, orient=orient, backend=self._get_backend(), ) # fmt: skip return self.add_layer(layer) diff --git a/whitecanvas/canvas/dataframe/_feature_cat.py b/whitecanvas/canvas/dataframe/_feature_cat.py index 7ec50387..b002671a 100644 --- a/whitecanvas/canvas/dataframe/_feature_cat.py +++ b/whitecanvas/canvas/dataframe/_feature_cat.py @@ -6,6 +6,7 @@ TypeVar, ) +from whitecanvas import theme from whitecanvas.canvas.dataframe._base import BaseCatPlotter from whitecanvas.layers import tabular as _lt from whitecanvas.layers.tabular import _jitter @@ -108,6 +109,7 @@ def add_line( Line collection layer. """ canvas = self._canvas() + width = theme._default("line.width", width) layer = _lt.DFLines.from_table( self._df, self._get_x(), self._get_y(), name=name, color=color, width=width, style=style, backend=canvas._get_backend(), @@ -271,61 +273,19 @@ def add_hist( self, *, bins: int | ArrayLike1D = 10, - range: tuple[float, float] | None = None, - density: bool = False, - name: str | None = None, - color: NStr | None = None, - hatch: NStr | None = None, - ): - # TODO: implement this - raise NotImplementedError - - def add_hist_line( - self, - *, - bins: int | ArrayLike1D = 10, - range: tuple[float, float] | None = None, - density: bool = False, + limits: tuple[float, float] | None = None, + kind: str = "count", + shape: str = "bars", name: str | None = None, color: NStr | None = None, - width: str | None = None, + width: float | None = None, style: NStr | None = None, ): - """ - Add lines representing histograms. - - >>> ### Use "value" column as x-axis - >>> canvas.cat(df, x="value").add_line_hist(bins=8, density=True) - - >>> ### Multiple histograms colored by column "group" - >>> canvas.cat(df, x="value").add_line_hist(color="group") - - Parameters - ---------- - bins : int or array-like, default 10 - If an integer, the number of bins. If an array, the bin edges. - range : (float, float), default None - If provided, the lower and upper range of the bins. - density : bool, default False - If True, the total area of the histogram will be normalized to 1. - name : str, optional - Name of the layer. - color : str or sequence of str, optional - Column name(s) for coloring the lines. Must be categorical. - width : str, optional - Column name for line width. Must be numerical. - style : str or sequence of str, optional - Column name(s) for styling the lines. Must be categorical. - - Returns - ------- - WrappedLines - Line collection layer. - """ canvas = self._canvas() + width = theme._default("line.width", width) x0, orient = self._column_and_orient() - layer = _lt.DFLines.build_hist( - self._df, x0, bins=bins, range=range, density=density, name=name, + layer = _lt.DFHistograms.from_table( + self._df, x0, bins=bins, limits=limits, kind=kind, shape=shape, name=name, orient=orient, color=color, width=width, style=style, backend=canvas._get_backend(), ) # fmt: skip @@ -334,11 +294,11 @@ def add_hist_line( elif color is None: layer.with_color(canvas._color_palette.next()) if self._update_label: - ax_label = "density" if density else "count" if orient.is_vertical: - canvas.y.label.text = ax_label + canvas.y.label.text = kind else: - canvas.x.label.text = ax_label + canvas.x.label.text = kind + return canvas.add_layer(layer) def add_kde( diff --git a/whitecanvas/layers/_primitive/line.py b/whitecanvas/layers/_primitive/line.py index 49d9673b..051dca44 100644 --- a/whitecanvas/layers/_primitive/line.py +++ b/whitecanvas/layers/_primitive/line.py @@ -426,35 +426,6 @@ def with_text( name=self.name, ) - @classmethod - def build_hist( - cls, - data: ArrayLike1D, - *, - bins: int | ArrayLike1D = 10, - density: bool = False, - range: tuple[float, float] | None = None, - orient: str | Orientation = Orientation.VERTICAL, - name: str | None = None, - color: ColorType = "blue", - alpha: float = 1.0, - width: float = 1.0, - style: LineStyle | str = LineStyle.SOLID, - antialias: bool = True, - backend: Backend | str | None = None, - ): - """Construct a line from a histogram.""" - data = as_array_1d(data) - counts, edges = np.histogram(data, bins, density=density, range=range) - xdata = np.concatenate(list(zip(edges[:-1], edges[1:]))) - ydata = np.concatenate(list(zip(counts, counts))) - if not Orientation.parse(orient).is_vertical: - xdata, ydata = ydata, xdata - return Line( - xdata, ydata, name=name, color=color, alpha=alpha, width=width, - style=style, antialias=antialias, backend=backend, - ) # fmt: skip - @classmethod def build_cdf( cls, diff --git a/whitecanvas/layers/group/__init__.py b/whitecanvas/layers/group/__init__.py index 7d02ce91..59cfd964 100644 --- a/whitecanvas/layers/group/__init__.py +++ b/whitecanvas/layers/group/__init__.py @@ -1,7 +1,8 @@ -from whitecanvas.layers.group._collections import LayerTuple +from whitecanvas.layers.group._collections import LayerCollectionBase, LayerTuple from whitecanvas.layers.group.band_collection import BandCollection, ViolinPlot from whitecanvas.layers.group.boxplot import BoxPlot from whitecanvas.layers.group.graph import Graph +from whitecanvas.layers.group.hist import Histogram from whitecanvas.layers.group.labeled import ( LabeledBars, LabeledLine, @@ -32,4 +33,6 @@ "Graph", "StemPlot", "LayerTuple", + "Histogram", + "LayerCollectionBase", ] diff --git a/whitecanvas/layers/group/hist.py b/whitecanvas/layers/group/hist.py new file mode 100644 index 00000000..c6767cbe --- /dev/null +++ b/whitecanvas/layers/group/hist.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +from enum import Enum +from typing import overload + +import numpy as np +from numpy.typing import NDArray + +from whitecanvas.backend import Backend +from whitecanvas.layers._primitive import Band, Line +from whitecanvas.layers.group._collections import LayerContainer +from whitecanvas.types import ArrayLike1D, ColorType, LineStyle, Orientation +from whitecanvas.utils.hist import get_hist_edges, histograms +from whitecanvas.utils.normalize import as_array_1d + + +class HistogramShape(Enum): + step = "step" + polygon = "polygon" + bars = "bars" + + +class HistogramKind(Enum): + count = "count" + density = "density" + frequency = "frequency" + percent = "percent" + + +class Histogram(LayerContainer): + def __init__( + self, + data: NDArray[np.number], + edges: NDArray[np.number], + limits: tuple[float, float] | None, + line: Line, + fill: Band, + shape: HistogramShape = HistogramShape.bars, + kind: HistogramKind = HistogramKind.count, + name: str | None = None, + ): + if name is None: + name = "histogram" + super().__init__([line, fill], name=name) + self._data = data + self._shape = shape + self._kind = kind + self._edges = edges + self._limits = limits + + @property + def data(self) -> NDArray[np.number]: + """The data used to plot the histogram.""" + return self._data + + @data.setter + def data(self, data: NDArray[np.number]): + data = as_array_1d(data) + xdata, ydata = _calculate_xy( + data, self._edges, self._shape, self._kind, self._limits, clip=True + ) # fmt: skip + self._update_internal(xdata, ydata) + self._data = data + + def _update_internal(self, xdata: NDArray[np.number], ydata: NDArray[np.number]): + if self.orient.is_vertical: + self.line.data = xdata, ydata + else: + self.line.data = ydata, xdata + self.fill.data = xdata, np.zeros_like(ydata), ydata + + @property + def line(self) -> Line: + """The line layer.""" + return self._children[0] + + @property + def fill(self) -> Band: + """The fill layer.""" + return self._children[1] + + @property + def orient(self) -> Orientation: + return self.fill.orient + + @property + def shape(self) -> HistogramShape: + """The shape of the histogram.""" + return self._shape + + @shape.setter + def shape(self, shape: str | HistogramShape): + shape = HistogramShape(shape) + xdata, ydata, _ = _calculate_xy( + self._data, self._edges, shape, self._kind, self._limits + ) # fmt: skip + self._update_internal(xdata, ydata) + self._shape = shape + + @property + def kind(self) -> HistogramKind: + """The kind of the histogram.""" + return self._kind + + @kind.setter + def kind(self, kind: str | HistogramKind): + kind = HistogramKind(kind) + xdata, ydata, _ = _calculate_xy( + self._data, self._edges, self._shape, kind, self._limits + ) # fmt: skip + self._update_internal(xdata, ydata) + self._kind = kind + + @property + def limits(self) -> tuple[float, float] | None: + """The limits of the histogram.""" + return self._limits + + @limits.setter + def limits(self, limits: tuple[float, float] | None): + xdata, ydata, _ = _calculate_xy( + self._data, self._edges, self._shape, self._kind, limits + ) + self._update_internal(xdata, ydata) + self._limits = limits + + @property + def edges(self) -> NDArray[np.number]: + """The edges of the histogram.""" + return self._edges + + @edges.setter + def edges(self, edges: NDArray[np.number]): + edges = as_array_1d(edges) + xdata, ydata, _ = _calculate_xy( + self._data, edges, self._shape, self._kind, self._limits + ) + self._update_internal(xdata, ydata) + self._edges = edges + + @property + def color(self) -> NDArray[np.float32]: + return self.line.color + + @color.setter + def color(self, color: ColorType): + self.line.color = color + self.fill.face.update(color=color, alpha=0.2) + + @overload + def update_edges(self, bins: int, limits: tuple[float, float] | None = None): + ... + + @overload + def update_edges(self, edges: NDArray[np.number]): + ... + + def update_edges(self, bins, limits=None): + """ + Update the edges of the histogram. + + >>> hist.update_edges(20, limits=(0, 10)) # uniform bins + >>> hist.update_edges([0, 2, 3, 5]) # non-uniform bins + """ + if limits is not None and not isinstance(bins, (int, np.number)): + raise TypeError("bins must be an integer when limits are specified.") + edges = get_hist_edges([self._data], bins, limits) + self.edges = edges + + @classmethod + def from_array( + cls, + data: NDArray[np.number], + shape: HistogramShape = HistogramShape.bars, + kind: HistogramKind = HistogramKind.count, + name: str | None = None, + bins: int = 10, + limits: tuple[float, float] | None = None, + color: ColorType = "black", + style: str | LineStyle = LineStyle.SOLID, + width: float = 1.0, + orient: str | Orientation = "vertical", + backend: str | Backend | None = None, + ) -> Histogram: + """Create a histogram from an array.""" + shape = HistogramShape(shape) + kind = HistogramKind(kind) + ori = Orientation.parse(orient) + xdata, ydata, edges = _calculate_xy(data, bins, shape, kind, limits) + if ori.is_vertical: + line = Line( + xdata, ydata, color=color, style=style, width=width, backend=backend + ) # fmt: skip + else: + line = Line( + ydata, xdata, color=color, style=style, width=width, backend=backend + ) + fill = Band( + xdata, np.zeros_like(ydata), ydata, color=color, alpha=0.2, orient=ori, + backend=backend, + ) # fmt: skip + return cls(data, edges, limits, line, fill, shape, kind, name=name) + + +def _calculate_xy( + data, + bins: int | ArrayLike1D, + shape: HistogramShape, + kind: HistogramKind, + limits: tuple[float, float] | None = None, + clip: bool = True, +) -> tuple[NDArray[np.number], NDArray[np.number], NDArray[np.number]]: + if clip and limits is not None: + data = np.clip(data, *limits) + hist = histograms([data], bins, limits) + shape = HistogramShape(shape) + kind = HistogramKind(kind) + if kind is HistogramKind.count: + heights = hist.counts[0] + elif kind is HistogramKind.density: + heights = hist.density()[0] + elif kind is HistogramKind.frequency: + heights = hist.frequency()[0] + elif kind is HistogramKind.percent: + heights = hist.percent()[0] + else: + raise ValueError(f"Unknown kind {kind!r}.") + + if shape is HistogramShape.step: + xdata = np.repeat(hist.edges, 2) + ydata = np.concatenate([[0], np.repeat(heights, 2), [0]]) + elif shape is HistogramShape.polygon: + centers = hist.centers() + xdata = np.concatenate([[centers[0]], centers, [centers[-1]]]) + ydata = np.concatenate([[0], heights, [0]]) + elif shape is HistogramShape.bars: + edges = hist.edges + xdata = np.repeat(edges, 3)[1:-1] + ydata = np.zeros_like(xdata) + ydata[1::3] = ydata[2::3] = heights + else: + raise ValueError(f"Unknown shape {shape!r}.") + return xdata, ydata, hist.edges diff --git a/whitecanvas/layers/tabular/__init__.py b/whitecanvas/layers/tabular/__init__.py index 3265dd7d..325ad2e0 100644 --- a/whitecanvas/layers/tabular/__init__.py +++ b/whitecanvas/layers/tabular/__init__.py @@ -7,6 +7,7 @@ from whitecanvas.layers.tabular._dataframe import ( DFBars, DFHeatmap, + DFHistograms, DFLines, DFMarkerGroups, DFMarkers, @@ -24,6 +25,7 @@ "DFBars", "DFBoxPlot", "DFHeatmap", + "DFHistograms", "DFPointPlot2D", "parse", ] diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index 1f17de45..47764626 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -25,6 +25,7 @@ Symbol, _Void, ) +from whitecanvas.utils.hist import histograms if TYPE_CHECKING: from typing_extensions import Self @@ -41,14 +42,13 @@ def __init__( segs: list[np.ndarray], labels: list[tuple[Any, ...]], color: _Cols | None = None, - width: str | None = None, + width: float = 1.0, style: _Cols | None = None, name: str | None = None, backend: str | Backend | None = None, ): splitby = _shared.join_columns(color, style, source=source) self._color_by = _p.ColorPlan.default() - self._width_by = _p.WidthPlan.default() self._style_by = _p.StylePlan.default() self._labels = labels self._splitby = splitby @@ -56,8 +56,7 @@ def __init__( super().__init__(base, source) if color is not None: self.with_color(color) - if isinstance(width, str): - self.with_width(width) + self.with_width(width) if style is not None: self.with_style(style) @@ -68,7 +67,7 @@ def from_table( x: str | _jitter.JitterBase, y: str | _jitter.JitterBase, color: str | None = None, - width: str | None = None, + width: float | None = None, style: str | None = None, name: str | None = None, backend: str | Backend | None = None, @@ -129,60 +128,6 @@ def build_kde( backend=backend, ) # fmt: skip - @classmethod - def build_hist( - cls, - df: _DF, - value: str, - bins: int | ArrayLike1D = 10, - density: bool = False, - range: tuple[float, float] | None = None, - color: str | None = None, - width: str | None = None, - style: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - backend: str | Backend | None = None, - ) -> DFLines[_DF]: - src = parse(df) - splitby = _shared.join_columns(color, style, source=src) - ori = Orientation.parse(orient) - segs = [] - labels: list[tuple[Any, ...]] = [] - for sl, df in src.group_by(splitby): - labels.append(sl) - each = df[value] - counts, edges = np.histogram(each, bins=bins, density=density, range=range) - x = np.empty(2 * counts.size + 2, dtype=np.float32) - y = np.empty(2 * counts.size + 2, dtype=np.float32) - x[0] = edges[0] - x[-1] = edges[-1] - y[0] = y[-1] = 0 - x[1:-1:2] = edges[:-1] - x[2:-1:2] = edges[1:] - y[1:-1:2] = counts - y[2:-1:2] = counts - if ori.is_vertical: - segs.append(np.column_stack([x, y])) - else: - segs.append(np.column_stack([y, x])) - return DFLines( - src, segs, labels, name=name, color=color, width=width, style=style, - backend=backend, - ) # fmt: skip - - @property - def color(self) -> _p.ColorPlan: - return self._color_by - - @property - def width(self) -> _p.WidthPlan: - return self._width_by - - @property - def style(self) -> _p.StylePlan: - return self._style_by - @overload def with_color(self, value: ColorType) -> Self: ... @@ -207,21 +152,8 @@ def with_color(self, by, /, palette=None): self._color_by = color_by return self - @overload def with_width(self, value: float) -> Self: - ... - - @overload - def with_width(self, by: str, limits=None) -> Self: - ... - - def with_width(self, by, /, limits=None) -> Self: - if isinstance(by, str): - width_by = _p.WidthPlan.from_range(by, limits=limits) - else: - width_by = _p.WidthPlan.from_const(float(by)) - self._base_layer.width = width_by.map(self._source) - self._width_by = width_by + self._base_layer.width = value return self def with_style(self, by: str | Iterable[str], styles=None) -> Self: @@ -534,22 +466,6 @@ def from_table( orient=orient, backend=backend, ) # fmt: skip - @classmethod - def build_hist( - cls, - df: DataFrameWrapper[_DF], - bins: int | ArrayLike1D, - density: bool = False, - range: tuple[float, float] | None = None, - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - name: str | None = None, - extent: float = 0.8, - orient: Orientation = Orientation.VERTICAL, - backend: str | Backend | None = None, - ) -> DFBars[_DF]: - ... - def with_color(self, by: str | Iterable[str] | ColorType, palette=None) -> Self: cov = _shared.ColumnOrValue(by, self._source) if cov.is_column: @@ -657,3 +573,115 @@ def __init__( if size is not None: base.markers.size = size super().__init__(base, source) + + +class DFHistograms( + _shared.DataFrameLayerWrapper[_lg.LayerCollectionBase[_lg.Histogram], _DF], + Generic[_DF], +): + def __init__( + self, + source: DataFrameWrapper[_DF], + base: _lg.LayerCollectionBase[_lg.Histogram], + labels: list[tuple[Any, ...]], + color: _Cols | None = None, + width: str | None = None, + style: _Cols | None = None, + ): + splitby = _shared.join_columns(color, style, source=source) + self._color_by = _p.ColorPlan.default() + self._width_by = _p.WidthPlan.default() + self._style_by = _p.StylePlan.default() + self._labels = labels + self._splitby = splitby + super().__init__(base, source) + if color is not None: + self.with_color(color) + if isinstance(width, str): + self.with_width(width) + if style is not None: + self.with_style(style) + + @classmethod + def from_table( + cls, + df: DataFrameWrapper[_DF], + value: str, + bins: int | ArrayLike1D, + limits: tuple[float, float] | None = None, + kind="count", + shape="bars", + color: str | None = None, + width: float = 1.0, + style: str | None = None, + name: str | None = None, + orient: str | Orientation = Orientation.VERTICAL, + backend: str | Backend | None = None, + ) -> DFHistograms[_DF]: + splitby = _shared.join_columns(color, style, source=df) + ori = Orientation.parse(orient) + arrays: list[np.ndarray] = [] + labels: list[tuple] = [] + for sl, sub in df.group_by(splitby): + labels.append(sl) + arrays.append(sub[value]) + hist = histograms(arrays, bins, limits) + + layers = [] + for arr in arrays: + each_layer = _lg.Histogram.from_array( + arr, + kind=kind, + bins=hist.edges, + limits=limits, + width=width, + orient=ori, + shape=shape, + backend=backend, + ) + layers.append(each_layer) + base = _lg.LayerCollectionBase(layers, name=name) + return cls(df, base, labels, color=color, width=width, style=style) + + @overload + def with_color(self, value: ColorType) -> Self: + ... + + @overload + def with_color( + self, + by: str | Iterable[str], + palette: ColormapType | None = None, + ) -> Self: + ... + + def with_color(self, by, /, palette=None): + cov = _shared.ColumnOrValue(by, self._source) + if cov.is_column: + if set(cov.columns) > set(self._splitby): + raise ValueError(f"Cannot color by a column other than {self._splitby}") + color_by = _p.ColorPlan.from_palette(cov.columns, palette=palette) + else: + color_by = _p.ColorPlan.from_const(Color(cov.value)) + for i, col in enumerate(color_by.generate(self._labels, self._splitby)): + self._base_layer[i].color = col + self._color_by = color_by + return self + + def with_width(self, value: float) -> Self: + for hist in self._base_layer: + hist.line.width = value + return self + + def with_style(self, by: str | Iterable[str], styles=None) -> Self: + cov = _shared.ColumnOrValue(by, self._source) + if cov.is_column: + if set(cov.columns) > set(self._splitby): + raise ValueError(f"Cannot style by a column other than {self._splitby}") + style_by = _p.StylePlan.new(cov.columns, values=styles) + else: + style_by = _p.StylePlan.from_const(LineStyle(cov.value)) + for i, st in enumerate(style_by.generate(self._labels, self._splitby)): + self._base_layer[i].style = st + self._style_by = style_by + return self diff --git a/whitecanvas/utils/hist.py b/whitecanvas/utils/hist.py new file mode 100644 index 00000000..15d30cbb --- /dev/null +++ b/whitecanvas/utils/hist.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from typing import NamedTuple + +import numpy as np +from numpy.typing import NDArray + +from whitecanvas.utils.normalize import as_array_1d + + +class Histogram(NamedTuple): + edges: NDArray[np.number] + width: float + counts: NDArray[np.integer] + + def density(self) -> NDArray[np.number]: + return self.frequency_scaled(self.width) + + def frequency(self) -> NDArray[np.number]: + return self.frequency_scaled(1) + + def percent(self) -> NDArray[np.number]: + return self.frequency_scaled(100) + + def scaled(self, scale: float) -> NDArray[np.number]: + return self.counts / scale + + def frequency_scaled(self, scale: float) -> NDArray[np.number]: + return self.counts / self.counts.sum() / scale + + +class HistogramTuple(NamedTuple): + edges: NDArray[np.number] + width: float + counts: list[NDArray[np.integer]] + + def density(self) -> list[NDArray[np.number]]: + return self.frequency_scaled(self.width) + + def frequency(self) -> list[NDArray[np.number]]: + return self.frequency_scaled(1) + + def percent(self) -> list[NDArray[np.number]]: + return self.frequency_scaled(100) + + def scaled(self, scale: float) -> list[NDArray[np.number]]: + out: list[NDArray[np.number]] = [] + for arr in self.counts: + scaled = arr / scale + out.append(scaled) + return out + + def frequency_scaled(self, scale: float) -> list[NDArray[np.number]]: + out: list[NDArray[np.number]] = [] + for arr in self.counts: + density_scaled = arr / arr.sum() / scale + out.append(density_scaled) + return out + + def centers(self) -> NDArray[np.number]: + return (self.edges[:-1] + self.edges[1:]) / 2 + + +def get_hist_edges( + arrays: list[NDArray[np.number]], + bins: int | NDArray[np.number], + range: tuple[float, float] | None = None, +) -> NDArray[np.number]: + if range is None: + total = np.concatenate(arrays) + value_min = total.min() + value_max = total.max() + else: + value_min, value_max = range + if value_min >= value_max: + raise ValueError("max must be larger than min in range parameter") + if isinstance(bins, (int, np.integer)): + nbins = bins.__index__() + if nbins < 1: + raise ValueError("bins should be a positive integer") + edges = np.linspace(value_min, value_max, nbins + 1) + else: + edges = as_array_1d(bins) + if np.diff(edges).min() <= 0: + raise ValueError("bin edges must increase monotonically") + return edges + + +def histograms( + arrays: list[NDArray[np.number]], + bins: int, + range: tuple[float, float] | None = None, +) -> HistogramTuple: + edges = get_hist_edges(arrays, bins, range) + width = edges[1] - edges[0] + counts = [np.histogram(arr, edges)[0] for arr in arrays] + return HistogramTuple(edges, width, counts) From b0aaad21b216a488c82053d0fc245a3b0a2eb4d7 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Sun, 4 Feb 2024 21:01:36 +0900 Subject: [PATCH 05/11] update hist, kde --- tests/test_categorical.py | 8 +- whitecanvas/backend/plotly/canvas.py | 22 +- whitecanvas/backend/vispy/band.py | 5 +- whitecanvas/canvas/_base.py | 24 +- whitecanvas/canvas/dataframe/_feature_cat.py | 3 +- whitecanvas/layers/_primitive/band.py | 33 -- whitecanvas/layers/group/__init__.py | 3 +- whitecanvas/layers/group/hist.py | 243 ------------ whitecanvas/layers/group/line_collection.py | 2 +- whitecanvas/layers/group/line_fill.py | 382 +++++++++++++++++++ whitecanvas/layers/tabular/__init__.py | 2 + whitecanvas/layers/tabular/_dataframe.py | 102 ++++- 12 files changed, 529 insertions(+), 300 deletions(-) delete mode 100644 whitecanvas/layers/group/hist.py create mode 100644 whitecanvas/layers/group/line_fill.py diff --git a/tests/test_categorical.py b/tests/test_categorical.py index fce8f0a3..1ebe99e6 100644 --- a/tests/test_categorical.py +++ b/tests/test_categorical.py @@ -18,10 +18,10 @@ def test_cat(backend: str): canvas.cat(df, "x", "y").add_markers(color="label") canvas.cat(df, "x", "y").add_markers(hatch="label") canvas.cat(df, "x", "y").add_hist2d(bins=(5, 4)) - # canvas.cat(df, "x", "y").along_x().add_hist(bins=5) - # canvas.cat(df, "x", "y").along_x().add_hist(bins=5, color="label") - # canvas.cat(df, "x", "y").along_y().add_hist(bins=6) - # canvas.cat(df, "x", "y").along_y().add_hist(bins=6, color="label") + canvas.cat(df, "x", "y").along_x().add_hist(bins=5) + canvas.cat(df, "x", "y").along_x().add_hist(bins=5, color="label") + canvas.cat(df, "x", "y").along_y().add_hist(bins=6) + canvas.cat(df, "x", "y").along_y().add_hist(bins=6, color="label") @pytest.mark.parametrize("orient", ["v", "h"]) def test_cat_plots(backend: str, orient: str): diff --git a/whitecanvas/backend/plotly/canvas.py b/whitecanvas/backend/plotly/canvas.py index 924e65f5..dcc5446b 100644 --- a/whitecanvas/backend/plotly/canvas.py +++ b/whitecanvas/backend/plotly/canvas.py @@ -1,6 +1,7 @@ from __future__ import annotations import sys +import warnings import weakref from typing import Callable @@ -76,7 +77,26 @@ def _plt_get_ylabel(self): def _plt_reorder_layers(self, layers: list[PlotlyLayer]): model_to_idx_map = {id(layer._props): i for i, layer in enumerate(layers)} first, *data = self._fig._data - self._fig._data = [first] + [data[model_to_idx_map[id(r)]] for r in data] + try: + self._fig._data = [first] + [data[model_to_idx_map[id(r)]] for r in data] + except KeyError: + # sometimes fails, so just warn + not_found = [] + for r in data: + if id(r) not in model_to_idx_map: + not_found.append(r) + keys = list(model_to_idx_map.keys()) + warnings.warn( + f"Layers {not_found!r} not found in the ID keys {keys!r}.", + UserWarning, + stacklevel=2, + ) + if len(self._fig._data) != len(data) + 1: + warnings.warn( + "Number of layers changed", + UserWarning, + stacklevel=2, + ) def _plt_get_aspect_ratio(self) -> float | None: """Get aspect ratio of canvas""" diff --git a/whitecanvas/backend/vispy/band.py b/whitecanvas/backend/vispy/band.py index c40d7996..90f7b3b3 100644 --- a/whitecanvas/backend/vispy/band.py +++ b/whitecanvas/backend/vispy/band.py @@ -20,7 +20,10 @@ def __init__(self, t, ydata0, ydata1, orient: Orientation): bw = np.stack([ydata1[::-1], t[::-1]], axis=1) verts = np.concatenate([fw, bw], axis=0) self._edge_style = LineStyle.SOLID - super().__init__(verts, border_width=0) + try: + super().__init__(verts, border_width=0) + except Exception: + super().__init__(verts, border_width=0, triangulate=False) self.unfreeze() self._t = t self._y0 = ydata0 diff --git a/whitecanvas/canvas/_base.py b/whitecanvas/canvas/_base.py index 43e66225..2f7ba05e 100644 --- a/whitecanvas/canvas/_base.py +++ b/whitecanvas/canvas/_base.py @@ -1156,8 +1156,8 @@ def add_kde( orient: str | Orientation = Orientation.VERTICAL, band_width: float | Literal["scott", "silverman"] = "scott", color: ColorType | None = None, - alpha: float = 1.0, - hatch: str | Hatch = Hatch.SOLID, + width: float | None = None, + style: LineStyle | str | None = None, ) -> _l.Band: """ Add data as a band layer representing kernel density estimation (KDE). @@ -1184,23 +1184,19 @@ def add_kde( Returns ------- - Band - The band layer representing KDE. + Kde + The KDE layer. """ name = self._coerce_name(_l.Band, name) color = self._generate_colors(color) + width = theme._default("line.width", width) + style = theme._default("line.style", style) - layer = _l.Band.from_kde( - data, - bottom, - name=name, - band_width=band_width, - orient=orient, - color=color, - alpha=alpha, - hatch=hatch, + layer = _lg.Kde.from_array( + data, bottom=bottom, scale=1, band_width=band_width, name=name, + orient=orient, color=color, width=width, style=style, backend=self._get_backend(), - ) + ) # fmt: skip return self.add_layer(layer) @overload diff --git a/whitecanvas/canvas/dataframe/_feature_cat.py b/whitecanvas/canvas/dataframe/_feature_cat.py index b002671a..3abe26ee 100644 --- a/whitecanvas/canvas/dataframe/_feature_cat.py +++ b/whitecanvas/canvas/dataframe/_feature_cat.py @@ -338,8 +338,9 @@ def add_kde( Line collection layer. """ canvas = self._canvas() + width = theme._default("line.width", width) x0, orient = self._column_and_orient() - layer = _lt.DFLines.build_kde( + layer = _lt.DFKde.from_table( self._df, x0, band_width=band_width, name=name, orient=orient, color=color, width=width, style=style, backend=canvas._get_backend(), diff --git a/whitecanvas/layers/_primitive/band.py b/whitecanvas/layers/_primitive/band.py index 67cc9103..4f1de83d 100644 --- a/whitecanvas/layers/_primitive/band.py +++ b/whitecanvas/layers/_primitive/band.py @@ -2,8 +2,6 @@ from typing import Any -import numpy as np - from whitecanvas.backend import Backend from whitecanvas.layers._base import DataBoundLayer from whitecanvas.layers._mixin import FaceEdgeMixin @@ -90,34 +88,3 @@ def set_data( edge_high: ArrayLike1D | None = None, ): self.data = t, edge_low, edge_high - - @classmethod - def from_kde( - cls, - data: ArrayLike1D, - bottom: float = 0.0, - *, - name: str | None = None, - band_width: float | None = None, - color: ColorType = "blue", - alpha: float = 1.0, - hatch: str | Hatch = Hatch.SOLID, - orient: str | Orientation = Orientation.VERTICAL, - backend: Backend | str | None = None, - ): - from whitecanvas.utils.kde import gaussian_kde - - data = as_array_1d(data) - kde = gaussian_kde(data, bw_method=band_width) - - sigma = np.sqrt(kde.covariance[0, 0]) - pad = sigma * 2.5 - x = np.linspace(data.min() - pad, data.max() + pad, 100) - y1 = kde(x) - y0 = np.full_like(y1, bottom) - self = cls( - x, y0, y1, name=name, orient=orient, color=color, alpha=alpha, - hatch=hatch, backend=backend, - ) # fmt: skip - self._band_type = "kde" - return self diff --git a/whitecanvas/layers/group/__init__.py b/whitecanvas/layers/group/__init__.py index 59cfd964..45c027bf 100644 --- a/whitecanvas/layers/group/__init__.py +++ b/whitecanvas/layers/group/__init__.py @@ -2,7 +2,6 @@ from whitecanvas.layers.group.band_collection import BandCollection, ViolinPlot from whitecanvas.layers.group.boxplot import BoxPlot from whitecanvas.layers.group.graph import Graph -from whitecanvas.layers.group.hist import Histogram from whitecanvas.layers.group.labeled import ( LabeledBars, LabeledLine, @@ -11,6 +10,7 @@ ) from whitecanvas.layers.group.line_band import LineBand from whitecanvas.layers.group.line_collection import LineCollection +from whitecanvas.layers.group.line_fill import Histogram, Kde from whitecanvas.layers.group.line_markers import Plot from whitecanvas.layers.group.marker_collection import MarkerCollection from whitecanvas.layers.group.stemplot import StemPlot @@ -34,5 +34,6 @@ "StemPlot", "LayerTuple", "Histogram", + "Kde", "LayerCollectionBase", ] diff --git a/whitecanvas/layers/group/hist.py b/whitecanvas/layers/group/hist.py deleted file mode 100644 index c6767cbe..00000000 --- a/whitecanvas/layers/group/hist.py +++ /dev/null @@ -1,243 +0,0 @@ -from __future__ import annotations - -from enum import Enum -from typing import overload - -import numpy as np -from numpy.typing import NDArray - -from whitecanvas.backend import Backend -from whitecanvas.layers._primitive import Band, Line -from whitecanvas.layers.group._collections import LayerContainer -from whitecanvas.types import ArrayLike1D, ColorType, LineStyle, Orientation -from whitecanvas.utils.hist import get_hist_edges, histograms -from whitecanvas.utils.normalize import as_array_1d - - -class HistogramShape(Enum): - step = "step" - polygon = "polygon" - bars = "bars" - - -class HistogramKind(Enum): - count = "count" - density = "density" - frequency = "frequency" - percent = "percent" - - -class Histogram(LayerContainer): - def __init__( - self, - data: NDArray[np.number], - edges: NDArray[np.number], - limits: tuple[float, float] | None, - line: Line, - fill: Band, - shape: HistogramShape = HistogramShape.bars, - kind: HistogramKind = HistogramKind.count, - name: str | None = None, - ): - if name is None: - name = "histogram" - super().__init__([line, fill], name=name) - self._data = data - self._shape = shape - self._kind = kind - self._edges = edges - self._limits = limits - - @property - def data(self) -> NDArray[np.number]: - """The data used to plot the histogram.""" - return self._data - - @data.setter - def data(self, data: NDArray[np.number]): - data = as_array_1d(data) - xdata, ydata = _calculate_xy( - data, self._edges, self._shape, self._kind, self._limits, clip=True - ) # fmt: skip - self._update_internal(xdata, ydata) - self._data = data - - def _update_internal(self, xdata: NDArray[np.number], ydata: NDArray[np.number]): - if self.orient.is_vertical: - self.line.data = xdata, ydata - else: - self.line.data = ydata, xdata - self.fill.data = xdata, np.zeros_like(ydata), ydata - - @property - def line(self) -> Line: - """The line layer.""" - return self._children[0] - - @property - def fill(self) -> Band: - """The fill layer.""" - return self._children[1] - - @property - def orient(self) -> Orientation: - return self.fill.orient - - @property - def shape(self) -> HistogramShape: - """The shape of the histogram.""" - return self._shape - - @shape.setter - def shape(self, shape: str | HistogramShape): - shape = HistogramShape(shape) - xdata, ydata, _ = _calculate_xy( - self._data, self._edges, shape, self._kind, self._limits - ) # fmt: skip - self._update_internal(xdata, ydata) - self._shape = shape - - @property - def kind(self) -> HistogramKind: - """The kind of the histogram.""" - return self._kind - - @kind.setter - def kind(self, kind: str | HistogramKind): - kind = HistogramKind(kind) - xdata, ydata, _ = _calculate_xy( - self._data, self._edges, self._shape, kind, self._limits - ) # fmt: skip - self._update_internal(xdata, ydata) - self._kind = kind - - @property - def limits(self) -> tuple[float, float] | None: - """The limits of the histogram.""" - return self._limits - - @limits.setter - def limits(self, limits: tuple[float, float] | None): - xdata, ydata, _ = _calculate_xy( - self._data, self._edges, self._shape, self._kind, limits - ) - self._update_internal(xdata, ydata) - self._limits = limits - - @property - def edges(self) -> NDArray[np.number]: - """The edges of the histogram.""" - return self._edges - - @edges.setter - def edges(self, edges: NDArray[np.number]): - edges = as_array_1d(edges) - xdata, ydata, _ = _calculate_xy( - self._data, edges, self._shape, self._kind, self._limits - ) - self._update_internal(xdata, ydata) - self._edges = edges - - @property - def color(self) -> NDArray[np.float32]: - return self.line.color - - @color.setter - def color(self, color: ColorType): - self.line.color = color - self.fill.face.update(color=color, alpha=0.2) - - @overload - def update_edges(self, bins: int, limits: tuple[float, float] | None = None): - ... - - @overload - def update_edges(self, edges: NDArray[np.number]): - ... - - def update_edges(self, bins, limits=None): - """ - Update the edges of the histogram. - - >>> hist.update_edges(20, limits=(0, 10)) # uniform bins - >>> hist.update_edges([0, 2, 3, 5]) # non-uniform bins - """ - if limits is not None and not isinstance(bins, (int, np.number)): - raise TypeError("bins must be an integer when limits are specified.") - edges = get_hist_edges([self._data], bins, limits) - self.edges = edges - - @classmethod - def from_array( - cls, - data: NDArray[np.number], - shape: HistogramShape = HistogramShape.bars, - kind: HistogramKind = HistogramKind.count, - name: str | None = None, - bins: int = 10, - limits: tuple[float, float] | None = None, - color: ColorType = "black", - style: str | LineStyle = LineStyle.SOLID, - width: float = 1.0, - orient: str | Orientation = "vertical", - backend: str | Backend | None = None, - ) -> Histogram: - """Create a histogram from an array.""" - shape = HistogramShape(shape) - kind = HistogramKind(kind) - ori = Orientation.parse(orient) - xdata, ydata, edges = _calculate_xy(data, bins, shape, kind, limits) - if ori.is_vertical: - line = Line( - xdata, ydata, color=color, style=style, width=width, backend=backend - ) # fmt: skip - else: - line = Line( - ydata, xdata, color=color, style=style, width=width, backend=backend - ) - fill = Band( - xdata, np.zeros_like(ydata), ydata, color=color, alpha=0.2, orient=ori, - backend=backend, - ) # fmt: skip - return cls(data, edges, limits, line, fill, shape, kind, name=name) - - -def _calculate_xy( - data, - bins: int | ArrayLike1D, - shape: HistogramShape, - kind: HistogramKind, - limits: tuple[float, float] | None = None, - clip: bool = True, -) -> tuple[NDArray[np.number], NDArray[np.number], NDArray[np.number]]: - if clip and limits is not None: - data = np.clip(data, *limits) - hist = histograms([data], bins, limits) - shape = HistogramShape(shape) - kind = HistogramKind(kind) - if kind is HistogramKind.count: - heights = hist.counts[0] - elif kind is HistogramKind.density: - heights = hist.density()[0] - elif kind is HistogramKind.frequency: - heights = hist.frequency()[0] - elif kind is HistogramKind.percent: - heights = hist.percent()[0] - else: - raise ValueError(f"Unknown kind {kind!r}.") - - if shape is HistogramShape.step: - xdata = np.repeat(hist.edges, 2) - ydata = np.concatenate([[0], np.repeat(heights, 2), [0]]) - elif shape is HistogramShape.polygon: - centers = hist.centers() - xdata = np.concatenate([[centers[0]], centers, [centers[-1]]]) - ydata = np.concatenate([[0], heights, [0]]) - elif shape is HistogramShape.bars: - edges = hist.edges - xdata = np.repeat(edges, 3)[1:-1] - ydata = np.zeros_like(xdata) - ydata[1::3] = ydata[2::3] = heights - else: - raise ValueError(f"Unknown shape {shape!r}.") - return xdata, ydata, hist.edges diff --git a/whitecanvas/layers/group/line_collection.py b/whitecanvas/layers/group/line_collection.py index b37bc5dd..031d7356 100644 --- a/whitecanvas/layers/group/line_collection.py +++ b/whitecanvas/layers/group/line_collection.py @@ -53,7 +53,7 @@ def width(self, width: float | Sequence[float]): _width = [width] * len(self) else: _width = np.asarray(width, dtype=np.float32) - if len(width) != len(self): + if len(_width) != len(self): raise ValueError( f"width must be a float or a sequence of length {len(self)}" ) diff --git a/whitecanvas/layers/group/line_fill.py b/whitecanvas/layers/group/line_fill.py new file mode 100644 index 00000000..86ad6959 --- /dev/null +++ b/whitecanvas/layers/group/line_fill.py @@ -0,0 +1,382 @@ +from __future__ import annotations + +from enum import Enum +from typing import overload + +import numpy as np +from numpy.typing import NDArray + +from whitecanvas.backend import Backend +from whitecanvas.layers._primitive import Band, Line +from whitecanvas.layers.group._collections import LayerContainer +from whitecanvas.types import ArrayLike1D, ColorType, LineStyle, Orientation +from whitecanvas.utils.hist import get_hist_edges, histograms +from whitecanvas.utils.normalize import as_array_1d + + +class HistogramShape(Enum): + step = "step" + polygon = "polygon" + bars = "bars" + + +class HistogramKind(Enum): + count = "count" + density = "density" + frequency = "frequency" + percent = "percent" + + +class LineFillBase(LayerContainer): + def __init__(self, line: Line, fill: Band, name: str | None = None): + super().__init__([line, fill], name=name) + + @property + def line(self) -> Line: + """The line layer.""" + return self._children[0] + + @property + def fill(self) -> Band: + """The fill layer.""" + return self._children[1] + + @property + def orient(self) -> Orientation: + """Orientation of the line and fill layers.""" + return self.fill.orient + + @property + def color(self) -> NDArray[np.float32]: + """Color of the layer.""" + return self.line.color + + @color.setter + def color(self, color: ColorType): + self.line.color = color + self.fill.face.update(color=color, alpha=0.2) + self.fill.edge.width = 0.0 + + +class Histogram(LineFillBase): + def __init__( + self, + data: NDArray[np.number], + edges: NDArray[np.number], + limits: tuple[float, float] | None, + line: Line, + fill: Band, + shape: HistogramShape = HistogramShape.bars, + kind: HistogramKind = HistogramKind.count, + name: str | None = None, + ): + if name is None: + name = "histogram" + super().__init__(line, fill, name=name) + self._data = data + self._shape = shape + self._kind = kind + self._edges = edges + self._limits = limits + + @property + def data(self) -> NDArray[np.number]: + """The data used to plot the histogram.""" + return self._data + + @data.setter + def data(self, data: NDArray[np.number]): + data = as_array_1d(data) + xdata, ydata = self._calculate_xy( + data, self._edges, self._shape, self._kind, self._limits, clip=True + ) # fmt: skip + self._update_internal(xdata, ydata) + self._data = data + + def _update_internal(self, xdata: NDArray[np.number], ydata: NDArray[np.number]): + if self.orient.is_vertical: + self.line.data = xdata, ydata + else: + self.line.data = ydata, xdata + self.fill.data = xdata, _prep_bottom(ydata), ydata + + @property + def shape(self) -> HistogramShape: + """The shape of the histogram.""" + return self._shape + + @shape.setter + def shape(self, shape: str | HistogramShape): + shape = HistogramShape(shape) + xdata, ydata, _ = self._calculate_xy( + self._data, self._edges, shape, self._kind, self._limits + ) # fmt: skip + self._update_internal(xdata, ydata) + self._shape = shape + + @property + def kind(self) -> HistogramKind: + """The kind of the histogram.""" + return self._kind + + @kind.setter + def kind(self, kind: str | HistogramKind): + kind = HistogramKind(kind) + xdata, ydata, _ = self._calculate_xy( + self._data, self._edges, self._shape, kind, self._limits + ) # fmt: skip + self._update_internal(xdata, ydata) + self._kind = kind + + @property + def limits(self) -> tuple[float, float] | None: + """The limits of the histogram.""" + return self._limits + + @limits.setter + def limits(self, limits: tuple[float, float] | None): + xdata, ydata, _ = self._calculate_xy( + self._data, self._edges, self._shape, self._kind, limits + ) + self._update_internal(xdata, ydata) + self._limits = limits + + @property + def edges(self) -> NDArray[np.number]: + """The edges of the histogram.""" + return self._edges + + @edges.setter + def edges(self, edges: NDArray[np.number]): + edges = as_array_1d(edges) + xdata, ydata, _ = self._calculate_xy( + self._data, edges, self._shape, self._kind, self._limits + ) + self._update_internal(xdata, ydata) + self._edges = edges + + @overload + def update_edges(self, bins: int, limits: tuple[float, float] | None = None): + ... + + @overload + def update_edges(self, edges: NDArray[np.number]): + ... + + def update_edges(self, bins, limits=None): + """ + Update the edges of the histogram. + + >>> hist.update_edges(20, limits=(0, 10)) # uniform bins + >>> hist.update_edges([0, 2, 3, 5]) # non-uniform bins + """ + if limits is not None and not isinstance(bins, (int, np.number)): + raise TypeError("bins must be an integer when limits are specified.") + edges = get_hist_edges([self._data], bins, limits) + self.edges = edges + + @classmethod + def from_array( + cls, + data: NDArray[np.number], + shape: HistogramShape = HistogramShape.bars, + kind: HistogramKind = HistogramKind.count, + name: str | None = None, + bins: int = 10, + limits: tuple[float, float] | None = None, + color: ColorType = "black", + style: str | LineStyle = LineStyle.SOLID, + width: float = 1.0, + orient: str | Orientation = "vertical", + backend: str | Backend | None = None, + ) -> Histogram: + """Create a histogram from an array.""" + shape = HistogramShape(shape) + kind = HistogramKind(kind) + ori = Orientation.parse(orient) + xdata, ydata, edges = cls._calculate_xy(data, bins, shape, kind, limits) + if ori.is_vertical: + line = Line( + xdata, ydata, color=color, style=style, width=width, backend=backend + ) # fmt: skip + else: + line = Line( + ydata, xdata, color=color, style=style, width=width, backend=backend + ) + fill = Band( + xdata, _prep_bottom(ydata), ydata, color=color, alpha=0.2, orient=ori, + backend=backend, + ) # fmt: skip + return cls(data, edges, limits, line, fill, shape, kind, name=name) + + @staticmethod + def _calculate_xy( + data, + bins: int | ArrayLike1D, + shape: HistogramShape, + kind: HistogramKind, + limits: tuple[float, float] | None = None, + clip: bool = True, + ) -> tuple[NDArray[np.number], NDArray[np.number], NDArray[np.number]]: + if clip and limits is not None: + data = np.clip(data, *limits) + hist = histograms([data], bins, limits) + shape = HistogramShape(shape) + kind = HistogramKind(kind) + if kind is HistogramKind.count: + heights = hist.counts[0] + elif kind is HistogramKind.density: + heights = hist.density()[0] + elif kind is HistogramKind.frequency: + heights = hist.frequency()[0] + elif kind is HistogramKind.percent: + heights = hist.percent()[0] + else: + raise ValueError(f"Unknown kind {kind!r}.") + + if shape is HistogramShape.step: + xdata = np.repeat(hist.edges, 2) + ydata = np.concatenate([[0], np.repeat(heights, 2), [0]]) + elif shape is HistogramShape.polygon: + centers = hist.centers() + xdata = np.concatenate([[centers[0]], centers, [centers[-1]]]) + ydata = np.concatenate([[0], heights, [0]]) + elif shape is HistogramShape.bars: + edges = hist.edges + xdata = np.repeat(edges, 3)[1:-1] + ydata = np.zeros_like(xdata) + ydata[1::3] = ydata[2::3] = heights + else: + raise ValueError(f"Unknown shape {shape!r}.") + return xdata, ydata, hist.edges + + +def _prep_bottom(ydata: NDArray[np.number]) -> NDArray[np.number]: + return np.full_like(ydata, 0) + + +class Kde(LineFillBase): + def __init__( + self, + data: NDArray[np.number], + band_width: float, + line: Line, + fill: Band, + name: str | None = None, + bottom: float = 0.0, + scale: float = 1.0, + ): + if name is None: + name = "kde" + super().__init__(line, fill, name=name) + self._data = data + self._bottom = bottom + self._band_width = band_width + self._scale = scale + + @property + def data(self) -> NDArray[np.number]: + """The data used to plot the histogram.""" + return self._data + + @data.setter + def data(self, data: NDArray[np.number]): + data = as_array_1d(data) + xdata, ydata = self._calculate_params( + data, self._band_width, self._bottom, self._scale + ) # fmt: skip + self._update_internal(xdata, ydata, self._bottom) + self._data = data + + def _update_internal( + self, xdata: NDArray[np.number], ydata: NDArray[np.number], bottom: float + ): + if self.orient.is_vertical: + self.line.data = xdata, ydata + else: + self.line.data = ydata, xdata + self.fill.data = xdata, np.full_like(xdata, bottom), ydata + + @property + def band_width(self) -> float: + """The band width of the kernel density estimation.""" + return self._band_width + + @band_width.setter + def band_width(self, band_width: float): + xdata, ydata, bw = self._calculate_params( + self._data, band_width, self._bottom, self._scale + ) # fmt: skip + self._update_internal(xdata, ydata, self._bottom) + self._band_width = bw + + @property + def bottom(self) -> float: + """The bottom value of the fill.""" + return self._bottom + + @bottom.setter + def bottom(self, bottom: float): + xdata, ydata, _ = self._calculate_params( + self._data, self._band_width, bottom, self._scale + ) # fmt: skip + self._update_internal(xdata, ydata, bottom) + self._bottom = bottom + + @property + def scale(self) -> float: + """The scale of the kernel density estimation.""" + return self._scale + + @scale.setter + def scale(self, scale: float): + xdata, ydata, _ = self._calculate_params( + self._data, self._band_width, self._bottom, scale + ) # fmt: skip + self._update_internal(xdata, ydata, self._bottom) + self._scale = scale + + @classmethod + def from_array( + cls, + data: ArrayLike1D, + bottom: float = 0.0, + scale: float = 1.0, + *, + name: str | None = None, + band_width: float | None = None, + color: ColorType = "blue", + style: str | LineStyle = LineStyle.SOLID, + width: float = 1.0, + orient: str | Orientation = Orientation.VERTICAL, + backend: Backend | str | None = None, + ): + data = as_array_1d(data) + x, y1, bw = cls._calculate_params(data, band_width, bottom, scale) + if orient.is_vertical: + line = Line(x, y1, color=color, style=style, width=width, backend=backend) + else: + line = Line(y1, x, color=color, style=style, width=width, backend=backend) + fill = Band( + x, np.full_like(x, bottom), y1, color=color, alpha=0.2, orient=orient, + backend=backend, + ) # fmt: skip + return Kde(data, bw, line, fill, name=name, bottom=bottom, scale=scale) + + @staticmethod + def _calculate_params( + data: NDArray[np.number], + band_width: float, + bottom: float = 0.0, + scale: float = 1.0, + ) -> tuple[NDArray[np.number], NDArray[np.number], float]: + from whitecanvas.utils.kde import gaussian_kde + + data = as_array_1d(data) + kde = gaussian_kde(data, bw_method=band_width) + + sigma = np.sqrt(kde.covariance[0, 0]) + pad = sigma * 2.5 + x = np.linspace(data.min() - pad, data.max() + pad, 100) + y1 = kde(x) * scale + bottom + return x, y1, kde.factor diff --git a/whitecanvas/layers/tabular/__init__.py b/whitecanvas/layers/tabular/__init__.py index 325ad2e0..ce7ea984 100644 --- a/whitecanvas/layers/tabular/__init__.py +++ b/whitecanvas/layers/tabular/__init__.py @@ -8,6 +8,7 @@ DFBars, DFHeatmap, DFHistograms, + DFKde, DFLines, DFMarkerGroups, DFMarkers, @@ -26,6 +27,7 @@ "DFBoxPlot", "DFHeatmap", "DFHistograms", + "DFKde", "DFPointPlot2D", "parse", ] diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index 47764626..b96e005c 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -682,6 +682,106 @@ def with_style(self, by: str | Iterable[str], styles=None) -> Self: else: style_by = _p.StylePlan.from_const(LineStyle(cov.value)) for i, st in enumerate(style_by.generate(self._labels, self._splitby)): - self._base_layer[i].style = st + self._base_layer[i].line.style = st + self._style_by = style_by + return self + + +class DFKde( + _shared.DataFrameLayerWrapper[_lg.LayerCollectionBase[_lg.Kde], _DF], + Generic[_DF], +): + def __init__( + self, + source: DataFrameWrapper[_DF], + base: _lg.LayerCollectionBase[_lg.Kde], + labels: list[tuple[Any, ...]], + color: _Cols | None = None, + width: str | None = None, + style: _Cols | None = None, + ): + splitby = _shared.join_columns(color, style, source=source) + self._color_by = _p.ColorPlan.default() + self._width_by = _p.WidthPlan.default() + self._style_by = _p.StylePlan.default() + self._labels = labels + self._splitby = splitby + super().__init__(base, source) + if color is not None: + self.with_color(color) + if isinstance(width, str): + self.with_width(width) + if style is not None: + self.with_style(style) + + @classmethod + def from_table( + cls, + df: DataFrameWrapper[_DF], + value: str, + band_width: float | None = None, + color: str | None = None, + width: float = 1.0, + style: str | None = None, + name: str | None = None, + orient: str | Orientation = Orientation.VERTICAL, + backend: str | Backend | None = None, + ) -> DFHistograms[_DF]: + splitby = _shared.join_columns(color, style, source=df) + ori = Orientation.parse(orient) + arrays: list[np.ndarray] = [] + labels: list[tuple] = [] + for sl, sub in df.group_by(splitby): + labels.append(sl) + arrays.append(sub[value]) + layers = [] + for arr in arrays: + each_layer = _lg.Kde.from_array( + arr, width=width, band_width=band_width, orient=ori, backend=backend, + ) # fmt: skip + layers.append(each_layer) + base = _lg.LayerCollectionBase(layers, name=name) + return cls(df, base, labels, color=color, width=width, style=style) + + @overload + def with_color(self, value: ColorType) -> Self: + ... + + @overload + def with_color( + self, + by: str | Iterable[str], + palette: ColormapType | None = None, + ) -> Self: + ... + + def with_color(self, by, /, palette=None): + cov = _shared.ColumnOrValue(by, self._source) + if cov.is_column: + if set(cov.columns) > set(self._splitby): + raise ValueError(f"Cannot color by a column other than {self._splitby}") + color_by = _p.ColorPlan.from_palette(cov.columns, palette=palette) + else: + color_by = _p.ColorPlan.from_const(Color(cov.value)) + for i, col in enumerate(color_by.generate(self._labels, self._splitby)): + self._base_layer[i].color = col + self._color_by = color_by + return self + + def with_width(self, value: float) -> Self: + for hist in self._base_layer: + hist.line.width = value + return self + + def with_style(self, by: str | Iterable[str], styles=None) -> Self: + cov = _shared.ColumnOrValue(by, self._source) + if cov.is_column: + if set(cov.columns) > set(self._splitby): + raise ValueError(f"Cannot style by a column other than {self._splitby}") + style_by = _p.StylePlan.new(cov.columns, values=styles) + else: + style_by = _p.StylePlan.from_const(LineStyle(cov.value)) + for i, st in enumerate(style_by.generate(self._labels, self._splitby)): + self._base_layer[i].line.style = st self._style_by = style_by return self From 7b13036a78d46048e7578288b5dac28fe37a8ed5 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Sun, 4 Feb 2024 21:30:22 +0900 Subject: [PATCH 06/11] fix plotly reorder bug --- whitecanvas/backend/plotly/canvas.py | 40 ++++++++++++---------------- whitecanvas/canvas/_grid.py | 14 ++++++++++ 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/whitecanvas/backend/plotly/canvas.py b/whitecanvas/backend/plotly/canvas.py index dcc5446b..e0f153f4 100644 --- a/whitecanvas/backend/plotly/canvas.py +++ b/whitecanvas/backend/plotly/canvas.py @@ -1,9 +1,8 @@ from __future__ import annotations import sys -import warnings import weakref -from typing import Callable +from typing import TYPE_CHECKING, Callable import numpy as np from plotly import graph_objects as go @@ -15,6 +14,9 @@ from whitecanvas.types import MouseEvent from whitecanvas.utils.normalize import rgba_str_color +if TYPE_CHECKING: + from plotly._subplots import SubplotXY + class Canvas: def __init__( @@ -43,7 +45,7 @@ def __init__( ) self._fig.add_trace(self._scatter) - def _subplot_layout(self): + def _subplot_layout(self) -> SubplotXY: try: layout = self._fig.get_subplot(**self._loc.asdict()) except Exception: # manually wrapped backend are not created with subplots @@ -77,26 +79,15 @@ def _plt_get_ylabel(self): def _plt_reorder_layers(self, layers: list[PlotlyLayer]): model_to_idx_map = {id(layer._props): i for i, layer in enumerate(layers)} first, *data = self._fig._data - try: - self._fig._data = [first] + [data[model_to_idx_map[id(r)]] for r in data] - except KeyError: - # sometimes fails, so just warn - not_found = [] - for r in data: - if id(r) not in model_to_idx_map: - not_found.append(r) - keys = list(model_to_idx_map.keys()) - warnings.warn( - f"Layers {not_found!r} not found in the ID keys {keys!r}.", - UserWarning, - stacklevel=2, - ) - if len(self._fig._data) != len(data) + 1: - warnings.warn( - "Number of layers changed", - UserWarning, - stacklevel=2, - ) + ordered_data = [] + data_in_other = [] + for _data in data: + data_id = id(_data) + if data_id in model_to_idx_map: + ordered_data.append(data[model_to_idx_map[data_id]]) + else: + data_in_other.append(_data) + self._fig._data = [first, *ordered_data, *data_in_other] def _plt_get_aspect_ratio(self) -> float | None: """Get aspect ratio of canvas""" @@ -201,6 +192,9 @@ def _plt_twinx(self): kwargs["secondary_y"] = True return Canvas(self._fig, **kwargs) + def _repr_mimebundle_(self, *args, **kwargs): + return self._fig._repr_mimebundle_(*args, **kwargs) + def _convert_cb(cb): return lambda _, points, state: cb(points.point_inds) # noqa: ARG005 diff --git a/whitecanvas/canvas/_grid.py b/whitecanvas/canvas/_grid.py index edcd0aa5..aecfc4d4 100644 --- a/whitecanvas/canvas/_grid.py +++ b/whitecanvas/canvas/_grid.py @@ -261,6 +261,11 @@ def _repr_mimebundle_(self, *args: Any, **kwargs: Any) -> dict: return self._backend_object._repr_mimebundle_(*args, **kwargs) raise NotImplementedError() + def _repr_html_(self, *args: Any, **kwargs: Any) -> str: + if hasattr(self._backend_object, "_repr_html_"): + return self._backend_object._repr_html_(*args, **kwargs) + raise NotImplementedError() + class CanvasVGrid(CanvasGrid): @override @@ -393,3 +398,12 @@ def screenshot(self) -> NDArray[np.uint8]: def _repr_png_(self): """Return PNG representation of the widget for QtConsole.""" return self._grid._repr_png_() + + def _repr_mimebundle_(self, *args: Any, **kwargs: Any) -> dict: + return self._grid._repr_mimebundle_(*args, **kwargs) + + def _ipython_display_(self, *args: Any, **kwargs: Any) -> Any: + return self._grid._ipython_display_(*args, **kwargs) + + def _repr_html_(self, *args: Any, **kwargs: Any) -> str: + return self._grid._repr_html_(*args, **kwargs) From 2e3cd2c86ff978d7cf4826f9956d6fa2b65907cb Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Sun, 4 Feb 2024 23:54:51 +0900 Subject: [PATCH 07/11] link and future --- docs/_scripts/_hooks.py | 27 +++- docs/categorical/cat_num.md | 176 +++++++++++++---------- docs/categorical/num_num.md | 12 +- tests/test_canvas.py | 10 +- whitecanvas/canvas/_grid.py | 100 ++++++++----- whitecanvas/canvas/dataframe/_one_cat.py | 4 + whitecanvas/core.py | 64 +++++---- whitecanvas/plot/_canvases.py | 4 +- 8 files changed, 244 insertions(+), 153 deletions(-) diff --git a/docs/_scripts/_hooks.py b/docs/_scripts/_hooks.py index d9193996..cb7355d4 100644 --- a/docs/_scripts/_hooks.py +++ b/docs/_scripts/_hooks.py @@ -25,17 +25,34 @@ def _add_images(matchobj: re.Match[str]) -> str: other = code return "```python\n" + other + "\n```" - line, other = code.split("\n", 1) - assert line.startswith("#!name:") - name = line.split(":", 1)[1].strip() + code, name = _get_image_name(code) dest = f"_images/{name}.png" + code, width = _get_image_width(code) + reldepth = "../" * page.file.src_path.count(os.sep) dest = f"{reldepth}_images/{name}.png" - link = f"\n![]({dest}){{ loading=lazy, width=360px }}\n\n" - new_md = "```python\n" + other + "\n```" + link + link = f"\n![]({dest}){{ loading=lazy, width={width}px }}\n\n" + new_md = "```python\n" + code + "\n```" + link return new_md md = re.sub("``` ?python\n([^`]*)```", _add_images, md, re.DOTALL) return md + +def _get_image_name(code: str) -> tuple[str, str]: + line, other = code.split("\n", 1) + assert line.startswith("#!name:") + name = line.split(":", 1)[1].strip() + return other, name + +def _get_image_width(code: str) -> tuple[str, int]: + """Get the width of the image from the code.""" + code = code.strip() + if code.startswith("#!width:"): + line, other = code.split("\n", 1) + width = int(line.split(":", 1)[1].strip()) + else: + other = code + width = 360 + return other, width diff --git a/docs/categorical/cat_num.md b/docs/categorical/cat_num.md index e8d85113..f7020db3 100644 --- a/docs/categorical/cat_num.md +++ b/docs/categorical/cat_num.md @@ -1,6 +1,23 @@ # Categorical × Numerical Data -There are several plots that use categorical axis. Examples are: +In this section, following data will be used as an example: + +``` python +import numpy as np +from whitecanvas import new_canvas + +rng = np.random.default_rng(12345) +df = { + "category": ["A"] * 40 + ["B"] * 50, + "observation": np.concatenate([rng.random(40), rng.random(50) + 1.3]), + "replicate": [0] * 20 + [1] * 20 + [0] * 25 + [1] * 25, + "temperature": rng.normal(scale=2.8, size=90) + 22.0, +} +``` + +How can we visualize the distributions for each category? There are several plots that +use categorical axis as either the x- or y-axis, and numerical axis as the other. +Examples are: - Strip plot - Swarm plot @@ -12,20 +29,41 @@ such as the marker symbol and the marker size. Things are even more complicated the markers represent numerical values, such as their size being proportional to the value, or colored by a colormap. -`whitecanvas` provides a consistent and simple interface to handle all these cases. In -this section, following data will be used as an example: +`whitecanvas` provides a consistent and simple interface to handle all these cases. +Methods used for this purpose are `cat_x` and `cat_y`, where `cat_x` will deem the +x-axis as categorical, and `cat_y` will do the same for the y-axis. ``` python -import numpy as np -from whitecanvas import new_canvas +#!skip +canvas = new_canvas("matplotlib") -rng = np.random.default_rng(12345) -df = { - "category": ["A"] * 40 + ["B"] * 50, - "observation": np.concatenate([rng.random(40), rng.random(50) + 1.3]), - "replicate": [0] * 20 + [1] * 20 + [0] * 25 + [1] * 25, - "temperature": rng.normal(scale=2.8, size=90) + 22.0, -} +# create the categorical plotter. +cat_plt_x = canvas.cat_x(df, x="category", y="observation") +cat_plt_y = canvas.cat_y(df, x="observation", y="category") +``` + +`cat_x` and `cat_y` use the argument `x=` and `y=` to specify the columns that are used +for the plot, where `x=` is the categorical axis for `cat_x` and `y=` for `cat_y`. + +``` note +This is one of the important difference between `seaborn`. In `seaborn`, `orient` are +used to specify the orientation of the plots. This design forces the user to add the +argument `orient=` to every plot even though the orientation rarely changes during the +use of the same figure. In `whitecanvas`, you don't have to specify the orientation +once a categorical plotter is created by either `cat_x` or `cat_y`. +``` + +Multiplt columns can be used for the categorical axis, but only one column can be used +for the numerical axis. + +``` python +#!skip +# OK +canvas.cat_x(df, x=["category", "replicate"], y="observation") +# OK +canvas.cat_y(df, x="observation", y=["category", "replicate"]) +# NG +canvas.cat_x(df, x="category", y=["observation", "temperature"]) ``` ## Non-marker-type Plots @@ -36,105 +74,91 @@ them. It includes `add_violinplot`, `add_boxplot`, `add_pointplot` and `add_barp ``` python #!name: categorical_axis_violin_0 canvas = new_canvas("matplotlib") -canvas.cat(df).add_violinplot("category", "observation") +canvas.cat_x(df, x="category", y="observation").add_violinplot() canvas.show() ``` -The first argument of `add_violinplot` is the column that defines the offset (shift -from 0 in the categorical axis). The second one is the column that is used for the -values. - -Offset can be defined by multiple columns. You can pass a sequence of column names to -do that. +Violins can also be shown in different color. Specify the `color=` argument to do that. ``` python #!name: categorical_axis_violin_1 canvas = new_canvas("matplotlib") -canvas.cat(df).add_violinplot(["category", "replicate"], "observation") +( + canvas + .cat_x(df, x="category", y="observation") + .add_violinplot(color="replicate") +) canvas.show() ``` -Violons can also be shown in different color. Specify the `color=` argument to do that. +By default, groups with different colors do not overlap. This is controlled by the +`dodge=` argument. Set `dodge=False` to make them overlap (although it is not the way +we usually do). ``` python #!name: categorical_axis_violin_2 canvas = new_canvas("matplotlib") -canvas.cat(df).add_violinplot("category", "observation", color="replicate") +( + canvas + .cat_x(df, x="category", y="observation") + .add_violinplot(color="replicate", dodge=False) +) canvas.show() ``` -You can see that the violins overlaps. It is because only "category" is used for the -offsets. Offsets, colors and other properties are calculated **independently**. - -To separate them, we need to add "replicate" to the offset. +`hatch=` can also be specified in a similar way. It will change the hatch pattern of the +violins. ``` python -#!name: categorical_axis_violin_3 +#!name: categorical_axis_violin_4 canvas = new_canvas("matplotlib") -canvas.cat(df).add_violinplot( - offset=["category", "replicate"], - value="observation", - color="replicate" +( + canvas + .cat_x(df, x="category", y="observation") + .add_violinplot(hatch="replicate") ) canvas.show() ``` -`hatch=` can also be specified in a similar way. Again, All the properties are -independent. +`color` and `hatch` can overlap with each other or the `x=` or `y=` argument. ``` python -#!name: categorical_axis_violin_4 +#!name: categorical_axis_violin_5 canvas = new_canvas("matplotlib") -canvas.cat(df).add_violinplot( - offset=["category", "replicate"], - value="observation", - color="replicate", - hatch="category", +( + canvas + .cat_x(df, x="category", y="observation") + .add_violinplot(color="category") ) -canvas +canvas.show() ``` -!!! note - This is different from the `seaborn` interface, where `hue=` and `dodge=` are used - to separate groups. As you can see in these examples, this is how `whitecanvas` - can easily handle more complicated cases without confusion. - `add_boxplot`, `add_pointplot` and `add_barplot` is very similar to `add_violinplot`. ``` python -#!name: categorical_axis_boxplot_0 -canvas = new_canvas("matplotlib") -canvas.cat(df).add_boxplot( - offset=["category", "replicate"], - value="observation", - color="replicate", - hatch="category", -) -canvas -``` +#!name: categorical_axis_many_plots +#!width: 700 +from whitecanvas import hgrid -``` python -#!name: categorical_axis_pointplot_0 -canvas = new_canvas("matplotlib") -canvas.cat(df).add_pointplot( - offset=["category", "replicate"], - value="observation", - color="replicate", - hatch="category", -) -canvas -``` +canvas = hgrid(ncols=3, size=(1600, 600), backend="matplotlib") -``` python -#!name: categorical_axis_barplot_0 -canvas = new_canvas("matplotlib") -canvas.cat(df).add_barplot( - offset=["category", "replicate"], - value="observation", - color="replicate", - hatch="category", -) -canvas +c0 = canvas.add_canvas(0) +c0.cat_x(df, x="category", y="observation").add_boxplot() +c0.title = "boxplot" + +c1 = canvas.add_canvas(1) +c1.cat_x(df, x="category", y="observation").add_pointplot() +c1.title = "pointplot" + +c2 = canvas.add_canvas(2) +c2.cat_x(df, x="category", y="observation").add_barplot() +c2.title = "barplot" + +canvas.show() ``` ## Marker-type Plots + +TODO + +## Aggregation diff --git a/docs/categorical/num_num.md b/docs/categorical/num_num.md index c88bf135..57a1812a 100644 --- a/docs/categorical/num_num.md +++ b/docs/categorical/num_num.md @@ -90,8 +90,18 @@ the plotter which axis to use, call `along_x()` or `along_y()` to restrict the dimension. ``` python +#!name: cat_hist_along_x canvas = new_canvas("matplotlib") # canvas.cat(df, x="label", y="value").add_hist(bins=10) # This will raise an error -canvas.cat(df, x="label", y="value").along_x().add_hist(bins=10) +canvas.cat(df, x="label", y="value").along_y().add_hist(bins=10) +canvas.show() +``` + +KDE can be similarly added. + +``` python +#!name: cat_kde_x +canvas = new_canvas("matplotlib") +canvas.cat(df, x="value").add_kde() canvas.show() ``` diff --git a/tests/test_canvas.py b/tests/test_canvas.py index 744a7b1d..dae6ca34 100644 --- a/tests/test_canvas.py +++ b/tests/test_canvas.py @@ -50,7 +50,7 @@ def test_namespace_pointing_at_different_objects(): assert_color_equal(c1.x.color, "blue") def test_grid(backend: str): - cgrid = wc.grid(2, 2, link_x=True, link_y=True, backend=backend) + cgrid = wc.grid(2, 2, backend=backend).link_x().link_y() c00 = cgrid.add_canvas(0, 0) c01 = cgrid.add_canvas(0, 1) c10 = cgrid.add_canvas(1, 0) @@ -76,8 +76,8 @@ def test_grid(backend: str): def test_grid_nonuniform(backend: str): cgrid = wc.grid_nonuniform( - [2, 1], [2, 1], link_x=True, link_y=True, backend=backend - ) + [2, 1], [2, 1], backend=backend + ).link_x().link_y() c00 = cgrid.add_canvas(0, 0) c01 = cgrid.add_canvas(0, 1) c10 = cgrid.add_canvas(1, 0) @@ -101,7 +101,7 @@ def test_grid_nonuniform(backend: str): assert len(c11.layers) == 1 def test_vgrid_hgrid(backend: str): - cgrid = wc.vgrid(2, backend=backend, link_x=True, link_y=True) + cgrid = wc.vgrid(2, backend=backend).link_x().link_y() c0 = cgrid.add_canvas(0) c1 = cgrid.add_canvas(1) @@ -114,7 +114,7 @@ def test_vgrid_hgrid(backend: str): assert len(c0.layers) == 1 assert len(c1.layers) == 1 - cgrid = wc.hgrid(2, backend=backend, link_x=True, link_y=True) + cgrid = wc.hgrid(2, backend=backend).link_x().link_y() c0 = cgrid.add_canvas(0) c1 = cgrid.add_canvas(1) diff --git a/whitecanvas/canvas/_grid.py b/whitecanvas/canvas/_grid.py index aecfc4d4..fa2fede7 100644 --- a/whitecanvas/canvas/_grid.py +++ b/whitecanvas/canvas/_grid.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Iterator +from typing import TYPE_CHECKING, Any, Iterator import numpy as np from numpy.typing import NDArray @@ -13,6 +13,9 @@ from whitecanvas.theme import get_theme from whitecanvas.utils.normalize import arr_color +if TYPE_CHECKING: + from typing_extensions import Self + class GridEvents(SignalGroup): drawn = Signal() @@ -55,8 +58,6 @@ def uniform( nrows: int = 1, ncols: int = 1, *, - link_x: bool = False, - link_y: bool = False, backend: Backend | str | None = None, ) -> CanvasGrid: """ @@ -71,10 +72,7 @@ def uniform( backend : backend-like, optional The backend to use for the grid. """ - return CanvasGrid( - [10] * nrows, [10] * ncols, link_x=link_x, link_y=link_y, - backend=backend, - ) # fmt: skip + return CanvasGrid([10] * nrows, [10] * ncols, backend=backend) @property def shape(self) -> tuple[int, int]: @@ -88,16 +86,7 @@ def x_linked(self) -> bool: @x_linked.setter def x_linked(self, value: bool): - value = bool(value) - if value == self._x_linked: - return - if value: - for _, canvas in self.iter_canvas(): - canvas.x.events.lim.connect(self._align_xlims, unique=True) - else: - for _, canvas in self.iter_canvas(): - canvas.x.events.lim.disconnect(self._align_xlims) - self._x_linked = value + self.link_x() if value else self.unlink_x() @property def y_linked(self) -> bool: @@ -106,16 +95,65 @@ def y_linked(self) -> bool: @y_linked.setter def y_linked(self, value: bool): - value = bool(value) - if value == self._y_linked: - return - if value: + self.link_y() if value else self.unlink_y() + + def link_x(self, future: bool = True) -> Self: + """ + Link all the x-axes of the canvases in the grid. + + >>> from whitecanvas import grid + >>> g = grid(2, 2).link_x() # link x-axes of all canvases + + Parameters + ---------- + future : bool, default True + If Ture, all the canvases added in the future will also be linked. Only link + the existing canvases if False. + """ + if not self._x_linked: + for _, canvas in self.iter_canvas(): + canvas.x.events.lim.connect(self._align_xlims, unique=True) + if future: + self._x_linked = True + return self + + def link_y(self, future: bool = True) -> Self: + """ + Link all the y-axes of the canvases in the grid. + + >>> from whitecanvas import grid + >>> g = grid(2, 2).link_y() # link y-axes of all canvases + + Parameters + ---------- + future : bool, default True + If Ture, all the canvases added in the future will also be linked. Only link + the existing canvases if False. + """ + if not self._y_linked: for _, canvas in self.iter_canvas(): canvas.y.events.lim.connect(self._align_ylims, unique=True) - else: + if future: + self._y_linked = True + return self + + def unlink_x(self, future: bool = True) -> Self: + """Unlink all the x-axes of the canvases in the grid.""" + if self._x_linked: + for _, canvas in self.iter_canvas(): + canvas.x.events.lim.disconnect(self._align_xlims) + if future: + self._x_linked = False + return self + + def unlink_y(self, future: bool = True) -> Self: + """Unlink all the y-axes of the canvases in the grid.""" + if self._y_linked: for _, canvas in self.iter_canvas(): canvas.y.events.lim.disconnect(self._align_ylims) - self._y_linked = value + if future: + self._y_linked = False + return self def __repr__(self) -> str: cname = type(self).__name__ @@ -273,11 +311,9 @@ def __init__( self, heights: list[int], *, - link_x: bool = False, - link_y: bool = False, backend: Backend | str | None = None, ) -> None: - super().__init__(heights, [1], link_x=link_x, link_y=link_y, backend=backend) + super().__init__(heights, [1], backend=backend) @override def __getitem__(self, key: int) -> Canvas: @@ -292,11 +328,9 @@ def uniform( cls, nrows: int = 1, *, - link_x: bool = False, - link_y: bool = False, backend: Backend | str | None = None, ) -> CanvasVGrid: - return CanvasVGrid([1] * nrows, link_x=link_x, link_y=link_y, backend=backend) + return CanvasVGrid([1] * nrows, backend=backend) @override def add_canvas(self, row: int, **kwargs) -> Canvas: @@ -309,11 +343,9 @@ def __init__( self, widths: list[int], *, - link_x: bool = False, - link_y: bool = False, backend: Backend | str | None = None, ) -> None: - super().__init__([1], widths, link_x=link_x, link_y=link_y, backend=backend) + super().__init__([1], widths, backend=backend) @override def __getitem__(self, key: int) -> Canvas: @@ -328,11 +360,9 @@ def uniform( cls, ncols: int = 1, *, - link_x: bool = False, - link_y: bool = False, backend: Backend | str | None = None, ) -> CanvasHGrid: - return CanvasHGrid([1] * ncols, link_x=link_x, link_y=link_y, backend=backend) + return CanvasHGrid([1] * ncols, backend=backend) @override def add_canvas(self, col: int, **kwargs) -> Canvas: diff --git a/whitecanvas/canvas/dataframe/_one_cat.py b/whitecanvas/canvas/dataframe/_one_cat.py index e87dd0aa..7b212b2e 100644 --- a/whitecanvas/canvas/dataframe/_one_cat.py +++ b/whitecanvas/canvas/dataframe/_one_cat.py @@ -2,6 +2,8 @@ from typing import TYPE_CHECKING, Generic, Sequence, TypeVar +import numpy as np + from whitecanvas import theme from whitecanvas.canvas.dataframe._base import AggMethods, BaseCatPlotter, CatIterator from whitecanvas.layers import tabular as _lt @@ -106,8 +108,10 @@ def __init__( pos, label = self._cat_iter.axis_ticks() if self._orient.is_vertical: canvas.x.ticks.set_labels(pos, label) + canvas.x.lim = (np.min(pos) - 0.5, np.max(pos) + 0.5) else: canvas.y.ticks.set_labels(pos, label) + canvas.y.lim = (np.min(pos) - 0.5, np.max(pos) + 0.5) def __repr__(self) -> str: return ( diff --git a/whitecanvas/core.py b/whitecanvas/core.py index 87bcc914..1e3062b2 100644 --- a/whitecanvas/core.py +++ b/whitecanvas/core.py @@ -18,8 +18,7 @@ def grid( nrows: int = 1, ncols: int = 1, *, - link_x: bool = False, - link_y: bool = False, + size: tuple[int, int] | None = None, backend: Backend | str | None = None, ) -> CanvasGrid: """ @@ -27,14 +26,12 @@ def grid( Parameters ---------- - nrows : int, optional - Number of rows, by default 1 - ncols : int, optional - Number of columns, by default 1 - link_x : bool, optional - Whether to link x axes, by default False - link_y : bool, optional - Whether to link y axes, by default False + nrows : int, default 1 + Number of rows. + ncols : int, default 1 + Number of columns. + size : (int, int), optional + Size of the grid. backend : Backend or str, optional Backend name. @@ -43,60 +40,71 @@ def grid( CanvasGrid Grid of empty canvases. """ - return CanvasGrid.uniform( - nrows, ncols, link_x=link_x, link_y=link_y, backend=backend - ) + g = CanvasGrid.uniform(nrows, ncols, backend=backend) + if size is not None: + g.size = size + return g def grid_nonuniform( heights: list[int], widths: list[int], *, - link_x: bool = False, - link_y: bool = False, + size: tuple[int, int] | None = None, backend: Backend | str | None = None, ) -> CanvasGrid: - return CanvasGrid(heights, widths, link_x=link_x, link_y=link_y, backend=backend) + g = CanvasGrid(heights, widths, backend=backend) + if size is not None: + g.size = size + return g def vgrid( nrows: int = 1, *, - link_x: bool = False, - link_y: bool = False, + size: tuple[int, int] | None = None, backend: Backend | str | None = None, ) -> CanvasVGrid: - return CanvasVGrid.uniform(nrows, link_x=link_x, link_y=link_y, backend=backend) + g = CanvasVGrid.uniform(nrows, backend=backend) + if size is not None: + g.size = size + return g def vgrid_nonuniform( heights: list[int], *, - link_x: bool = False, - link_y: bool = False, + size: tuple[int, int] | None = None, backend: Backend | str | None = None, ) -> CanvasVGrid: - return CanvasVGrid(heights, link_x=link_x, link_y=link_y, backend=backend) + g = CanvasVGrid(heights, backend=backend) + if size is not None: + g.size = size + return g def hgrid( ncols: int = 1, *, - link_x: bool = False, - link_y: bool = False, + size: tuple[int, int] | None = None, backend: Backend | str | None = None, ) -> CanvasHGrid: - return CanvasHGrid.uniform(ncols, link_x=link_x, link_y=link_y, backend=backend) + g = CanvasHGrid.uniform(ncols, backend=backend) + if size is not None: + g.size = size + return g def hgrid_nonuniform( widths: list[int], *, - link_x: bool = False, - link_y: bool = False, + size: tuple[int, int] | None = None, backend: Backend | str | None = None, ) -> CanvasHGrid: - return CanvasHGrid(widths, link_x=link_x, link_y=link_y, backend=backend) + g = CanvasHGrid(widths, backend=backend) + if size is not None: + g.size = size + return g def new_canvas( diff --git a/whitecanvas/plot/_canvases.py b/whitecanvas/plot/_canvases.py index 62ec04c5..85a85f9b 100644 --- a/whitecanvas/plot/_canvases.py +++ b/whitecanvas/plot/_canvases.py @@ -30,12 +30,10 @@ def subplots( nrows: int = 1, ncols: int = 1, *, - link_x: bool = False, - link_y: bool = False, backend: Backend | str | None = None, ) -> CanvasGrid: """Create a new grid of subplots.""" - out = grid(nrows, ncols, link_x=link_x, link_y=link_y, backend=backend) + out = grid(nrows, ncols, backend=backend) for i in range(nrows): for j in range(ncols): out.add_canvas(i, j) From 82ffde88dce239e4d560c2e43200a2deba2972c0 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Mon, 5 Feb 2024 18:03:53 +0900 Subject: [PATCH 08/11] fix swarm jitter, doc --- docs/categorical/cat_num.md | 92 +++++++++++++++++++++++- docs/categorical/num_num.md | 40 ++++++++--- whitecanvas/canvas/_base.py | 20 ++++++ whitecanvas/canvas/dataframe/_one_cat.py | 69 +++++++----------- whitecanvas/layers/tabular/_dataframe.py | 22 ++++++ whitecanvas/layers/tabular/_jitter.py | 30 +++++--- 6 files changed, 213 insertions(+), 60 deletions(-) diff --git a/docs/categorical/cat_num.md b/docs/categorical/cat_num.md index f7020db3..69f9ef50 100644 --- a/docs/categorical/cat_num.md +++ b/docs/categorical/cat_num.md @@ -159,6 +159,96 @@ canvas.show() ## Marker-type Plots -TODO +``` python +#!name: categorical_axis_stripplot +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category", y="observation") + .add_stripplot(color="replicate") +) +``` + +``` python +#!name: categorical_axis_stripplot_dodge +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category", y="observation") + .add_stripplot(color="replicate", dodge=True) +) +``` + +As for the `Markers` layer, `as_edge_only` will convert the face features to the edge features. + +``` python +#!name: categorical_axis_stripplot_dodge +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category", y="observation") + .add_stripplot(color="replicate", dodge=True) +) +``` + +Each marker size can represent a numerical value. `with_size` will map the numerical +values of a column to the size of the markers. + +``` python +#!name: categorical_axis_stripplot_by_size +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category", y="observation") + .add_stripplot() + .with_size("temperature") +) +``` + +Similarly, each marker color can represent a numerical value. `with_colormap` will map the value with an arbitrary colormap. + +``` python +#!name: categorical_axis_stripplot_by_color +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category", y="observation") + .add_stripplot() + .with_colormap("temperature", cmap="coolwarm") +) +``` ## Aggregation + +Showing both all the data points and the aggregated data is a common way to efficiently +visualize the data. This task is usually done by the module specific group-by methods, +but `whitecanvas` provides a built-in method to simplify the process. + +``` python +#!name: categorical_axis_stripplot_and_agg +canvas = new_canvas("matplotlib") + +# create a categorical plotter +cat_plt = canvas.cat_x(df, x="category", y="observation") + +# plot all the data +cat_plt.add_stripplot(color="category") +# plot the mean +cat_plt.mean().add_markers(color="category", size=20) + +canvas.show() +``` + +Count plot. + +``` python +#!name: categorical_axis_countplot +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category") + .count() + .add_bars(color="replicate", dodge=True) +) +canvas.show() +``` diff --git a/docs/categorical/num_num.md b/docs/categorical/num_num.md index 57a1812a..27dbcdbf 100644 --- a/docs/categorical/num_num.md +++ b/docs/categorical/num_num.md @@ -62,26 +62,36 @@ rng = np.random.default_rng(12345) # sample data df = { "label": ["A"] * 60 + ["B"] * 30 + ["C"] * 40, - "value": rng.normal(size=130), + "X": rng.normal(loc=0.0, size=130), + "Y": rng.normal(loc=1.0, size=130), } ``` -`x="value"` means that the x-axis being "value" and the y-axis being the count. +`x="X"` means that the x-axis being "X" and the y-axis being the count. Arguments forwards to the `histogram` method of `numpy`. ``` python #!name: cat_hist_x canvas = new_canvas("matplotlib") -canvas.cat(df, x="value").add_hist(bins=10) +canvas.cat(df, x="X").add_hist(bins=10) canvas.show() ``` -To transpose the histogram, use `y="value"`. +To transpose the histogram, use `y="X"`. ``` python #!name: cat_hist_y canvas = new_canvas("matplotlib") -canvas.cat(df, y="value").add_hist(bins=10) +canvas.cat(df, y="X").add_hist(bins=10) +canvas.show() +``` + +Histograms can be grouped by color. + +``` python +#!name: cat_hist_x_colored +canvas = new_canvas("matplotlib") +canvas.cat(df, x="X").add_hist(bins=10, color="label") canvas.show() ``` @@ -92,8 +102,8 @@ dimension. ``` python #!name: cat_hist_along_x canvas = new_canvas("matplotlib") -# canvas.cat(df, x="label", y="value").add_hist(bins=10) # This will raise an error -canvas.cat(df, x="label", y="value").along_y().add_hist(bins=10) +# canvas.cat(df, x="label", y="X").add_hist(bins=10) # This will raise an error +canvas.cat(df, x="label", y="X").along_y().add_hist(bins=10) canvas.show() ``` @@ -102,6 +112,20 @@ KDE can be similarly added. ``` python #!name: cat_kde_x canvas = new_canvas("matplotlib") -canvas.cat(df, x="value").add_kde() +canvas.cat(df, x="X").add_kde(color="label") canvas.show() ``` + +2-dimensional histogram can be added by `add_hist2d`. + +``` python +#!name: cat_hist2d +canvas = new_canvas("matplotlib") +canvas.cat(df, x="X", y="Y").add_hist2d(cmap=["white", "blue"], bins=(8, 10)) +canvas.show() +``` + +!!! note + `add_hist` and `add_hist2d` returns completely different objects (histogram and + heatmap) and they are configured by different arguments. That's why `whitecanvas` + split them into two different methods. diff --git a/whitecanvas/canvas/_base.py b/whitecanvas/canvas/_base.py index 2f7ba05e..71130748 100644 --- a/whitecanvas/canvas/_base.py +++ b/whitecanvas/canvas/_base.py @@ -309,6 +309,26 @@ def update_axes( self.y.label.color = color return self + def update_labels( + self, + title: str | None = None, + x: str | None = None, + y: str | None = None, + ) -> Self: + """ + Helper function to update the title, x, and y labels. + + >>> from whitecanvas import new_canvas + >>> canvas = new_canvas("matplotlib").update_labels("Title", "X", "Y") + """ + if title is not None: + self.title.text = title + if x is not None: + self.x.label.text = x + if y is not None: + self.y.label.text = y + return self + def cat( self, data: _DF, diff --git a/whitecanvas/canvas/dataframe/_one_cat.py b/whitecanvas/canvas/dataframe/_one_cat.py index 7b212b2e..1e479fe2 100644 --- a/whitecanvas/canvas/dataframe/_one_cat.py +++ b/whitecanvas/canvas/dataframe/_one_cat.py @@ -162,8 +162,7 @@ def add_violinplot( >>> canvas.cat_x(df, x="species", y="weight").add_violinplot() >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_violinplot(offset, "weight", color="region") + >>> canvas.cat_x(df, "region", "weight").add_violinplot(dodge=True) Parameters ---------- @@ -181,7 +180,7 @@ def add_violinplot( Returns ------- - WrappedViolinPlot + DFViolinPlot Violin plot layer. """ canvas = self._canvas() @@ -207,26 +206,19 @@ def add_boxplot( Add a categorical box plot. >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_boxplot("species", "weight") + >>> canvas.cat_x(df, x="species", y="weight").add_boxplot() >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_boxplot(offset, "weight", color="region") + >>> canvas.cat_x(df, "region", "weight").add_boxplot(dodge=True) Parameters ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. color : str or sequence of str, optional Column name(s) for coloring the lines. Must be categorical. hatch : str or sequence of str, optional Column name(s) for hatches. Must be categorical. name : str, optional Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". capsize : float, default 0.1 Length of the caps as a fraction of the width of the box. extent : float, default 0.8 @@ -234,7 +226,7 @@ def add_boxplot( Returns ------- - WrappedBoxPlot + DFBoxPlot Box plot layer. """ canvas = self._canvas() @@ -259,38 +251,31 @@ def add_pointplot( Add a categorical point plot (markers with error bars). >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_pointplot("species", "weight") + >>> canvas.cat_x(df, x="species", y="weight").add_pointplot() >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_pointplot(offset, "weight", color="region") + >>> canvas.cat_x(df, "region", "weight").add_pointplot(dodge=True) The default estimator and errors are mean and standard deviation. To change them, use `est_by_*` and `err_by_*` methods. >>> ### Use standard error x 2 (~95%) as error bars. - >>> canvas.cat(df).add_pointplot("species", "weight").err_by_se(scale=2.0) + >>> canvas.cat_x(df, "species", "weight").add_pointplot().err_by_se(scale=2.0) Parameters ---------- - offset : tuple of str - Column name(s) for x-axis. - value : str - Column name for y-axis. color : str or sequence of str, optional Column name(s) for coloring the lines. Must be categorical. hatch : str or sequence of str, optional Column name(s) for hatches. Must be categorical. name : str, optional Name of the layer. - orient : str, default "vertical" - Orientation of the violins. Can be "vertical" or "horizontal". capsize : float, default 0.1 Length of the caps as a fraction of the width of the box. Returns ------- - WrappedPointPlot + DFPointPlot Point plot layer. """ canvas = self._canvas() @@ -316,17 +301,16 @@ def add_barplot( Add a categorical bar plot (bars with error bars). >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_barplot("species", "weight") + >>> canvas.cat_x(df, x="species", y="weight").add_barplot() >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_barplot(offset, "weight", color="region") + >>> canvas.cat_x(df, "region", "weight").add_barplot(dodge=True) The default estimator and errors are mean and standard deviation. To change them, use `est_by_*` and `err_by_*` methods. >>> ### Use standard error x 2 (~95%) as error bars. - >>> canvas.cat(df).add_barplot("species", "weight").err_by_se(scale=2.0) + >>> canvas.cat_x(df, "species", "weight").add_barplot().err_by_se(scale=2.0) Parameters ---------- @@ -343,7 +327,7 @@ def add_barplot( Returns ------- - WrappedBarPlot + DFBarPlot Bar plot layer. """ canvas = self._canvas() @@ -378,11 +362,10 @@ def add_stripplot( Add a categorical strip plot. >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_stripplot("species", "weight") + >>> canvas.cat_x(df, x="species", y="weight").add_stripplot() >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_stripplot(offset, "weight", color="region") + >>> canvas.cat_x(df, "region", "weight").add_stripplot(dodge=True) Parameters ---------- @@ -403,7 +386,7 @@ def add_stripplot( Returns ------- - WrappedMarkerGroups + DFMarkerGroups Marker collection layer. """ canvas = self._canvas() @@ -438,6 +421,7 @@ def add_markers( size: str | None = None, dodge: NStr | bool = False, ) -> _lt.DFMarkerGroups[_DF]: + """Alias of `add_stripplot` with no jittering.""" return self.add_stripplot( color=color, hatch=hatch, symbol=symbol, size=size, dodge=dodge, extent=0, seed=0, name=name, @@ -459,11 +443,10 @@ def add_swarmplot( Add a categorical swarm plot. >>> ### Use "species" column as categories and "weight" column as values. - >>> canvas.cat(df).add_swarmplot("species", "weight") + >>> canvas.cat_x(df, x="species", y="weight").add_swarmplot() >>> ### Color by column "region" with dodging. - >>> offset = ["species", "region"] # categories that offset will be added - >>> canvas.cat(df).add_swarmplot(offset, "weight", color="region") + >>> canvas.cat_x(df, "region", "weight").add_swarmplot(dodge=True) Parameters ---------- @@ -484,7 +467,7 @@ def add_swarmplot( Returns ------- - WrappedMarkerGroups + DFMarkerGroups Marker collection layer. """ canvas = self._canvas() @@ -568,7 +551,7 @@ def add_line( """ Add line that connect the aggregated values. - >>> canvas.cat(df).mean().add_line("time", "value") + >>> canvas.cat_x(df).mean().add_line("time", "value") Parameters ---------- @@ -583,7 +566,7 @@ def add_line( Returns ------- - WrappedLines + DFLines Line collection layer. """ canvas = self._canvas() @@ -616,7 +599,7 @@ def add_markers( """ Add markers that represent the aggregated values. - >>> canvas.cat(df).mean().add_markers("time", "value") + >>> canvas.cat_x(df).mean().add_markers("time", "value") Parameters ---------- @@ -637,7 +620,7 @@ def add_markers( Returns ------- - WrappedMarkers + DFMarkers Marker collection layer. """ canvas = self._canvas() @@ -669,7 +652,7 @@ def add_bars( """ Add bars that represent the aggregated values. - >>> canvas.cat(df).mean().add_bars("time", "value") + >>> canvas.cat_x(df).mean().add_bars("time", "value") Parameters ---------- @@ -688,7 +671,7 @@ def add_bars( Returns ------- - WrappedBars + DFBars Bar collection layer. """ canvas = self._canvas() diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index b96e005c..50aaec3a 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -378,6 +378,28 @@ def with_shift(self, dx: float = 0.0, dy: float = 0.0) -> Self: canvas._autoscale_for_layer(self, pad_rel=0.025) return self + def as_edge_only( + self, + width: float = 3.0, + style: str | LineStyle = LineStyle.SOLID, + ) -> Self: + """ + Convert the markers to edge-only mode. + + This method will set the face color to transparent and the edge color to the + current face color. + + Parameters + ---------- + width : float, default 3.0 + Width of the edge. + style : str or LineStyle, default LineStyle.SOLID + Line style of the edge. + """ + for layer in self.base.iter_children(): + layer.as_edge_only(width=width, style=style) + return self + class DFMarkerGroups(DFMarkers): def __init__(self, *args, orient: Orientation = Orientation.VERTICAL, **kwargs): diff --git a/whitecanvas/layers/tabular/_jitter.py b/whitecanvas/layers/tabular/_jitter.py index 23dcae4e..949cca3d 100644 --- a/whitecanvas/layers/tabular/_jitter.py +++ b/whitecanvas/layers/tabular/_jitter.py @@ -94,27 +94,41 @@ def __init__( self._extent = extent self._limits = limits + def _get_bins(self, src: DataFrameWrapper[_DF]) -> int: + return 25 # just for now + def map(self, src: DataFrameWrapper[_DF]) -> NDArray[np.floating]: values = src[self._value] vmin, vmax = self._limits - nbin = 25 + nbin = self._get_bins(src) dv = (vmax - vmin) / nbin + # bin index that each value belongs to v_indices = np.floor((values - vmin) / dv).astype(np.int32) v_indices[v_indices == nbin] = nbin - 1 + + args = [src[b] for b in self._by] + offset_pre = np.zeros(len(src), dtype=np.float32) + for row in self._mapping.keys(): + sl = np.all(np.column_stack([a == r for a, r in zip(args, row)]), axis=1) + offset_pre[sl] = self._map_one(v_indices[sl], nbin) + + offset_max = np.abs(offset_pre).max() + width_default = dv * offset_max + offsets = offset_pre / offset_max * min(self._extent / 2, width_default) + out = self._map(src) + offsets + return out + + def _map_one(self, indices: NDArray[np.int32], nbin: int) -> NDArray[np.floating]: offset_count = np.zeros(nbin, dtype=np.int32) - offset_pre = np.zeros_like(values, dtype=np.int32) - for i, idx in enumerate(v_indices): + offset_pre = np.zeros_like(indices, dtype=np.int32) + for i, idx in enumerate(indices): c = offset_count[idx] if c % 2 == 0: offset_pre[i] = c / 2 else: offset_pre[i] = -(c + 1) / 2 offset_count[idx] += 1 - offset_max = np.abs(offset_pre).max() - width_default = dv * offset_max - offsets = offset_pre / offset_max * min(self._extent / 2, width_default) - out = self._map(src) + offsets - return out + return offset_pre def _tuple(x) -> tuple[str, ...]: From 48ea3a8519f53039ed1f7f93baffc473b4f3b457 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Mon, 5 Feb 2024 18:13:17 +0900 Subject: [PATCH 09/11] fix with_face_multi --- whitecanvas/layers/_mixin.py | 2 +- whitecanvas/layers/_primitive/bars.py | 2 +- whitecanvas/layers/_primitive/markers.py | 2 +- whitecanvas/layers/group/labeled.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/whitecanvas/layers/_mixin.py b/whitecanvas/layers/_mixin.py index 22a4771b..0ea3f595 100644 --- a/whitecanvas/layers/_mixin.py +++ b/whitecanvas/layers/_mixin.py @@ -640,7 +640,7 @@ def with_edge( def with_face_multi( self, color: ColorType | Sequence[ColorType] | _Void = _void, - hatch: str | Hatch | Sequence[str | Hatch] = Hatch.SOLID, + hatch: str | Hatch | Sequence[str | Hatch] | _Void = _void, alpha: float = 1, ) -> Self: if not isinstance(self._face_namespace, MultiFace): diff --git a/whitecanvas/layers/_primitive/bars.py b/whitecanvas/layers/_primitive/bars.py index 5ad8dc29..e19a9171 100644 --- a/whitecanvas/layers/_primitive/bars.py +++ b/whitecanvas/layers/_primitive/bars.py @@ -386,7 +386,7 @@ def with_face( def with_face_multi( self, color: ColorType | Sequence[ColorType] | _Void = _void, - hatch: str | Hatch | Sequence[str | Hatch] = Hatch.SOLID, + hatch: str | Hatch | Sequence[str | Hatch] | _Void = _void, alpha: float = 1, ) -> Bars[MultiFace, _Edge]: return super().with_face_multi(color, hatch, alpha) diff --git a/whitecanvas/layers/_primitive/markers.py b/whitecanvas/layers/_primitive/markers.py index 484d609f..17504742 100644 --- a/whitecanvas/layers/_primitive/markers.py +++ b/whitecanvas/layers/_primitive/markers.py @@ -616,7 +616,7 @@ def with_face_multi( self, *, color: ColorType | Sequence[ColorType] | _Void = _void, - hatch: str | Hatch | Sequence[str | Hatch] = Hatch.SOLID, + hatch: str | Hatch | Sequence[str | Hatch] | _Void = _void, alpha: float = 1, ) -> Markers[MultiFace, _Edge, _Size]: """ diff --git a/whitecanvas/layers/group/labeled.py b/whitecanvas/layers/group/labeled.py index 3ca144c7..ba420a94 100644 --- a/whitecanvas/layers/group/labeled.py +++ b/whitecanvas/layers/group/labeled.py @@ -323,7 +323,7 @@ def __init__( self._init_events() @property - def bars(self) -> Bars: + def bars(self) -> Bars[_NFace, _NEdge]: """The bars layer.""" return self._children[0] From f02e49544cfe284e02aa1121985f0e5ee2f2f102 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Mon, 5 Feb 2024 19:26:23 +0900 Subject: [PATCH 10/11] dodging in agg methods --- whitecanvas/canvas/dataframe/_one_cat.py | 75 ++++++++++----------- whitecanvas/layers/group/line_collection.py | 2 +- whitecanvas/layers/tabular/_box_like.py | 60 ++++++----------- whitecanvas/layers/tabular/_dataframe.py | 49 +++----------- whitecanvas/layers/tabular/_shared.py | 45 +++++++++++++ whitecanvas/layers/tabular/_utils.py | 17 ----- 6 files changed, 112 insertions(+), 136 deletions(-) diff --git a/whitecanvas/canvas/dataframe/_one_cat.py b/whitecanvas/canvas/dataframe/_one_cat.py index 1e479fe2..18b4c177 100644 --- a/whitecanvas/canvas/dataframe/_one_cat.py +++ b/whitecanvas/canvas/dataframe/_one_cat.py @@ -394,7 +394,9 @@ def add_stripplot( size = theme._default("markers.size", size) df = self._df - splitby, dodge = _splitby_dodge(df, self._offset, color, hatch, dodge) + splitby, dodge = _shared.norm_dodge_markers( + df, self._offset, color, hatch, dodge + ) # fmt: skip _map = self._cat_iter.prep_position_map(splitby, dodge) _extent = self._cat_iter.zoom_factor(dodge) * extent xj = _jitter.UniformJitter(splitby, _map, extent=_extent, seed=seed) @@ -474,7 +476,9 @@ def add_swarmplot( symbol = theme._default("markers.symbol", symbol) size = theme._default("markers.size", size) df = self._df - splitby, dodge = _splitby_dodge(df, self._offset, color, hatch, dodge) + splitby, dodge = _shared.norm_dodge_markers( + df, self._offset, color, hatch, dodge + ) # fmt: skip _map = self._cat_iter.prep_position_map(splitby, dodge) _extent = self._cat_iter.zoom_factor(dodge) * extent @@ -545,8 +549,9 @@ def add_line( *, name: str | None = None, color: NStr | None = None, - width: str | None = None, + width: float | None = None, style: NStr | None = None, + dodge: NStr | bool = False, ) -> _lt.DFLines[_DF]: """ Add line that connect the aggregated values. @@ -559,8 +564,8 @@ def add_line( Name of the layer. color : str or sequence of str, optional Column name(s) for coloring the lines. Must be categorical. - width : str, optional - Column name for line width. Must be numerical. + width : float, optional + Line width. style : str or sequence of str, optional Column name(s) for styling the lines. Must be categorical. @@ -569,11 +574,16 @@ def add_line( DFLines Line collection layer. """ + # TODO: support width: str canvas = self._canvas() df = self._df - _joined = _shared.join_columns(self._offset, color, style, source=df) - df_agg = self._aggregate(df, _joined, self._value) - xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) + width = theme._default("line.width", width) + + _splitby, _dodge = _shared.norm_dodge(df, self._offset, color, dodge=dodge) + df_agg = self._aggregate(df, _splitby, self._value) + _pos_map = self._cat_iter.prep_position_map(_splitby, dodge=_dodge) + + xj = _jitter.CategoricalJitter(_splitby, _pos_map) yj = _jitter.IdentityJitter(self._value).check(df_agg) if not self._orient.is_vertical: xj, yj = yj, xj @@ -595,6 +605,7 @@ def add_markers( hatch: NStr | Hatch | None = None, size: str | float | None = None, symbol: NStr | Symbol | None = None, + dodge: NStr | bool = False, ) -> _lt.DFMarkers[_DF]: """ Add markers that represent the aggregated values. @@ -625,9 +636,13 @@ def add_markers( """ canvas = self._canvas() df = self._df - _joined = _shared.join_columns(self._offset, color, hatch, symbol, source=df) - df_agg = self._aggregate(df, _joined, self._value) - xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) + _splitby, _dodge = _shared.norm_dodge( + df, self._offset, color, hatch, symbol, dodge=dodge + ) # fmt: skip + df_agg = self._aggregate(df, _splitby, self._value) + _pos_map = self._cat_iter.prep_position_map(_splitby, dodge=_dodge) + + xj = _jitter.CategoricalJitter(_splitby, _pos_map) yj = _jitter.IdentityJitter(self._value).check(df_agg) if not self._orient.is_vertical: xj, yj = yj, xj @@ -648,6 +663,7 @@ def add_bars( color: NStr | ColorType | None = None, hatch: NStr | Hatch | None = None, extent: float = 0.8, + dodge: NStr | bool = True, ) -> _lt.DFBars[_DF]: """ Add bars that represent the aggregated values. @@ -676,14 +692,20 @@ def add_bars( """ canvas = self._canvas() df = self._df - _joined = _shared.join_columns(self._offset, color, hatch, source=df) - df_agg = self._aggregate(df, _joined, self._value) - xj = _jitter.CategoricalJitter(self._offset, self._cat_iter.category_map()) + _splitby, _dodge = _shared.norm_dodge( + df, self._offset, color, hatch, dodge=dodge + ) # fmt: skip + df_agg = self._aggregate(df, _splitby, self._value) + _pos_map = self._cat_iter.prep_position_map(_splitby, dodge=_dodge) + + xj = _jitter.CategoricalJitter(_splitby, _pos_map) yj = _jitter.IdentityJitter(self._value).check(df_agg) + + _extent = self._cat_iter.zoom_factor(_dodge) * extent if not self._orient.is_vertical: xj, yj = yj, xj layer = _lt.DFBars.from_table( - df_agg, xj, yj, name=name, color=color, hatch=hatch, extent=extent, + df_agg, xj, yj, name=name, color=color, hatch=hatch, extent=_extent, backend=canvas._get_backend(), ) # fmt: skip if color is not None and not layer._color_by.is_const(): @@ -714,26 +736,3 @@ class XCatPlotter(OneAxisCatPlotter[_C, _DF]): class YCatPlotter(OneAxisCatPlotter[_C, _DF]): _orient = Orientation.HORIZONTAL - - -def _splitby_dodge( - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - dodge: str | tuple[str, ...] | bool = False, -) -> tuple[tuple[str, ...], tuple[str, ...]]: - if isinstance(offset, str): - offset = (offset,) - if isinstance(dodge, bool): - if dodge: - _all = _shared.join_columns(color, hatch, source=source) - dodge = tuple(c for c in _all if c not in offset) - else: - dodge = () - elif isinstance(dodge, str): - dodge = (dodge,) - else: - dodge = tuple(dodge) - splitby = _shared.join_columns(offset, dodge, source=source) - return splitby, dodge diff --git a/whitecanvas/layers/group/line_collection.py b/whitecanvas/layers/group/line_collection.py index 031d7356..9b1b39ec 100644 --- a/whitecanvas/layers/group/line_collection.py +++ b/whitecanvas/layers/group/line_collection.py @@ -49,7 +49,7 @@ def width(self) -> NDArray[np.float32]: @width.setter def width(self, width: float | Sequence[float]): - if isinstance(width, float): + if isinstance(width, (int, float, np.number)): _width = [width] * len(self) else: _width = np.asarray(width, dtype=np.float32) diff --git a/whitecanvas/layers/tabular/_box_like.py b/whitecanvas/layers/tabular/_box_like.py index 12965607..6925925f 100644 --- a/whitecanvas/layers/tabular/_box_like.py +++ b/whitecanvas/layers/tabular/_box_like.py @@ -31,42 +31,19 @@ _DF = TypeVar("_DF") -def _splitby_dodge( - source: DataFrameWrapper[_DF], - offset: str | tuple[str, ...], - color: str | tuple[str, ...] | None = None, - hatch: str | tuple[str, ...] | None = None, - dodge: str | tuple[str, ...] | bool = False, -) -> tuple[tuple[str, ...], tuple[str, ...]]: - if isinstance(offset, str): - offset = (offset,) - if isinstance(dodge, bool): - if dodge: - _all = _shared.join_columns(color, hatch, source=source) - dodge = tuple(c for c in _all if c not in offset) - else: - dodge = () - elif isinstance(dodge, str): - dodge = (dodge,) - else: - dodge = tuple(dodge) - splitby = _shared.join_columns(offset, color, hatch, dodge, source=source) - return splitby, dodge - - def _norm_color_hatch( color, hatch, - cat: CatIterator[_DF], + df: DataFrameWrapper[_DF], ) -> tuple[_p.ColorPlan, _p.HatchPlan]: - color_cov = _shared.ColumnOrValue(color, cat.df) + color_cov = _shared.ColumnOrValue(color, df) if color_cov.is_column: color_by = _p.ColorPlan.from_palette(color_cov.columns) elif color_cov.value is not None: color_by = _p.ColorPlan.from_const(Color(color_cov.value)) else: color_by = _p.ColorPlan.default() - hatch_cov = _shared.ColumnOrValue(hatch, cat.df) + hatch_cov = _shared.ColumnOrValue(hatch, df) if hatch_cov.is_column: hatch_by = _p.HatchPlan.new(hatch_cov.columns) elif hatch_cov.value is not None: @@ -155,10 +132,12 @@ def __init__( shape: str = "both", backend: str | Backend | None = None, ): - _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + _splitby, dodge = _shared.norm_dodge( + cat.df, cat.offsets, color, hatch, dodge=dodge + ) # fmt: skip x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) _extent = cat.zoom_factor(dodge=dodge) * extent - color_by, hatch_by = _norm_color_hatch(color, hatch, cat) + color_by, hatch_by = _norm_color_hatch(color, hatch, cat.df) base = _lg.ViolinPlot.from_arrays( x, arr, name=name, orient=orient, shape=shape, extent=_extent, backend=backend, @@ -171,10 +150,7 @@ def orient(self) -> Orientation: """Orientation of the violins.""" return self._base_layer.orient - def with_shift( - self, - shift: float = 0.0, - ) -> Self: + def with_shift(self, shift: float = 0.0) -> Self: for layer in self._base_layer: _old = layer.data layer.set_data(edge_low=_old.y0 + shift, edge_high=_old.y1 + shift) @@ -199,11 +175,13 @@ def __init__( capsize: float = 0.1, backend: str | Backend | None = None, ): - _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + _splitby, dodge = _shared.norm_dodge( + cat.df, cat.offsets, color, hatch, dodge=dodge, + ) # fmt: skip x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) _extent = cat.zoom_factor(dodge=dodge) * extent _capsize = cat.zoom_factor(dodge=dodge) * capsize - color_by, hatch_by = _norm_color_hatch(color, hatch, cat) + color_by, hatch_by = _norm_color_hatch(color, hatch, cat.df) base = _lg.BoxPlot.from_arrays( x, arr, name=name, orient=orient, capsize=_capsize, extent=_extent, backend=backend, @@ -314,10 +292,12 @@ def __init__( capsize: float = 0.1, backend: str | Backend | None = None, ): - _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + _splitby, dodge = _shared.norm_dodge( + cat.df, cat.offsets, color, hatch, dodge=dodge, + ) # fmt: skip x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) _capsize = cat.zoom_factor(dodge=dodge) * capsize - color_by, hatch_by = _norm_color_hatch(color, hatch, cat) + color_by, hatch_by = _norm_color_hatch(color, hatch, cat.df) base = _lg.LabeledPlot.from_arrays( x, arr, name=name, orient=orient, capsize=_capsize, backend=backend, ) # fmt: skip @@ -362,7 +342,7 @@ def _set_error_values(self, err_low, err_high): class DFBarPlot( - _shared.DataFrameLayerWrapper[_lg.LabeledBars, _DF], _BoxLikeMixin, Generic[_DF] + _shared.DataFrameLayerWrapper[_lg.LabeledBars, _DF], _EstimatorMixin, Generic[_DF] ): def __init__( self, @@ -377,11 +357,13 @@ def __init__( extent: float = 0.8, backend: str | Backend | None = None, ): - _splitby, dodge = _splitby_dodge(cat.df, cat.offsets, color, hatch, dodge) + _splitby, dodge = _shared.norm_dodge( + cat.df, cat.offsets, color, hatch, dodge=dodge, + ) # fmt: skip x, arr, categories = cat.prep_arrays(_splitby, value, dodge=dodge) _extent = cat.zoom_factor(dodge=dodge) * extent _capsize = cat.zoom_factor(dodge=dodge) * capsize - color_by, hatch_by = _norm_color_hatch(color, hatch, cat) + color_by, hatch_by = _norm_color_hatch(color, hatch, cat.df) base = _lg.LabeledBars.from_arrays( x, arr, name=name, orient=orient, capsize=_capsize, extent=_extent, backend=backend, diff --git a/whitecanvas/layers/tabular/_dataframe.py b/whitecanvas/layers/tabular/_dataframe.py index 50aaec3a..3148428d 100644 --- a/whitecanvas/layers/tabular/_dataframe.py +++ b/whitecanvas/layers/tabular/_dataframe.py @@ -91,43 +91,6 @@ def from_table( backend=backend, ) # fmt: skip - @classmethod - def build_kde( - cls, - df: _DF, - value: str, - band_width: float | None = None, - color: str | None = None, - width: str | None = None, - style: str | None = None, - name: str | None = None, - orient: str | Orientation = Orientation.VERTICAL, - backend: str | Backend | None = None, - ) -> DFLines[_DF]: - from whitecanvas.utils.kde import gaussian_kde - - src = parse(df) - splitby = _shared.join_columns(color, style, source=src) - ori = Orientation.parse(orient) - segs = [] - labels: list[tuple[Any, ...]] = [] - for sl, df in src.group_by(splitby): - labels.append(sl) - each = df[value] - kde = gaussian_kde(each, bw_method=band_width) - sigma = np.sqrt(kde.covariance[0, 0]) - pad = sigma * 2.5 - x = np.linspace(each.min() - pad, each.max() + pad, 100) - y = kde(x) - if ori.is_vertical: - segs.append(np.column_stack([x, y])) - else: - segs.append(np.column_stack([y, x])) - return DFLines( - src, segs, labels, name=name, color=color, width=width, style=style, - backend=backend, - ) # fmt: skip - @overload def with_color(self, value: ColorType) -> Self: ... @@ -476,11 +439,15 @@ def from_table( yj = y else: yj = _jitter.IdentityJitter(y) - xs = [] - ys = [] + xs: list[np.ndarray] = [] + ys: list[np.ndarray] = [] for _, sub in df.group_by(splitby): - xs.append(xj.map(sub)) - ys.append(yj.map(sub)) + xcur = xj.map(sub) + ycur = yj.map(sub) + order = np.argsort(xcur) + xs.append(xcur[order]) + ys.append(ycur[order]) + # BUG: order of coloring and x/y do not match x0 = np.concatenate(xs) y0 = np.concatenate(ys) return DFBars( diff --git a/whitecanvas/layers/tabular/_shared.py b/whitecanvas/layers/tabular/_shared.py index e4cd61b0..cf5bd344 100644 --- a/whitecanvas/layers/tabular/_shared.py +++ b/whitecanvas/layers/tabular/_shared.py @@ -82,3 +82,48 @@ def join_columns( def unique_tuple(a: tuple[str, ...], b: tuple[str, ...]) -> tuple[str, ...]: b_filt = tuple(x for x in b if x not in a) return a + b_filt + + +def norm_dodge( + source: DataFrameWrapper[_DF], + offset: str | tuple[str, ...], + *args: str | tuple[str, ...] | None, + dodge: str | tuple[str, ...] | bool = False, +) -> tuple[tuple[str, ...], tuple[str, ...]]: + if isinstance(offset, str): + offset = (offset,) + if isinstance(dodge, bool): + if dodge: + _all = join_columns(*args, source=source) + dodge = tuple(c for c in _all if c not in offset) + else: + dodge = () + elif isinstance(dodge, str): + dodge = (dodge,) + else: + dodge = tuple(dodge) + splitby = join_columns(offset, *args, dodge, source=source) + return splitby, dodge + + +def norm_dodge_markers( + source: DataFrameWrapper[_DF], + offset: str | tuple[str, ...], + color: str | tuple[str, ...] | None = None, + hatch: str | tuple[str, ...] | None = None, + dodge: str | tuple[str, ...] | bool = False, +) -> tuple[tuple[str, ...], tuple[str, ...]]: + if isinstance(offset, str): + offset = (offset,) + if isinstance(dodge, bool): + if dodge: + _all = join_columns(color, hatch, source=source) + dodge = tuple(c for c in _all if c not in offset) + else: + dodge = () + elif isinstance(dodge, str): + dodge = (dodge,) + else: + dodge = tuple(dodge) + splitby = join_columns(offset, dodge, source=source) + return splitby, dodge diff --git a/whitecanvas/layers/tabular/_utils.py b/whitecanvas/layers/tabular/_utils.py index 145cc00d..33d70bdc 100644 --- a/whitecanvas/layers/tabular/_utils.py +++ b/whitecanvas/layers/tabular/_utils.py @@ -1,7 +1,5 @@ from __future__ import annotations -import itertools - import numpy as np @@ -11,18 +9,3 @@ def unique(arr: np.ndarray, axis=0) -> np.ndarray: raise ValueError(f"Cannot handle {arr.dtype} in unique().") _, idx = np.unique(arr, axis=axis, return_index=True) return arr[np.sort(idx)] - - -def unique_product(each_unique: list[np.ndarray]) -> np.ndarray: - """ - Return the all the unique combinations of the given arrays. - - >>> unique_product([np.array([0, 1, 2]), np.array([3, 4])]) - array([[0, 3], - [0, 4], - [1, 3], - [1, 4], - [2, 3], - [2, 4]]) - """ - return np.array(list(itertools.product(*each_unique))) From f61403f083bc108e4af313f182af5a6418d5ef94 Mon Sep 17 00:00:00 2001 From: Hanjin Liu Date: Mon, 5 Feb 2024 20:36:40 +0900 Subject: [PATCH 11/11] fix doc generation --- docs/_scripts/_hooks.py | 4 +--- docs/categorical/cat_num.md | 48 +++++++++++++++++++++++++++++++------ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/docs/_scripts/_hooks.py b/docs/_scripts/_hooks.py index cb7355d4..ac97232e 100644 --- a/docs/_scripts/_hooks.py +++ b/docs/_scripts/_hooks.py @@ -26,8 +26,6 @@ def _add_images(matchobj: re.Match[str]) -> str: return "```python\n" + other + "\n```" code, name = _get_image_name(code) - dest = f"_images/{name}.png" - code, width = _get_image_width(code) reldepth = "../" * page.file.src_path.count(os.sep) @@ -36,7 +34,7 @@ def _add_images(matchobj: re.Match[str]) -> str: new_md = "```python\n" + code + "\n```" + link return new_md - md = re.sub("``` ?python\n([^`]*)```", _add_images, md, re.DOTALL) + md = re.sub("``` ?python\n([^`]*)```", _add_images, md, flags=re.DOTALL) return md diff --git a/docs/categorical/cat_num.md b/docs/categorical/cat_num.md index 69f9ef50..201c6b8b 100644 --- a/docs/categorical/cat_num.md +++ b/docs/categorical/cat_num.md @@ -10,7 +10,7 @@ rng = np.random.default_rng(12345) df = { "category": ["A"] * 40 + ["B"] * 50, "observation": np.concatenate([rng.random(40), rng.random(50) + 1.3]), - "replicate": [0] * 20 + [1] * 20 + [0] * 25 + [1] * 25, + "replicate": [0] * 23 + [1] * 17 + [0] * 22 + [1] * 28, "temperature": rng.normal(scale=2.8, size=90) + 22.0, } ``` @@ -182,12 +182,13 @@ canvas = new_canvas("matplotlib") As for the `Markers` layer, `as_edge_only` will convert the face features to the edge features. ``` python -#!name: categorical_axis_stripplot_dodge +#!name: categorical_axis_stripplot_dodge_edge_only canvas = new_canvas("matplotlib") ( canvas .cat_x(df, x="category", y="observation") .add_stripplot(color="replicate", dodge=True) + .as_edge_only(width=2) ) ``` @@ -218,14 +219,30 @@ canvas = new_canvas("matplotlib") ) ``` +Swarm plot is another way to visualize all the data points with markers. + +``` python +#!name: categorical_axis_swarmplot +canvas = new_canvas("matplotlib") +( + canvas + .cat_x(df, x="category", y="observation") + .add_swarmplot(sort=True) + .with_colormap("temperature", cmap="coolwarm") +) +``` + ## Aggregation -Showing both all the data points and the aggregated data is a common way to efficiently -visualize the data. This task is usually done by the module specific group-by methods, -but `whitecanvas` provides a built-in method to simplify the process. +Showing the aggregated data is a common way to efficiently visualize a lot of data. This +task is usually done by the module specific group-by methods, but `whitecanvas` provides +a built-in method to simplify the process. + +In following example, `mean()` is used to prepare a mean-aggregated plotter, which has +`add_markers` method to add the mean markers to the plotter. ``` python -#!name: categorical_axis_stripplot_and_agg +#!name: categorical_axis_stripplot_and_agg_mean canvas = new_canvas("matplotlib") # create a categorical plotter @@ -239,7 +256,24 @@ cat_plt.mean().add_markers(color="category", size=20) canvas.show() ``` -Count plot. +Similar `add_*` methods include `add_line()` and `add_bars()`. + +``` python +#!name: categorical_axis_stripplot_and_agg_line +canvas = new_canvas("matplotlib") + +# create a categorical plotter +cat_plt = canvas.cat_x(df, x="category", y="observation") + +# plot all the data +cat_plt.add_stripplot(color="category") +# plot the mean +cat_plt.mean().add_line(width=3, color="black") + +canvas.show() +``` + +Count plot is a special case of the aggregation. Use `count()` to make the plotter. ``` python #!name: categorical_axis_countplot