Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable calc_misssing_rate work with pd.DataFrame #38

Merged
merged 4 commits into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,17 @@ or install from source code:

```python
import numpy as np
from pygrinder import mcar, mar_logistic, mnar_x, mnar_t

from pygrinder import (
mcar,
mar_logistic,
mnar_x,
mnar_t,
rdo,
seq_missing,
block_missing,
calc_missing_rate
)

# given a time-series dataset with 128 samples, each sample with 10 time steps and 36 data features
ts_dataset = np.random.randn(128, 10, 36)
Expand All @@ -87,11 +97,29 @@ X_with_mar_data = mar_logistic(ts_dataset[:, 0, :], obs_rate=0.1, missing_rate=0

# grind the dataset with MNAR pattern
X_with_mnar_x_data = mnar_x(ts_dataset, offset=0.1)
X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos = 10, scale = 3)
X_with_mnar_t_data = mnar_t(ts_dataset, cycle=20, pos=10, scale=3)

# grind the dataset with RDO pattern
X_with_rdo_data = rdo(ts_dataset, p=0.1)

# grind the dataset with Sequence-Missing pattern
X_with_seq_missing_data = seq_missing(ts_dataset, p=0.1, seq_len=5)

# grind the dataset with Block-Missing pattern
X_with_block_missing_data = block_missing(ts_dataset, factor=0.1, block_width=3, block_len=3)

# calculate the missing rate of the dataset
missing_rate = calc_missing_rate(X_with_mcar_data)
```


## ❖ Citing PyGrinder/PyPOTS
<p align="center">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>

The paper introducing PyPOTS is available [on arXiv](https://arxiv.org/abs/2305.18811),
A short version of it is accepted by the 9th SIGKDD international workshop on Mining and Learning from Time Series ([MiLeTS'23](https://kdd-milets.github.io/milets2023/))).
**Additionally**, PyPOTS has been included as a [PyTorch Ecosystem](https://pytorch.org/ecosystem/) project.
Expand All @@ -102,12 +130,6 @@ please cite it as below and 🌟star this repository to make others notice this
There are scientific research projects using PyPOTS and referencing in their papers.
Here is [an incomplete list of them](https://scholar.google.com/scholar?as_ylo=2022&q=%E2%80%9CPyPOTS%E2%80%9D&hl=en).

<p align="center">
<a href="https://github.com/WenjieDu/PyPOTS">
<img src="https://pypots.com/figs/pypots_logos/Ecosystem/PyPOTS_Ecosystem_Pipeline.png" width="95%"/>
</a>
</p>

``` bibtex
@article{du2023pypots,
title={{PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series}},
Expand All @@ -117,9 +139,9 @@ year={2023},
}
```
or
> Wenjie Du. (2023).
> Wenjie Du.
> PyPOTS: a Python toolbox for data mining on Partially-Observed Time Series.
> arXiv, abs/2305.18811. https://arxiv.org/abs/2305.18811
> arXiv, abs/2305.18811, 2023.


<details>
Expand Down
2 changes: 1 addition & 1 deletion pygrinder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#
# Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
# 'X.Y.dev0' is the canonical version of 'X.Y.dev'
__version__ = "0.6"
__version__ = "0.6.1"

from .missing_at_random import mar_logistic
from .missing_completely_at_random import mcar, mcar_little_test
Expand Down
32 changes: 30 additions & 2 deletions pygrinder/block_missing/block_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,40 @@ def block_missing(
feature_idx: list = None,
step_idx: list = None,
) -> Union[np.ndarray, torch.Tensor]:
"""Create block missing data.

Parameters
----------
X :
Data vector. If X has any missing values, they should be numpy.nan.

factor :
The actual missing rate of block_missing is hard to be strictly controlled.
Hence, we use ``factor`` to help adjust the final missing rate.

block_len :
The length of the mask block.

block_width :
The width of the mask block.

feature_idx :
The indices of features for missing block to star with.

step_idx :
The indices of steps for a missing block to start with.

Returns
-------
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.

"""
if isinstance(X, list):
X = np.asarray(X)
n_samples, n_steps, n_features = X.shape

# assert 0 < p <= 1, f"p must be in range (0, 1), but got {p}"

assert isinstance(
block_len, int
), f"`block_len` must be type of int, but got {type(block_len)}"
Expand Down
26 changes: 26 additions & 0 deletions pygrinder/sequential_missing/seq_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,32 @@ def seq_missing(
feature_idx: list = None,
step_idx: list = None,
) -> Union[np.ndarray, torch.Tensor]:
"""Create subsequence missing data.

Parameters
----------
X :
Data vector. If X has any missing values, they should be numpy.nan.

p :
The probability that values may be masked as missing completely at random.

seq_len :
The length of missing sequence.

feature_idx :
The indices of features for missing sequences to be corrupted.

step_idx :
The indices of steps for a missing sequence to start with.

Returns
-------
corrupted_X :
Original X with artificial missing values.
Both originally-missing and artificially-missing values are left as NaN.

"""
if isinstance(X, list):
X = np.asarray(X)
n_samples, n_steps, n_features = X.shape
Expand Down
21 changes: 13 additions & 8 deletions pygrinder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,42 @@
from typing import Union, Tuple

import numpy as np
import pandas as pd
import torch


def calc_missing_rate(X: Union[np.ndarray, torch.Tensor]) -> float:
def calc_missing_rate(
X: Union[np.ndarray, torch.Tensor, pd.DataFrame],
) -> float:
"""Calculate the originally missing rate of the raw data.

Parameters
----------
X:
Data array that may contain missing values.
Data array/tensor/frame that may contain missing values.

Returns
-------
originally_missing_rate,
missing_rate,
The originally missing rate of the raw data. Its value should be in the range [0,1].

"""
if isinstance(X, list):
X = np.asarray(X)

if isinstance(X, np.ndarray):
originally_missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape)
missing_rate = np.sum(np.isnan(X)) / np.prod(X.shape)
elif isinstance(X, torch.Tensor):
originally_missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape)
originally_missing_rate = originally_missing_rate.item()
missing_rate = torch.sum(torch.isnan(X)) / np.prod(X.shape)
missing_rate = missing_rate.item()
elif isinstance(X, pd.DataFrame):
missing_rate = pd.isna(X).sum().sum() / np.prod(X.shape)
else:
raise TypeError(
f"X must be type of list/numpy.ndarray/torch.Tensor, but got {type(X)}"
f"X must be type of list/numpy.ndarray/torch.Tensor/pandas.DataFrame, but got {type(X)}"
)

return originally_missing_rate
return missing_rate


def masked_fill(
Expand Down
Loading