diff --git a/AUTHORS.rst b/AUTHORS.rst index 703e073..0379962 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -16,12 +16,12 @@ Authors * Sayan Patra * Yi Su * Rachit Arora -* Brian Vegetabile -* Qiang Fei -* Phil Gaudreau -* Yi-Wei Liu Other Contributors ------------------ +* Qiang Fei * Saad Eddin Al Orjany * Rachit Kumar +* Phil Gaudreau +* Yi-Wei Liu +* Katherine Li diff --git a/HISTORY.rst b/HISTORY.rst index 66a6876..dfb6825 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -2,10 +2,41 @@ History ======= +1.0.0 (2024-01-07) +------------------ + +* Greykite AD (Anomaly Detection) is now available. +It improves upon the out-of-box confidence intervals generated by Silverkite, by automatically tuning the confidence intervals +and other filters (e.g. based on ``Absolute Percentage Error (APE)``) using expected alert rate information and/ or anomaly labels, if available. +It allows the users to define robust objective function, constraints and parameter space to optimize the confidence intervals. +For example user can target a minimal recall level of 80% while maximizing precision. Additionally, the users can specify a +minimum error level to filter out anomalies that are not business relevant. The motivation to include criteria other than +statistical significance is to bake in material/ business impact into the detection. + + * @Reza Hosseini: Devised the core anomaly detection library structure. Added base ``Detector`` module. + * @Reza Hosseini: Added `~greykite.detection.detector.reward.Reward` that allows users to specify and optimize robust anomaly detection objectives. + * @Sayan Patra: Added ``GreykiteDetector`` module that builds anomaly detection based on Greykite forecasting. + * @Sayan Patra: Added tutorials for Greykite anomaly detection. + +* New features and methods + * @Reza Hosseini: Added `~greykite.common.features.outlier.ZScoreOutlierDetector` and `~greykite.common.features.outlier.TukeyOutlierDetector`, improved outlier detection modules. + * @Sayan Patra: Added `~greykite.detection.common.pickler.GreykitePickler`. This improves the pickling function for Greykite models and allows to store the model in a single file. + * @Yi-Wei Lu: Added ``DifferenceBasedOutlierTransformer`` that can identify outliers in the ``sklearn`` pipeline. + +* Library enhancements + * @Kaixu Yang: Added ``scipy`` solver to make quantile regression more stable. + * @Qiang Fei: Updated ``auto_holiday`` functionality to use holiday groupers for improved forecast performance in holiday periods. + * @Katherine Li: Improved changepoint detection method that can identify level shifts. + +* Bug fixes + * @Reza Hosseini @Sayan Patra @Yi Su @Qiang Fei @Kaixu Yang @Phil Gaudreau: Other library enhancements and bug fixes. + + 0.5.1 (2023-06-01) ------------------ -Loosen dill requirements +Loosen dill package requirements. + 0.5.0 (2023-04-03) ------------------ @@ -32,6 +63,7 @@ Python 3.10 support. * @Yi Su, @Sayan Patra: Now ``train_end_date`` is always respected if specified by the user. Previously it got ignored if there are trailing NA’s in training data or ``anomaly_df`` imputes the anomalous points to NA. Also, now ``train_end_date`` accepts a string value. * @Yi Su: The seasonality order now takes `None` without raising an error. It will be treated the same as `False` or zero. + 0.4.0 (2022-07-15) ------------------ @@ -41,7 +73,7 @@ Python 3.10 support. * @Kaixu Yang: Auto model components. (1) seasonality inferrer (2) holiday inferrer (3) automatic growth. * @Kaixu Yang: Lag-based estimator. Supports lag-based forecasts such as week-over-week. * @Reza Hosseini: Fast simulation option. Provides a better accuracy and speed for mean prediction when simulation is used in autoregression. - * @Kaixu Yang: Quantile regression option for Silverkite `fit_algorithm`. + * @Kaixu Yang: Quantile regression option for Silverkite ``fit_algorithm``. * New model templates * @Kaixu Yang: AUTO. Automatically chooses templates based on the data frequency, forecast horizon and evaluation configs. @@ -55,7 +87,7 @@ Python 3.10 support. * Library enhancements and bug fixes * The SILVERKITE template has been updated to include automatic autoregression and changepoint detection. - * Renamed `SilverkiteMultistageEstimator` to `MultistageForecastEstimator`. + * Renamed ``SilverkiteMultistageEstimator`` to ``MultistageForecastEstimator``. * Renamed the normalization method "min_max" to "zero_to_one". * @Reza Hosseini: Added normalization methods: "minus_half_to_half", "zero_at_origin". * @Albert Chen: Updated tutorials. diff --git a/README.rst b/README.rst index d7699ad..14af2a1 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -Greykite: A flexible, intuitive and fast forecasting library +Greykite: A flexible, intuitive and fast forecasting and anomaly detection library .. raw:: html @@ -21,6 +21,16 @@ evaluation, benchmarking, and plotting. Other open source algorithms can be supported through Greykite’s interface to take advantage of this framework, as listed below. +Greykite AD (Anomaly Detection) is an extension of the Greykite Forecasting library. It provides users with an interpretable, +fast, robust and easy to use interface to monitor their metrics with minimal effort. + +Greykite AD improves upon the out-of-box confidence intervals generated by Silverkite, by automatically tuning the confidence intervals +and other filters (e.g. based on ``APE``) using expected alert rate information and/ or anomaly labels, if available. +It allows the users to define robust objective function, constraints and parameter space to optimize the confidence intervals. +For example user can target a minimal recall level of 80% while maximizing precision. Additionally, the users can specify a +minimum error level to filter out anomalies that are not business relevant. The motivation to include criteria other than +statistical significance is to bake in material/ business impact into the detection. + For a demo, please see our `quickstart `_. Distinguishing Features @@ -47,7 +57,8 @@ Distinguishing Features Algorithms currently supported within Greykite’s modeling framework: -* Silverkite (Greykite’s flagship algorithm) +* Silverkite (Greykite’s flagship forecasting algorithm) +* Greykite Anomaly Detection (Greykite's flagship anomaly detection algorithm) * `Facebook Prophet `_ * `Auto Arima `_ @@ -62,6 +73,7 @@ libraries or even outside the forecasting context. * SimpleSilverkiteForecast() - Silverkite algorithm with `forecast_simple` and `predict` methods. * SilverkiteForecast() - low-level interface to Silverkite algorithm with `forecast` and `predict` methods. * ReconcileAdditiveForecasts() - adjust a set of forecasts to satisfy inter-forecast additivity constraints. +* GreykiteDetector() - simple interface for optimizing anomaly detection performance based on Greykite forecasts. Usage Examples -------------- @@ -164,4 +176,4 @@ License ------- Copyright (c) LinkedIn Corporation. All rights reserved. Licensed under the -`BSD 2-Clause `_ License. \ No newline at end of file +`BSD 2-Clause `_ License. diff --git a/README_PYPI.rst b/README_PYPI.rst index 78cb5c0..ede2986 100644 --- a/README_PYPI.rst +++ b/README_PYPI.rst @@ -1,4 +1,4 @@ -Greykite: A flexible, intuitive and fast forecasting library +Greykite: A flexible, intuitive and fast forecasting and anomaly detection library .. image:: https://raw.githubusercontent.com/linkedin/greykite/master/LOGO-C8.png :height: 300px @@ -22,6 +22,16 @@ evaluation, benchmarking, and plotting. Other open source algorithms can be supported through Greykite’s interface to take advantage of this framework, as listed below. +Greykite AD (Anomaly Detection) is an extension of the Greykite Forecasting library. It provides users with an interpretable, +fast, robust and easy to use interface to monitor their metrics with minimal effort. + +Greykite AD improves upon the out-of-box confidence intervals generated by Silverkite, by automatically tuning the confidence intervals +and other filters (e.g. based on ``APE``) using expected alert rate information and/ or anomaly labels, if available. +It allows the users to define robust objective function, constraints and parameter space to optimize the confidence intervals. +For example user can target a minimal recall level of 80% while maximizing precision. Additionally, the users can specify a +minimum error level to filter out anomalies that are not business relevant. The motivation to include criteria other than +statistical significance is to bake in material/ business impact into the detection. + For a demo, please see our `quickstart `_. Distinguishing Features @@ -49,6 +59,7 @@ Distinguishing Features Algorithms currently supported within Greykite’s modeling framework: * Silverkite (Greykite’s flagship algorithm) +* Greykite Anomaly Detection (Greykite's flagship anomaly detection algorithm) * `Facebook Prophet `_ * `Auto Arima `_ @@ -63,6 +74,7 @@ libraries or even outside the forecasting context. * SimpleSilverkiteForecast() - Silverkite algorithm with `forecast_simple` and `predict` methods. * SilverkiteForecast() - low-level interface to Silverkite algorithm with `forecast` and `predict` methods. * ReconcileAdditiveForecasts() - adjust a set of forecasts to satisfy inter-forecast additivity constraints. +* GreykiteDetector() - simple interface for optimizing anomaly detection performance based on Greykite forecasts. Usage Examples -------------- @@ -165,4 +177,4 @@ License ------- Copyright (c) LinkedIn Corporation. All rights reserved. Licensed under the -`BSD 2-Clause `_ License. \ No newline at end of file +`BSD 2-Clause `_ License. diff --git a/docs/index.rst b/docs/index.rst index 01451b7..ba6f60b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -12,11 +12,11 @@ Welcome to Greykite! View the contents to get started. .. toctree:: :maxdepth: 1 - :caption: Greykite Info + :caption: Overview :hidden: :glob: - pages/greykite/overview + pages/overview/* .. toctree:: :maxdepth: 2 diff --git a/docs/nbpages/quickstart/01_exploration/0200_auto_configuration_tools.py b/docs/nbpages/quickstart/01_exploration/0200_auto_configuration_tools.py index a9e534e..1874519 100644 --- a/docs/nbpages/quickstart/01_exploration/0200_auto_configuration_tools.py +++ b/docs/nbpages/quickstart/01_exploration/0200_auto_configuration_tools.py @@ -367,7 +367,8 @@ holiday_df = get_holidays(countries=["US"], year_start=year_start, year_end=year_end)["US"] # Defines the number of pre / post days that a holiday has impact on. -# If not specified, (0, 0) will be used. +# If not specified, numbers specified by ``holiday_impact_pre_num_days`` and +# ``holiday_impact_post_num_days`` will be used. holiday_impact_dict = { "Christmas Day": (4, 3), # 12/25. "Independence Day": (4, 4), # 7/4. @@ -390,8 +391,10 @@ holiday_df=holiday_df, holiday_date_col="date", holiday_name_col="event_name", + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=holiday_impact_dict, - get_suffix_func="dow_grouped" + get_suffix_func="wd_we" ) # Runs holiday grouper using k-means with diagnostics. diff --git a/docs/nbpages/quickstart/0200_simple_anomaly_detection.py b/docs/nbpages/quickstart/0200_simple_anomaly_detection.py new file mode 100644 index 0000000..e42875c --- /dev/null +++ b/docs/nbpages/quickstart/0200_simple_anomaly_detection.py @@ -0,0 +1,153 @@ +""" +Simple Anomaly Detection +======================== + +You can create and evaluate an anomaly detection model with just a few lines of code. + +Provide your timeseries as a pandas dataframe with timestamp and value. +Optionally, you can also provide the anomaly labels as a column in the dataframe. + +For example, to detect anomalies in daily sessions data, your dataframe could look like this: + +.. code-block:: python + + import pandas as pd + df = pd.DataFrame({ + "date": ["2020-01-08-00", "2020-01-09-00", "2020-01-10-00"], + "sessions": [10231.0, 12309.0, 12104.0], + "is_anomaly": [False, True, False] + }) + +The time column can be any format recognized by `pandas.to_datetime`. + +In this example, we'll load a dataset representing ``log(daily page views)`` +on the Wikipedia page for Peyton Manning. +It contains values from 2007-12-10 to 2016-01-20. More dataset info +`here `_. +""" + +import warnings + +import plotly +from greykite.common.data_loader import DataLoader +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.data import DetectorData +from greykite.detection.detector.greykite import GreykiteDetector +from greykite.framework.templates.autogen.forecast_config import ForecastConfig +from greykite.framework.templates.autogen.forecast_config import MetadataParam +from greykite.framework.templates.model_templates import ModelTemplateEnum + +warnings.filterwarnings("ignore") + +# Loads dataset into pandas DataFrame +dl = DataLoader() +df = dl.load_peyton_manning() + +# specify dataset information +metadata = MetadataParam( + time_col="ts", # name of the time column ("date" in example above) + value_col="y", # name of the value column ("sessions" in example above) + freq="D" # "H" for hourly, "D" for daily, "W" for weekly, etc. + # Any format accepted by `pandas.date_range` +) + +# %% +# Create an Anomaly Detection Model +# ------------------------------- +# Similar to forecasting, you need to provide a forecast config and an +# anomaly detection config. You can choose any of the available forecast model +# templates (see :doc:`/pages/stepbystep/0100_choose_model`). + +# In this example, we choose the "AUTO" model template for the forecast config, +# and the default anomaly detection config. +# The Silverkite "AUTO" model template chooses the parameter configuration +# given the input data frequency, forecast horizon and evaluation configs. + +anomaly_detector = GreykiteDetector() # Creates an instance of the Greykite anomaly detector + +forecast_config = ForecastConfig( + model_template=ModelTemplateEnum.AUTO.name, + forecast_horizon=7, # forecasts 7 steps ahead + coverage=None, # Confidence Interval will be tuned by the AD model + metadata_param=metadata) + +ad_config = ADConfig() # Default anomaly detection config + +detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=None) + +# %% +# Train the Anomaly Detection Model +# --------------------------------- +# You can train the anomaly detection model by calling the ``fit`` method. +# This method takes a ``DetectorData`` object as input. +# The ``DetectorData`` object consists the time series information as a pandas dataframe. +# Optionally, you can also provide the anomaly labels as a column in the dataframe. +# The anomaly labels can also be provided as a list of boolean values. +# The anomaly labels are used to evaluate the model performance. + +train_size = int(2700) +df_train = df[:train_size].reset_index(drop=True) +train_data = DetectorData(df=df_train) +detector.fit(data=train_data) + +# %% +# Predict with the Anomaly Detection Model +# --------------------------------------- +# You can predict anomalies by calling the ``predict`` method. + +test_data = DetectorData(df=df) +test_data = detector.predict(test_data) + +# %% +# Evaluate the Anomaly Detection Model +# ------------------------------------ +# The output of the anomaly detection model are stored as attributes +# of the ``GreykiteDetector`` object. +# (The interactive plots are generated by ``plotly``: **click to zoom!**) + + +# %% +# Training +# ^^^^^^^^ +# The ``fitted_df`` attribute contains the result on the training data. +# You can plot the result by calling the ``plot`` method with ``phase="train"``. +print(detector.fitted_df) + +fig = detector.plot( + phase="train", + title="Greykite Detector Peyton Manning - fit phase") +plotly.io.show(fig) + +# %% +# Prediction +# ^^^^^^^^^^ +# The ``pred_df`` attribute contains the predicted result. +# You can plot the result by calling the ``plot`` method with ``phase="predict"``. + +print(detector.pred_df) + +fig = detector.plot( + phase="predict", + title="Greykite Detector Peyton Manning - predict phase") +plotly.io.show(fig) + +# %% +# Model Summary +# ^^^^^^^^^^^^^^^^^ +# Model summary allows inspection of individual model terms. +# Check parameter estimates and their significance for insights +# on how the model works and what can be further improved. +# You can call the ``summary`` method to see the model summary. +summary = detector.summary() +print(summary) + +# %% +# What's next? +# ------------ +# If you're satisfied with the forecast performance, you're done! +# +# For a complete example of how to tune this forecast, see +# :doc:`/gallery/tutorials/0400_anomaly_detection_tutorial`. diff --git a/docs/nbpages/tutorials/0400_anomaly_detection_tutorial.py b/docs/nbpages/tutorials/0400_anomaly_detection_tutorial.py new file mode 100644 index 0000000..e2f5dd1 --- /dev/null +++ b/docs/nbpages/tutorials/0400_anomaly_detection_tutorial.py @@ -0,0 +1,393 @@ +""" +Tune your first anomaly detection model +======================================= + +This is a basic tutorial for creating and tuning a Greykite AD (Anomaly Detection) model. +It is intended for users who are new to Greykite AD and want to get started quickly. + +The Greykite AD is a forecast-based AD method i.e. the forecast is used as the baseline. +A data point is predicted as anomalous if it is outside the forecasted confidence intervals. +The Greykite AD algorithm gives you the flexibility to better model and control the +confidence intervals. A forecast based AD method is inherently dependent on an accurate +forecasting model to achieve satisfactory AD performance. + +Throughout this tutorial, we will assume that you are familiar with tuning a +Greykite forecast model. If you are not, please refer to the +:doc:`/gallery/tutorials/0100_forecast_tutorial`. + +The anomaly detection config (``ADConfig``) allows the users divide the time series into segments and +learn a different volatility model for each segment. The user can specify the volatility features. +It also allows users to specify objective function, constraints and parameter space to optimize the +confidence intervals. + +These features include: + + Volatility Features: + This allows users to specify the features to segment the time series and learn a + different volatility model for each segment. For example, if the time series is a daily + time series, the user can specify the volatility features as ``["dow"]`` to learn a + different volatility model for each day of the week. The user can also specify multiple + volatility features. For example, if the time series is a daily time series, the user can + specify the volatility features as ``[["dow", "is_weekend"]]`` to learn a different + volatility model for each day of the week and a different volatility model for weekends. + + Coverage Grid: + This allows users to specify a grid of the confidence intervals. The ``coverage_grid`` is + specified as a list of floats between 0 and 1. For example, if the ``coverage_grid`` is specified as + ``[0.5, 0.95]``, the algorithm optimizes over confidence intervals with coverage ``0.5`` and ``0.95``. + + Target Anomaly Percentage: + This allows users to specify the ``target_anomaly_percent``, which is specified as a float + between 0 and 1. For example, if ``target_anomaly_percent`` is + specified as ``0.1``, the anomaly score threshold is optimized such that 10% of the data + points are predicted as anomalous. + + Target Precision: + This allows users to specify the ``target_precision``, which is specified as a + float between 0 and 1. For example, if the ``target_precision`` is specified as ``0.9``, the + anomaly score threshold is optimized such that at least 90% of the predicted anomalies are true + anomalies. This is useful when the user has a limited budget to investigate the anomalies. + + Target Recall: + This allows users to specify the ``target_recall``, which is specified as a float + between 0 and 1. For example, if the ``target_recall`` is specified as ``0.9``, the anomaly + score threshold is optimized such that at least 90% of the true anomalies are predicted as + anomalies. This is useful when the user wants to detect most of the anomalies. + +""" + +import datetime + +import numpy as np +import pandas as pd +import plotly +import plotly.express as px +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import TIME_COL +from greykite.common.constants import VALUE_COL +from greykite.common.testing_utils import generate_df_for_tests +from greykite.common.testing_utils_anomalies import contaminate_df_with_anomalies +from greykite.common.viz.timeseries_annotate import plot_lines_markers +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.ad_evaluation import precision_score +from greykite.detection.common.ad_evaluation import recall_score +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.data import DetectorData +from greykite.detection.detector.greykite import GreykiteDetector +from greykite.detection.detector.reward import Reward +from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam +from greykite.framework.templates.autogen.forecast_config import ForecastConfig +from greykite.framework.templates.autogen.forecast_config import MetadataParam +from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam + +# Evaluation metrics used in the tests. +# F1 score for the True label: +f1 = partial_return(f1_score, True) +# Precision score, for the True label: +precision = partial_return(precision_score, True) +# Recall score for the True label: +recall = partial_return(recall_score, True) + +# %% +# Generate a dataset with anomalies +# ---------------------------------- +# Let us first generate a dataset with ground truth anomaly labels. +df = generate_df_for_tests( + freq="D", + train_start_date=datetime.datetime(2020, 1, 1), + intercept=50, + train_frac=0.99, + periods=200)["df"] + +# Specifies anomaly locations. +anomaly_block_list = [ + np.arange(10, 15), + np.arange(33, 35), + np.arange(60, 65), + np.arange(82, 85), + np.arange(94, 98), + np.arange(100, 105), + np.arange(111, 113), + np.arange(125, 130), + np.arange(160, 163), + np.arange(185, 190), + np.arange(198, 200)] + +# Contaminates `df` with anomalies at the specified locations, +# via `anomaly_block_list`. +# If original value is y, the anomalous value is: (1 +/- delta)*y. +df = contaminate_df_with_anomalies( + df=df, + anomaly_block_list=anomaly_block_list, + delta_range_lower=0.25, + delta_range_upper=0.5, + value_col=VALUE_COL, + min_admissible_value=None, + max_admissible_value=None) + +fig = plot_lines_markers( + df=df, + x_col=TIME_COL, + line_cols=["contaminated_y", "y"], + line_colors=["red", "blue"], + title="Generation of daily anomalous data") +fig.update_yaxes() +plotly.io.show(fig) + +# %% +# The anomalies are generated by adding a random delta to the original value. +# The plot above shows the original data (``y``) in blue and the contaminated data +# (``contaminated_y``) in red. We will drop the original data (``y``) and use the +# contaminated data (``contaminated_y``) as the input to the anomaly detector. + +df = df.drop(columns=[VALUE_COL]).rename( + columns={"contaminated_y": VALUE_COL}) +df[ANOMALY_COL] = (df[ANOMALY_COL] == 1) + +train_size = int(100) +df_train = df[:train_size].reset_index(drop=True) +df_test = df[train_size:].reset_index(drop=True) + + +# %% +# Structure of a Greykite AD model +# --------------------------------- +# The Greykite AD takes a ``forecast_config`` and ``ADConfig`` +# and builds a detector which uses the forecast as baseline. +# The fit consists of following stages: +# - Fit a forecast model using the given ``forecast_config``. +# - Fit a volatility model using the given ``ADConfig``. +# This builds a `~greykite.algo.uncertainty.conditional.conf_interval.conf_interval` +# model that optimizes over the parameters specified in the ``ADConfig``. + +# %% +# Any of the available forecast model +# templates (see :doc:`/pages/stepbystep/0100_choose_model`) work in conjunction +# with the Greykite AD. In this example, we choose the "SILVERKITE_EMPTY" template. + +metadata = MetadataParam( + time_col=TIME_COL, + value_col=VALUE_COL, + train_end_date=None, + anomaly_info=None) + +evaluation_period = EvaluationPeriodParam( + test_horizon=0, + cv_max_splits=0) + +model_components = ModelComponentsParam( + autoregression={ + "autoreg_dict": { + "lag_dict": {"orders": [7]}, + "agg_lag_dict": None}}, + events={ + "auto_holiday": False, + "holiday_lookup_countries": ["US"], + "holiday_pre_num_days": 2, + "holiday_post_num_days": 2, + "daily_event_df_dict": None}, + custom={ + "extra_pred_cols": ["dow"], + "min_admissible_value": 0, + "normalize_method": "zero_to_one"}) + +forecast_config = ForecastConfig( + model_template="SILVERKITE_EMPTY", + metadata_param=metadata, + coverage=None, + evaluation_period_param=evaluation_period, + forecast_horizon=1, + model_components_param=model_components) + +# %% +# The Greykite AD algorithm works with or without anomaly labels for training. +# The reward function for the AD algorithm is updated accordingly. +# When no anomaly labels are provided, the AD algorithm uses ``target_anomaly_percent`` to determine +# the anomaly score threshold. If anomaly labels are provided, the AD algorithm uses +# ``precision``, ``recall`` or ``f1`` to determine the anomaly score threshold. + +# %% +# Anomaly labels are available +# ----------------------------- +# Let us first consider the case where anomaly labels are available for training. +# You can pass the anomaly labels in a few different ways: +# - As the ``ANOMALY_COL`` column in the training dataframe (``train_data.df``). +# - As a vector of anomaly labels in the training data (``train_data.y_true``). +# - As a separate dataframe in the training data (``train_data.anomaly_df``). +# - As a separate dataframe in the ``metadata_param`` in the ``forecast_config``. +# The detector combines the anomaly labels from all these sources and stores it +# under the ``anomaly_df`` attribute in the ``detector``. + +# %% +# In this example, the anomaly labels are passed as ``ANOMALY_COL`` column in the training dataframe. +# When anomalies are available for training, you can use ``precision``, ``recall``, ``f1`` or a combination +# of these metrics to determine the anomaly score threshold. In this example, we will use ``f1``. + +ad_config = ADConfig( + volatility_features_list=[["dow"], ["is_weekend"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + variance_scaling=True) + +def f1_reward(data): + return f1( + y_true=data.y_true, + y_pred=data.y_pred) +reward = Reward(f1_reward) +train_data = DetectorData(df=df_train) + +# Initializes the detector. +detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) +# Fits the model +detector.fit(data=train_data) + +# Checks parameter grid. +param_obj_list = detector.fit_info["param_obj_list"] +param_eval_df = pd.DataFrame.from_records(param_obj_list) +param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) +fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' result of parameter search: reward=f1") +plotly.io.show(fig) + +# %% +# Plots the training results. +fig = detector.plot(title="'GreykiteDetector' prediction: reward=f1", phase="train") +plotly.io.show(fig) + +# %% +# Let us run the model on the test data and plot the results. +# The plot shows the actual data in orange, the forecast in blue, and the +# confidence intervals in grey. The predicted anomalies are marked in red. +test_data = DetectorData( + df=df_test, + y_true=df_test[ANOMALY_COL]) +test_data = detector.predict(test_data) +fig = detector.plot(title="'GreykiteDetector' prediction: reward=f1") +plotly.io.show(fig) + +# %% +# We can see from the plot that our model is able to detect all the anomalies. +# Finally, let's check the evaluation metrics via the ``summary`` method. +# You can see that the model achieved a high precision and recall value. +summary = detector.summary() +print(summary) + +# %% +# Examples of other reward functions +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# In this section we provide examples of other reward functions that can be used. +# The `~greykite.detection.detector.reward.Reward` class allows users the +# flexibility to specify their own reward functions. This class enables two powerful mechanisms: +# - taking a simple `reward_func` and construct a penalized version of that +# - starting from existing objectives building more complex ones by adding / +# multiplying / dividing them or use same operations with numbers. +# +# These two mechanisms together support robust multi-objective problems. +# Some examples are provided below. All these reward functions can be used as before. + +# Builds precision as objective function. +def precision_func(data): + return precision( + y_true=data.y_true, + y_pred=data.y_pred) +precision_obj = Reward(precision_func) + +# Builds recall as objective function. +def recall_func(data): + return recall( + y_true=data.y_true, + y_pred=data.y_pred) +recall_obj = Reward(recall_func) + +# Builds sum of precision and recall objective function. +additive_obj = precision_obj + recall_obj + +# %% +# The class also allows for constrained optimization. For example, in the context +# of anomaly detection if recall is to be optimized +# subject to precision being at least 80 percent, the users can enable this. Let's +# see how this can be done. + +# First, let's build a penalized precision objective function that +# penalizes precision values under 0.8 by `penalty == -inf`. +penalized_precision_obj = Reward( + precision_func, + min_unpenalized=0.8, + penalty=-np.inf) + +# The constraint can also be passed via the ADConfig. +ad_config = ADConfig( + target_precision=0.8) + +# Builds a combined objective function that optimizes recall +# subject to precision being at least 80 percent. +combined_obj = recall_obj + penalized_precision_obj + +# %% +# Users can also combine objectives to achieve more complex objectives from existing ones. +# For example F1 can be easily expressed in terms of precision and recall objectives. +f1_obj = (2 * recall_obj * precision_obj) / (recall_obj + precision_obj) + + +# %% +# Anomaly labels are *NOT* available +# --------------------------------- +# In this example, we will use an AD config which uses ``target_anomaly_percent`` to +# determine the anomaly score threshold. If not specified, the AD algorithm uses a default +# ``target_anomaly_percent`` of 10%. +ad_config = ADConfig( + volatility_features_list=[["dow"], ["is_weekend"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=10.0, + variance_scaling=True) + +detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=None) +detector.fit(data=train_data) + +# Checks parameter grid. +param_obj_list = detector.fit_info["param_obj_list"] +param_eval_df = pd.DataFrame.from_records(param_obj_list) +param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) +fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' result of param search: reward=anomaly_percent") +plotly.io.show(fig) + +# %% +# Plots the training results. +fig = detector.plot(title="'GreykiteDetector' prediction: reward=anomaly_percent", phase="train") +plotly.io.show(fig) + + +# %% +# Let us run the model on the test data and plot the results. +# The plot shows the actual data in orange, the forecast in blue, and the +# confidence intervals in grey. The predicted anomalies are marked in red. + +test_data = DetectorData( + df=df_test, + y_true=df_test[ANOMALY_COL]) +test_data = detector.predict(test_data) +fig = detector.plot(title="'GreykiteDetector' prediction: reward=anomaly_percent") +plotly.io.show(fig) + +# %% +# We can see from the plot that our model is able to detect all the anomalies. +# Finally, let's check the evaluation metrics via the ``summary`` method. +# You can see that the model achieved a high precision and recall value. + +summary = detector.summary() +print(summary) diff --git a/docs/pages/changelog/changelog.rst b/docs/pages/changelog/changelog.rst index 7ada932..8ea62a9 100644 --- a/docs/pages/changelog/changelog.rst +++ b/docs/pages/changelog/changelog.rst @@ -1,6 +1,38 @@ +1.0.0 (2024-01-07) +------------------ + +* Greykite AD (Anomaly Detection) is now available. +It improves upon the out-of-box confidence intervals generated by Silverkite, by automatically tuning the confidence intervals +and other filters (e.g. based on ``Absolute Percentage Error (APE)``) using expected alert rate information and/ or anomaly labels, if available. +It allows the users to define robust objective function, constraints and parameter space to optimize the confidence intervals. +For example user can target a minimal recall level of 80% while maximizing precision. Additionally, the users can specify a +minimum error level to filter out anomalies that are not business relevant. The motivation to include criteria other than +statistical significance is to bake in material/ business impact into the detection. + + * @Reza Hosseini: Devised the core anomaly detection library structure. Added base ``Detector`` module. + * @Reza Hosseini: Added `~greykite.detection.detector.reward.Reward` that allows users to specify and optimize robust anomaly detection objectives. + * @Sayan Patra: Added ``GreykiteDetector`` module that builds anomaly detection based on Greykite forecasting. + * @Sayan Patra: Added tutorials for Greykite anomaly detection. + +* New features and methodss + * @Reza Hosseini: Added `~greykite.common.features.outlier.ZScoreOutlierDetector` and `~greykite.common.features.outlier.TukeyOutlierDetector`, improved outlier detection modules. + * @Sayan Patra: Added `~greykite.detection.common.pickler.GreykitePickler`. This improves the pickling function for Greykite models and allows to store the model in a single file. + * @Yi-Wei Lu: Added ``DifferenceBasedOutlierTransformer`` that can identify outliers in the ``sklearn`` pipeline. + +* Library Enhancements + * @Kaixu Yang: Added ``scipy`` solver to make quantile regression more stable. + * @Qiang Fei: Updated ``auto_holiday`` functionality to use holiday groupers for improved forecast performance in holiday periods. + * @Katherine Li: Improved changepoint detection method that can identify level shift. + +* Bug fixes + * @Reza Hosseini @Sayan Patra @Yi Su @Qiang Fei @Kaixu Yang @Phil Gaudreau: Other library enhancements and bug fixes. + + 0.5.1 (2023-06-01) ------------------ -Loosen dill package requirments + +Loosen dill package requirements. + 0.5.0 (2023-04-03) ------------------ @@ -27,6 +59,7 @@ Python 3.10 support. * @Yi Su, @Sayan Patra: Now ``train_end_date`` is always respected if specified by the user. Previously it got ignored if there are trailing NA’s in training data or ``anomaly_df`` imputes the anomalous points to NA. Also, now ``train_end_date`` accepts a string value. * @Yi Su: The seasonality order now takes `None` without raising an error. It will be treated the same as `False` or zero. + 0.4.0 (2022-07-15) ------------------ @@ -55,8 +88,8 @@ Python 3.10 support. * @Reza Hosseini: Added normalization methods: "minus_half_to_half", "zero_at_origin". * @Albert Chen: Updated tutorials. * @Yi Su: Upgraded fbprophet 0.5 to prophet 1.0. - * @Yi Su: Upgraded holidays to 0.13 - * @Albert Chen @Kaixu Yang @Yi Su: Speed optimization for Silverkite. + * @Yi Su: Upgraded holidays to 0.13. + * @Albert Chen @Kaixu Yang @Yi Su: Speed optimization for Silverkite algorithms. * @Albert Chen @Reza Hosseini @Kaixu Yang @Sayan Patra @Yi Su: Other library enhancements and bug fixes. 0.3.0 (2021-12-14) diff --git a/docs/pages/model_components/0400_events.rst b/docs/pages/model_components/0400_events.rst index f9f4bf7..a355000 100644 --- a/docs/pages/model_components/0400_events.rst +++ b/docs/pages/model_components/0400_events.rst @@ -295,8 +295,8 @@ Or you may use it as a conditional column in the uncertainty model ``conditional 2. Sometimes you may have a weekly time series or the response is daily rolling sum. In such cases, the whole week, or the whole rolling window is impacted by a holiday within it. -We allow for modeling such holiday neighboring effect by specifying ``daily_event_neighbor_event`` in ``events``. -For example, you may use ``daily_event_neighbor_event = 6`` to model rolling 7-day sum holiday effect +We allow for modeling such holiday neighboring effect by specifying ``daily_event_neighbor_impact`` in ``events``. +For example, you may use ``daily_event_neighbor_impact = 6`` to model rolling 7-day sum holiday effect in a daily time series. Or you may use ``daily_event_neighbor_impact = lambda x: [x - timedelta(days=x.isocalendar()[2] - 1) + timedelta(days=i) for i in range(7)]`` to model a holiday effect in weekly time series. @@ -313,36 +313,130 @@ event called "Christmas Day_7D_after" which is 7 days after the Christmas Day. Auto Holiday and Holiday Grouper ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Silverkite models support automatically inferring significant holidays and generate holiday configurations. -It utilizes the `~greykite.algo.common.holiday_inferrer.HolidayInferrer` method to infer holidays. -For more details, see Holiday Inferrer in :doc:`/gallery/quickstart/01_exploration/0200_auto_configuration_tools`. +Silverkite models support automatically grouped holiday features. +It utilizes the `~greykite.algo.common.holiday_grouper.HolidayGrouper` method to group +holidays based on their estimated impact inferred from the training data. This method avoids creating too many +parameters for each holiday while making sure that holidays +that are different enough will be modeled separately. For more details, see Holiday Inferer and +Holiday Grouper in :doc:`/gallery/quickstart/01_exploration/0200_auto_configuration_tools`. It's easy to use auto holiday in model components. -In the event dictionary, specify ``auto_holiday = True``, -and the model will automatically pull significant holidays or neighboring days from ``holiday_lookup_countries`` -by checking their individual effects from the training data. -All other parameters will be ignored except ``daily_event_df_dict``, which will be -added to any inferred significant holiday and neighboring day events. +In the ``events`` dictionary, specify ``auto_holiday=True``, and the model will automatically pull +holidays from ``holiday_lookup_countries``. Additional events can be passed in through ``daily_event_df_dict``, +which get combined with holidays from ``holiday_lookup_countries`` and used as sources of holidays for grouping. Their +neighboring days will be pulled based on ``holiday_pre_num_days``, ``holiday_post_num_days`` and +``holiday_pre_post_num_dict`` if provided. Each holiday and their neighboring days (e.g. Christmas, Christmas +1 Day, +Christmas +2 Days) will have their holiday impact estimated. The holiday grouper checks their individual effects from +the training data and generates holiday groups. Each holiday and its neighboring day in ``holidays_to_model_separately`` +will be modeled separately regardless of whether ``auto_holiday`` is on. Generally, it is recommended to leave +``holidays_to_model_separately`` as ``None`` here unless there is prior knowledge that some holidays or events have +different behaviors as other ones. The ``auto_holiday_params`` can take in additional parameters used by holiday grouper. .. code-block:: python - events=dict( - auto_holiday=True, - holiday_lookup_countries=["US"] + events = dict( + auto_holiday=True, # Turns on auto holiday config. + holidays_to_model_separately=None, # No holiday is modeled separately. + holiday_lookup_countries="auto", # A default list of countries to search. + holiday_pre_num_days=2, # Considers 2 days before each holiday. + holiday_post_num_days=2, # Considers 2 days after each holiday. + holiday_pre_post_num_dict=None, # Specifies if any holiday needs a different count of neighboring days. + daily_event_df_dict=None, # Additional events added to holidays from `holiday_lookup_countries`. + auto_holiday_params=None # Additional parameters for holiday groupers. + ) + +In the example here, by default there will be 5 feature groups generated based on holiday grouping results: +``"events_holiday_group_0"``, ``"events_holiday_group_1"``, ``"events_holiday_group_2"``, ``"events_holiday_group_3"``, +``"events_holiday_group_4"``. + +By contrast, if ``auto_holiday=False``, there will also be 5 feature groups generated: +``"events_Other"``, ``"events_Other-1"``, ``"events_Other-2"``, ``"events_Other+1"``, ``"events_Other+2"``. +All holidays will be modeled as one whole group named ``"events_Other"``. Their neighboring days will be modeled in +groups based on their relations to the holidays. + +To extend from default settings, additional parameters for holiday groupers can be passed in through +``auto_holiday_params``. The following code lists out all parameters that can be specified and tuned with their +current defaults. Please refer to Holiday Grouper in +:doc:`/gallery/quickstart/01_exploration/0200_auto_configuration_tools` for more information on how it works. + +.. code-block:: python + + # The `auto_holiday_params` to pass in for events. + auto_holiday_params = dict( + df=None, # Time series to infer holiday impact. + time_col=None, # Time column in `df`. + value_col=None, # Value column in `df`. + holiday_df=None, # User specified holidays, will replace holidays from `holiday_lookup_countries` and `daily_event_df_dict`. + holiday_date_col=None, # Holiday date column in `holiday_df`. + holiday_name_col=None, # Holiday name column in `holiday_df`. + get_suffix_func="wd_we", # Extended feature group added as suffix, please see `HolidayGrouper`. + baseline_offsets=[-7, 7], # The baseline is the average of -7/+7 observations. + use_relative_score=True, # Whether to use relative or absolute score when estimating impact. + min_n_days=1, # Minimal number of occurrences of an event to be included in grouping. + min_abs_avg_score=0.03, # Minimal average score of an event to be kept in consideration. + clustering_method="kmeans", # Clustering methods. + n_clusters=5, # Number of clusters in k-means clustering. + bandwidth=None, # Only used if "kde" is selected for `clustering_method`. + bandwidth_multiplier=None, # Only used if "kde" is selected for `clustering_method` and `bandwidth` is `None`. + ) + +Example 1: User-specified Time Series for Learning Holiday Impact + +By default, the same time series used for training is used to learn holiday impact and generate holiday +groups. The library also allows users to import external time series to train holiday impact. This can be useful when +the training data is too short to learn holiday impact. The external time series +used for generating holiday groups needs at least one time column and one value column. + +For example, if we have a time series ``external_df`` in the format below (values are made up): + +.. csv-table:: external_df + :header: dates,values + + 2020-01-01,5.22 + 2020-01-02,8.88 + 2020-01-03,8.72 + ...,... + +The input for ``auto_holiday_params`` can then be specified as below: + +.. code-block:: python + + # The `auto_holiday_params` to pass in for events. + auto_holiday_params = dict( + df=external_df, + time_col="dates", + value_col="values" ) -We also provide a Holiday Grouper tool to help you group holidays based on their estimated impact inferred from -the training data. The smart grouping makes sure to not create too many parameters to each holiday while making -sure that holidays that are different enough will be modeled separately. -For more details, see Holiday Grouper in :doc:`/gallery/quickstart/01_exploration/0200_auto_configuration_tools`. +Example 2: Customized Holiday List + +By default, holidays pulled from ``holiday_lookup_countries`` will be combined with events imported from +``daily_event_df_dict`` and serve as the source of holidays for grouping. +When users want to use a completely customized list of holidays, they can specify that through ``holiday_df``. In +this case, holidays fetched from ``holiday_lookup_countries`` and ``daily_event_df_dict`` will be ignored. Only +holidays included in ``holiday_df`` will be used for grouping. + +The ``holiday_df`` +should be a `pandas.DataFrame` with at least two columns: one column for dates and one for holiday names. The holiday +dates should cover both training and forecasting periods. For example, assuming that the input ``holiday_df`` looks +like below: + +.. csv-table:: holiday_df + :header: date, event_name + + 2020-12-25,Christmas + 2021-01-01,New Year + ...,... -The Holiday Grouper returns a curated ``daily_event_df_dict`` which can be directly specified in events. +The input for ``auto_holiday_params`` can then be specified as: .. code-block:: python - events=dict( - holiday_lookup_countries=[], - daily_event_df_dict=daily_event_df_dict + # The `auto_holiday_params` to pass in for events. + auto_holiday_params = dict( + holiday_df=holiday_df, + holiday_date_col="date", + holiday_name_col="event_name" ) Prophet diff --git a/docs/pages/model_components/0600_custom.rst b/docs/pages/model_components/0600_custom.rst index f9f017a..f776fe4 100644 --- a/docs/pages/model_components/0600_custom.rst +++ b/docs/pages/model_components/0600_custom.rst @@ -451,7 +451,8 @@ and regularization is used. normalize_method : `str` or None, default None The normalization method for the feature matrix. - Available values are "statistical", "zero_to_one" and "minus_half_to_half". + Available values are "zero_to_one", "statistical", "minus_half_to_half", "zero_at_origin". + If None, no normalization will be performed. Examples: @@ -465,6 +466,9 @@ Examples: ) The ``statistical`` method removes the "mean" and divides by "std" for each column. -The ``zero_to_one`` method removes the "min" and divides by the "max - min" -The ````minus_half_to_half```` method removes the "(min + max)/2" and divides by the "max - min" -for each column. For details, see `~greykite.common.features.normalize.normalize_df`. +The ``zero_to_one`` method removes the "min" and divides by the "max - min" for each column. +The ``minus_half_to_half`` method removes the "(min + max)/2" and divides by the "max - min" +for each column. +The ``zero_at_origin`` method removes a constant equal to the first data point and divides +by the "max - min" for each column. +For details, see `~greykite.common.features.normalize.normalize_df`. diff --git a/docs/pages/greykite/overview.rst b/docs/pages/overview/100_forecast_intro.rst similarity index 94% rename from docs/pages/greykite/overview.rst rename to docs/pages/overview/100_forecast_intro.rst index e759a08..b68d026 100644 --- a/docs/pages/greykite/overview.rst +++ b/docs/pages/overview/100_forecast_intro.rst @@ -1,8 +1,5 @@ -Overview -======== - -The Greykite model ------------------- +The Greykite Forecast model +--------------------------- Greykite is a forecast library developed by LinkedIn. Its flagship algorithm, Silverkite, provides interpretable, fast, and highly flexible univariate forecasts that capture effects such as time-varying growth and seasonality, diff --git a/docs/pages/overview/200_ad_intro.rst b/docs/pages/overview/200_ad_intro.rst new file mode 100644 index 0000000..44a40a9 --- /dev/null +++ b/docs/pages/overview/200_ad_intro.rst @@ -0,0 +1,85 @@ +The Greykite Anomaly Detection model +==================================== +**Authors: Reza Hosseini, Sayan Patra** + +Greykite AD (Anomaly Detection) is an extension of the Greykite +Forecasting library. It provides users with an interpretable, fast, robust and easy to use +interface to monitor their metrics with minimal effort. + +Greykite AD improves upon the out-of-box confidence intervals generated by Silverkite, by automatically tuning the confidence intervals +and other filters (e.g. based on ``Absolute Percentage Error (APE)``) using expected alert rate information and/ or anomaly labels, if available. +It allows the users to define robust objective function, constraints and parameter space to optimize the confidence intervals. +For example user can target a minimal recall level of 80% while maximizing precision. Additionally, the users can specify a +minimum error level to filter out anomalies that are not business relevant. The motivation to include criteria other than +statistical significance is to bake in material/ business impact into the detection. + +The parameters of the model can be configured via a config file, which makes the model easy to +serve in production environments. This approach has proved effective in real-world use cases at scale. + + +How does the algorithm work? +---------------------------- +The algorithm is based on the following steps: + + 1. Fit a forecast model using the given ``ForecastConfig``. + + 2. Fit a volatility model on forecast errors using the given ``ADConfig``. + This includes using expected alert rate and / or anomaly labels to optimize the confidence bands and + other filters based on ``APE`` etc. The optimization can reflect complex user preferences + e.g. target a minimum recall level while maximizing precision. + + +Advantages of using the Greykite AD +----------------------------------- + 1. It works on any data frequency (e.g. daily, hourly, 15 minutes, + 5 minutes, 1 min) even with small amount of data. + + 2. It can use user feedback (labels provided by the user) to adjust itself + over time, but works without anomaly labels too. + + 3. It takes into account seasonality, holidays, growth and other + complex patterns when issuing alerts. + + 4. It provides great flexibility in optimization metrics e.g. optimize recall + subject to precision being at least 80 percent. + + +One of the primary advantages of the Greykite AD is that it works for both supervised and unsupervised problems. +In the supervised case, the user provides the labels for the anomalies. These labels are used to train, evaluate and +update the model. The reward function is chosen to utilize these labels e.g. precision, recall, F1 score etc, or a +combination of these. In the unsupervised case, when no labels are provided, the algorithm uses +the percent of time points that are labeled as anomalies to optimize the model. + +The library is designed such that the users can provide flexible objective functions and constraints that +suit their use case. It allows for the following: + + 1. Combining objective functions and constraints. + + For example, the user can specify to optimize ``F1 score`` such that the ``anomaly percent`` + is less than ``5%`` and the ``recall`` is at least ``80%``. + + 2. Filter anomalies based on business requirements. + + It is possible that a statistically significant anomaly is not business relevant. For example, a statistically significant spike + in the number of users on a website may not have enough business impact to warrant an alert. + The user can filter such anomalies by specifying a minimum error level to trigger an alert. + The library provides ``Absolute Percentage Error (APE)`` and ``Symmetric Absolute Percentage Error (SAPE)`` + as two options for error level. The user can also pass a grid of values (e.g. ``ADConfig.ape_grid``) and Greykite can use + the grid in the optimization. In the supervised case, Greykite can use the labels to find the optimal combination of + confidence interval coverage and ``APE`` filter based on the user labels. In the un-supervised case, it makes sense to + optimize only over either coverage or a filter because the only input is the expected alert rate. + + 3. Soft version of metrics to bridge the gap between business requirements and statistical requirements. + + We developed soft version of well-known metrics that allows for margin of error in the detection process i.e. + ``soft precision``, ``soft recall`` and ``soft F1 score``. + For example, if the user is satisfied that an anomaly is captured within 5 hours of its occurrence, + then the user can specify a window of 5 hours in ``soft recall``. The library will then consider an anomaly to + be a true positive if it is detected within 5 hours of its occurrence. This often bridges the gap between + the business requirements and the statistical requirements. + + +See :doc:`/gallery/quickstart/0200_simple_anomaly_detection` to get started. +A more detailed tutorial for the anomaly detection process is +at :doc:`/gallery/tutorials/0400_anomaly_detection_tutorial`. +You can follow that guide for advanced configuration. \ No newline at end of file diff --git a/greykite/algo/changepoint/adalasso/changepoint_detector.py b/greykite/algo/changepoint/adalasso/changepoint_detector.py index 231ea5c..918242c 100644 --- a/greykite/algo/changepoint/adalasso/changepoint_detector.py +++ b/greykite/algo/changepoint/adalasso/changepoint_detector.py @@ -48,6 +48,7 @@ from greykite.algo.changepoint.adalasso.changepoints_utils import get_trend_changes_from_adaptive_lasso from greykite.algo.changepoint.adalasso.changepoints_utils import get_yearly_seasonality_changepoint_dates_from_freq from greykite.algo.changepoint.adalasso.changepoints_utils import plot_change +from greykite.algo.changepoint.shift_detection.shift_detector import ShiftDetection from greykite.common.constants import TimeFeaturesEnum from greykite.common.features.timeseries_features import get_evenly_spaced_changepoints_dates from greykite.common.logging import LoggingLevelEnum @@ -133,6 +134,8 @@ def __init__(self): self.seasonality_df: Optional[pd.DataFrame] = None self.seasonality_changepoints: Optional[dict] = None self.seasonality_estimation: Optional[pd.Series] = None + self.shift_detector: Optional[ShiftDetection] = None + self.level_shift_df: Optional[pd.DataFrame] = None @ignore_warnings(category=ConvergenceWarning) def find_trend_changepoints( @@ -140,6 +143,7 @@ def find_trend_changepoints( df, time_col, value_col, + shift_detector=None, yearly_seasonality_order=8, yearly_seasonality_change_freq=None, resample_freq="D", @@ -190,6 +194,9 @@ def find_trend_changepoints( Time column name in ``df`` value_col : `str` Value column name in ``df`` + shift_detector: `greykite.algo.changepoint.shift_detection.shift_detector.ShiftDetection` + An instance of ShiftDetection for identifying level shifts and computing regressors. Level + shift points will be considered as regressors when selecting change points by adaptive lasso. yearly_seasonality_order : `int`, default 8 Fourier series order to capture yearly seasonality. yearly_seasonality_change_freq : `DateOffset`, `Timedelta` or `str` or `None`, default `None` @@ -347,6 +354,18 @@ def find_trend_changepoints( self.time_col = time_col self.value_col = value_col self.original_df = df + self.shift_detector = shift_detector + + # If a shift detector object is passed to the constructor, we're making Changepoint cognizant of level shifts + # and it will handle the level shifts as independent regressors. + if self.shift_detector is not None: + self.level_shift_cols, self.level_shift_df = self.shift_detector.detect( + df.copy(), + time_col=time_col, + value_col=value_col, + forecast_horizon=0 + ) + # Resamples df to get a coarser granularity to get rid of shorter seasonality. # The try except below speeds up unnecessary datetime transformation. if resample_freq is not None: @@ -408,6 +427,25 @@ def find_trend_changepoints( "seas_names": ["yearly"]}) ) trend_df = pd.concat([trend_df, long_seasonality_df], axis=1) + # Augment the trend_df with the additional level shift regressors. + if self.shift_detector is not None and len(self.level_shift_cols) > 0: + # Rename each column to have level shift prefixed in name. + pad_char, pad_size = 0, 4 # Left pad the levelshift regressors with 0s for sorting. + new_col_names = {col_name: f"levelshift_{ndx:{pad_char}{pad_size}}_{col_name}" for ndx, col_name in enumerate(self.level_shift_cols)} + self.level_shift_df.rename(columns=new_col_names, inplace=True) + # Save regressors for concatenation to trend_df. + time_col_and_regressor_cols = [time_col] + sorted(new_col_names.values()) + levelshift_regressors_df = self.level_shift_df[time_col_and_regressor_cols] + # Resample level shift df according to resample frequency + levelshift_regressors_df_copy = levelshift_regressors_df.copy() + if resample_freq is not None: + levelshift_regressors_df_resample = levelshift_regressors_df_copy.resample(resample_freq, on=time_col).mean().reset_index() + else: + levelshift_regressors_df_resample = levelshift_regressors_df_copy + # Set time column to be the index of the dataframe for level shift regressors. + levelshift_regressors_df_resample.set_index(time_col, inplace=True) + # Concatenate regressors column-wise. + trend_df = pd.concat([trend_df, levelshift_regressors_df_resample], axis=1) trend_df.index = df_resample[time_col] self.trend_df = trend_df # Estimates trend. @@ -1083,7 +1121,8 @@ def get_changepoints_dict(df, time_col, value_col, changepoints_dict): "no_changepoint_distance_from_begin", "no_changepoint_proportion_from_end", "no_changepoint_distance_from_end", - "no_changepoint_proportion_from_end" + "no_changepoint_proportion_from_end", + "shift_detector" ] changepoints_dict_keys = [ "continuous_time_col", diff --git a/greykite/algo/changepoint/shift_detection/__init__.py b/greykite/algo/changepoint/shift_detection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/greykite/algo/changepoint/shift_detection/shift_detector.py b/greykite/algo/changepoint/shift_detection/shift_detector.py new file mode 100644 index 0000000..74fea3a --- /dev/null +++ b/greykite/algo/changepoint/shift_detection/shift_detector.py @@ -0,0 +1,405 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Katherine Li, Kaixu Yang +"""This module conducts level shift detection. +The level shifts are handled with regressors in the Silverkite forecasting model. +This module contains a class which can generate corresponding regressors to the input dataframe. +This dataframe can be fed into the Silverkite model. +The level shift algorithm takes the first order differencing of the data. +It will calculate the z score on the differenced values. +If the z score is larger than the predefined threshold, +the dates will be marked as level shift and corresponding regressors will be created. +For every level shift, it will create a regressor that has values 0 before it and 1 after it, +so the model is able to shift at those dates.""" + +import re +import warnings +from datetime import datetime +from enum import Enum +from typing import List +from typing import Optional + +import pandas as pd + +from greykite.common.constants import LEVELSHIFT_COL_PREFIX_SHORT +from greykite.common.viz.timeseries_plotting import plot_multivariate + + +# An enum of supported time series frequencies +# see details at +# https://pandas.pydata.org/docs/user_guide/timeseries.html#timeseries-offset-aliases +class TimeSeriesFrequency(Enum): + T = "minutes" + H = "hours" + D = "days" + W = "weeks" + M = "months" + Y = "years" + + +def find_min_max_of_block(indices: List[int]): + """Given a list of indices, with some of them being consecutive, + find the start and end of each block. + Indices are considered to be in the same block if they are consecutive numbers. + For example, [1, 4, 5, 6, 12, 14, 15] will give [[1, 1], [4, 6], [12, 12], [14, 15]]. + + Parameters + ---------- + indices: `list` + List of indices in ascending order. + Example: [1, 4, 5, 6, 12, 14, 15]. + + Returns + ---------- + index_blocks : `list` + List of list with start and end index of each block. + Example: [[1, 1], [4, 6], [12, 12], [14, 15]]. + """ + if not indices: + return [] + block_start_i = 0 + index_blocks = [] + for i in range(1, len(indices)): + if indices[i] - indices[i - 1] != 1: + index_blocks.append((indices[block_start_i], indices[i - 1])) + block_start_i = i + index_blocks.append((indices[block_start_i], indices[-1])) + return index_blocks + + +class ShiftDetection: + """The level shifts are handled with regressors in the Silverkite forecasting model. + This class can generate corresponding regressors to the input dataframe that can be fed into the Silverkite model. + For every level shift, we create a regressor that has values 0 before it and 1 after it, + so the model is able to shift at those dates. + + The main method to run level shift detection is `detect`. + The level shift algorithm takes the first order differencing of the data. + It will calculate the z score on the differenced values. + If the z score is larger than the predefined threshold, + the dates will be marked as level shift and corresponding regressors are created. + For example, if it detects a sudden increase during + [2020-01-01, 2020-01-03], it will create a regressor column named "ctp_2020_01_02" + by taking middle of the start and end date of the level shift period. + for dates before 2020_01_02, values will be set to 0 while after this date, values are 1. + + Attributes + ---------- + df : `pandas.DataFrame` + The dataframe used for level shift detection. + It must have two columns: + ``time_col`` indicating the column of time + (the format should be able to be parsed by pd.to_datetime), + and ``value_col`` indicating the column of observed time series values. + time_col : `str` + The column name for time column. + value_col : `str` + The column name for value column. + forecast_horizon: `int` + The number of datapoints to forecast. + This is used to generate dataframe after adding regressors for level shift. + freq : `str` + Frequency of the dataframe such as "D" for daily and "W" for weekly. + The allowed values can be found in the TimeSeriesFrequency Enum object. + z_score_cutoff : `int` + The z score cutoff value to define the level shift. + By default is 3. + df_shift : `pandas.DataFrame` + Dataframe with additional columns calculated by levelshift algorithm: + column "actual_diff" represents the abs of the first order differencing, + column "zscore" of actual_diff of the two adjacent dates. + This dataframe will be used for function `plot_level_shift`. + shift_dates : `list` + The shift start and end date. below is an example: + [(Timestamp("2020-01-11 00:00:00"), Timestamp("2020-01-12 00:00:00")), + (Timestamp("2020-01-21 00:00:00"), Timestamp("2020-01-21 00:00:00"))]. + regressor_col : `list` + List of names of the regressor columns which will start with `LEVELSHIFT_COL_PREFIX_SHORT` + E.g. ["ctp_2020_01_11"] + final_df : `pandas.DataFrame` + Dataframe with regressor columns which is expanded to future dates based on ``forecast_horizon``. + For example, if the input has 2 rows and forecast_horizon = 1, + it will create a new row with ``value_col`` = NaN and a few regressor columns of the level shift detected. + + Methods + ------- + detect : callable + Runs shift detection algorithm and create dataframe with levelshift + regressor columns for a given time series df. + plot_level_shift : callable + Plots the results after running fucntion `find_shifts` or `detect`. + find_shifts : callable + Finds the start and end dates of the level shift for a given time series df. + create_df_with_regressor : callable + Appends level shift regressor column to a given time series df. + create_regressor_for_future_dates : callable + Creates future dates with level shift regressor columns + to a given time series df. + """ + + def __init__(self): + self.original_df: Optional[pd.DataFrame] = None + self.time_col: Optional[str] = None + self.value_col: Optional[str] = None + self.forecast_horizon: Optional[int] = None + self.freq: Optional[str] = None + self.z_score_cutoff: Optional[int] = None + self.df_shift: Optional[pd.DataFrame] = None + self.shift_dates: Optional[list] = None + self.regressor_col: Optional[list] = None + self.final_df: Optional[pd.DataFrame] = None + + def detect( + self, + original_df: pd.DataFrame, + time_col: str, + value_col: str, + forecast_horizon: int = 0, + freq: str = "D", + z_score_cutoff: float = 3): + """This is the main function to create dataframe + with level shift regressor columns. + It will detect the level shifts, return a dataframe with regressor columns + representing the dates of sudden value shift, and return a list of regressor + column names that will be passed into the config of the Forecaster.run_forecast_config. + + Attributes + ---------- + df : `pandas.DataFrame` + The dataframe used for level shift detection. + It must have at least two columns: + ``time_col`` in the format should be able to be parsed by pd.to_datetime + and ``value_col`` in the `int` or `float` format for observed time series values. + time_col : `str` + The column name for time column. + value_col : `str` + The column name for value column. + forecast_horizon: `int`, default is 0 (don't need to expand to future dates) + The number of datapoints to forecast. + This is used to generate dataframe after adding regressors for level shift. + freq : `str`, default is "D" + Frequency of the dataframe such as "D" for daily and "W" for weekly. + The allowed values can be found in the TimeSeriesFrequency Enum object. + z_score_cutoff : `float` + The z score cutoff value to define the level shift. + By default is 3. + + Returns + ------ + regressor_col: `list` + List of names of the regressor_col which will start with `LEVELSHIFT_COL_PREFIX_SHORT` + E.g. ["ct0", "ctp_2020_01_11"] + final_df: `pandas.DataFrame` + Dataframe with time_col, value_col and additional regressor columns + of the level shift detected. + It is also expanded to future dates based on ``forecast_horizon``. + For example, if the input has 2 rows and forecast_horizon = 1, + it will create a new row with value_col = NaN and a few regressor columns + of the level shift detected. + """ + # Initialize the variables. + self.original_df = original_df + self.time_col = time_col + self.value_col = value_col + self.forecast_horizon = forecast_horizon + self.freq = freq + self.z_score_cutoff = z_score_cutoff + + # Check frequency type matching the enum TimeSeriesFrequency. + if freq not in TimeSeriesFrequency.__members__.keys(): + raise ValueError("freq should be one of the values of " + + str(list(TimeSeriesFrequency.__members__.keys()))) + + # Filter columns and ensure the datetime type of the time_col. + df = original_df[[time_col, value_col]].copy() + df[time_col] = pd.to_datetime(df[time_col]) + + # Find shifts. + self.df_shift, self.shift_dates = self.find_shifts(df, time_col, value_col, z_score_cutoff) + + # Create regressor columns. + df_w_regressor = self.create_df_with_regressor(self.df_shift, time_col, self.shift_dates) + self.regressor_col, self.final_df = self.create_regressor_for_future_dates( + df_w_regressor, + time_col, + value_col, + forecast_horizon, + freq) + return self.regressor_col, self.final_df + + def plot_level_shift(self): + """Makes a plot to show the observations with level shifts. + + Attributes + ---------- + None + + Returns + ---------- + fig : `plotly.graph_objects.Figure` The plot object. + """ + if self.df_shift is not None and self.time_col is not None: + fig = plot_multivariate(self.df_shift, self.time_col) + for pair in self.shift_dates: + fig.add_vrect(x0=pair[0], x1=pair[1], fillcolor="red", opacity=0.2) + fig.show() + else: + warnings.warn("please run either detect() or find_shifts() first.") + + def find_shifts( + self, + df: pd.DataFrame, + time_col: str, + value_col: str, + z_score_cutoff: int = 3): + """This is the main function to detect level shifts based on the z score threshold. + + Attributes + ---------- + df : `pandas.DataFrame` + The dataframe used for level shift detection. + It must have at least two columns: + ``time_col`` in the format should be able to be parsed by pd.to_datetime + and ``value_col`` in the `int` or `float` format for observed time series values. + time_col : `str` + The column name for time column. + value_col : `str` + The column name for value column. + z_score_cutoff : `int`, default is 3. + The z score cutoff value to define the level shift. + + Returns + ---------- + df_find_shifts : `pandas.DataFrame` + Dataframe with two additional columns: + actual_diff: abs of the first order differencing. + zscore: standard deviation between two datapoints. + shift_dates : `list` + The shift start and end date. below is an example: + [(Timestamp("2020-01-11 00:00:00"), Timestamp("2020-01-12 00:00:00")), + (Timestamp("2020-01-21 00:00:00"), Timestamp("2020-01-21 00:00:00"))]. + """ + df_find_shifts = df.copy() + df_find_shifts["actual_diff"] = df_find_shifts[value_col].diff(1).abs() + df_find_shifts["zscore"] = ((df_find_shifts["actual_diff"] - df_find_shifts["actual_diff"].mean()) + / df_find_shifts["actual_diff"].std()) + + shifts = df_find_shifts[df_find_shifts["zscore"] > z_score_cutoff].index.tolist() + + shift_dates = find_min_max_of_block(shifts) + shift_dates = [(df_find_shifts[time_col].loc[pair[0]], df_find_shifts[time_col].loc[pair[1]]) for pair in shift_dates] + return df_find_shifts, shift_dates + + def create_df_with_regressor( + self, + df: pd.DataFrame, + time_col: str, + shift_dates: List[datetime]): + """Create dataframe with additional regressor columns of the level shift detected. + + Parameters + ---------- + df : `pandas.DataFrame` + The dataframe used for level shift detection. + It must have at least two columns: + ``time_col`` in the format should be able to be parsed by pd.to_datetime + and ``value_col`` in the `int` or `float` format for observed time series values. + time_col : `str` + The column name for time column. + shift_dates : `list` + The shift start and end date. this is generated by find_shifts(). + below is an example: + [(Timestamp("2020-01-11 00:00:00"), Timestamp("2020-01-12 00:00:00")), + (Timestamp("2020-01-21 00:00:00"), Timestamp("2020-01-21 00:00:00"))] + + Returns + ---------- + df_regressor : `pandas.DataFrame` + Dataframe with additional regressor columns of the level shift detected. + The regressor columns will start with `LEVELSHIFT_COL_PREFIX_SHORT`. + It is in minutely format. + regressor is calculated based on the middle point of each shift time block. + This middle point might not exist in the raw data. E.g, for a shift block of + [(Timestamp("2020-09-01 00:00:00"), Timestamp("2020-10-01 00:00:00")], + regressor is created as `ctp_2020_09_16_00_00`. + """ + df_regressor = df.copy() + for i, pair in enumerate(shift_dates): + # If we have a consecutive block, we use the date in the middle to create the regressor column. + ctp = pair[0] + (pair[1] - pair[0]) / 2 + df_regressor[f"{LEVELSHIFT_COL_PREFIX_SHORT}_{ctp.strftime('%Y_%m_%d_%H_%M')}"] = ( + (df_regressor[time_col] >= ctp).astype(int)) + return df_regressor + + def create_regressor_for_future_dates( + self, + df: pd.DataFrame, + time_col: str, + value_col: str, + forecast_horizon: int, + freq: TimeSeriesFrequency): + """Expand dataframe to future dates based on the forecast_horizon. + + Parameters + ---------- + df : `pandas.DataFrame` + The dataframe generated by calling create_df_with_regressor(). + It must have at least the columns listed below: + time_col in the format should be able to be parsed by pd.to_datetime, + value_col in the `int` or `float` format for observed time series values, + a few regressor columns identified by level shift algorithm. + time_col : `str` + The column name for time column. + value_col : `str` + The column name for value column. + forecast_horizon : `int` + The number of datapoints to forecast. + This is used to generate dataframe after adding regressors for level shift. + freq : `str` + Frequency of the dataframe such as "D" for daily and "W" for weekly. + The allowed values can be found in the TimeSeriesFrequency Enum object. + + Returns + ---------- + ctp_col : `list` + List of names of the regressor columns which will start with `LEVELSHIFT_COL_PREFIX_SHORT` + E.g. ["ctp_2020_01_11_00_00"] + df_result : `pandas.DataFrame` + Dataframe with regressor columns which is expanded to future dates based on ``forecast_horizon``. + For example, if the input has 2 rows and forecast_horizon = 1, + it will create a new row with ``value_col`` = NaN and a few regressor columns of the level shift detected. + """ + # Check frequency type matching the enum TimeSeriesFrequency. + if forecast_horizon != 0 and freq not in TimeSeriesFrequency.__members__.keys(): + raise ValueError("freq should be one of the values of " + + str(list(TimeSeriesFrequency.__members__.keys()))) + + df_regressor = df.copy() + re_pattern = re.compile(fr"^{LEVELSHIFT_COL_PREFIX_SHORT}_\d{{4}}_\d{{2}}_\d{{2}}_\d{{2}}_\d{{2}}$") + ctp_col = [column for column in df_regressor.columns if re_pattern.match(column)] + df_regressor = df_regressor[[time_col, value_col] + ctp_col] + if forecast_horizon == 0: + return ctp_col, df_regressor + expand_df = pd.DataFrame({ + time_col: pd.date_range(df_regressor[time_col].max(), freq=freq, periods=forecast_horizon+1)}) + expand_df[ctp_col] = 1 + df_result = pd.concat([df_regressor, expand_df.loc[1:]]).reset_index(drop=True) + return ctp_col, df_result diff --git a/greykite/algo/common/holiday_grouper.py b/greykite/algo/common/holiday_grouper.py index 4a1ef28..96f0121 100644 --- a/greykite/algo/common/holiday_grouper.py +++ b/greykite/algo/common/holiday_grouper.py @@ -72,8 +72,15 @@ class HolidayGrouper: Name of the holiday date column in ``holiday_df``. holiday_name_col : `str` Name of the holiday name column in ``holiday_df``. + holiday_impact_pre_num_days: `int`, default 0 + Default number of days before the holiday that will be modeled for holiday effect if the given holiday + is not specified in ``holiday_impact_dict``. + holiday_impact_post_num_days: `int`, default 0 + Default number of days after the holiday that will be modeled for holiday effect if the given holiday + is not specified in ``holiday_impact_dict``. holiday_impact_dict : `Dict` [`str`, Any] or None, default None - A dictionary containing the neighboring impacting days of a certain holiday. + A dictionary containing the neighboring impacting days of a certain holiday. This overrides the + default ``pre_num`` and ``post_num`` for each holiday specified here. The key is the name of the holiday matching those in the provided ``holiday_df``. The value is a tuple of two values indicating the number of neighboring days before and after the holiday. For example, a valid dictionary may look like: @@ -175,6 +182,8 @@ def __init__( holiday_df: pd.DataFrame, holiday_date_col: str, holiday_name_col: str, + holiday_impact_pre_num_days: int = 0, + holiday_impact_post_num_days: int = 0, holiday_impact_dict: Optional[Dict[str, Tuple[int, int]]] = None, get_suffix_func: Optional[Union[Callable, str]] = "wd_we"): self.df = df.copy() @@ -183,6 +192,8 @@ def __init__( self.holiday_df = holiday_df.copy() self.holiday_date_col = holiday_date_col self.holiday_name_col = holiday_name_col + self.holiday_impact_pre_num_days = holiday_impact_pre_num_days + self.holiday_impact_post_num_days = holiday_impact_post_num_days if holiday_impact_dict is None: holiday_impact_dict = {} self.holiday_impact_dict = holiday_impact_dict.copy() @@ -215,6 +226,8 @@ def __init__( holiday_df=self.holiday_df, holiday_date_col=HOLIDAY_DATE_COL, holiday_name_col=HOLIDAY_NAME_COL, + holiday_impact_pre_num_days=self.holiday_impact_pre_num_days, + holiday_impact_post_num_days=self.holiday_impact_post_num_days, holiday_impact_dict=self.holiday_impact_dict, get_suffix_func=self.get_suffix_func ) @@ -225,8 +238,8 @@ def group_holidays( use_relative_score: bool = True, min_n_days: int = 1, min_same_sign_ratio: float = 0.66, - min_abs_avg_score: float = 0.05, - clustering_method: str = "kde", + min_abs_avg_score: float = 0.03, + clustering_method: str = "kmeans", bandwidth: Optional[float] = None, bandwidth_multiplier: Optional[float] = 0.2, n_clusters: Optional[int] = 5, @@ -250,11 +263,11 @@ def group_holidays( scores for the ratio to achieve 0.66. Similarly, if an event has 3 occurrences, at least 2 of them must have the same directional impact. This parameter is intended to rule out holidays that have indefinite effects. - min_abs_avg_score : `float`, default 0.05 + min_abs_avg_score : `float`, default 0.03 The minimal average score of an event (across all its occurrences) to be kept before grouping. - When ``use_relative_score = True``, 0.05 means the effect must be greater than 5%. - clustering_method : `str`, default "kde" + When ``use_relative_score = True``, 0.03 means the effect must be greater than 3%. + clustering_method : `str`, default "kmeans" Clustering method used to group the holidays. Since we are doing 1-D clustering, current supported methods include (1) "kde" for kernel density estimation, and (2) "kmeans" for k-means clustering. @@ -623,11 +636,11 @@ def check_scores( return if show_pruned: - score_result = result_dict["score_result"] - score_result_avg = result_dict["score_result_avg"] - else: score_result = result_dict["score_result_original"] score_result_avg = result_dict["score_result_avg_original"] + else: + score_result = result_dict["score_result"] + score_result_avg = result_dict["score_result_avg"] res_dict = {} for key, value in score_result_avg.items(): if holiday_name_pattern in key: @@ -737,12 +750,14 @@ def _prune_holiday_by_score( for key, value in score_result.items(): # `key` is the name of the event. # `value` is a list of scores, we need to check the following. + # First removes NAs before the following filtering. + value_non_na = [val for val in value if not np.isnan(val)] # (1) It has minimum length `min_n_days`. - if len(value) < min_n_days: + if len(value_non_na) < min_n_days: continue # (2) The ratio of same-sign scores is at least `min_same_sign_ratio`. - signs = [(score > 0) * 1 for score in value] + signs = [(score > 0) * 1 for score in value_non_na] n_pos, n_neg = sum(signs), len(signs) - sum(signs) if max(n_pos, n_neg) < min_same_sign_ratio * (n_pos + n_neg): continue @@ -771,10 +786,13 @@ def expand_holiday_df_with_suffix( holiday_df: pd.DataFrame, holiday_date_col: str, holiday_name_col: str, + holiday_impact_pre_num_days: int = 0, + holiday_impact_post_num_days: int = 0, holiday_impact_dict: Optional[Dict[str, Tuple[int, int]]] = None, get_suffix_func: Optional[Union[Callable, str]] = "wd_we") -> pd.DataFrame: """Expands an input holiday dataframe ``holiday_df`` to include the neighboring days - specified in ``holiday_impact_dict``. + specified in ``holiday_impact_dict`` or through ``holiday_impact_pre_num_days`` and + `holiday_impact_post_num_days`. Also adds suffixes generated by ``get_suffix_func`` to better model the effects of events falling on different days of week. @@ -786,8 +804,15 @@ def expand_holiday_df_with_suffix( Name of the holiday date column in ``holiday_df``. holiday_name_col : `str` Name of the holiday name column in ``holiday_df``. + holiday_impact_pre_num_days: `int`, default 0 + Default number of days before the holiday that will be modeled for holiday effect if the given holiday + is not specified in ``holiday_impact_dict``. + holiday_impact_post_num_days: `int`, default 0 + Default number of days after the holiday that will be modeled for holiday effect if the given holiday + is not specified in ``holiday_impact_dict``. holiday_impact_dict : `Dict` [`str`, Any] or None, default None - A dictionary containing the neighboring impacting days of a certain holiday. + A dictionary containing the neighboring impacting days of a certain holiday. This overrides the + default ``pre_num`` and ``post_num`` for each holiday specified here. The key is the name of the holiday matching those in the provided ``holiday_df``. The value is a tuple of two values indicating the number of neighboring days before and after the holiday. For example, a valid dictionary may look like: @@ -842,7 +867,7 @@ def get_suffix_func(x): return "" if row[holiday_name_col] in holiday_impact_dict.keys(): pre_search_days, post_search_days = holiday_impact_dict[row[holiday_name_col]] else: - pre_search_days, post_search_days = 0, 0 + pre_search_days, post_search_days = holiday_impact_pre_num_days, holiday_impact_post_num_days for i in range(-pre_search_days, post_search_days + 1): original_dow_flag = get_suffix_func(row[holiday_date_col]) diff --git a/greykite/algo/common/holiday_utils.py b/greykite/algo/common/holiday_utils.py index 6792ea2..54ae41d 100644 --- a/greykite/algo/common/holiday_utils.py +++ b/greykite/algo/common/holiday_utils.py @@ -18,10 +18,25 @@ # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# original author: Yi Su +# original author: Yi Su, Kaixu Yang """Constants and utility functions for `HolidayInferrer` and `HolidayGrouper`.""" import datetime +from typing import Any +from typing import Dict +from typing import List +from typing import Optional + +import pandas as pd +from pandas.tseries.frequencies import to_offset + +from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import get_event_pred_cols +from greykite.common.constants import EVENT_DF_DATE_COL +from greykite.common.constants import EVENT_DF_LABEL_COL +from greykite.common.constants import EVENT_PREFIX +from greykite.common.constants import EVENT_SHIFTED_SUFFIX_AFTER +from greykite.common.constants import EVENT_SHIFTED_SUFFIX_BEFORE +from greykite.common.python_utils import split_offset_str HOLIDAY_NAME_COL = "country_holiday" @@ -35,10 +50,21 @@ """ HOLIDAY_IMPACT_DICT = { + "All Saints Day": (1, 1), # 11/1. + "Ascension Day": (1, 4), # Thursday. + "Assumption of Mary": (4, 4), # 8/15, duplicated with "India_Independence Day". + "Chinese New Year": (4, 4), # Varying. + "Christmas Day": (4, 3), # 12/25. + "Diwali": (4, 4), # Varying. + "Easter Sunday": (6, 1), # Sunday. + "Eid al-Adha": (1, 1), # Varying. + "Eid al-Fitr": (1, 1), # Varying. + "Europe DST End": (0, 0), # Sunday. + "Europe DST Start": (0, 0), # Sunday. "Halloween": (1, 1), # 10/31. "New Year's Day": (3, 3), # 1/1. } -"""An example. Number of pre/post days that a holiday has impact on. +"""Number of pre/post days that a holiday has impact on. For example, if Halloween has neighbor (1, 1), Halloween_minus_1 and Halloween_plus_1 will be generated as two additional events. If not specified, (0, 0) will be used. @@ -62,6 +88,8 @@ def get_dow_grouped_suffix(date: datetime.datetime) -> str: return "_Sat" elif date.day_name() == "Sunday": return "_Sun" + elif date.day_name() in ["Friday", "Monday"]: + return "_Mon/Fri" else: return "_WD" @@ -83,3 +111,111 @@ def get_weekday_weekend_suffix(date: datetime.datetime) -> str: return "_WE" else: return "_WD" + + +def get_autoreg_holiday_interactions( + daily_event_df_dict: Dict[str, pd.DataFrame], + lag_names: List[str]) -> List[str]: + """Gets the interaction terms between holidays and autoregression terms or other lag terms. + + Parameters + ---------- + daily_event_df_dict : `Dict` [`str`, `pandas.DataFrame`] + The input event configuration. + A dictionary with keys being the event names and values being a pandas DataFrame of dates. + See `~greykite.algo.forecast.silverkite.forecast_silverkite` for details. + lag_names : `List` [`str`] + A list of lag names, e.g. ["y_lag1"]. + Each will be interacting with the holidays. + + Returns + ------- + interactions : `List` [`str`] + A list of interaction terms between holidays and lags, to be passed to ``extra_pred_cols``. + """ + holiday_names = get_event_pred_cols(daily_event_df_dict) + interactions = [f"{holiday}:{lag}" for lag in lag_names for holiday in holiday_names] + return interactions + + +def add_shifted_events( + daily_event_df_dict: Dict[str, pd.DataFrame], + shifted_effect_lags: Optional[List[str]] = None, + event_df_date_col: str = EVENT_DF_DATE_COL, + event_df_label_col: str = EVENT_DF_LABEL_COL) -> Dict[str, Any]: + """This function does two things: + + - (1) adds shifted events to ``daily_event_df_dict`` and returns the new event dictionary. + - (2) returns a list of new column names to be added in the model. + This is useful when we need to remove these main effects from the model. + + Parameters + ---------- + daily_event_df_dict : `Dict` [`str`, `pandas.DataFrame`] + The input event configuration. + A dictionary with keys being the event names and values being a pandas DataFrame of dates. + See `~greykite.algo.forecast.silverkite.forecast_silverkite` for details. + shifted_effect_lags : `List` [`str`] or None, default None + Additional neighbor events based on given events. + For example, passing ["-1D", "7D"] will add extra daily events which are 1 day before + and 7 days after the given events. + Offset format is {d}{freq} with any integer plus a frequency string. Must be parsable by pandas ``to_offset``. + The new events' names will be the current events' names with suffix "{offset}_before" or "{offset}_after". + For example, if we have an event named "US_Christmas Day", + a "7D" shift will have name "US_Christmas Day_7D_after". + This is useful when you expect an offset of the current holidays also has impact on the + time series, or you want to interact the lagged terms with autoregression. + event_df_date_col : `str`, default ``EVENT_DF_DATE_COL`` + Date column of the dataframes in ``daily_event_df_dict``. + event_df_label_col : `str`, default ``EVENT_DF_LABEL_COL`` + Label column of the dataframes in ``daily_event_df_dict``. + + Returns + ------- + shifted_events_dict : `Dict` [`str`, `Any`] + A dictionary of results: + + - "new_daily_event_df_dict": the new event dictionary that expands the input + ``daily_event_df_dict`` by adding new shifted events. + Note that this is intended to be used to manually add the events. + One can also specify the ``events["daily_event_shifted_effect"]`` field to directly add them. + - "shifted_events_cols": the column names of the newly added shifted event in the model. + This is useful when we need to remove these main effects from the model. + One can specify this in ``drop_pred_cols`` to achieve this. + """ + if shifted_effect_lags is None: + shifted_effect_lags = [] + + new_daily_event_df_dict = {} + shifted_events_cols = [] + drop_pred_cols = [] + for order in shifted_effect_lags: + num, freq = split_offset_str(order) + if num == 0: + break + num = int(num) + lag_offset = to_offset(order) + for name, event_df in daily_event_df_dict.items(): + new_event_df = event_df.copy() + new_event_df[event_df_date_col] = pd.to_datetime(new_event_df[event_df_date_col]) + new_event_df[event_df_date_col] += lag_offset + # Sets suffix of the new event. + suffix = EVENT_SHIFTED_SUFFIX_BEFORE if num < 0 else EVENT_SHIFTED_SUFFIX_AFTER + # Creates a new dataframe to add to `new_daily_event_df_dict`. + new_name = f"{name}_{abs(num)}{freq}{suffix}" + new_event_df[event_df_label_col] = new_name + new_daily_event_df_dict[new_name] = new_event_df + # Records the new column name in the model. + new_col = f"{EVENT_PREFIX}_{new_name}" + shifted_events_cols.append(new_col) + + if len(new_daily_event_df_dict) > 0: + drop_pred_cols = get_event_pred_cols(new_daily_event_df_dict) + + new_daily_event_df_dict.update(daily_event_df_dict) + + return { + "new_daily_event_df_dict": new_daily_event_df_dict, + "shifted_events_cols": shifted_events_cols, + "drop_pred_cols": drop_pred_cols + } diff --git a/greykite/algo/common/ml_models.py b/greykite/algo/common/ml_models.py index 0c5b3a8..208003a 100644 --- a/greykite/algo/common/ml_models.py +++ b/greykite/algo/common/ml_models.py @@ -474,7 +474,21 @@ def fit_ml_model( - "max_admissible_value" : maximum acceptable value - "normalize_df_func" : normalization function - "regression_weight_col" : regression weight column - + - "alpha" : the regularization term from the linear / ridge regression. + Note that the OLS (ridge) estimator is ``inv(X.T @ X + alpha * np.eye(p)) @ X.T @ Y =: H @ Y``. + - "p_effective" : effective number of parameters. + In linear regressions, it is also equal to ``trace(X @ H)``, where H is defined above. + ``X @ H`` is also called the hat matrix. + - "h_mat" : the H matrix (p by n) in linear regression estimator, as defined above. + Note that H is not necessarily of full-rank p even in ridge regression. + ``H = inv(X.T @ X + alpha * np.eye(p)) @ X.T``. + - "sigma_scaler" : theoretical scaler of the estimated sigma. + Volatility model estimates sigma by taking the sample standard deviation, and + we need to scale it by ``np.sqrt((n_train - 1) / (n_train - p_effective))`` to obtain + an unbiased estimator. + - "x_mean" : column mean of ``x_mat`` as a row vector. + This is stored and used in ridge regression to compute the prediction intervals. + In other methods, it is set to `None`. """ # Builds model matrices. @@ -668,7 +682,8 @@ def fit_ml_model( "alpha": alpha, "h_mat": h_mat, "p_effective": p_effective, - "sigma_scaler": sigma_scaler} + "sigma_scaler": sigma_scaler, + "x_mean": x_mean} if uncertainty_dict is None: fitted_df = predict_ml( diff --git a/greykite/algo/common/model_summary_utils.py b/greykite/algo/common/model_summary_utils.py index 15a5427..96199df 100644 --- a/greykite/algo/common/model_summary_utils.py +++ b/greykite/algo/common/model_summary_utils.py @@ -1639,7 +1639,7 @@ def create_title_section(): content : `str` Title section. """ - content = " Model Summary ".center(80, "=") + "\n\n" + content = " Forecast Model Summary ".center(80, "=") + "\n\n" return content diff --git a/greykite/algo/forecast/silverkite/auto_config.py b/greykite/algo/forecast/silverkite/auto_config.py index 18add6b..b6a23fa 100644 --- a/greykite/algo/forecast/silverkite/auto_config.py +++ b/greykite/algo/forecast/silverkite/auto_config.py @@ -19,6 +19,7 @@ # original author: Kaixu Yang """Automatically populates parameters based on input time series.""" +import inspect from datetime import timedelta from typing import Dict from typing import List @@ -28,12 +29,20 @@ import pandas as pd from greykite.algo.changepoint.adalasso.auto_changepoint_params import generate_trend_changepoint_detection_params -from greykite.algo.common.holiday_inferrer import HolidayInferrer +from greykite.algo.common.holiday_grouper import HolidayGrouper from greykite.algo.common.seasonality_inferrer import SeasonalityInferConfig from greykite.algo.common.seasonality_inferrer import SeasonalityInferrer from greykite.algo.common.seasonality_inferrer import TrendAdjustMethodEnum +from greykite.algo.forecast.silverkite.forecast_simple_silverkite_helper import split_events_into_dictionaries +from greykite.common import constants as cst from greykite.common.constants import GrowthColEnum from greykite.common.constants import TimeFeaturesEnum +from greykite.common.features.timeseries_features import add_event_window_multi +from greykite.common.features.timeseries_features import get_holidays +from greykite.common.logging import LoggingLevelEnum +from greykite.common.logging import log_message +from greykite.common.python_utils import update_dictionary +from greykite.common.time_properties import describe_timeseries def get_auto_seasonality( @@ -253,70 +262,315 @@ def get_auto_holidays( df: pd.DataFrame, time_col: str, value_col: str, - countries: List[str] = ("UnitedStates", "India", "UnitedKingdom"), - forecast_horizon: Optional[int] = None, - daily_event_dict_override: Optional[Dict[str, pd.DataFrame]] = None): - """Automatically infers significant holidays and their neighboring days. + start_year: int, + end_year: int, + pre_num: int = 2, + post_num: int = 2, + pre_post_num_dict: Optional[Dict[str, pd.DataFrame]] = None, + holiday_lookup_countries: List[str] = ("UnitedStates", "India", "UnitedKingdom"), + holidays_to_model_separately: Optional[List[str]] = None, + daily_event_df_dict: Optional[Dict] = None, + auto_holiday_params: Optional[Dict] = None): + """Automatically group holidays and their neighboring days based on estimated holiday impact. Parameters ---------- df : `pandas.DataFrame` - The input time series. + The timeseries data used to infer holiday impact if no ``df`` is passed through ``auto_holiday_params``. time_col : `str` - The column name for timestamps in ``df``. + The column name for timestamps in ``df`` that will be used for holiday impact estimation in ``HolidayGrouper``. + If ``time_col`` is passed in through ``auto_holiday_params``, this will be ignored. value_col : `str` - The column name for values in ``df``. - countries : `list` [`str`], default ("UnitedStates", "India", "UnitedKingdom") - A list of countries to look up holidays. - forecast_horizon : `int` or None, default None - The forecast horizon used to calculate the years needed to populate holidays. - daily_event_dict_override : `dict` [`str`, `pandas.DataFrame`] or None, default None - The daily event dict passed to the configuration. - When auto holiday is activated, - the entries in ``daily_event_dict`` will be added - to the holidays' ``daily_event_dict``. + The column name for values in ``df`` that will be used for holiday impact estimation in ``HolidayGrouper``. + If ``value_col`` is passed in through ``auto_holiday_params``, this will be ignored. + start_year : `int` + Year of first training data point, used to generate holiday events based on ``holiday_lookup_countries``. + This will not be used if `holiday_df` is passed in through ``auto_holiday_params``. + end_year : `int` + Year of last forecast data point, used to generate holiday events based on ``holiday_lookup_countries``. + This will not be used if `holiday_df` is passed in through ``auto_holiday_params``. + pre_num : `int`, default 2 + Model holiday effects for ``pre_num`` days before the holiday. This will be used as + ``holiday_impact_pre_num_days`` when constructing ``HolidayGrouper`` if ``holiday_impact_pre_num_days`` + is not passed in though ``auto_holiday_params``. + post_num : `int`, default 2 + Model holiday effects for ``post_num`` days before the holiday. This will be used as + ``holiday_impact_post_num_days`` when constructing ``HolidayGrouper`` if ``holiday_impact_post_num_days`` + is not passed in though ``auto_holiday_params``. + pre_post_num_dict : `dict` [`str`, (`int`, `int`)] or None, default None + Overrides ``pre_num`` and ``post_num`` for each holiday in + ``holidays_to_model_separately`` and in ``HolidayGrouper`` (as ``holiday_impact_dict``) + if ``holiday_impact_dict`` is not passed in though ``auto_holiday_params``. + For example, if ``holidays_to_model_separately`` contains "Thanksgiving" and "Labor Day", + this parameter can be set to ``{"Thanksgiving": [1, 3], "Labor Day": [1, 2]}``, + denoting that the "Thanksgiving" ``pre_num`` is 1 and ``post_num`` is 3, and "Labor Day" + ``pre_num`` is 1 and ``post_num`` is 2. + Holidays not specified use the default given by ``pre_num`` and ``post_num``. + holiday_lookup_countries : `list` [`str`], default ("UnitedStates", "India", "UnitedKingdom") + A list of countries to look up holidays. This will be used with `daily_event_df_dict` to + generate `holiday_df` that contains holidays that will be modeled when ``holiday_df`` is + not passed in through ``auto_holiday_params``. Otherwise, ``auto_holiday_params["holiday_df"]`` + will be used and this will be ignored. + holidays_to_model_separately : `list` [`str`] or None + Which holidays to include in the model by themselves. These holidays will not be passed into the + ``HolidayGrouper``. The model creates a separate key, value for each item in ``holidays_to_model_separately`` + and their neighboring days. Generally, this is recommended to be kept as `None` unless some specific + assumptions on holidays need to be applied. + daily_event_df_dict : `dict` [`str`, `pandas.DataFrame`] or None, default None + A dictionary of holidays to be used in ``HolidayGrouper``to generate `holiday_df` which contains holidays + that will be modeled when ``holiday_df`` is not passed in through ``auto_holiday_params``. + Each key presents a holiday name, and the values are data frames, with a date column that records all dates + for the corresponding holiday. + Otherwise, ``auto_holiday_params["holiday_df"]`` will be used and this will be ignored. + auto_holiday_params: `dict` or None, default None + This dictionary takes in parameters that can be passed in and used by holiday grouper when + ``auto_holiday`` is set to `True`. It overwrites all configurations passed in or generated by + other inputs. + Examples of arguments that can be included here include: + + ``"df"`` : `str` + Data Frame used by `HolidayGrouper` to infer holiday impact. If this exists, + ``df`` will be ignored. + ``"holiday_df"`` : `str` + Input holiday dataframe that contains the dates and names of the holidays. + If this exists, the following parameters used to generate holiday list will be ignored: + + * "start_year" + * "end_year" + * "holiday_lookup_countries" + * "daily_event_df_dict" + + ``"holiday_date_col"`` : `str` + This will be used as the date column when ``holiday_df`` is passed in through + ``auto_holiday_params`` + ``"holiday_name_col"`` : `str` + This will be used as the holiday name column when ``holiday_df`` is passed in through + ``auto_holiday_params`` + + Please refer to + `~greykite.algo.common.holiday_grouper.HolidayGrouper` for more details. Returns ------- - daily_event_dict : `dict` [`str`, `pandas.DataFrame`] - A dictionary with the keys being the event names - and values being the dataframes including 2 columns: + daily_event_df_dict : `dict` [`str`, `pandas.DataFrame` [cst.EVENT_DF_DATE_COL, cst.EVENT_DF_LABEL_COL]] + A dictionary with the keys being the holiday group names and values being the dataframes including 2 columns: - EVENT_DF_DATE_COL : the events' occurrence dates. - EVENT_DF_LABEL_COL : the events' names. + Suitable for use as ``daily_event_df_dict`` parameter in `forecast_silverkite`. + """ - # Calculates the number of extra years needed to cover the forecast period. - if forecast_horizon and forecast_horizon > 0: - timestamps = pd.to_datetime(df[time_col]) - min_increment = min((timestamps - timestamps.shift(1)).dropna()) - min_increment_in_days = min_increment / timedelta(days=1) - forecast_horizon_in_days = min_increment_in_days * forecast_horizon - extra_years = int(np.ceil(forecast_horizon_in_days / 366)) + 2 # +2 in case of incomplete years + + # Initializes `group_holiday_params`, the parameters to pass in for function `group_holidays`. + if auto_holiday_params is None: + group_holiday_params = dict() else: - extra_years = 3 - # Automatically infers holidays. - hi = HolidayInferrer() - result = hi.infer_holidays( - df=df, - time_col=time_col, - value_col=value_col, - countries=countries, - pre_search_days=2, - post_search_days=2, - baseline_offsets=[-7, 7], - plot=False, - independent_holiday_thres=0.85, - together_holiday_thres=0.95, - extra_years=extra_years - ) - if result is None: - # This happens when data is super-daily. - holiday_dict = {} + group_holiday_params = auto_holiday_params.copy() + + # Initializes `grouper_init_params`, the parameters for `HolidayGrouper`. Anything passed through + # `auto_holiday_params`, which are stored in `group_holiday_params`, will be used for initializations. + # Uses `pop` to remove these parameters from `group_holiday_params` if existed at the same time. + + # Handles `group_holiday_params["df"]` separately to avoid an ambiguity error on using `or` to check if it is empty. + if group_holiday_params.get("df") is None: + group_holiday_params["df"] = df + + grouper_init_params = dict( + df=group_holiday_params.pop("df"), # This will be standardized and updated later. + time_col=group_holiday_params.pop("time_col", None) or time_col, + value_col=group_holiday_params.pop("value_col", None) or value_col, + holiday_df=group_holiday_params.pop("holiday_df", None), # This will be updated later. + holiday_date_col=group_holiday_params.pop("holiday_date_col", None) or cst.EVENT_DF_DATE_COL, + # This will be standardized and updated later. + holiday_name_col=group_holiday_params.pop("holiday_name_col", None) or cst.EVENT_DF_LABEL_COL, + # This will be standardized and updated later. + holiday_impact_pre_num_days=group_holiday_params.pop("holiday_impact_pre_num_days", None) or pre_num, + holiday_impact_post_num_days=group_holiday_params.pop("holiday_impact_post_num_days", None) or post_num, + holiday_impact_dict=group_holiday_params.pop("holiday_impact_dict", None) or pre_post_num_dict) + + # Checks and updates if any other parameter for `HolidayGrouper` exists in `group_holiday_params` that has not + # been initialized through `grouper_init_params`. + # ``.copy()`` is used below to avoid altering the dictionary keys within iteration on same keys + group_holiday_params_key_copy = group_holiday_params.copy().keys() + for key in group_holiday_params_key_copy: + if key in inspect.signature(HolidayGrouper).parameters: + grouper_init_params[key] = group_holiday_params.pop(key) + + # constructs initial `holiday_df`. If `holiday_df` is passed through `auto_holiday_params`, use it. Otherwise, + # constructs it based on input country list and/or user input holidays through `daily_event_df_dict`. + if grouper_init_params["holiday_df"] is not None: + holiday_df = grouper_init_params["holiday_df"].copy() + holiday_date_col = grouper_init_params["holiday_date_col"] + holiday_name_col = grouper_init_params["holiday_name_col"] + if not {holiday_date_col, holiday_name_col}.issubset(holiday_df.columns): + raise ValueError(f"Columns `{holiday_date_col}` and/or " + f"`{holiday_name_col}` not found in input `holiday_df`.") + # Standardizes `holiday_name_col` and `holiday_name_col` in `holiday_df`. + holiday_df = holiday_df.rename(columns={ + holiday_date_col: cst.EVENT_DF_DATE_COL, + holiday_name_col: cst.EVENT_DF_LABEL_COL + }) + else: + # Constructs `holiday_df` based on input country list and/or user input holidays through `daily_event_df_dict`. + # When `holiday_lookup_countries` is not empty, the corresponding `holiday_df_from_countries` is constructed. + holiday_df_from_countries = None + if len(holiday_lookup_countries) > 0: + holiday_df_from_countries_dict = get_holidays( + countries=holiday_lookup_countries, + year_start=start_year - 1, + year_end=end_year + 1) + holiday_df_from_countries_list = [holidays for _, holidays in holiday_df_from_countries_dict.items()] + holiday_df_from_countries = pd.concat(holiday_df_from_countries_list) + # Removes the observed holidays and only keeps the original holidays. This assumes that the original + # holidays are consistently present in the output of `get_holidays` when their observed counterparts are + # included. This assumption has been verified for all available countries within the date range of + # 2015 to 2030. + cond_observed_holiday = holiday_df_from_countries[cst.EVENT_DF_LABEL_COL].apply( + lambda x: True if any(i in x for i in ["(Observed)", "(observed)"]) else False) + holiday_df_from_countries = holiday_df_from_countries.loc[~cond_observed_holiday, [cst.EVENT_DF_DATE_COL, cst.EVENT_DF_LABEL_COL]] + + # When `daily_event_df_dict` is not empty, its format gets converted to dataframe `holiday_df_from_dict`. + holiday_df_from_dict = None + if daily_event_df_dict: + holiday_df_from_dict_list = [] + for holiday_name, holiday_dates in daily_event_df_dict.items(): + holiday_dates[cst.EVENT_DF_LABEL_COL] = holiday_name + # Checks and finds the first column where values can be recognized by `pd.to_datetime`, uses it as + # the date column and cast it as `cst.EVENT_DF_DATE_COL`. + flag = False + for col in holiday_dates.columns: + try: + holiday_dates[cst.EVENT_DF_DATE_COL] = pd.to_datetime(holiday_dates[col]) + flag = True + except ValueError: + continue + # When a valid date column is found, breaks the loop. + if flag: + break + if flag is False: + raise ValueError(f"No valid date column is found in data frames in `daily_event_df_dict` to use " + f"as {cst.EVENT_DF_DATE_COL}.") + holiday_df_from_dict_list.append(holiday_dates[[cst.EVENT_DF_DATE_COL, cst.EVENT_DF_LABEL_COL]]) + holiday_df_from_dict = pd.concat(holiday_df_from_dict_list) + + if (holiday_df_from_countries is None) & (holiday_df_from_dict is None): + raise ValueError("Automatic holiday grouping is enabled. Holiday list needs to be specified through" + "`holiday_lookup_countries` or `daily_event_df_dict`. Currently, None is found.") + holiday_df = ( + pd.concat([holiday_df_from_countries, holiday_df_from_dict]) + .drop_duplicates() + .reset_index(drop=True) + ) + # Makes sure the `holiday_date_col` and `holiday_name_col` in `grouper_init_params` are standardized values. + grouper_init_params["holiday_date_col"] = cst.EVENT_DF_DATE_COL + grouper_init_params["holiday_name_col"] = cst.EVENT_DF_LABEL_COL + + # Separates holidays specified in `holidays_to_model_separately` from `holiday_df`, so that they will not be passed + # into `HolidayGrouper`. When `holidays_to_model_separately` is not empty, a dictionary is constructed with + # each key presents one holiday or a specific neighboring day for all holidays in `holidays_to_model_separately`. + if holidays_to_model_separately is None: + holidays_to_model_separately = [] + elif holidays_to_model_separately == "auto": + holidays_to_model_separately = [] + log_message("Automatic holiday grouping is enabled. The `holidays_to_model_separately` parameter should be " + "`None` or a list. Since the current input is 'auto', it is set to an empty list and no" + "holiday will be modeled separately.", + LoggingLevelEnum.WARNING) + + if not isinstance(holidays_to_model_separately, (list, tuple)): + raise ValueError( + f"Automatic holiday grouping is enabled. The `holidays_to_model_separately` parameter should be `None` or " + f"a list, found {holidays_to_model_separately}") + elif len(holidays_to_model_separately) == 0: + holiday_df_exclude_separate = holiday_df.copy() + holiday_df_dict_separate_with_effect = dict() else: - holiday_dict = hi.generate_daily_event_dict() - if daily_event_dict_override is None: - daily_event_dict_override = {} - # Updates the result with pre-specified daily events. - holiday_dict.update(daily_event_dict_override) - return holiday_dict + holiday_to_separate_condition = holiday_df[cst.EVENT_DF_LABEL_COL].isin(holidays_to_model_separately) + holiday_df_exclude_separate = holiday_df[~holiday_to_separate_condition] + holiday_df_separate = holiday_df[holiday_to_separate_condition] + + # Initializes the holiday dictionary for holidays modeled separately. Each key corresponds to + # a holiday. + holiday_df_dict_separate_with_effect = split_events_into_dictionaries( + events_df=holiday_df_separate, + events=holidays_to_model_separately) + + # Removes "'" from keys in `pre_post_num_dict_processed` because they are + # removed from holiday names by `split_events_into_dictionaries`. + if grouper_init_params["holiday_impact_dict"]: + pre_post_num_dict_processed = grouper_init_params["holiday_impact_dict"].copy() + for key in pre_post_num_dict.keys(): + new_key = key.replace("'", "") + pre_post_num_dict_processed[new_key] = pre_post_num_dict_processed.pop(key) + else: + pre_post_num_dict_processed = dict() + + # Adds shifted events. + shifted_event_dict = add_event_window_multi( + event_df_dict=holiday_df_dict_separate_with_effect, + time_col=cst.EVENT_DF_DATE_COL, + label_col=cst.EVENT_DF_LABEL_COL, + time_delta="1D", + pre_num=grouper_init_params["holiday_impact_pre_num_days"], + post_num=grouper_init_params["holiday_impact_post_num_days"], + pre_post_num_dict=pre_post_num_dict_processed) + holiday_df_dict_separate_with_effect.update(shifted_event_dict) + + # Raises warning if `holiday_df_exclude_separate` becomes empty, as there will be no holidays for grouping. + # Returns `holiday_df_dict_separate_with_effect` in this case. + if holiday_df_exclude_separate.empty: + log_message(f"All input holidays are modeled separately and no remaining holidays can be used by the " + f"holiday grouper. A dictionary of all holidays with effects modeled separately is returned.", + LoggingLevelEnum.WARNING) + return holiday_df_dict_separate_with_effect + + # Reassigns `grouper_init_params["holiday_df"]` with `holiday_df_exclude_separate` that excludes holidays that are + # modeled separately. + grouper_init_params["holiday_df"] = holiday_df_exclude_separate.copy() + + # Checks if `df` in `grouper_init_params` is None or empty. + if (grouper_init_params["df"] is None) or grouper_init_params["df"].empty: + raise ValueError("Automatic holiday grouping is enabled. Dataframe cannot be `None` or empty.") + # Checks if `df`, `time_col`, `value_col` in `grouper_init_params` are valid. + # Reassigns the values as they can potentially be overriden by `auto_holiday_params`. + df = grouper_init_params["df"].copy() + time_col = grouper_init_params["time_col"] + value_col = grouper_init_params["value_col"] + if not {time_col, value_col}.issubset(df.columns): + raise ValueError("Input `df` for holiday grouper does not contain `time_col` or `value_col`.") + + # First pre-processes `df` to determine if it is appropriate for holiday effects inference. + # If the data is sub-daily, aggregates it to daily before grouping. + # If the data is less granular than daily, raise ValueError. + + # Converts time column. + df[time_col] = pd.to_datetime(df[time_col]) + df = df.sort_values(time_col).reset_index(drop=True) + # First infers frequency, if more granular than daily, aggregates it to daily. + # If less granular than daily, raise ValueError. + time_stats = describe_timeseries(df=df, time_col=time_col) + if time_stats["freq_in_days"] > 1.0: + raise ValueError("Input holiday df for holiday grouper has frequency less than daily. " + "Holiday effect cannot be inferred.") + elif time_stats["freq_in_days"] < 1.0: + df_tmp = df.resample("D", on=time_col).agg({value_col: np.nanmean}) + df = (df_tmp.drop(columns=time_col).reset_index() if time_col in df_tmp.columns + else df_tmp.reset_index()) + # Reassigns back processed `df`. + grouper_init_params["df"] = df + + # Calls holiday grouper. + hg = HolidayGrouper( + **grouper_init_params) + hg.group_holidays( + **group_holiday_params) + + daily_event_df_dict_exclude_separate = hg.result_dict["daily_event_df_dict"] + + # Adds back holidays that are modeled separately with their neighboring days. + daily_event_df_dict_final = update_dictionary( + daily_event_df_dict_exclude_separate, + overwrite_dict=holiday_df_dict_separate_with_effect) + + return daily_event_df_dict_final diff --git a/greykite/algo/forecast/silverkite/forecast_simple_silverkite.py b/greykite/algo/forecast/silverkite/forecast_simple_silverkite.py index e1be02b..3a0d7c1 100644 --- a/greykite/algo/forecast/silverkite/forecast_simple_silverkite.py +++ b/greykite/algo/forecast/silverkite/forecast_simple_silverkite.py @@ -90,6 +90,7 @@ def convert_params( holiday_post_num_days: int = 2, holiday_pre_post_num_dict: Optional[Dict] = None, daily_event_df_dict: Optional[Dict] = None, + auto_holiday_params: Optional[Dict] = None, daily_event_neighbor_impact: Optional[Union[int, List[int], callable]] = None, daily_event_shifted_effect: Optional[List[str]] = None, auto_growth: bool = False, @@ -232,19 +233,19 @@ def convert_params( If None, uses the defaults in `~greykite.algo.common.ml_models.fit_model_via_design_matrix`. auto_holiday : `bool`, default False Whether to automatically infer holiday configuration based on the input timeseries. - The candidate lookup countries are specified by ``holiday_lookup_countries``. - If True, the following parameters will be ignored: + If True, holiday groups will be constructed through estimated holiday effects. If False, + the other specified holiday configuration will be used to generate holiday features. + Notice that the following parameters serve different uses in two case: - * "holidays_to_model_separately" - * "holiday_pre_num_days" - * "holiday_post_num_days" - * "holiday_pre_post_num_dict" + * "Daily_event_df_dict": When ``auto_holiday = False``, this specifies additional events that are + ready for use in modeling; when ``auto_holiday = True``, this provides a list of holidays that + can be processed and used as inputs for ``HolidayGrouper``. + * "auto_holiday_params": It contains customized parameter values that can be used for ``HolidayGrouper``. + When ``auto_holiday = False``, this will not be used. - For details, see `~greykite.algo.common.holiday_inferrer.HolidayInferrer`. - Extra events specified in ``daily_event_df_dict`` will be added to the inferred holidays. + For details, see `~greykite.algo.common.holiday_grouper.HolidayGrouper`. holiday_lookup_countries : `list` [`str`] or "auto" or None, optional, default "auto" - The countries that contain the holidays you intend to model - (``holidays_to_model_separately``). + The countries that contain the holidays you intend to model. * If "auto", uses a default list of countries that contain the default ``holidays_to_model_separately``. @@ -254,23 +255,28 @@ def convert_params( holidays_to_model_separately : `list` [`str`] or "auto" or `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES` or None, optional, default "auto" # noqa: E501 Which holidays to include in the model. - The model creates a separate key, value for each item in ``holidays_to_model_separately``. - The other holidays in the countries are grouped together as a single effect. + The model creates a separate key, value for each item in ``holidays_to_model_separately`` and their neighboring + days. If ``auto_holiday = False``, the other holidays in the countries are grouped together as a single effect. + If ``auto_holiday = True``, the other holidays will be processed by ``HolidayGrouper`` and assigned to + holiday groups based on their estimated impact. * If "auto", uses a default list of important holidays. See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO`. * If `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES`, - uses all available holidays in ``holiday_lookup_countries``. This can often - create a model that has too many parameters, and should typically be avoided. + uses all available holidays in ``holiday_lookup_countries``. This can often create a model that has too many parameters, + and should typically be avoided. This argument is only valid when ``auto_holiday = False``. When ``auto_holiday`` is + set to `True`, a `ValueError` would be raised. * If a list, must be a list of holiday names. * If None or an empty list, all holidays in ``holiday_lookup_countries`` are grouped together as a single effect. Use ``holiday_lookup_countries`` to provide a list of countries where these holiday occur. holiday_pre_num_days : `int`, default 2 - Model holiday effects for ``holiday_pre_num_days`` days before the holiday. + Model holiday effects for ``holiday_pre_num_days`` days before the holiday. When ``auto_holiday = False``, + this also applies to the holidays passed in through `daily_event_df_dict`. holiday_post_num_days : `int`, default 2 - Model holiday effects for ``holiday_post_num_days`` days after the holiday. + Model holiday effects for ``holiday_post_num_days`` days after the holiday. When ``auto_holiday = False``, + this also applies to the holidays passed in through `daily_event_df_dict`. holiday_pre_post_num_dict : `dict` [`str`, (`int`, `int`)] or None, default None Overrides ``pre_num`` and ``post_num`` for each holiday in ``holidays_to_model_separately``. @@ -281,8 +287,16 @@ def convert_params( Holidays not specified use the default given by ``pre_num`` and ``post_num``. daily_event_df_dict : `dict` [`str`, `pandas.DataFrame`] or None, default None A dictionary of data frames, each representing events data for the corresponding key. - Specifies additional events to include besides the holidays specified above. The format - is the same as in `~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast`. + Specifies additional events to include besides the holidays specified above. + The usage and format requirement of `daily_event_df_dict` is different based on whether + ``auto_holiday`` is enabled. When ``auto_holiday = False``, these events will be + directly appended in the return and used in modeling. When ``auto_holiday = True``, this + will be used as sources of holidays, together with holidays constructed through + `holiday_lookup_countries`. Holiday grouper will be run on all these holidays and their + neighboring days unless they are specified in `holidays_to_model_separately`. + + When ``auto_holiday = False``, the format is the same as in + `~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast`. The DataFrame has two columns: - The first column contains event dates. Must be in a format @@ -290,7 +304,7 @@ def convert_params( frequency for proper join. It is joined against the time in ``df``, converted to a day: ``pd.to_datetime(pd.DatetimeIndex(df[time_col]).date)``. - - the second column contains the event label for each date + - The second column contains the event label for each date. The column order is important; column names are ignored. The event dates must span their occurrences in both the training @@ -355,6 +369,25 @@ def convert_params( Note: Do not use `~greykite.common.constants.EVENT_DEFAULT` in the second column. This is reserved to indicate dates that do not correspond to an event. + + When ``auto_holiday = True``, this dictionary is used to pass in holidays that will + be used by `HolidayGrouper`. Each key corresponds to a holiday name, and the + Dataframe has at least one column that contains event dates: + + - The event date column used will be the first column in each data frame + that can be recognized by `pandas.to_datetime`. Must be at daily + frequency for proper join. It is joined against the time in ``df``, + converted to a day: + ``pd.to_datetime(pd.DatetimeIndex(df[time_col]).date)``. + - The other columns in the dataframe will be ignored. + + The event dates must span their occurrences in both the training + and future prediction period. + auto_holiday_params : `dict` or None, default None + This dictionary takes in parameters that can be passed in and used by holiday grouper when + ``auto_holiday`` is set to `True`. When ``auto_holiday = False``, this will not be used. Please see + `~greykite.algo.forecast.silverkite.forecast_simple_silverkite.SimpleSilverkiteForecast.__get_silverkite_holidays` + for more details. daily_event_neighbor_impact : `int`, `list` [`int`], callable or None, default None The impact of neighboring timestamps of the events in ``event_df_dict``. This is for daily events so the units below are all in days. @@ -738,7 +771,7 @@ def convert_params( # Specifies events (via ``daily_event_df_dict``, ``extra_pred_cols``). # Constant daily effect. - holiday_df_dict = self.__get_silverkite_holidays( + daily_event_df_dict = self.__get_silverkite_holidays( auto_holiday=auto_holiday, holiday_lookup_countries=holiday_lookup_countries, holidays_to_model_separately=holidays_to_model_separately, @@ -747,21 +780,11 @@ def convert_params( pre_num=holiday_pre_num_days, post_num=holiday_post_num_days, pre_post_num_dict=holiday_pre_post_num_dict, + auto_holiday_params=auto_holiday_params, df=df, time_col=time_col, value_col=value_col, - forecast_horizon=forecast_horizon) - if holiday_df_dict is not None: - # Adds holidays to the user-specified events, - # giving preference to user events - # if there are conflicts - daily_event_df_dict = update_dictionary( - holiday_df_dict, - overwrite_dict=daily_event_df_dict) - - if not daily_event_df_dict: - # Sets empty dictionary to None - daily_event_df_dict = None + daily_event_df_dict=daily_event_df_dict) extra_pred_cols += get_event_pred_cols( daily_event_df_dict, @@ -1353,23 +1376,23 @@ def __get_silverkite_holidays( pre_num=2, post_num=2, pre_post_num_dict=None, + auto_holiday_params=None, df=None, time_col=cst.TIME_COL, value_col=cst.VALUE_COL, - forecast_horizon=None): + daily_event_df_dict=None): """Generates holidays dictionary for input to daily_event_df_dict parameter of silverkite model. The main purpose is to provide reasonable defaults for the holiday names and countries Parameters ---------- auto_holiday : `bool`, default False - If True, the other holiday configurations will be ignored. - An algorithm is used to automatically infer the holiday configuration. - For details, see `~greykite.algo.common.holiday_inferrer.HolidayInferrer`. - If False, the specified holiday configuration will be used to generate holiday features. + If `True`, an algorithm is used to automatically infer the holiday configuration. Holiday groups will be constructed through + estimated holiday effects for holidays and their neighboring days. + For details, see `~greykite.algo.common.holiday_grouper.HolidayGrouper`. + If `False`, the specified holiday configuration will be used to generate holiday features. holiday_lookup_countries : `list` [`str`] or "auto" or None, optional, default "auto" - The countries that contain the holidays you intend to model - (``holidays_to_model_separately``). + The countries that contain the holidays you intend to model. * If "auto", uses a default list of countries that contain the default ``holidays_to_model_separately``. @@ -1379,27 +1402,32 @@ def __get_silverkite_holidays( holidays_to_model_separately : `list` [`str`] or "auto" or `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES` or None, optional, default "auto" # noqa: E501 Which holidays to include in the model. - The model creates a separate key, value for each item in ``holidays_to_model_separately``. - The other holidays in the countries are grouped together as a single effect. - - * If "auto", uses a default list of important holidays. - See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO`. - * If `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES`, - uses all available holidays in ``holiday_lookup_countries``. This can often - create a model that has too many parameters, and should typically be avoided. - * If a list, must be a list of holiday names. - * If None or an empty list, all holidays in ``holiday_lookup_countries`` are grouped together - as a single effect. + The model creates a separate key, value for each item in ``holidays_to_model_separately`` and their neighboring + days. If ``auto_holiday = False``, the other holidays in the countries are grouped together as a single effect. + If ``auto_holiday = True``, the other holidays will be processed by ``HolidayGrouper`` and assigned to + holiday groups based on their estimated impact. + + * If "auto", uses a default list of important holidays. + See `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO`. + * If `~greykite.algo.forecast.silverkite.constants.silverkite_holiday.SilverkiteHoliday.ALL_HOLIDAYS_IN_COUNTRIES`, + uses all available holidays in ``holiday_lookup_countries``. This can often create a model that has too many parameters, + and should typically be avoided. This argument is only valid when ``auto_holiday = False``. When ``auto_holiday`` is + set to `True`, a `ValueError` would be raised. + * If a list, must be a list of holiday names. + * If None or an empty list, all holidays in ``holiday_lookup_countries`` are grouped together + as a single effect. Use ``holiday_lookup_countries`` to provide a list of countries where these holiday occur. - start_year : `int` + start_year : `int`, default 2015 Year of first training data point, used to generate holiday events. - end_year : `int` + end_year : `int`, default 2030 Year of last forecast data point, used to generate holiday events. - pre_num : `int` - Model holiday effects for ``pre_num`` days before the holiday. - post_num : `int` - Model holiday effects for ``post_num`` days after the holiday. + pre_num : `int`, default 2 + Model holiday effects for ``pre_num`` days before the holiday. When ``auto_holiday = False``, + this also applies to the holidays passed in through ``daily_event_df_dict``. + post_num : `int`, default 2 + Model holiday effects for ``post_num`` days after the holiday. When ``auto_holiday = False``, + this also applies to the holidays passed in through ``daily_event_df_dict``. pre_post_num_dict : `dict` [`str`, (`int`, `int`)] or None, default None Overrides ``pre_num`` and ``post_num`` for each holiday in ``holidays_to_model_separately``. @@ -1408,22 +1436,35 @@ def __get_silverkite_holidays( denoting that the "Thanksgiving" ``pre_num`` is 1 and ``post_num`` is 3, and "Labor Day" ``pre_num`` is 1 and ``post_num`` is 2. Holidays not specified use the default given by ``pre_num`` and ``post_num``. + auto_holiday_params : `dict` or None, default None + This dictionary takes in parameters that can be passed in and used by holiday grouper when + ``auto_holiday`` is set to `True`. When ``auto_holiday = False``, this will not be used. Please see + `~greykite.algo.forecast.silverkite.forecast_simple_silverkite.SimpleSilverkiteForecast.__get_silverkite_holidays` + for more details. df : `pandas.DataFrame` or None, default None. - The timeseries data needed for automatically inferring holiday configuration. - This is not used when ``auto_holiday`` is False. + The timeseries data that will be used by default for automatically inferring holiday configuration. This + will not be used if external ``df`` is specified through ``auto_holiday_params``. + This is not used when ``auto_holiday`` is `False`. time_col : `str`, default `cst.TIME_COL` - The column name for timestamps in ``df``. - This is not used when ``auto_holiday`` is False. + The column name for timestamps in ``df``. This will not be used if ``time_col`` is specified through + ``auto_holiday_params``. + This is not used when ``auto_holiday`` is `False`. value_col : `str`, default `cst.VALUE_COL` - The column name for values in ``df``. - This is not used when ``auto_holiday`` is False. - forecast_horizon : `int` or None, default None - The forecast horizon, used to calculate the year list for "auto" option. + The column name for values in ``df``. This will not be used if ``value_col`` is specified through + ``auto_holiday_params``. + This is not used when ``auto_holiday`` is `False`. + daily_event_df_dict : `dict` [`str`, `pandas.DataFrame`] or None, default None + A dictionary of data frames, each representing events data for the corresponding key. + Specifies additional events to include besides the holidays specified by other holiday configurations. + If ``auto_holiday`` is set to `False`, it will be directly appended to the returned result, and the + format is the same as in `~greykite.algo.forecast.silverkite.SilverkiteForecast.forecast`. + If ``auto_holiday`` is set to `True`, it provides a list of holidays that will be processed and + passed in as inputs for ``HolidayGrouper``. Returns ------- daily_event_df_dict : `dict` [`str`, `pandas.DataFrame` [EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]] - Suitable for use as `daily_event_df_dict` parameter in `forecast_silverkite`. + Suitable for use as ``daily_event_df_dict`` parameter in ``forecast_silverkite``. Each holiday is modeled as its own effect (not specific to each country). See Also @@ -1434,30 +1475,39 @@ def __get_silverkite_holidays( `~greykite.common.features.timeseries_features.get_available_holidays_across_countries` to see available holidays in those countries. """ + # Processes `holiday_lookup_countries`. if holiday_lookup_countries is None: - # `None` will not model any holidays + # `None` will not model any holidays when `auto_holiday` is `False`. When `auto_holiday` is `True`, holidays + # has to be passed in through `holiday_lookup_countries` and/or `daily_event_df_dict`. holiday_lookup_countries = [] elif holiday_lookup_countries == "auto": - # countries that contain the default `holidays_to_model_separately` + # Countries that contain the default `holidays_to_model_separately`. holiday_lookup_countries = self._silverkite_holiday.HOLIDAY_LOOKUP_COUNTRIES_AUTO elif not isinstance(holiday_lookup_countries, (list, tuple)): raise ValueError( f"`holiday_lookup_countries` should be a list, found {holiday_lookup_countries}") + if auto_holiday: - if df is None: - raise ValueError("Automatically inferring holidays is turned on. Dataframe must be provided.") return get_auto_holidays( df=df, time_col=time_col, value_col=value_col, - countries=holiday_lookup_countries, - forecast_horizon=forecast_horizon, - daily_event_dict_override=None) # ``daily_event_dict`` is handled in `convert_params` with other cases. + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=holidays_to_model_separately, + daily_event_df_dict=daily_event_df_dict, + auto_holiday_params=auto_holiday_params + ) else: + # Processes `holidays_to_model_separately`. if holidays_to_model_separately is None: holidays_to_model_separately = [] elif holidays_to_model_separately == "auto": - # important holidays + # important holidays. holidays_to_model_separately = self._silverkite_holiday.HOLIDAYS_TO_MODEL_SEPARATELY_AUTO elif holidays_to_model_separately == self._silverkite_holiday.ALL_HOLIDAYS_IN_COUNTRIES: holidays_to_model_separately = get_available_holidays_across_countries( @@ -1468,11 +1518,24 @@ def __get_silverkite_holidays( raise ValueError( f"`holidays_to_model_separately` should be a list, found {holidays_to_model_separately}") - return generate_holiday_events( + holiday_df_dict = generate_holiday_events( countries=holiday_lookup_countries, holidays_to_model_separately=holidays_to_model_separately, - year_start=start_year - 1, # subtract 1 just in case, to ensure coverage of all holidays - year_end=end_year + 1, # add 1 just in case, to ensure coverage of all holidays + year_start=start_year - 1, # subtract 1 just in case, to ensure coverage of all holidays. + year_end=end_year + 1, # add 1 just in case, to ensure coverage of all holidays. pre_num=pre_num, post_num=post_num, pre_post_num_dict=pre_post_num_dict) + + if holiday_df_dict is not None: + # Adds holidays to the user-specified events, + # giving preference to user events if there are conflicts. + daily_event_df_dict = update_dictionary( + holiday_df_dict, + overwrite_dict=daily_event_df_dict) + + if not daily_event_df_dict: + # Sets empty dictionary to 'None'. + daily_event_df_dict = None + + return daily_event_df_dict diff --git a/greykite/algo/uncertainty/conditional/conf_interval.py b/greykite/algo/uncertainty/conditional/conf_interval.py index 4235c24..b6a2268 100644 --- a/greykite/algo/uncertainty/conditional/conf_interval.py +++ b/greykite/algo/uncertainty/conditional/conf_interval.py @@ -47,6 +47,7 @@ def conf_interval( conditional_cols=None, quantiles=(0.005, 0.025, 0.975, 0.995), quantile_estimation_method="normal_fit", + remove_conditional_mean=True, sample_size_thresh=5, small_sample_size_method="std_quantiles", small_sample_size_quantile=0.95, @@ -65,7 +66,7 @@ def conf_interval( distribution function. ``offset_col`` is used in the prediction phase to shift the calculated quantiles - appropriately. + appropriately. It is not used in this function implementation directly. Parameters ---------- @@ -107,7 +108,7 @@ def conf_interval( These quantiles can be then used to construct the desired CIs. The default values [0.005, 0.025, 0.0975, 0.995] can be used to construct 99 and 95 percent CIs. - quantile_estimation_method : `str`, default "normal_fit" + quantile_estimation_method : `str`, default `"normal_fit"` There are two options implemented for the quantile estimation method (conditional on slice): @@ -116,12 +117,15 @@ def conf_interval( - "ecdf": Uses the empirical cumulative distribution function to calculate sample quantiles. + remove_conditional_mean : `bool`, default True + If True, for every slice (defined by `conditional_cols`), the conditional mean + is removed when calculating quantiles. sample_size_thresh : `int`, default 5 The minimum sample size for each slice where we allow for using the conditional - distribution (conditioned on the "conditional_cols" argument). + distribution (conditioned on the `"conditional_cols"` argument). If sample size for that slice is smaller than this, we use the fallback method. - small_sample_size_method : `str`, default "std_quantiles" + small_sample_size_method : `str`, default `"std_quantiles"` The method to use for slices with small sample size - "std_quantile" method is implemented and it looks at the response @@ -147,9 +151,9 @@ def conf_interval( Dictionary with following items (main component is the ``predict`` function). - "ecdf_df" : `pandas.DataFrame` - ecdf_df generated by "estimate_empirical_distribution" + ecdf_df generated by `estimate_empirical_distribution` - "ecdf_df_overall" : `pandas.DataFrame` - ecdf_df_overall generated by "estimate_empirical_distribution" + ecdf_df_overall generated by `estimate_empirical_distribution` - "ecdf_df_fallback" : `pandas.DataFrame` ecdf_df_fallback, a fall back data to get the CI quantiles when the sample size for that slice is small or that slice @@ -158,7 +162,8 @@ def conf_interval( - if small_sample_size_method = "std_quantiles", we use std quantiles to pick a slice which has a std close to that quantile and fall-back to that slice. - - otherwise we fallback to "ecdf_overall" + - otherwise we fallback to "ecdf_df_overall" + - "distribution_col" : `str` Input ``distribution_col`` - "offset_col": `str` @@ -188,11 +193,14 @@ def conf_interval( quantile_grid_size=None, quantiles=quantiles, conditional_cols=conditional_cols, - remove_conditional_mean=True) + remove_conditional_mean=remove_conditional_mean + ) ecdf_df = model_dict["ecdf_df"] ecdf_df_overall = model_dict["ecdf_df_overall"] + ecdf_df_fallback = ecdf_df_overall.copy() - # Two methods are implemented: ecdf; normal_fit. + # Two methods are implemented: `"ecdf"`; `"normal_fit"`. + # For normal fit, `ecdf_df` and `ecdf_df_fallback` are updated below. if quantile_estimation_method == "ecdf": quantile_summary_col = f"{distribution_col}_ecdf_quantile_summary" elif quantile_estimation_method == "normal_fit": @@ -222,32 +230,36 @@ def conf_interval( # similar points in the past. fall_back_for_all = False if small_sample_size_method == "std_quantiles": - ecdf_df_large_ss = ecdf_df.loc[ecdf_df[sample_size_col] >= sample_size_thresh].reset_index(drop=True) + ecdf_df_large_ss = ecdf_df.loc[ecdf_df[sample_size_col] >= sample_size_thresh].reset_index( + drop=True) assert set(ecdf_df_large_ss.columns).intersection(["std_quantile", "std_quantile_diff"]) == set(), ( "column names: std_quantile, std_quantile_diff should not appear in ecdf_df") if len(ecdf_df_large_ss) == 0: - warnings.warn("No slice had sufficient sample size. We fall back to the overall distribution.") - # If ``ecdf_df_large_ss`` is empty it means we do not have any sufficient + warnings.warn( + "No slice had sufficient sample size. We fall back to the overall distribution.") + # If `ecdf_df_large_ss` is empty it means we do not have any sufficient # samples for any slices. - # Therefore we have to fall back in all cases and we set ``ecdf_df`` - # to ``ecdf_df_fall_back`` + # Therefore we have to fall back in all cases and we set `ecdf_df` + # to `ecdf_df_fall_back` ecdf_df = ecdf_df_fallback fall_back_for_all = True else: - ecdf_df_large_ss["std_quantile"] = np.argsort(ecdf_df_large_ss[std_col]) / ecdf_df_large_ss.shape[0] - # Calculates the distance between "std_quantile" column values and ``small_sample_size_quantile`` - ecdf_df_large_ss["std_quantile_diff"] = abs(ecdf_df_large_ss["std_quantile"] - small_sample_size_quantile) - # Chooses the row with closes value in "std_quantile" column to ``small_sample_size_quantile`` - # Note the resulting dataframe below ``ecdf_df_fallback`` will have one row - ecdf_df_fallback = (ecdf_df_large_ss.loc[[ecdf_df_large_ss["std_quantile_diff"].idxmin()]] - .reset_index(drop=True)) + ecdf_df_large_ss["std_quantile"] = np.argsort( + ecdf_df_large_ss[std_col]) / ecdf_df_large_ss.shape[0] + # Calculates the distance between `"std_quantile"` column values and `small_sample_size_quantile` + ecdf_df_large_ss["std_quantile_diff"] = abs( + ecdf_df_large_ss["std_quantile"] - + small_sample_size_quantile) + # Chooses the row with closes value in `"std_quantile"` column to `small_sample_size_quantile`. + # Note the resulting dataframe below `ecdf_df_fallback` will have one row. + ecdf_df_fallback = ecdf_df_large_ss.loc[[ecdf_df_large_ss["std_quantile_diff"].idxmin()]] del ecdf_df_fallback["std_quantile"] del ecdf_df_fallback["std_quantile_diff"] del ecdf_df_large_ss["std_quantile"] del ecdf_df_large_ss["std_quantile_diff"] - # we re-assign ecdf_df by removing the combinations with small sample size - # this is done so that in predict phase those values are not populated from - # small sample sizes and use ``ecdf_fallback`` + # Re-assigns `ecdf_df` by removing the combinations with small sample size. + # This is done so that in predict phase those values are not populated from + # small sample sizes and use `ecdf_fallback` ecdf_df = ecdf_df_large_ss elif small_sample_size_method is not None: raise NotImplementedError( diff --git a/greykite/common/constants.py b/greykite/common/constants.py index 2ba94a9..ee5e033 100644 --- a/greykite/common/constants.py +++ b/greykite/common/constants.py @@ -91,6 +91,8 @@ """Prefix for naming changepoint features.""" CHANGEPOINT_COL_PREFIX_SHORT = "cp" """Short prefix for naming changepoint features.""" +LEVELSHIFT_COL_PREFIX_SHORT = "ctp" +"""Short prefix for naming levelshift features.""" # Column names used by # `~greykite.common.features.adjust_anomalous_data.adjust_anomalous_data`. diff --git a/greykite/common/data_loader.py b/greykite/common/data_loader.py index 03ba695..6028ce8 100644 --- a/greykite/common/data_loader.py +++ b/greykite/common/data_loader.py @@ -201,7 +201,7 @@ def load_peyton_manning(self): This dataset contains log daily page views for the Wikipedia page for Peyton Manning. One of the primary datasets used for demonstrations by Facebook ``Prophet`` algorithm. - Source: https://github.com/facebook/prophet/blob/master/examples/example_wp_log_peyton_manning.csv + Source: https://github.com/facebook/prophet/blob/main/examples/example_wp_log_peyton_manning.csv Below is the dataset attribute information: diff --git a/greykite/common/evaluation.py b/greykite/common/evaluation.py index ac21e79..fe15e3a 100644 --- a/greykite/common/evaluation.py +++ b/greykite/common/evaluation.py @@ -774,6 +774,31 @@ def elementwise_absolute_percent_error(true_val, pred_val): return 100 * abs(true_val - pred_val) / abs(true_val) +def elementwise_symmetric_absolute_percent_error(true_val, pred_val): + """The symmetric absolute percent error between a single true and predicted value. + + Parameters + ---------- + true_val : float + True value. + pred_val : float + Predicted value. + + Returns + ------- + symmetric_absolute_percent_error : float + Symmetric Absolute Percent error, abs(true_val - pred_val) / (abs(true_val) + abs(pred_val)) + """ + denominator = abs(true_val) + abs(pred_val) + if denominator == 0: + warnings.warn("true_val and pred_val are 0. Symmetric absolute percent error is undefined.") + return None + elif denominator < 1e-8: + warnings.warn("denominator contains very small values. Symmetric absolute percent error is " + "very likely highly volatile.") + return 100 * abs(true_val - pred_val) / (abs(true_val) + abs(pred_val)) + + def elementwise_quantile(true_val, pred_val, q): """The quantile loss between a single true and predicted value. @@ -881,7 +906,12 @@ class ElementwiseEvaluationMetricEnum(Enum): elementwise_absolute_percent_error, "absolute_percent_error", [ACTUAL_COL, PREDICTED_COL]) - """Percent error, abs(true-pred)/abs(true)""" + """Absolute percent error, abs(true-pred)/abs(true)""" + SymmetricAbsolutePercentError = ElementwiseEvaluationMetric( + elementwise_symmetric_absolute_percent_error, + "symmetric_absolute_percent_error", + [ACTUAL_COL, PREDICTED_COL]) + """Symmetric absolute percent error, abs(true - pred)/ (abs(true) + abs(pred))""" Quantile80 = ElementwiseEvaluationMetric( partial(elementwise_quantile, q=0.80), "quantile_loss_80", diff --git a/greykite/common/features/normalize.py b/greykite/common/features/normalize.py index e27ec22..145ef64 100644 --- a/greykite/common/features/normalize.py +++ b/greykite/common/features/normalize.py @@ -50,10 +50,10 @@ def normalize_df( - "statistical" method removes the "mean" and divides by "std" for each column. - "zero_to_one" method removes the "min" and divides by the "max - min" for each column. - - "minus_half_to_half" method will remove the "(min + max)/2" and divides by the "max - min" - for each column. - - "zero_at_origin" method will remove the first data point and divides by the "max - min" + - "minus_half_to_half" method removes the "(min + max)/2" and divides by the "max - min" for each column. + - "zero_at_origin" method removes a constant equal to the first data point and divides + by the "max - min" for each column. drop_degenerate_cols : `bool`, default True A boolean to determine if columns with only one possible value should be diff --git a/greykite/common/features/outlier.py b/greykite/common/features/outlier.py new file mode 100644 index 0000000..1aabdb3 --- /dev/null +++ b/greykite/common/features/outlier.py @@ -0,0 +1,822 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini +"""Basic functionality to identify one-dimensional outliers.""" + +import warnings +from abc import abstractmethod +from dataclasses import dataclass +from dataclasses import field +from typing import Optional + +import numpy as np +import pandas as pd + + +@dataclass +class DetectionResult: + """This is a dataclass to denote the result of an outlier detection.""" + scores: Optional[pd.Series] = None + """A series of floats each denoting a score of how much of an outleir a point is. + The core could be signed for some methods with negative meaning a value is very small and + the very large for positive.""" + is_outlier: Optional[pd.Series] = None + """A series of booleans with `True` meaning a point is an outlier and False meaning + it is not an outlier.""" + + +@dataclass +class DiffMethod: + """This dataclass is to denote a `diff_method` if a differencing with respect + to a baseline is needed.""" + name: Optional[str] = None + """Name of the method.""" + param: Optional[dict] = field(default_factory=dict) + """Parameters of the method.""" + + +EXPONENTIAL_SMOOTHING_ALPHA = 0.5 +"""Default for exponential smoothing `com`. +See the constant below. +""" +EXPONENTIAL_SMOOTHING = DiffMethod(name="es", param={"alpha": EXPONENTIAL_SMOOTHING_ALPHA}) +"""This uses exponential smoothing method to calculate a baseline. +This is utilized in `diff_from_baseline` method of `~greykite.common.features.outlier.BaseOutlierDetector`. +See: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html +See method details: https://en.wikipedia.org/wiki/Exponential_smoothing +""" + +MOVING_MEDIAN_WINDOW = 5 +"""The window size for moving aggregation method. +See below.""" +MOVING_MEDIAN = DiffMethod( + name="moving_med", + param={ + "window": MOVING_MEDIAN_WINDOW, + "min_periods": 1, + "center": True}) +"""This calculate a moving median as the baseline. +The parameter default are centered window of size 10 and requires only one available data as minimum. +This is utilized in `diff_from_baseline` method of `~greykite.common.features.outlier.BaseOutlierDetector`. +For a longer list of parameters to pass: +See: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html +""" + + +IMPLEMENTED_DIFF_METHODS = ["es", "moving_med"] +"""List of implemented methods for baseline to be used in differencing.""" + +Z_SCORE_CUTOFF = 5.0 +"""Z-score cutoff point.""" +IQR_LOWER = 0.1 +"""Lower quantile to calculate IQR (Inter-quartile Range). +Note that our default is different from standard.""" +IQR_UPPER = 0.9 +"""Upper quantile to calculate IQR (Inter-quartile Range). +Note that our default is different from standard.""" +TUKEY_CUTOFF = 1.0 +"""IQR coefficient to detect outliers. +Note that our default is different from standard. +""" +TRIM_PERCENT = 5.0 +"""Default percent for trimming wich is a number between 0 and 100 (typically less then 5).""" + + +class BaseOutlierDetector: + """This is the base class for detecting one-dimensional outliers. + These classes are expected to return (outlier) scores for each point and a + boolean to express if each point is an outlier or not. + + Additionally, a lower bound (`lower_bound`) and upper bound (`upper_bound`) + attribute will be available after `fit` to determine the bounds for + outlier determination. + + This class already implements a few processing steps which are useful across + various outlier detection methods: + + - `remove_na`: removing NAs + - `trim`: trimming the data. This is useful for example when using z-score + - `diff_from_baseline`: fitting a simple baseline and differencing the input from baseline. + + Children of this base class need to implement two abstract methods: + + - `fit_logic` + - `detect_logic` + + Parameters + ---------- + trim_percent : `float`, default None + Trimming percent before calculating the model quantities. + This removes `trim_percent` of data in symmetric fashion from + both ends and then it calculates the quantities needed. + For example in Z-score based method: `ZScoreOutlierDetector` (a child of this class), + this will remove extreme values before calculating the variance. + diff_method : `str` or None, default `~greykite.common.features.outlier.MOVING_MEDIAN` + A simple method to fit a baseline and calculate residuals, then apply + the approach on the residuals. + The implemented methods are listed in: + `~greykite.common.features.outlier.IMPLEMENTED_DIFF_METHODS` + + Attributes + ---------- + self.lower_bound : `float` or None, default None + The lower bound for not being outlier which is decided after `self.fit` + is called. + self.upper_bound : `float` or None, default None + The upper bound for not being outlier which is decided after `self.fit` + is called. + self.fitted_param : `dict`, default None + Fitted (method specific) parameters. + The dictionary keys depend on the model (child of this class). + This is updated from empty dict after `self.fit` is called. + Note that `self.fit` calls the abstract method `self.fit_logic`, + which is implemented by the child class. + self.y : `pandas.Series` or None, default None + Input series which is added after `self.fit` is called with data. + y_diff : `pandas.Series` or None, default None + Differenced series if a `diff_method` is passed. + y_na_removed : `pandas.Series` or None, default None + The vector `y` after removing NAs. + y_trimmed : `pandas.Series` or None, default None + The `y` vector after trimming in symmetric fashion using `trim_percent`. + y_ready_to_fit` : `pandas.Series` or None, default None + The final vector used in `fit` which essentially finds + This vector is constructed with the following three steps: + + - (1) differencing if a `diff_method` is passed; + - (2) removing NAs; + - (3) trimming. + + self.y_diffed : `pandas.Series` or None, default None + self.y_na_removed : `pandas.Series` or None, default None + self.y_trimmed : `pandas.Series` or None, default None + self.y_ready_to_fit: `pandas.Series` or None, default None + self.fitted : `~greykite.common.features.outlier.DetectionResult` or None, default None + Fitted scores and booleans. + The default for both fields is None, before fit is called. + self.y_new : `pandas.Series` or None, default None + This is a series used at `prediction` time. + Note that in this case prediction means that we want to apply + the same logic to new data. + It iw worth noting in most application `fit` is only needed because + the sole purpose is to do outlier removal. + self.y_new_ready_to_predict : `pandas.Series` or None, default None + This is the transformed input for prediction. + In this case, only differencing might take place, if a `diff_method` is passed. + Note that there is no need to trim or remove NAs in this case. + self.predicted : `~greykite.common.features.outlier.DetectionResult` or None, default None + Predicted scores and booleans, if `self.predict` is called on new data. + """ + def __init__( + self, + trim_percent=TRIM_PERCENT, + diff_method=MOVING_MEDIAN): + self.trim_percent = trim_percent + self.diff_method = diff_method + + # Attributes (See docstring of the class). + self.lower_bound = None + self.upper_bound = None + + # Fitted (method specific) parameters. + # This attibute is updated by the abstract method `self.fit_logic`. + # That method is the core of the logic for the outlier detection. + self.fitted_param = {} + + # Input series. + self.y = None + + # Transformed input series: + # `y_diff`: differenced series if a `diff_method` is passed. + # `y_na_removed`: the vector `y` after removing NAs. + # `y_trimmed`: `y` vector after trimming in symmetric fashion using `trim_percent`. + # `y_ready_to_fit`: The final vector used in `fit`, + # which essentially defines + # the criteria to assign outlier being `True`. + # This vector is constricted after + # (1) differencing if a `diff_method` is passed; + # (2) removing NAs; + # (3) trimming. + self.y_diffed = None + self.y_na_removed = None + self.y_trimmed = None + self.y_ready_to_fit = None + + # Fitted scores and booleans. + self.fitted = DetectionResult(scores=None, is_outlier=None) + + # Prediction related attributes: + # Prediction time series. + self.y_new = None + # This is the transformed input for prediction. + # In this case, only differencing might take place, + # if a `diff_method` is passed. + # Note that there is no need to trim or remove NAs in this case. + self.y_new_ready_to_predict = None + # Predicted scores and booleans. + self.predicted = DetectionResult(scores=None, is_outlier=None) + + @staticmethod + def remove_na(y): + """Removes NAs from the input series. + Also, importantly, will raise error if there are not enough data left (at least 2). + + Parameters + ---------- + y : `pandas.series` + A series of floats. + + Returns + ------- + y_na_removed : `pandas.Series` + The input series after removing NAs. + """ + y = pd.Series(y) + # Removes NAs. + y_na_removed = y.copy().dropna() + if len(y_na_removed) < 2: + raise ValueError( + f"Length of y after removing NAs is less than 2.\n" + f"y: {y}\n", + f"y_na_removed: {y_na_removed}.\n") + return y_na_removed + + @staticmethod + def trim( + y, + trim_percent=TRIM_PERCENT): + """This methods performs symmetric trimming from a given series `y` + in the inputs. + This means a percent of data from both sides of the + distribution are cut by calculating a high and low quantile. + + Parameters + ---------- + y : `pandas.series` + A series of floats. + trim_percent : `float`, default `TRIM_PERCENT`. + Trimming percent for calculating the variance. + The function first removes this amount of data in symmetric fashion from + both ends and then it calculates the mean and the variance. + + Returns + ------- + y_trimmed : `pandas.series` + A series of floats. + """ + y_trimmed = y.copy() + + if trim_percent is None or trim_percent == 0: + return y_trimmed + + if trim_percent < 100 and trim_percent > 0: + # Calculates half of trimming percent, + # in order to calculate upper / lower quantiles. + alpha = 0.5 * (trim_percent / 100.0) + lower_limit = np.quantile(a=y, q=alpha) + upper_limit = np.quantile(a=y, q=1 - alpha) + y_trimmed = [x for x in y if (x <= upper_limit and x >= lower_limit)] + # If length of trimmed values is less than 2, + # we revert to the original values. + if len(y_trimmed) < 2: + warnings.warn( + "After trimming there were less than two values: " + "Therefore trimming was disabled." + f"\n original y: {y}" + f"\n y_trimmed: {y_trimmed}", + UserWarning) + y_trimmed = y + else: + raise ValueError( + f"Trim percent: {trim_percent} needs to be" + " a value in the interval [0.0, 100.0).") + + return y_trimmed + + @staticmethod + def diff_from_baseline(y, diff_method): + """Calculates a baseline for the input series `y` and then removes + that baseline from `y`, thus creating a diff series. + + Parameters + ---------- + y : `pandas.series` + The input series (of floats). + diff_method : `str` + A simple method to fit a baseline and calculate residuals, then apply + the approach on the residuals. + The implemented methods are listed in: + `~greykite.common.features.outlier.IMPLEMENTED_DIFF_METHODS`. + + Returns + ------- + residuals : `pandas.series` + The series of residuals after applying `diff_method`. + If `diff_method` is None, then input is not altered. + """ + y = pd.Series(y).copy() + name = diff_method.name + param = diff_method.param + + # Initial values for returned quantities. + baseline_y = None + residuals = y.copy() + + if name not in IMPLEMENTED_DIFF_METHODS: + raise NotImplementedError(f"{name} is not implemented") + else: + if name == EXPONENTIAL_SMOOTHING.name: + if param is None: + # `alpha` specifies the coef. in exponential smoothing. + # See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html. + param = dict(alpha=EXPONENTIAL_SMOOTHING_ALPHA) + baseline_y = y.ewm(**param).mean() + elif name == MOVING_MEDIAN.name: + baseline_y = y.rolling(**param).median() + + # Below we assert that the baseline needs to be of the same size as input. + assert len(y) == len(baseline_y), "baseline needs to be of the same size as input." + residuals = y - baseline_y + + return { + "residuals": residuals, + "baseline_y": baseline_y} + + @abstractmethod + def fit_logic(self, y_ready_to_fit): + """This is an abstract method to be implemented by children of this base class. + This logic is to be applied to `y_ready_to_fit` and it will update + `self.fitted_param`. + All transormations needed to map from `y` to `y_ready_to_fit` are handled + by this class in the method `fit` below. + + Parameters + ---------- + y_ready_to_fit : `pandas.series` + A float series of the (transformed) inputs. + + Returns + ------- + None. This method updates: `self.fitted_param`. + """ + + # Implement the fit logic here! + self.fitted_param = None + + @abstractmethod + def detect_logic(self, y_new): + """This is an abstract method to be implemented by children of this base class. + + Parameters + ---------- + y_new : `pandas.series` + A series of floats. This is the input series to be labeled. + + Returns + ------- + result : `~greykite.common.features.outlier.DetectionResult` + Includes the outlier detection results (see the data class attributes). + """ + # Implement detect logic here! + return DetectionResult(scores=None, is_outlier=None) + + def fit(self, y): + """This method prepares and fits the input data. + This means that for the input series `y`, it fits the parameters + which are then used to decide if a point should be an outlier and + return an outlier score for each point. + + This method first prepares the input series `y` by + + 1- differencing from baseline if a `diff_method` is passed. + 2- removing NAs + 3- trimming. + + and then calls the `fit_logic` which is an abstract method to be + implemented by the users of this class. + + Parameters + ---------- + y : `pandas.series` + A series of floats. This is the input series to be labeled. + + + Returns + ------- + None. + """ + # Initialize all series to be the same as input. + # We then modify these according to the parameters, + # For example differencing will only take place if `diff_method` is not None. + self.y = y.copy() + self.y_diffed = y.copy() + self.y_na_removed = y.copy() + self.y_trimmed = y.copy() + self.y_ready_to_fit = y.copy() + + # Applies `diff_method` and calculates diffs (residuals). + if self.diff_method is not None: + self.y_diffed = self.diff_from_baseline( + y=self.y, + diff_method=self.diff_method)["residuals"] + + # Removes NAs. + self.y_na_removed = self.remove_na(self.y_diffed) + + # Assigns the final vector used for fitting. + self.y_ready_to_fit = self.y_na_removed.copy() + + if self.trim_percent is not None: + self.y_ready_to_fit = self.trim( + y=self.y_ready_to_fit, + trim_percent=self.trim_percent) + # Calls the implemented fit logic: `fit_logic`, + # which is an abstract method to be implemented in child classes. + # This will update `self.fitted_param`. + self.fit_logic(y_ready_to_fit=self.y_ready_to_fit) + + # The fitted values must always be obtained by a call to `detect` using original `y`. + # Note that we pass the original `y` to detect, + # as we like to get a series of the same size. + # In other words: there is no need to remove NAs or trim. + # Also note that the differncing will be handled by the `detect` method. + self.fitted = self.detect(self.y) + + def detect(self, y_new): + """This method uses the fitted parameters to decide if a point should + be labeled as outlier and also provides a score. + + This method performs two steps + + - first does a simple preparation by a call to `diff_from_baseline`; + - then calls the abstract method: `detect_logic`. + + Parameters + ---------- + y_new : `pandas.series` + A series of floats. This is the input series to be labeled. + + Returns + ------- + result : `~greykite.common.features.outlier.DetectionResult` + Includes the outlier detection results (see the data class attributes). + + """ + self.y_new = y_new.copy() + self.y_new_diffed = y_new.copy() + + if self.diff_method is not None: + self.y_new_diffed = self.diff_from_baseline( + y=self.y_new_diffed, + diff_method=self.diff_method)["residuals"] + + self.predicted = self.detect_logic(self.y_new_diffed) + return self.predicted + + +class ZScoreOutlierDetector(BaseOutlierDetector): + """This is a class for detecting one-dimensional outliers using z-score (based on the normal distribution). + See https://en.wikipedia.org/wiki/Standard_score as a reference. + + This is a child of `~greykite.common.features.outlier.ZScoreOutlierDetector`, + which already implements a few processing steps which are useful across + various outlier detection methods: + + - `remove_na`: removing NAs + - `trim`: trimming the data. This is useful for example when using z-score + - `diff_from_baseline`: fitting a simple baseline and differencing the input from baseline. + + For this method: + + - `DetectionResult.scores` are defined as: + The difference of the value with trimmed mean divided by the trimmed standard deviation + - `DetectionResult.is_outlier` are defined as: scores which are off by more than `z_score_cutoff`. + + Parameters + ---------- + z_score_cutoff : `float`, default `Z_SCORE_CUTOFF` + The normal distribution cut-off used to decide outliers. + + Attribute + --------- + fitted_param: `dict` or None, default None + This is updated after `self.fit` is run on data. + This dictionary stores: + + - "trimmed_mean": `float` + Trimmed mean of the input fit data (after trimming). + - "trimmed_sd": `float` + Trimmed standard deviation of the input fit data (after trimming). + + + Other Parameters and Attributes: + See `~greykite.common.features.outlier.BaseOutlierDetector` docstring. + """ + def __init__( + self, + z_score_cutoff=Z_SCORE_CUTOFF, + trim_percent=TRIM_PERCENT, + diff_method=MOVING_MEDIAN): + """See class attributes for details on parameters / attributes.""" + self.z_score_cutoff = z_score_cutoff + + super().__init__( + trim_percent=trim_percent, + diff_method=diff_method) + + def fit_logic(self, y_ready_to_fit): + """The logic of the fit. + This is an abstract method of the base class: + `~greykite.common.features.outlier.BaseOutlierDetector` + and it is implemented for z-score here. + + Parameters + ---------- + y_ready_to_fit: `pandas.series` + The series which is used for fitting. + + Returns + ------- + None. + + Updates + self.fitted_param: `dict` + Parameters of the z-score model. + + - "trimmed_mean" + - "trimmed_sd" + + self.lower_bound: `float` + The lower bound to decide if a point is an outlier. + self.lower_bound: `float` + The upper bound to decide is a point is an outlier. + + Indirectly updates (via `self.fit` method of the parent class): + + self.y_diffed: `pandas.Series` or None, default None + self.y_na_removed: `pandas.Series` or None, default None + self.y_trimmed: `pandas.Series` or None, default None + self.y_ready_to_fit: `pandas.Series` or None, default None + """ + # Calculates z-scores and identifies points with:" + # Pseudo code: abs(z-score) > `Z_SCORE_CUTOFF` as outliers. + trimmed_mean = np.mean(y_ready_to_fit) + trimmed_sd = np.std(y_ready_to_fit) + self.lower_bound = trimmed_mean - trimmed_sd * self.z_score_cutoff + self.upper_bound = trimmed_mean + trimmed_sd * self.z_score_cutoff + + self.fitted_param = { + "trimmed_mean": trimmed_mean, + "trimmed_sd": trimmed_sd} + + def detect_logic(self, y_new): + """The logic of outlier detection, after `fit` is done. + This method uses the fit information to decide which points are + outliers and what should be their score. + + This is an abstract method of the base class: + `~greykite.common.features.outlier.BaseOutlierDetector` + and it is implemented for z-score here. + + For this method: scores are defined as the mean difference divided by the standard deviation + (as prescribed z-score). + + Parameters + ---------- + y_new: `pandas.series` + A series of floats. This is the input series to be labeled. + + Returns + ------- + result: `~greykite.common.features.outlier.DetectionResult` + Includes the outlier detection results (see the data class attributes). + """ + scores = pd.Series([0] * len(y_new), dtype=float) + is_outlier = pd.Series([False] * len(y_new)) + + # If trimmed sd is zero or not defined, we will not detect any outlier and return. + if not (self.fitted_param["trimmed_sd"] > 0): + return DetectionResult(scores=scores, is_outlier=is_outlier) + + scores = (y_new - self.fitted_param["trimmed_mean"]) / self.fitted_param["trimmed_sd"] + # Boolean series to denote outliers. + is_outlier = np.abs(scores) > self.z_score_cutoff + + return DetectionResult(scores=scores, is_outlier=is_outlier) + + +class TukeyOutlierDetector(BaseOutlierDetector): + """This is a class for detecting one-dimensional outliers. + This uses the celebrated outlier definition of John Tukey (and named here as such in his recognition): + Reference: Tukey, J.W., Exploratory data analysis. Addison-Wesley, Reading, 1977 + + Note: In Tukey's work: + + - `iqr_lower = 0.25` which is the first quartile + - `iqr_upper = 0.75` which is the third quartile + + Here we let user specify them and defaults are different. + Therefore the naming IQR (inter-quartile range) + is an imperfect naming. However, we think this naming is very widely used and + it is not worth to come up with new naming. + + For this method: + + - `DetectionResult.scores` are defined as: + + - score is zero if the value is within the IQR + - score is postive if the value is above the IQR. + It is the difference from the IQR upper bound devided by IQR length. + - score is negative if the value is below the IQR. + It is the difference from the IQR lower bound devided by IQR length. + + Exception: when IQR = 0, we need to handle it. + + - If there is no diff with the corresponding quantile, we set: `score = 0` + - If there is a diff with corresponding quantile, we set: `score = 2 * tukey_cutoff` + multiplied by the sign of the difference. + + - `DetectionResult.is_outlier` are defined as: scores which are off by more than `tukey_cutoff`. + + This is a child of `~greykite.common.features.outlier.BaseOutlierDetector`, + which already implements a few processing steps which are useful across + various outlier detection methods: + + - `remove_na`: removing NAs + - `trim`: trimming the data. This is useful for example when using z-score + - `diff_from_baseline`: fitting a simple baseline and differencing the input from baseline. + + Parameters: + ---------- + tukey_cutoff : `float`, default `TUKEY_CUTOFF` + The Tukey cutoff for deciding what is an anomaly. + iqr_lower : float`, default `IQR_LOWER` + The smaller quantile used in IQR calculation. + iqr_upper : float, default `IQR_UPPER` + The larger quantile used in IQR calculation. + + Attribute + --------- + fitted_param: `dict` or None, default None + This is updated after `self.fit` is run on data. + This dictionary stores: + + - "quantile_value_lower": `float` or None, default None + The lower bound of Tukey IQR. + - "quantile_value_upper": `float` or None, default None + The upper bound of Tukey IQR. + - "iqr": `float` or None, default None + The Tukey IQR (inter-quartile range). + + Other Parameters and Attributes: + See `~greykite.common.features.outlier.BaseOutlierDetector` docstring. + """ + def __init__( + self, + tukey_cutoff=TUKEY_CUTOFF, + iqr_lower=IQR_LOWER, + iqr_upper=IQR_UPPER, + trim_percent=None, + diff_method=MOVING_MEDIAN): + """See class docstring for details on parameters / attributes. + + Note that in this case, the default of `trim_percent` is None, rather than + `TRIM_PERCENT` used in the base class. + This is because in this method, trimming is not necessary as the logic + uses quantiles (which are robust to outliers). + """ + self.tukey_cutoff = tukey_cutoff + self.iqr_lower = iqr_lower + self.iqr_upper = iqr_upper + + super().__init__( + trim_percent=trim_percent, + diff_method=diff_method) + + def fit_logic(self, y_ready_to_fit): + """The logic of the fit. + This is an abstract method of the base class: + `~greykite.common.features.outlier.BaseOutlierDetector` + and it is implemented for Tukey method here. + + Parameters + ---------- + y_ready_to_fit : `pandas.series` + A series of floats. This is the input series to be labeled. + + Returns + ------- + None. + + Updates + self.fitted_param: `dict` or None, default None + This is updated after `self.fit` is run on data. + This dictionary stores: + + - "quantile_value_lower": `float` + The lower bound of Tukey IQR. + - "quantile_value_upper": `float` + The upper bound of Tukey IQR. + - "iqr": `float` + The Tukey IQR (inter-quartile range). + + self.lower_bound : `float` + The lower bound to decide if a point is an outlier. + self.lower_bound : `float` + The upper bound to decide is a point is an outlier. + + Indirectly updates (via `self.fit` method of the parent class): + + self.y_diffed : `pandas.Series` or None, default None + self.y_na_removed : `pandas.Series` or None, default None + self.y_trimmed : `pandas.Series` or None, default None + self.y_ready_to_fit : `pandas.Series` or None, default None + """ + # The value of the distribution at the given quantiles. + quantile_value_lower = np.quantile(a=y_ready_to_fit, q=self.iqr_lower) + quantile_value_upper = np.quantile(a=y_ready_to_fit, q=self.iqr_upper) + # IQR. + iqr = (quantile_value_upper - quantile_value_lower) + # Upper and lower bounds after considering the Tukey coef. + self.lower_bound = quantile_value_lower - (self.tukey_cutoff * iqr) + self.upper_bound = quantile_value_upper + (self.tukey_cutoff * iqr) + + self.fitted_param = { + "quantile_value_lower": quantile_value_lower, + "quantile_value_upper": quantile_value_upper, + "iqr": iqr + } + + def detect_logic(self, y_new): + """The logic of outlier detection, after `fit` is done. + This method uses the fit information to decide which points are + outliers and what should be their score. + + This is an abstract method of the base class: + `~greykite.common.features.outlier.BaseOutlierDetector` + and it is implemented for Tukey method here. + + For this method: + - `DetectionResult.scores` are defined as: + + - score is zero if the value is within the IQR + - score is postive if the value is above the IQR. + It is the difference from the IQR upper bound devided by IQR length. + - score is negative id the value is below the IQR. + It is the difference from the IQR lower bound devided by IQR length. + + - `DetectionResult.is_outlier` are defined as: scores which are off by more than `tukey_cutoff`. + + Parameters + ---------- + y_new : `pandas.series` + A series of floats. This is the input series to be labeled. + + Returns + ------- + result : `~greykite.common.features.outlier.DetectionResult` + Includes the outlier detection results (see the data class attributes). + """ + scores = pd.Series([0.0] * len(y_new), dtype=float) + is_outlier = pd.Series([False] * len(y_new)) + + # if IQR is zero, we will not detect any outlier and return. + if self.fitted_param["iqr"] == 0: + return DetectionResult(scores=scores, is_outlier=is_outlier) + + # First, we find the anchor point for calculating the score. + # If the point is too large then the upper quantile will be the anchor point. + # If the point is too small then the lower quantole will be the anchor point. + for i in range(len(y_new)): + if y_new[i] < self.fitted_param["quantile_value_lower"]: + anchor = self.fitted_param["quantile_value_lower"] + score = (y_new[i] - anchor) + elif y_new[i] > self.fitted_param["quantile_value_upper"]: + anchor = self.fitted_param["quantile_value_upper"] + score = (y_new[i] - anchor) + else: + anchor = None + score = 0 + + scores[i] = float(score) / float(self.fitted_param["iqr"]) + + is_outlier = np.abs(scores) > self.tukey_cutoff + return DetectionResult(scores=scores, is_outlier=is_outlier) diff --git a/greykite/common/features/timeseries_features.py b/greykite/common/features/timeseries_features.py index 054b513..ae45777 100644 --- a/greykite/common/features/timeseries_features.py +++ b/greykite/common/features/timeseries_features.py @@ -806,6 +806,9 @@ def add_daily_events( def get_neighbor_days_func(date): return [date + timedelta(days=d) for d in range(*neighbor_impact)] elif isinstance(neighbor_impact, list): + if 0 not in neighbor_impact: + neighbor_impact = sorted(neighbor_impact + [0]) + def get_neighbor_days_func(date): return [date + timedelta(days=d) for d in neighbor_impact] else: diff --git a/greykite/common/testing_utils.py b/greykite/common/testing_utils.py index dbef687..c6815b4 100644 --- a/greykite/common/testing_utils.py +++ b/greykite/common/testing_utils.py @@ -501,3 +501,113 @@ def assert_eval_function_equal(f1, f2): y_true = np.random.random(42) y_pred = np.random.random(42) assert_equal(f1(y_true, y_pred), f2(y_true, y_pred)) + + +def generate_df_with_arbitrary_trends_and_shifts( + start_date="2015-01-01", + length=365, + freq="D", + seed=10, + trend_slopes=None, + trend_intervals=None, + level_shifts=None, + level_shift_magnitudes=None): + """Generates a Pandas DataFrame that represents time series data with arbitrary trends and level shifts. + Example Usage: Calling `generate_df_with_arbitrary_trends_and_shifts(trend_slopes=[-1., 1.], trend_intervals=[.3, 1.], + level_shifts=[(.3, .6)], level_shift_magnitudes=[100.])` produces a time series with a slope of -1 for the first 30% + of the interval. The next 70% ((1. - .3)*100 = 70) of the interval has a slope of 1. The time series across the interval + .3 to .6, of the entire length, will have a positive level shift of magnitude 100. + + Parameters + ---------- + start_date : `str` + The start date of the time series data in "YYYY-MM-DD" format. Defaults to '2015-01-01'. + length : `int` + The number of data points in the time series. Defaults to 365. + freq : `str` + Frequency of the timestamps (based on pandas date_range frequencies). Defaults to "D" for daily. + seed : `int` + The seed for the random number generator. Defaults to 10. + trend_slopes : `Optional[List[float]]` + A list of slopes representing trends in the time series. Each slope applies to a corresponding + interval from 'trend_intervals'. Defaults to None and the series produced has a slope of 1. + trend_intervals : `Optional[List[float]]` + A list of values between 0 and 1 that represent the intervals of the corresponding trends in + 'trend_slopes'. The start of a new interval indicates the end of the previous one. The final + interval must end at 1. Defaults to None. + level_shifts : `Optional[List[Tuple[float, float]]]` + A list of tuples where each tuple contains two float values between 0 and 1. Each tuple + represents the start and end of a level shift in the time series. Defaults to None. + level_shift_magnitudes : `Optional[List[float]]` + A list of magnitudes for each level shift. Each magnitude corresponds to a tuple from + 'level_shifts'. Defaults to None. + + Returns + ------- + df : `pandas.DataFrame` + A DataFrame representing the generated time series with columns: + `"timestamp"` : time column. + `"y"` : value column. + + Raises + ------ + ValueError: If the first trend interval does not have a positive value. + ValueError: If the last trend interval does not end at 1. + ValueError: If the trend intervals are not strictly monotonically increasing. + ValueError: If each level shift does not have a start and end specified for its duration. + ValueError: If the start and end points of a level shift are not valid fractions of the time series duration. + ValueError: If the number of trend slopes is not equal to the number of trend intervals. + ValueError: If the number of level shifts is not equal to the number of level shift magnitudes. + """ + if len(trend_slopes) != len(trend_intervals): + raise ValueError("Each trend needs an interval specified for its duration.") + + def _verify_intervals(intervals): + if intervals[0] <= 0: + raise ValueError("The beginning trend must have a positive value") + if float(intervals[-1]) != 1.: + raise ValueError("The final interval must end at 1.") + for i in range(len(intervals) - 1): + if intervals[i+1] <= intervals[i]: + raise ValueError("The intervals must be strictly monotonically increasing.") + + def _verify_level_shifts(level_shift_starts, level_shift_ends): + if len(level_shift_starts) != len(level_shift_ends): + raise ValueError("Each level shift needs a start and end specified for its duration.") + for ls_start, ls_end in zip(level_shift_starts, level_shift_ends): + if ls_end <= ls_start or ls_start < 0 or ls_end > 1: + raise ValueError("Level shift start and end points should be valid fractions of the time series duration, with end > start.") + np.random.seed(seed) + if trend_slopes and trend_intervals: + _verify_intervals(trend_intervals) + + if level_shifts and level_shift_magnitudes: + level_shift_starts, level_shift_ends = zip(*level_shifts) + _verify_level_shifts(level_shift_starts, level_shift_ends) + if len(level_shifts) != len(level_shift_magnitudes): + raise ValueError("Each level shift needs a magnitude specified.") + ts = pd.date_range(start_date, freq=freq, periods=length) + trend_slopes = trend_slopes or [1.] + trend_intervals = trend_intervals or [1.] + level_shifts = level_shifts or [] + level_shift_magnitudes = level_shift_magnitudes or [] + hop_size = length / len(trend_slopes) + trend_intervals_indices = [0] + [int(hop_size * (i + 1)) for i in range(len(trend_slopes) - 1)] + [length] + base_value_with_trend_changes = np.array([]) + for i in range(len(trend_intervals_indices) - 1): + cur_trend_magnitude = trend_slopes[i] + begin, end = trend_intervals_indices[i], trend_intervals_indices[i+1] - 1 + interval_length = end - begin + 1 + prev_endpoint = base_value_with_trend_changes[-1] if len(base_value_with_trend_changes) > 0 else 0. + base_value_with_trend_changes = np.concatenate( + ( + base_value_with_trend_changes, + cur_trend_magnitude * np.arange(interval_length, dtype=np.float64) + prev_endpoint + ) + ) + for (ls_start, ls_end), ls_magnitude in zip(level_shifts, level_shift_magnitudes): + ls_start_index = int(ls_start * length) + ls_end_index = int(ls_end * length) + base_value_with_trend_changes[ls_start_index:ls_end_index] += ls_magnitude + df = pd.DataFrame({"timestamp": ts, "y": base_value_with_trend_changes}) + return df diff --git a/greykite/common/testing_utils_anomalies.py b/greykite/common/testing_utils_anomalies.py new file mode 100644 index 0000000..d40809f --- /dev/null +++ b/greykite/common/testing_utils_anomalies.py @@ -0,0 +1,285 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original authors: Apratim Dey, Reza Hosseini, Sayan Patra +"""Utility functions for simulating anomalies.""" + +import datetime + +import numpy as np +import pandas as pd + +from greykite.common.testing_utils import generate_df_for_tests + + +def generate_anomaly_blocks( + timeseries_length, + block_number, + mean_block_size=5): + """Returns blocks of indices to insert anomalies into. + + :param timeseries_length: int + length of the time series into which we want to inject anomalies + :param block_number: int + initial number of blocks; may change in the output + :param mean_block_size: float + initial average number of indices per block; may change in the output + + :return: Dict[list, any] + contains a list of blocks of indices, number of blocks and a list of size of blocks + """ + anomaly_start = np.random.choice(timeseries_length-1, block_number, replace=False) + anomaly_start = np.sort(anomaly_start) + interval_length = np.random.poisson(lam=mean_block_size, size=block_number) + anomaly_blocks = [] + for i in range(len(anomaly_start)): + block_lower = anomaly_start[i] + block_upper = min(timeseries_length, anomaly_start[i]+interval_length[i]+1) + anomaly_blocks.append(list(range(block_lower, block_upper))) + anomaly_block_list = [] + anomaly_block_list.append(anomaly_blocks[0]) + for j in range(1, len(anomaly_blocks)): + if anomaly_blocks[j][0] <= anomaly_block_list[-1][-1] + 1: + anomaly_block_list[-1] = list(np.sort(np.array((list(set(anomaly_block_list[-1] + anomaly_blocks[j])))))) + else: + anomaly_block_list.append(anomaly_blocks[j]) + + return {"anomaly_block_list": anomaly_block_list, + "block_number": len(anomaly_block_list), + "block_size": [len(anomaly_block_list[x]) for x in range(len(anomaly_block_list))]} + + +def contaminate_df_with_anomalies( + df, + anomaly_block_list, + delta_range_lower, + delta_range_upper, + value_col="y", + min_admissible_value=None, + max_admissible_value=None): + """Contaminate a dataframe with anomalies. If original value is y, the anomalous value is (1 +/- delta)y, + the + or - chosen randomly. + + :param df: pd.DataFrame + dataframe with values in column named "y" + :param anomaly_block_list: list + list of blocks of indices to insert anomalies in + :param delta_range_lower: float + lower boundary of the interval to choose delta from + :param delta_range_upper: float + upper boundary of the interval to choose delta from + :param value_col: str + The value columns which is to be contaminated + :param min_admissible_value: Optional[float] + minimum admissible value in df["y"] + :param max_admissible_value: Optional[float] + maximum admissible value in df["y"] + + :return: pd.DataFrame + contains the dataframe df with two columns appended: + "contaminated_y": values from column "y" changed to have outliers in the blocks given by anomaly_block_list + "is_anomaly": 0 for clean point, 1 for outlier + """ + y = np.array(df[value_col]) + is_anomaly = np.zeros(df.shape[0], dtype=float) + for i in range(len(anomaly_block_list)): + index_set = anomaly_block_list[i] + # generate a random sign: either 1 or -1 + s = 2*np.random.binomial(1, 0.5, 1) - 1 + for j in index_set: + multiplier = 1 + (s*np.random.uniform(delta_range_lower, delta_range_upper, 1)) + y[j] = y[j]*multiplier + if min_admissible_value is not None: + y[j] = max(min_admissible_value, y[j]) + if max_admissible_value is not None: + y[j] = min(max_admissible_value, y[j]) + is_anomaly[j] = 1 + df["contaminated_y"] = y + df["is_anomaly"] = is_anomaly + return df + + +def calc_quantiles_simulated_df( + sim_df_func, + quantiles=[0.25, 0.75], + simulation_num=50, + **params): + + """Calculates quantiles corresponding to probs by simulating time series with specified parameters + :param sim_df_func: callable + a function which simulates a dataframe + :param quantiles: List[float] + list of probabilities to compute quantiles at + :param simulation_num: int + number of simulations to calculate quantiles + :param: **params + parameters of ``sim_df_func`` + + :return: pd.DataFrame + contains quantiles corresponding to probs computed at each time point + """ + + df = [sim_df_func(**params)["y"] for x in range(simulation_num)] + df = pd.DataFrame(df) + quantiles_df = np.transpose(df.quantile(quantiles, 0)) + + return quantiles_df + + +def generate_df_with_anomalies_sim_based( + freq, + periods, + block_number, + mean_block_size, + train_start_date=datetime.datetime(2018, 7, 1), + train_end_date=None, + train_frac=0.8, + conti_year_origin=None, + noise_std=2.0, + remove_extra_cols=True, + autoreg_coefs=None, + fs_coefs=[-1, 3, 4], + growth_coef=3.0, + growth_pow=1.1, + intercept=10.0, + quantiles=[0.25, 0.75], + simulation_num=50, + filter_coef=1.5, + anomaly_coef=3): + """Generates a time series data frame by simulation and estimates quantiles + by simulation and annotates the former with outliers + :param freq: str + pd.date_range freq parameter, e.g. H or D + :param periods: int + number of periods to generate + :param block_number: int + initial number of blocks; may change in the output + :param mean_block_size: float + initial average number of indices per block; may change in the output + :param train_start_date: datetime.datetime + train start date + :param train_end_date: Optional[datetime.datetime] + train end date + :param train_frac: Optional[float] + fraction of data to use for training + only used if train_end_date isn't provided + :param noise_std: float + standard deviation of gaussian noise + :param conti_year_origin: float + the time origin for continuous time variables + :param remove_extra_cols: bool + whether to remove extra columns besides TIME_COL, VALUE_COL + :param autoreg_coefs: Optional[List[int]] + The coefficients for the autoregressive terms. + If provided the generated series denoted mathematically by Y(t) will be + converted as follows: + Y(t) -> Y(t) + c1 Y(t-1) + c2 Y(t-2) + c3 Y(t-3) + ... + where autoreg_coefs = [c1, c2, c3, ...] + In this fashion, the obtained series will have autoregressive + properties not explained by seasonality and growth. + :param fs_coefs: List[float] + The fourier series coefficients used. + :param growth_coef: float + Multiplier for growth + :param growth_pow: float + Power for growth, as function of continuous time + :param intercept: float + Constant term added to Y(t) + :param quantiles: List[float] + list of probabilities to compute quantiles at + :param simulation_num: int + number of simulations to calculate quantiles + :param filter_coef: float + threshold coefficient to detect anomalies while labeling + :param anomaly_coef: float + coefficient of iqr while creating anomalies + + :return: Dict containing + anomaly block list; + df with four columns appended: + "contaminated_y": values from column "y" changed to have outliers in the blocks given by anomaly_block_list + "is_anomaly": 0 for clean point, 1 for outlier + "lower": lower bound used for outlier filtering + "upper": upper bound used for outlier filtering; + df containing quantiles calculated through simulation + """ + res = generate_anomaly_blocks( + timeseries_length=periods, + block_number=block_number, + mean_block_size=mean_block_size) + + anomaly_block_list = res["anomaly_block_list"] + + def sim_df_func(): + return generate_df_for_tests( + freq, + periods, + train_start_date, + train_end_date, + train_frac, + conti_year_origin, + noise_std, + remove_extra_cols, + autoreg_coefs, + fs_coefs, + growth_coef, + growth_pow, + intercept, + seed=None)["df"] + + df = sim_df_func() + y = np.array(df["y"]) + + quantiles_df = calc_quantiles_simulated_df( + sim_df_func=sim_df_func, + quantiles=quantiles, + simulation_num=simulation_num) + + iqr = quantiles_df[quantiles[-1]] - quantiles_df[quantiles[0]] + lower = quantiles_df[quantiles[0]] - filter_coef*iqr + upper = quantiles_df[quantiles[-1]] + filter_coef*iqr + outlier_indices_upper_crossing = (y > upper) + outlier_indices_lower_crossing = (y < lower) + + y[outlier_indices_upper_crossing] = np.array(quantiles_df[quantiles[-1]])[outlier_indices_upper_crossing] + y[outlier_indices_lower_crossing] = np.array(quantiles_df[quantiles[0]])[outlier_indices_lower_crossing] + + is_anomaly = np.zeros(df.shape[0], dtype=float) + + for i in range(len(anomaly_block_list)): + index_set = anomaly_block_list[i] + s = 2*np.random.binomial(1, 0.5, 1)-1 + for j in index_set: + y[j] = y[j] + (s*anomaly_coef*iqr[j]) + + outlier_indices_upper_crossing = (y > upper) + outlier_indices_lower_crossing = (y < lower) + + df["contaminated_y"] = y + is_anomaly[outlier_indices_lower_crossing] = 1 + is_anomaly[outlier_indices_upper_crossing] = 1 + df["is_anomaly"] = is_anomaly + df["lower"] = lower + df["upper"] = upper + + return { + "anomaly_block_list": anomaly_block_list, + "df": df, + "quantiles_df": quantiles_df} diff --git a/greykite/common/time_properties.py b/greykite/common/time_properties.py index d8a68b6..ea3f78f 100644 --- a/greykite/common/time_properties.py +++ b/greykite/common/time_properties.py @@ -65,6 +65,7 @@ def describe_timeseries(df, time_col): """ df = df.copy(deep=True) + df[time_col] = pd.to_datetime(df[time_col]) if df.shape[0] < 2: raise Exception("dataframe needs to have at least two rows") df["delta"] = ( diff --git a/greykite/common/viz/timeseries_annotate.py b/greykite/common/viz/timeseries_annotate.py index 59d1214..c5208f7 100644 --- a/greykite/common/viz/timeseries_annotate.py +++ b/greykite/common/viz/timeseries_annotate.py @@ -362,12 +362,21 @@ def plot_lines_markers( x_col, line_cols=None, marker_cols=None, + band_cols=None, + band_cols_dict=None, line_colors=None, - marker_colors=None): + marker_colors=None, + band_colors=None, + title=None): """A lightweight, easy-to-use function to create a plotly figure of given - lines (curves) and markers (points) from the columns of a dataframe with a - legend which matches the column names. - This can be used for example to annotate multiple curves with markers + + - lines (curves) + - markers (points) + - filled bands (e.g. error bands) + + from the columns of a dataframe with a legend which matches the column names. + + This can be used for example to annotate multiple curves, markers and bands with an easy function call. Parameters @@ -380,12 +389,33 @@ def plot_lines_markers( The list of y-axis variables to be plotted as lines / curves. marker_cols : `list` [`str`] or None, default None The list of y-axis variables to be plotted as markers / points. + band_cols : `list` [`str`] or None, default None + The list of y-axis variables to be plotted as bands. + Each column is expected to have tuples, each of which denote the upper + and lower bounds. + band_cols_dict : `dict` [`str`: [`str`]] or None, default None + This is another way to specify bands. + In this case: + + - each key will be the name for the band + - the value contains the two bound colums of `df` for the band. + + For example `{ + "forecast": ["forecast_upper", "forecast_lower"], + "w": ["w1", "w2"]}` + Specifies two bands, one is based on the forecast prediction intervals + and one is based on a variables denoted by "w" which has two corresponding + columns in df: `"w1"` and `"w2"`. + line_colors : `list` [`str`] or None, default None - The list of colors to be used for each corresponding line given in ``line_cols`` + The list of colors to be used for each corresponding line column given in ``line_cols``. marker_colors : `list` [`str`] or None, default None - The list of colors to be used for each corresponding line given in ``line_cols`` + The list of colors to be used for each corresponding marker column given in ``line_cols``. + band_colors : `list` [`str`] or None, default None + The list of colors to be used for each corresponding band column given in ``band_cols``. + Each of these colors are used as filler for each band. title : `str` or None, default None - Plot title. If None, default is based on axis labels. + Plot title. If None, no title will appear. Returns ------- @@ -393,22 +423,37 @@ def plot_lines_markers( Interactive plotly graph of one or more columns in ``df`` against ``x_col``. """ - if line_colors is not None: - if len(line_colors) != len(line_cols): + if line_colors is not None and line_cols is not None: + if len(line_colors) < len(line_cols): + raise ValueError( + "If `line_colors` is passed, its length must be at least `len(line_cols)`") + + if marker_colors is not None and marker_cols is not None: + if len(marker_colors) < len(marker_cols): + raise ValueError( + "If `marker_colors` is passed, its length must be at least `len(marker_cols)`") + + if band_colors is not None and band_cols is not None: + if len(band_colors) < len(band_cols): raise ValueError( - "If `line_colors` is passed, its length must be equal to `line_cols`") + "If `band_colors` is passed, its length must be at least `len(band_cols)`") - if marker_colors is not None: - if len(marker_colors) != len(marker_cols): + if band_colors is not None and band_cols_dict is not None: + if len(band_colors) < len(band_cols_dict): raise ValueError( - "If `line_colors` is passed, its length must be equal to `line_cols`") + "If `band_colors` is passed, its length must be at least `len(band_cols_dict)`") - if line_cols is None and marker_cols is None: + if ( + line_cols is None and + marker_cols is None and + band_cols is None and + band_cols_dict is None): raise ValueError( - "At least one of `line_cols` or `marker_cols` must be passed as a list of strings (not None).") + "At least one of `line_cols` or `marker_cols` or `band_cols`" + " or `band_cols_dict` must be passed as a list (not None).") fig = go.Figure() - # Below we count the number of figure components to assign proper labels to legends + # Below we count the number of figure components to assign proper labels to legends. count_fig_data = -1 if line_cols is not None: for i, col in enumerate(line_cols): @@ -441,6 +486,79 @@ def plot_lines_markers( count_fig_data += 1 fig["data"][count_fig_data]["name"] = col + if band_cols is not None: + if band_colors is None: + band_colors = get_distinct_colors( + num_colors=len(band_cols), + opacity=0.2) + + for i, col in enumerate(band_cols): + fig.add_traces([ + go.Scatter( + x=df[x_col], + y=df[col].map(lambda b: b[1]), + mode="lines", + line=line, + line_color="rgba(0, 0, 0, 0)", + showlegend=True), + go.Scatter( + x=df[x_col], + y=df[col].map(lambda b: b[0]), + mode="lines", + line_color="rgba(0, 0, 0, 0)", + line=line, + fill="tonexty", + fillcolor=band_colors[i], + showlegend=True) + ]) + + # The code below adds legend for each band. + # We increment the count by two this time because each band comes with the + # inner filling and lines around it. + # In this case, we have made the lines around each band to be invisible. + # However, they do appear in the figure data and we want to only include + # one legend for each band. + count_fig_data += 2 + # This adds the legend corresponding to the band filler color. + fig["data"][len(fig["data"]) - 1]["name"] = col + # The name for this added data is the empty string, + # because we do not want to add a legend for the empty lines + # around the bands. + fig["data"][len(fig["data"]) - 2]["name"] = "" + + if band_cols_dict is not None: + if band_colors is None: + band_colors = get_distinct_colors( + num_colors=len(band_cols_dict), + opacity=0.2) + + for i, name in enumerate(band_cols_dict): + col1 = band_cols_dict[name][0] + col2 = band_cols_dict[name][1] + fig.add_traces([ + go.Scatter( + x=df[x_col], + y=df[col2], + mode="lines", + line=line, + line_color="rgba(0, 0, 0, 0)", + showlegend=True), + go.Scatter( + x=df[x_col], + y=df[col1], + mode="lines", + line_color="rgba(0, 0, 0, 0)", + line=line, + fill="tonexty", + fillcolor=band_colors[i], + showlegend=True) + ]) + + count_fig_data += 2 + fig["data"][len(fig["data"]) - 1]["name"] = name + fig["data"][len(fig["data"]) - 2]["name"] = "" + + fig.update_layout(title=title) return fig @@ -1084,9 +1202,9 @@ def plot_anomalies_over_forecast_vs_actual( predicted_col=PREDICTED_COL, predicted_anomaly_col=PREDICTED_ANOMALY_COL, anomaly_col=ANOMALY_COL, - marker_opacity=0.7, - predicted_anomaly_marker_color="green", - anomaly_marker_color="red", + marker_opacity=1, + predicted_anomaly_marker_color="rgba(0, 90, 181, 0.9)", + anomaly_marker_color="rgba(250, 43, 20, 0.7)", **kwargs): """Utility function which overlayes the predicted anomalies or anomalies on the forecast vs actual plot. The function calls the internal function `~greykite.common.viz.timeseries_plotting.plot_forecast_vs_actual` @@ -1129,22 +1247,27 @@ def plot_anomalies_over_forecast_vs_actual( actual_col=actual_col, predicted_col=predicted_col, **kwargs) - if predicted_anomaly_col is not None: - fig.add_trace(go.Scatter( - x=df.loc[df[predicted_anomaly_col].apply(lambda val: val is True), time_col], - y=df.loc[df[predicted_anomaly_col].apply(lambda val: val is True), predicted_col], - mode="markers", - marker=go.scatter.Marker(color=predicted_anomaly_marker_color), - name=predicted_anomaly_col.title(), - showlegend=True, - opacity=marker_opacity)) if anomaly_col is not None: fig.add_trace(go.Scatter( x=df.loc[df[anomaly_col].apply(lambda val: val is True), time_col], y=df.loc[df[anomaly_col].apply(lambda val: val is True), actual_col], mode="markers", + marker_size=10, + marker_symbol="square", marker=go.scatter.Marker(color=anomaly_marker_color), name=anomaly_col.title(), showlegend=True, opacity=marker_opacity)) + if predicted_anomaly_col is not None: + fig.add_trace(go.Scatter( + x=df.loc[df[predicted_anomaly_col].apply(lambda val: val is True), time_col], + y=df.loc[df[predicted_anomaly_col].apply(lambda val: val is True), actual_col], + mode="markers", + marker_size=7, + marker_symbol="diamond", + marker=go.scatter.Marker(color=predicted_anomaly_marker_color), + name=predicted_anomaly_col.title(), + showlegend=True, + opacity=marker_opacity)) + return fig diff --git a/greykite/detection/__init__.py b/greykite/detection/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/greykite/detection/common/__init__.py b/greykite/detection/common/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/greykite/detection/common/ad_evaluation.py b/greykite/detection/common/ad_evaluation.py new file mode 100644 index 0000000..ddcfcdf --- /dev/null +++ b/greykite/detection/common/ad_evaluation.py @@ -0,0 +1,661 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Saad Eddin Al Orjany, Sayan Patra, Reza Hosseini, Kaixu Yang + +"""Evaluation functions.""" + +import functools +import warnings +from typing import Optional +from typing import Union + +import numpy as np +import pandas as pd +from sklearn import metrics + +from greykite.detection.common.ad_evaluation_utils import compute_range_based_score +from greykite.detection.common.ad_evaluation_utils import prepare_anomaly_ranges + + +INPUT_COL_NAME = "input_col" + + +def validate_categorical_input(score_func): + """Decorator function to validate categorical scoring function input, + and unifies the input type to pandas.Series. + """ + + @functools.wraps(score_func) + def score_func_wrapper( + y_true: Union[list, np.array, pd.Series, pd.DataFrame], + y_pred: Union[list, np.array, pd.Series, pd.DataFrame], + *args, + **kwargs) -> np.array: + actual = pd.DataFrame(y_true).reset_index(drop=True) + pred = pd.DataFrame(y_pred).reset_index(drop=True) + if actual.shape[-1] != 1 or pred.shape[-1] != 1: + raise ValueError(f"The input for scoring must be 1-D array, found {actual.shape} and {pred.shape}") + if actual.shape != pred.shape: + raise ValueError(f"The input lengths must be the same, found {actual.shape} and {pred.shape}") + actual.columns = [INPUT_COL_NAME] + pred.columns = [INPUT_COL_NAME] + # Drop rows with NA values in either actual or pred + merged_df = pd.concat([actual, pred], axis=1).dropna() + actual = merged_df.iloc[:, [0]] + pred = merged_df.iloc[:, [1]] + category_in_actual_set = set(actual[INPUT_COL_NAME]) + category_in_pred_set = set(pred[INPUT_COL_NAME]) + pred_minus_actual = category_in_pred_set.difference(category_in_actual_set) + if pred_minus_actual: + warnings.warn(f"The following categories do not appear in y_true column, " + f"the recall may be undefined.\n{pred_minus_actual}") + actual_minus_pred = category_in_actual_set.difference(category_in_pred_set) + if actual_minus_pred: + warnings.warn(f"The following categories do not appear in y_pred column, " + f"the precision may be undefined.\n{actual_minus_pred}") + # Adds a list wrapper below since `sklearn >= 1.1` restricts the input types and shapes. + return score_func( + y_true=list(actual[INPUT_COL_NAME].reset_index(drop=True)), + y_pred=list(pred[INPUT_COL_NAME].reset_index(drop=True)), + *args, + **kwargs + ) + + return score_func_wrapper + + +@validate_categorical_input +def precision_score( + y_true, + y_pred, + sample_weight=None): + """Computes the precision scores for two arrays. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + sample_weight : array-like, 1-D + The sample weight. + + Returns + ------- + precision : `dict` + The precision score for different categories. + The keys are the categories, and the values are the precisions. + """ + actual_category = pd.unique(y_true) + pred_category = pd.unique(y_pred) + labels = pd.unique(np.concatenate([actual_category, pred_category])) + precisions_array = metrics.precision_score( + y_true=y_true, + y_pred=y_pred, + average=None, + labels=labels, + sample_weight=sample_weight, + zero_division=0 + ) + precisions = {} + for label, precision in zip(labels, precisions_array): + precisions[label] = precision + return precisions + + +@validate_categorical_input +def recall_score( + y_true, + y_pred, + sample_weight=None): + """Computes the recall scores for two arrays. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + sample_weight : array-like, 1-D + The sample weight. + + Returns + ------- + recall : `dict` + The recall score for different categories. + The keys are the categories, and the values are the recalls. + """ + actual_category = pd.unique(y_true) + pred_category = pd.unique(y_pred) + labels = pd.unique(np.concatenate([actual_category, pred_category])) + recalls_array = metrics.recall_score( + y_true=y_true, + y_pred=y_pred, + average=None, + labels=labels, + sample_weight=sample_weight, + zero_division=0 + ) + recalls = {} + for label, recall in zip(labels, recalls_array): + recalls[label] = recall + return recalls + + +@validate_categorical_input +def f1_score( + y_true, + y_pred, + sample_weight=None): + """Computes the F1 scores for two arrays. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + sample_weight : array-like, 1-D + The sample weight. + + Returns + ------- + recall : `dict` + The recall score for different categories. + The keys are the categories, and the values are the recalls. + """ + actual_category = pd.unique(y_true) + pred_category = pd.unique(y_pred) + labels = pd.unique(np.concatenate([actual_category, pred_category])) + f1s_array = metrics.f1_score( + y_true=y_true, + y_pred=y_pred, + average=None, + labels=labels, + sample_weight=sample_weight, + zero_division=0 + ) + f1_scores = {} + for label, f1 in zip(labels, f1s_array): + f1_scores[label] = f1 + return f1_scores + + +@validate_categorical_input +def matthews_corrcoef( + y_true, + y_pred, + sample_weight=None): + """Computes the Matthews correlation coefficient for two arrays. + The statistic is also known as the phi coefficient. + The Matthews correlation coefficient is used in machine learning as a measure of the quality of binary and multiclass classifications. + It takes into account true and false positives and negatives and is generally regarded as a balanced measure + which can be used even if the classes are of very different sizes. + The MCC is in essence a correlation coefficient value between -1 and +1 (inclusive). + One can interpret this coefficient as follows: + + - +1 represents a perfect prediction. + - 0 represents an average random prediction. + - -1 represents an inverse prediction. + + For more information, please consult the `wiki page `_. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + sample_weight : array-like, 1-D or None, default None + The sample weight. + + Returns + ------- + result : `float` + The Matthews correlation coefficient. + """ + return metrics.matthews_corrcoef( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight) + + +@validate_categorical_input +def informedness_statistic( + y_true, + y_pred, + sample_weight=None): + """Computes the Informedness also known as the Youden's J statistic for two arrays. + Youden's J statistic is defined as J = sensitivity + specificity - 1 for a binary output. + Informedness is its generalization to the multiclass case and estimates the probability of an informed decision. + Note that in binary classification, we have: + + - sensitivity: recall of the positive class. + - specificity: recall of the negative class. + + The index gives equal weight to false positive and false negative values. + In other words, all algorithms with the same value of the index give the same proportion of total misclassified results. + Its value ranges from -1 through +1 (inclusive). + One can interpret this statistic as follows: + + - +1 represents that there are no false positives or false negatives, i.e. the algorithm is perfect. + - 0 respresents when an algorithm gives the same proportion of positive results with and without an anomaly, i.e the test is useless. + - -1 represents that the classification yields only false positives and false negatives. It's an inverse prediction. + + For more information, please consult the `wiki page `_. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + sample_weight : array-like, 1-D or None, default None + The sample weight. + + Returns + ------- + result : `float` + The informedness statistic. + """ + return metrics.balanced_accuracy_score( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight, + adjusted=True) + + +@validate_categorical_input +def confusion_matrix( + y_true, + y_pred, + sample_weight=None): + """Computes the confusion matrix for two arrays. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + sample_weight : array-like, 1-D + The sample weight. + + Returns + ------- + confusion_matrix : `pandas.DataFrame` + The confusion matrix. + """ + actual_category = pd.unique(y_true) + pred_category = pd.unique(y_pred) + all_category = pd.unique(np.concatenate([actual_category, pred_category])) + matrix = metrics.confusion_matrix( + y_true=y_true, + y_pred=y_pred, + labels=all_category, + sample_weight=sample_weight + ) + matrix = pd.DataFrame(matrix) + matrix.index = pd.MultiIndex.from_arrays([["Actual"] * len(all_category), matrix.index]) + matrix.columns = pd.MultiIndex.from_arrays([["Pred"] * len(all_category), matrix.columns]) + return matrix + + +@validate_categorical_input +def soft_recall_score( + y_true, + y_pred, + window): + """Computes the soft recall score for two classes, usually labeled 1 and 0 to denote + an anomaly/ alert and not an anomaly/ alert, for `y_true` and `y_pred` respectively. + + soft_recall(window) is defined as the proportion of anomalies that were correctly + alerted within the window size. Mathematically, + + soft_precision(window) = TruePositive(window)/ sum_i (y_true_i == 1), + where + TruePositive(window) = sum_i(y_true_i == 1, max(y_pred_{i-window}, ..., y_pred_{i+window}) == 1). + + For example, let window = 2. + If the ith value in `y_true` is an anomaly (labeled 1), then we say the anomaly + is predicted if any of i-2, i-1, i, i+1, i+2 value in `y_pred` is an alert (labeled 1). + True Positive (window) is the sum of all such predicted anomalies. + + As far as we know these soft metrics do not appear in related work at least in this simple form. + These metric were introduced by Reza Hosseini and Sayan Patra as a part of this work. + They are found to be a very simple yet powerful extension of Precision/Recall in our work. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + window : `int` + The window size to determine True Positives. + + Returns + ------- + recall : `dict` + The recall scores for various categories. + + Examples + -------- + >>> y_true = [0, 1, 1, 1, 0, 0] + >>> y_pred = [0, 0, 0, 1, 0, 1] + >>> print([soft_recall_score(y_true, y_pred, window) for window in [0, 1, 2]]) + [0.3333333333333333, 0.6666666666666666, 1.0] + """ + if not isinstance(window, int) or window < 0: + raise ValueError(f"Input value of the parameter window ({window}) is not a non-negative integer.") + + # If `window` is 0, we revert to the standard definition directly (to save computation time). + if window == 0: + return recall_score(y_true=y_true, y_pred=y_pred) + + lag_y_pred = pd.DataFrame({ + "y_pred_0": y_pred + }) + + for i in np.arange(-window, window + 1): + # Due to shifting there will be missing data at the beginning and the end of + # the series. ffill and bfill interpolates these edge cases. + lag_y_pred[f"y_pred_{i}"] = lag_y_pred["y_pred_0"].shift(i).ffill().bfill() + + y_pred_soft = lag_y_pred.any(axis="columns") + + return recall_score(y_true, y_pred_soft) + + +@validate_categorical_input +def soft_precision_score( + y_true, + y_pred, + window): + """Computes the soft precision score for two classes, usually labeled 1 and 0 to denote + an anomaly and not an anomaly. + + soft_precision(window) is defined as the proportion of alerts that corresponds to an + anomaly within the window size. Mathematically, + + soft_precision(window) = TruePositive(window)/ sum_i (y_pred_i == 1), + where + TruePositive(window) = sum_i(y_pred_i == 1, max(actual_{i-window}, ..., actual_{i+window}) == 1) + + For eg. let window = 2. + If the ith value in `y_pred` is an alert (labeled 1), then we say the anomaly + is predicted if any of i-2, i-1, i, i+1, i+2 value in `y_pred` is an anomaly (labeled 1). + + True Positive (window) is the sum of all such captured anomalies. + + As far as we know these soft metrics do not appear in related work at least in this simple form. + These metric were introduced by Reza Hosseini and Sayan Patra as a part of this work. + They are found to be a very simple yet powerful extension of Precision/Recall in our work. + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + window : `int` + The window size to determine True Positives. + + + Returns + ------- + recall : `dict` + The precision scores for various categories. + + Examples + -------- + >>> y_true = [0, 1, 1, 1, 0, 0] + >>> y_pred = [0, 0, 0, 1, 0, 1] + >>> print([soft_precision_score(y_true, y_pred, window) for window in [0, 1, 2]]) + [0.5, 0.5, 1.0] + """ + if not isinstance(window, int) or window < 0: + raise ValueError(f"Input value of the parameter window ({window}) is not a non-negative integer.") + + # If `window` is 0, we revert to the standard definition directly (to save computation time). + if window == 0: + return precision_score(y_true=y_true, y_pred=y_pred) + + lag_y_true = pd.DataFrame({ + "y_true_0": y_true + }) + + for i in np.arange(-window, window + 1): + # Due to shifting there will be missing data at the beginning and the end of + # the series. ffill and bfill interpolates these edge cases. + lag_y_true[f"y_true_{i}"] = lag_y_true["y_true_0"].shift(i).ffill().bfill() + + y_true_soft = lag_y_true.any(axis="columns") + + return precision_score(y_true_soft, y_pred) + + +@validate_categorical_input +def soft_f1_score( + y_true, + y_pred, + window): + """Computes the soft F1 score for two classes, usually labeled 1 and 0 to denote + an anomaly and not an anomaly. + Soft F1 is simply calculated from + - Soft Precision: `~greykite.detection.common.evaluation.soft_precision_score` and + - Soft Recall: `~greykite.detection.common.evaluation.soft_recall_score` + using the standard formula for F1: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html + + Parameters + ---------- + y_true : array-like, 1-D + The actual categories. + y_pred : array-like, 1-D + The predicted categories. + window : `int` + The window size to determine True Positives. + + Returns + ------- + recall : `dict` + The precision scores for various categories. + """ + soft_precision = soft_precision_score( + y_true=y_true, + y_pred=y_pred, + window=window) + + soft_recall = soft_recall_score( + y_true=y_true, + y_pred=y_pred, + window=window) + + soft_f1 = {} + # Soft f1 can only be defined for categories (e.g. True, False) that + # appear in results of both precision and recall. + # Therefore we first find the intersection. + admissable_categories = set(soft_precision.keys()).intersection(soft_recall.keys()) + for categ in admissable_categories: + soft_f1[categ] = ( + 2 * (soft_precision[categ] * soft_recall[categ]) / + (soft_precision[categ] + soft_recall[categ])) + + return soft_f1 + + +@validate_categorical_input +def range_based_precision_score( + y_true, + y_pred, + alpha: float = 0.5, + positional_bias: str = "flat", + cardinality_bias: Optional[str] = None, + range_based: bool = True): + """Compute a precision score for two classes, usually labeled 1 and 0 to denote an anomaly and not an anomaly. + Both ``y_true`` and ``y_pred`` need to be sorted by timestamp. + + This precision implementation is from the paper: + Precision and Recall for Time Series + ; + + Point-wise real and predicted anomalies are first transformed into anomaly ranges. Then, given the set of real + anomaly ranges, R = {R_1, ..., R_Nr}, and predicted anomaly ranges, P = {P_1, ..., P_Np}, a precision score Precision_T(R, P_j) is + calculated for each predicted anomaly range, P_j. Those precision scores are then added into a total precision score and divided by the + total number of predicted anomaly ranges, Np, to obtain an average precision score for the whole timeseries. + + Multiple considerations are taken into account when computing the individual precision scores for each real anomaly range, such as: + Existence: Catching the existence of an anomaly (even by predicting only a single point in R_i), by itself, might be valuable + for the application. + Size: The larger the size of the correctly predicted portion of R_i, the higher the precision score. + Position: In some cases, not only size, but also the relative position of the correctly predicted portion of R_i might matter + to the application. + Cardinality: Detecting R_i with a single prediction range P_j ∈ P may be more valuable than doing so with multiple different + ranges in P in a fragmented manner. + + All of those considerations are captured using two main reward terms: Existence Reward and Overlap Reward, weighted by a weighting + constant ``alpha``. The precision score for each predicted anomaly range will be calculated as: + Precision_T = alpha * Existence Reward + (1 - alpha) * Overlap Reward + + Parameters + ---------- + y_true : array-like, 1-D + The actual point-wise anomalies + y_pred : array-like, 1-D + The predicted point-wise anomalies + alpha : `float` + Reward weighting term for the two main reward terms for the predicted anomaly range precision score: existence and overlap rewards. + positional_bias : `str`, default "flat" + The accepted options are: + * "flat": Each index position of an anomaly range is equally important. Return the same value of 1.0 as the positional + reward regardless of the location of the pointwise anomaly within the anomaly range. + * "front": Positional reward is biased towards early detection, as earlier overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + * "middle": Positional reward is biased towards the detection of anomaly closer to its middle point, as overlap locations + closer to the middle of an anomaly range are assigned higher rewards. + * "back": Positional reward is biased towards later detection, as later overlap locations of pointwise anomalies with an + anomaly range are assigned higher rewards. + cardinality_bias: `str` or None, default None + In the overlap reward, this is a penalization factor. If None, no cardinality penalty will be applied. If "reciprocal", the + overlap reward will be penalized as it gets multiplied by the reciprocal of the number of detected anomaly ranges overlapping + with the predicted anomaly range. + range_based: `bool`, default True + This implementation of range-based precision subsumes the classic precision. If True range based precision will be calculated, otherwise + classic precision will be calculated. + + Returns + ------- + precision : `float` + The overall precision score for the time series. + """ + assert len(y_true) == len(y_pred) + assert 0 <= alpha <= 1 + assert positional_bias in ["flat", "front", "middle", "back"] + if cardinality_bias is not None: + assert cardinality_bias == "reciprocal" + + real_anomaly_ranges = prepare_anomaly_ranges(np.array(y_true), range_based) + predicted_anomaly_ranges = prepare_anomaly_ranges(np.array(y_pred), range_based) + + precision = compute_range_based_score( + predicted_anomaly_ranges, + real_anomaly_ranges, + alpha, positional_bias, cardinality_bias) + return precision + + +@validate_categorical_input +def range_based_recall_score( + y_true, + y_pred, + alpha: float = 0.5, + positional_bias: str = "flat", + cardinality_bias: Optional[str] = None, + range_based: bool = True): + """Compute a recall score for two classes, usually labeled 1 and 0 to denote an anomaly and not an anomaly. + Both ``y_true`` and ``y_pred`` need to be in sorted by timestamp. + + This recall implementation is from the paper: + Precision and Recall for Time Series + ; + + Point-wise real and predicted anomalies are first transformed into anomaly ranges. Then, given the set of real + anomaly ranges, R = {R_1, ..., R_Nr}, and predicted anomaly ranges, P = {P_1, ..., P_Np}, a recall score Recall_T(R_i, P) is + calculated for each real anomaly range, R_i. Those recall scores are then added into a total recall score and divided by the + total number of real anomaly ranges, Nr, to obtain an average recall score for the whole timeseries. + + Multiple considerations are taken into account when computing the individual recall scores for each real anomaly range, such as: + Existence: Catching the existence of an anomaly (even by predicting only a single point in R_i), by itself, might be valuable + for the application. + Size: The larger the size of the correctly predicted portion of R_i, the higher the recall score. + Position: In some cases, not only size, but also the relative position of the correctly predicted portion of R_i might matter + to the application. + Cardinality: Detecting R_i with a single prediction range P_j ∈ P may be more valuable than doing so with multiple different + ranges in P in a fragmented manner. + + All of those considerations are captured using two main reward terms: Existence Reward and Overlap Reward, weighted by a weighting + constant ``alpha``. The recall score for each real anomaly range will be calculated as: + Recall_T = alpha * Existence Reward + (1 - alpha) * Overlap Reward + + Parameters + ---------- + y_true : array-like, 1-D + The actual point-wise anomalies + y_pred : array-like, 1-D + The predicted point-wise anomalies + alpha : `float` + Reward weighting term for the two main reward terms for the real anomaly range recall score: existence and overlap rewards. + positional_bias : `str`, default "flat" + The accepted options are: + * "flat": Each index position of an anomaly range is equally important. Return the same value of 1.0 as the positional + reward regardless of the location of the pointwise anomaly within the anomaly range. + * "front": Positional reward is biased towards early detection, as earlier overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + * "middle": Positional reward is biased towards the detection of anomaly closer to its middle point, as overlap locations + closer to the middle of an anomaly range are assigned higher rewards. + * "back": Positional reward is biased towards later detection, as later overlap locations of pointwise anomalies with an + anomaly range are assigned higher rewards. + cardinality_bias: `str` or None, default None + In the overlap reward, this is a penalization factor. If None, no cardinality penalty will be applied. If "reciprocal", the + overlap reward will be penalized as it gets multiplied by the reciprocal of the number of detected anomaly ranges overlapping + with the real anomaly range. + range_based: `bool`, default True + This implementation of range-based recall subsumes the classic recall. If True range based recall will be calculated, otherwise + classic recall will be calculated. + + Returns + ------- + recall : `float` + The overall recall score for the time series. + """ + assert len(y_true) == len(y_pred) + assert 0 <= alpha <= 1 + assert positional_bias in ["flat", "front", "middle", "back"] + if cardinality_bias is not None: + assert cardinality_bias == "reciprocal" + + real_anomaly_ranges = prepare_anomaly_ranges(np.array(y_true), range_based) + predicted_anomaly_ranges = prepare_anomaly_ranges(np.array(y_pred), range_based) + recall = compute_range_based_score( + real_anomaly_ranges, + predicted_anomaly_ranges, + alpha, positional_bias, cardinality_bias) + return recall diff --git a/greykite/detection/common/ad_evaluation_utils.py b/greykite/detection/common/ad_evaluation_utils.py new file mode 100644 index 0000000..17f60f2 --- /dev/null +++ b/greykite/detection/common/ad_evaluation_utils.py @@ -0,0 +1,306 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Saad Eddin Al Orjany + +"""util functions""" + +from typing import List +from typing import Optional + +import numpy as np + + +def shift(arr, num: int, fill_value: int = np.nan): + """Rolls a 1d-array in either direction, and applies a fill value on a portion of the array + + Parameters + ---------- + arr : array-like, 1-D + Array to roll and apply partial mask on + num : `int` + The patterns to be kept. + fill_value: `int` + After shifting elements, a value to fill in place of NAs in the resulint shifted array + + Returns + ------- + arr : array-like, 1-D + Rolled and partially-masked array + """ + arr = np.roll(arr, num) + if num < 0: + arr[num:] = fill_value + elif num > 0: + arr[:num] = fill_value + return arr + + +def prepare_anomaly_ranges(pointwise_anomalies, range_based: bool = True): + """Convert a list of pointwise anomalies into a list of anomaly ranges + + Parameters + ---------- + pointwise_anmalies : array-like, 1-D + List of pointwise anomalies + range_based : `bool` + If False, each pointwise anomaly is treated as an anomaly range + If True, adjacent pointwise anomalies will be merged into a single anomaly interval + + Returns + ------- + anomaly_ranges: array-like, 2D + 2D-array representing anomaly ranges, with each element indicating the + start and end indexes for an anomaly period. + """ + + if range_based: + pointwise_anomalies = np.argwhere(pointwise_anomalies == 1).ravel() + anomaly_ranges_shift_forward = shift( + pointwise_anomalies, + 1, + fill_value=pointwise_anomalies[0]) + anomaly_ranges_shift_backward = shift( + pointwise_anomalies, + -1, + fill_value=pointwise_anomalies[-1]) + anomaly_ranges_start = np.argwhere(( + anomaly_ranges_shift_forward - pointwise_anomalies) != -1).ravel() + anomaly_ranges_end = np.argwhere(( + pointwise_anomalies - anomaly_ranges_shift_backward) != -1).ravel() + anomaly_ranges = np.hstack([ + pointwise_anomalies[anomaly_ranges_start].reshape(-1, 1), + pointwise_anomalies[anomaly_ranges_end].reshape(-1, 1)]) + else: + anomaly_ranges = np.argwhere(pointwise_anomalies == 1).repeat(2, axis=1) + + return anomaly_ranges + + +def get_cardinality_factor( + overlap_count, + cardinality_bias: Optional[str] = None): + """Cardinalty factor used to penalize the overlap size & positional reward. + + Parameters + ---------- + overlap_count : array-like, 1-D + Accumulator used to keep track of how of overlaps between an anomaly range and a set of anomaly ranges. + cardinality_bias: `str` or None, default None + In the overlap reward, this is a penalization factor. If None, no cardinality penalty will be applied. If "reciprocal", the + overlap reward will be penalized as it gets multiplied by the reciprocal of the number of detected anomaly ranges overlapping + with the predicted anomaly range. + + Returns + ------- + cardinality_factor: `float` + A multiplying factor used to penalize higher cardinality: the number of overlaps of + predicted anomaly ranges with a real anomaly range, and vice versa. + """ + + if cardinality_bias is not None: + assert cardinality_bias == "reciprocal" + + overlap = overlap_count[0] + assert overlap >= 0 + + cardinality_factor = 1.0 + + if cardinality_bias == "reciprocal" and overlap > 1: + cardinality_factor /= overlap + + return cardinality_factor + + +def get_positional_reward( + loc: int, + anomaly_length: int, + positional_bias: str = "flat"): + """Positional reward for a single pointwise anomaly with an anomaly range + + Parameters + ---------- + loc : `int` + Location of the pointwise anomaly, within the anomaly range. Takes value in the range[1, anomaly_length] + anomaly_length: `int` + Length of the anomaly range used to score a pointwise anomaly within it. + positional_bias : `str`, default "flat" + The accepted options are: + * "flat": Each index position of an anomaly range is equally important. Return the same value of 1.0 as the positional + reward regardless of the location of the pointwise anomaly within the anomaly range. + * "front": Positional reward is biased towards early detection, as earlier overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + * "middle": Positional reward is biased towards the detection of anomaly closer to its middle point, as overlap locations + closer to the middle of an anomaly range are assigned higher rewards. + * "back": Positional reward is biased towards later detection, as later overlap locations of pointwise anomalies with an + anomaly range are assigned higher rewards. + + Returns + ------- + positional_reward: `float` + Positional reward for the pointwise anomaly within an anomaly range. + """ + + assert 1 <= loc <= anomaly_length + positional_reward = 1.0 + + if positional_bias == "flat": + return positional_reward + elif positional_bias == "front": + positional_reward = float(anomaly_length - loc + 1.0) + elif positional_bias == "middle": + if loc <= anomaly_length / 2.0: + positional_reward = float(loc) + else: + positional_reward = float(anomaly_length - loc + 1.0) + elif positional_bias == "back": + positional_reward = float(loc) + else: + raise Exception("Invalid positional bias value") + return positional_reward + + +def get_overlap_size_and_position_reward( + anomaly_range_1: List[int], + anomaly_range_2: List[int], + overlap_count, + positional_bias: str = "flat"): + """Calculates overlap reward for both size and position of two anomaly ranges + + Parameters + ---------- + anomaly_range_1 : list [int] + A list of two integers, representing the start and end indexes of an anomaly range + anomaly_range_2 : list [int] + A list of two integers, representing the start and end indexes of an anomaly range + overlap_count : array-like, 1-D + Accumulator used to keep track of how of overlaps between an anomaly range and a set of anomaly ranges. + positional_bias : `str`, default "flat" + If "flat", each index position of an anomaly range is equally important. Return the same + number, 1.0, as the positional reward regardless of the location of the pointwise anomaly + within the anomaly range. + If "front", reward is biased towards early detection, as earlier overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + If "middle", reward is biased towards the detection of anomaly closer to its middle point, as + overlap locations closer to the middle of an anomaly range are assigned higher rewards. + If "back", reward is biased towards later detection, as later overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + + Returns + ------- + overlap_size_and_position_reward: `float` + Overlap reward for both size and position of two anomaly ranges. + """ + overlap_size_and_position_reward = 0 + + if anomaly_range_1[1] < anomaly_range_2[0] or anomaly_range_1[0] > anomaly_range_2[1]: + return overlap_size_and_position_reward + else: + overlap_count[0] += 1 + overlap = np.zeros(anomaly_range_1.shape) + overlap[0] = max(anomaly_range_1[0], anomaly_range_2[0]) + overlap[1] = min(anomaly_range_1[1], anomaly_range_2[1]) + + anomaly_length = anomaly_range_1[1] - anomaly_range_1[0] + 1 + overlap_positional_reward = 0 + max_positional_reward = 0 + for local_idx in range(1, anomaly_length + 1): + temp_reward = get_positional_reward(local_idx, anomaly_length, positional_bias) + max_positional_reward += temp_reward + + idx = anomaly_range_1[0] + local_idx - 1 + if overlap[0] <= idx <= overlap[1]: + overlap_positional_reward += temp_reward + + if max_positional_reward > 0: + overlap_size_and_position_reward = overlap_positional_reward / max_positional_reward + + return overlap_size_and_position_reward + + +def compute_range_based_score( + anomaly_ranges_1, + anomaly_ranges_2, + alpha: float = 0.5, + positional_bias: str = "flat", + cardinality_bias: Optional[str] = None): + """Given two lists of anomaly ranges, calculate a range-based score over the time series represented by the first list + of anomaly ranges. If ``anomaly_ranges_1`` is the predicted anomaly ranges, the result is range-based precision score. If + ``anomaly_range_1`` is the real anomaly ranges, the result is range-based recall score. + + Parameters + ---------- + anomaly_ranges_1 : array-like, 2D + 2D-array representing anomaly ranges, with each element indicating the + start and end indexes for an anomaly period. + anomaly_ranges_2 : array-like, 2D + 2D-array representing anomaly ranges, with each element indicating the + start and end indexes for an anomaly period. + alpha : `float` + Reward weighting term for the two main reward terms for the real anomaly range recall score: existence + and overlap rewards. + positional_bias : `str`, default "flat" + If "flat", each index position of an anomaly range is equally important. Return the same + number, 1.0, as the positional reward regardless of the location of the pointwise anomaly + within the anomaly range. + If "front", reward is biased towards early detection, as earlier overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + If "middle", reward is biased towards the detection of anomaly closer to its middle point, as + overlap locations closer to the middle of an anomaly range are assigned higher rewards. + If "back", reward is biased towards later detection, as later overlap locations of pointwise + anomalies with an anomaly range are assigned higher rewards. + cardinality_bias: `str` or None, default None + In the overlap reward, this is a penalization factor. If None, no cardinality penalty will be applied. If "reciprocal", the + overlap reward will be penalized as it gets multiplied by the reciprocal of the number of detected anomaly ranges overlapping + with the predicted anomaly range. + + Returns + ------- + overlap_size_and_position_reward: `float` + Overlap reward for both size and position of two anomaly ranges. + """ + + assert 0 <= alpha <= 1 + assert positional_bias in ["flat", "front", "middle", "back"] + if cardinality_bias is not None: + assert cardinality_bias == "reciprocal" + + score = 0.0 + + for range_idx1 in range(len(anomaly_ranges_1)): + overlap_count = [0] + overlap_size_and_position_reward = 0 + real_range = anomaly_ranges_1[range_idx1, :] + for range_idx2 in range(len(anomaly_ranges_2)): + predicted_range = anomaly_ranges_2[range_idx2, :] + overlap_size_and_position_reward += get_overlap_size_and_position_reward( + real_range, + predicted_range, + overlap_count, + positional_bias) + + cardinality_factor = get_cardinality_factor(overlap_count, cardinality_bias) + overlap_reward = cardinality_factor * overlap_size_and_position_reward + + existence_reward = 1 if overlap_count[0] > 0 else 0 + score += alpha * existence_reward + (1 - alpha) * overlap_reward + + score /= len(anomaly_ranges_1) + return score diff --git a/greykite/detection/common/pickler.py b/greykite/detection/common/pickler.py new file mode 100644 index 0000000..f75a865 --- /dev/null +++ b/greykite/detection/common/pickler.py @@ -0,0 +1,650 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra, Kaixu Yang + +"""Util functions to serialize and deserialize anomaly detector.""" +import codecs +import inspect +from collections import OrderedDict + +import dill + + +# Constants used for serialization of the model. +# Be careful not to have these strings in the object to be serialized. +KEY_SUFFIX = "__key__" +"""Suffix used to denote the key of a dictionary during serialization.""" +VALUE_SUFFIX = "__value__" +"""Suffix used to denote the value of a dictionary during serialization.""" +OBJECT_TYPE_ROOT = "ROOT" +"""The string to denote the root of the serialized object tree.""" +PICKLE_KEY_EXTENSION = ".pkl" +"""The extension used to denote a object that can be directly serialized by dill.""" +DIRECTORY_KEY_EXTENSION = ".dir" +"""The extension used to denote a directory that can not be directly serialized by dill.""" +TYPE_KEY_EXTENSION = ".type" +"""The extension used to denote the type of an object that can not be directly serialized by dill.""" + + +class GreykitePickler: + """Extends the functionality of dill to serialize arbitrary objects. + + Originally intended to serialize the anomaly detector class + `~greykite.detection.anomaly_detector.AnomalyDetector`, but + can be potentially used to serialize other objects. + + Outputs a dictionary that can be serialized by `json` or `yaml` libraries. + The dictionary has a tree structure, where each node is either a + serialized object, or a pair of type and directory of serialized objects. + The base of the directory is called "ROOT". + + Usage + ----- + # To serialize + pickler = GreykitePickler() + pickled = pickler.dumps(obj) + pickled_json = json.dumps(pickled) + + # To deserialize + pickled = json.loads(pickled_json) + obj = pickler.loads(pickled) + + Attributes + ---------- + obj: any + The object to be serialized. + """ + def __init__(self): + """Initializes an instance of the GreykitePickler class.""" + self.obj = None + + def dumps(self, obj, obj_name=OBJECT_TYPE_ROOT): + """Uses Depth First Search (DFS) to recursively serialize the input `obj`. + + For each object, the following happens: + 1. If the `obj` is serializable by `dill`, a dictionary key with {`obj_name`}.pkl will be generated. + + ```python + { + "{obj_name}.pkl": "serialized object", + } + ``` + + 2. Else a dictionary with two keys are generated. + - {`obj_name`}.type stores the object type. + - {`obj_name`}.dir stores the elements/attributes of the object. + To build the {`obj_name`}.dir dictionary, `dumps` method is called recursively.\ + + ```python + { + "{obj_name}.type": "type of the object", + "{obj_name}.dir": { + "key1": "serialized value1", + "key2": "serialized value2", + ... + } + } + ``` + + The current supported recursion types are: + + - list/tuple: type name is "list" or "tuple", each element is attempted to + be pickled independently if the entire list/tuple is not serializable. + The order is preserved. + - OrderedDict: type name is "ordered_dict", each key and value are attempted + to be pickled independently if the entire dict is not serializable. + The order is preserved. + - dict: type name is "dict", each key and value are attempted to be pickled + independently if the entire dict is not serializable. + The order is not preserved. + - class instance: type name is the class object, used to create new instance. + Each attribute is attempted to be pickled independently if the entire + instance is not serializable. + + Parameters + ---------- + obj: any + The object to be serialized. + obj_name: `str`, default "ROOT" + The name of the object to be serialized. Default is "ROOT". + + Returns + ------- + serialized: `dict` + The serialized object. + + Raises + ------ + NotImplementedError: If the object cannot be serialized. + """ + self.obj = obj + try: + serialized = self.dumps_to_str(obj) + return {f"{obj_name}{PICKLE_KEY_EXTENSION}": serialized} + except NotImplementedError: + if isinstance(obj, OrderedDict): + return self._serialize_ordered_dict(obj, obj_name) + if isinstance(obj, dict): + return self._serialize_dict(obj, obj_name) + if isinstance(obj, (list, tuple)): + return self._serialize_list_tuple(obj, obj_name) + if hasattr(obj, "__class__") and not isinstance(obj, type): + return self._serialize_class(obj, obj_name) + else: + raise NotImplementedError(f"Cannot pickle object of type {type(obj)}.") + + def loads(self, serialized_dict, obj_type=OBJECT_TYPE_ROOT): + """Deserializes the output of the `dumps` method. + + Parameters + ---------- + serialized_dict: `dict` + The output of the `dumps` method. + obj_type: `str`, default "ROOT" + The type of the object to be deserialized. + + Returns + ------- + obj: any + The deserialized object. + + Raises + ------ + NotImplementedError: If the object cannot be deserialized. + """ + if obj_type == OBJECT_TYPE_ROOT: + return self._deserialize_root(serialized_dict, obj_type) + if obj_type == OrderedDict.__name__: + return self._deserialize_ordered_dict(serialized_dict, obj_type) + if obj_type == dict.__name__: + return self._deserialize_dict(serialized_dict, obj_type) + if obj_type in ("list", "tuple"): + return self._deserialize_list_tuple(serialized_dict, obj_type) + if inspect.isclass(obj_type): + return self._deserialize_class(serialized_dict, obj_type) + else: + raise NotImplementedError(f"Cannot unpickle object of type {obj_type}.") + + def _serialize_ordered_dict(self, obj, obj_name): + """Pickles an ordered dictionary when it can not be directly pickled by `dill`. + + Generates a dictionary with two keys. + - {`obj_name`}.type stores the object type. + - {`obj_name`}.dir stores the elements/attributes of the object. + To build the {`obj_name`}.dir dictionary, `dumps` method is called recursively. + The keys are serialized with the suffix "__key__" and prefix "{key_order}" to preserve the order. + Similarly, for the values, the suffix "__value__" is used. + + ```python + { + "{obj_name}.type": "serializable object type", + "{obj_name}.dir": { + "0_{key1}__key__": "serialized key1", + "0_{key1}__value__": "serialized value corresponding to key1", + "1_{key2}__key__": "serialized key2", + "1_{key2}__value__": "serialized value corresponding to key2", + ... + } + } + ``` + + Parameters + ---------- + obj: `OrderedDict` + The ordered dictionary to be serialized. + obj_name: `str` + The name of the object to be serialized. + + Returns + ------- + serialized: `dict` + The serialized ordered dictionary. + """ + # Dumps the type in .type dictionary + serialized = {f"{obj_name}{TYPE_KEY_EXTENSION}": self.dumps_to_str(type(obj).__name__)} + + # Dumps the keys and values in .dir dictionary + result = {} + for i, (key, value) in enumerate(obj.items()): + name = f"{i}_{str(key)}" + result.update(self.dumps(obj=key, obj_name=f"{name}{KEY_SUFFIX}")) + result.update(self.dumps(obj=value, obj_name=f"{name}{VALUE_SUFFIX}")) + serialized[f"{obj_name}{DIRECTORY_KEY_EXTENSION}"] = result + + return serialized + + def _serialize_dict(self, obj, obj_name): + """Pickles a dictionary when it can not be directly pickled by `dill`. + + Generates a dictionary with two keys. + - {`obj_name`}.type stores the object type. + - {`obj_name`}.dir stores the elements/attributes of the object. + To build the {`obj_name`}.dir dictionary, `dumps` method is called recursively. + The keys are serialized with the suffix "__key__" and the values are + serialized with the suffix "__value__". + This is done because the keys of a dictionary can be complex classes that can + not be directly serialized by `dill`. + + ```python + { + "{obj_name}.type": "serializable object type", + "{obj_name}.dir": { + "{key1}__key__": "serialized key1", + "{key1}__value__": "serialized value corresponding to key1", + "{key2}__key__": "serialized key2", + "{key2}__value__": "serialized value corresponding to key2", + ... + } + } + ``` + + Parameters + ---------- + obj: `dict` + The dictionary to be serialized. + obj_name: `str` + The name of the object to be serialized. + + Returns + ------- + serialized: `dict` + The serialized dictionary. + """ + # Dumps the type in .type dictionary + serialized = {f"{obj_name}{TYPE_KEY_EXTENSION}": self.dumps_to_str(type(obj).__name__)} + + # Dumps the keys and values in .dir dictionary + result = {} + for key, value in obj.items(): + name = str(key) + result.update(self.dumps(obj=key, obj_name=f"{name}{KEY_SUFFIX}")) + result.update(self.dumps(obj=value, obj_name=f"{name}{VALUE_SUFFIX}")) + serialized[f"{obj_name}{DIRECTORY_KEY_EXTENSION}"] = result + + return serialized + + def _serialize_list_tuple(self, obj, obj_name): + """Serializes a list or a tuple, preserving its order, when it can not be directly pickled by `dill`. + + Generates a dictionary with two keys. + - {`obj_name`}.type stores the object type. + - {`obj_name`}.dir stores the elements/attributes of the object. + To build the {`obj_name`}.dir dictionary, `dumps` method is called recursively. + + ```python + { + "{obj_name}.type": "serializable object type", + "{obj_name}.dir": { + "0__key__": "serialized value1", + "1__key__": "serialized value2", + ... + } + } + ``` + + Parameters + ---------- + obj: `list` or `tuple` + The list or tuple to be serialized. + obj_name: `str` + The name of the object to be serialized. + + Returns + ------- + serialized: `dict` + The serialized list or tuple. + """ + # Dumps the type in .type dictionary + serialized = {f"{obj_name}{TYPE_KEY_EXTENSION}": self.dumps_to_str(type(obj).__name__)} + + # Dumps the keys and values in .dir dictionary + result = {} + for i, value in enumerate(obj): + result.update(self.dumps(obj=value, obj_name=f"{i}{KEY_SUFFIX}")) + serialized[f"{obj_name}{DIRECTORY_KEY_EXTENSION}"] = result + + return serialized + + def _serialize_class(self, obj, obj_name): + """Pickles a class when it can not be directly pickled by `dill`. + + Generates a dictionary with two keys. + - {`obj_name`}.type stores the object type. + - {`obj_name`}.dir stores the elements/attributes of the object. + To build the {`obj_name`}.dir dictionary, `dumps` method is called recursively. + + ```python + { + "{obj_name}.type": "serializable object type", + "{obj_name}.dir": { + "{key1}": "serialized value corresponding to key1", + "{key2}": "serialized value corresponding to key2", + ... + } + } + ``` + Unlike `dict` and `OrderedDict`, the keys of the class attributes does not need to be + serialized, as these are simple strings. + + Parameters + ---------- + obj: `class` + The class to be serialized. + obj_name: `str` + The name of the object to be serialized. + + Returns + ------- + serialized: `dict` + The serialized calss. + """ + # Dumps the type in .type key + serialized = dict() + serialized[f"{obj_name}{TYPE_KEY_EXTENSION}"] = self.dumps_to_str(obj.__class__) + + # Initiates the .dir dictionary + serialized[f"{obj_name}{DIRECTORY_KEY_EXTENSION}"] = {} + # Dumps the class attributes in .dir key + for key, value in obj.__dict__.items(): + serialized[f"{obj_name}{DIRECTORY_KEY_EXTENSION}"].update(self.dumps(obj=value, obj_name=key)) + + return serialized + + def _deserialize_root(self, serialized_dict, obj_type): + """Deserializes the root object. + This is the very top level of the nested `serialized_dict`. + Thus, it either contains a single .pkl file or a single .type + .dir pair. + + Parameters + ---------- + serialized_dict: `dict` + The serialized dictionary. + obj_type: `str` + The type of the object to be deserialized. Must be "ROOT". + + Returns + ------- + obj: any + The deserialized object. + """ + if obj_type != OBJECT_TYPE_ROOT: + raise ValueError(f"The obj_type must be {OBJECT_TYPE_ROOT}.") + + pickles, directories, obj_types = self._get_keys_from_serialized_dict(serialized_dict) + if len(pickles) > 1 or len(directories) > 1: + raise ValueError("Multiple elements found in the top level.") + if f"{OBJECT_TYPE_ROOT}{PICKLE_KEY_EXTENSION}" in pickles: + # The only 1 .pkl file case. + return self.loads_from_str(serialized_dict[f"{OBJECT_TYPE_ROOT}{PICKLE_KEY_EXTENSION}"]) + else: + # The .type + .dir case. + return self.loads( + serialized_dict[f"{OBJECT_TYPE_ROOT}{DIRECTORY_KEY_EXTENSION}"], + obj_types[OBJECT_TYPE_ROOT]) + + def _deserialize_ordered_dict(self, serialized_dict, obj_type): + """Deserializes an ordered dictionary. + + Parameters + ---------- + serialized_dict: `dict` + The serialized dictionary. + obj_type: `str` + The type of the object to be deserialized. Must be "OrderedDict". + + Returns + ------- + obj: `OrderedDict` + The deserialized ordered dictionary. + """ + if obj_type != OrderedDict.__name__: + raise ValueError("The obj_type must be OrderedDict.") + + pickles, directories, obj_types = self._get_keys_from_serialized_dict(serialized_dict) + # Object is a OrderedDict. + # Fetch keys and values according to the number index to preserve orders. + result = OrderedDict() + # Order index is a number appended to the front. + elements = sorted(pickles + directories, key=lambda x: int(x.split("_")[0])) + keys = [element for element in elements if KEY_SUFFIX in element] + for element in keys: + if PICKLE_KEY_EXTENSION in element: + key = self.loads_from_str(serialized_dict[element]) + else: + key = self.loads( + serialized_dict[element], + obj_types[element.split(".")[0]]) + + # Searches for the value corresponding to the key. + element = element.replace(KEY_SUFFIX, VALUE_SUFFIX).split(".")[0] + # Value name could be either with .pkl or a directory. + if f"{element}{PICKLE_KEY_EXTENSION}" in pickles: + element = f"{element}{PICKLE_KEY_EXTENSION}" + value = self.loads_from_str(serialized_dict[element]) + elif f"{element}{DIRECTORY_KEY_EXTENSION}" in directories: + element = f"{element}{DIRECTORY_KEY_EXTENSION}" + value = self.loads( + serialized_dict[element], + obj_types[element.split(".")[0]]) + else: + raise ValueError(f"Value not found for key {key}.") + # Sets the key, value pair. + result[key] = value + + return result + + def _deserialize_dict(self, serialized_dict, obj_type): + """Deserializes a dictionary. + + Parameters + ---------- + serialized_dict: `dict` + The serialized dictionary. + obj_type: `str` + The type of the object to be deserialized. Must be "dict". + + Returns + ------- + obj: `dict` + The deserialized dictionary. + """ + if obj_type != dict.__name__: + raise ValueError("The obj_type must be dict.") + + pickles, directories, obj_types = self._get_keys_from_serialized_dict(serialized_dict) + result = {} + elements = pickles + directories + keys = [element for element in elements if KEY_SUFFIX in element] + # Iterates through keys and finds the corresponding values. + for element in keys: + if PICKLE_KEY_EXTENSION in element: + key = self.loads_from_str(serialized_dict[element]) + else: + key = self.loads( + serialized_dict[element], + obj_types[element.split(".")[0]]) + + # Searches for the value corresponding to the key. + element = element.replace(KEY_SUFFIX, VALUE_SUFFIX).split(".")[0] + # Value name could be either with .pkl or a directory. + if f"{element}{PICKLE_KEY_EXTENSION}" in pickles: + element = f"{element}{PICKLE_KEY_EXTENSION}" + value = self.loads_from_str(serialized_dict[element]) + elif f"{element}{DIRECTORY_KEY_EXTENSION}" in directories: + element = f"{element}{DIRECTORY_KEY_EXTENSION}" + value = self.loads( + serialized_dict[element], + obj_types[element.split(".")[0]]) + else: + raise ValueError(f"Value not found for key {key}.") + # Sets the key, value pair. + result[key] = value + + return result + + def _deserialize_list_tuple(self, serialized_dict, obj_type): + """Deserializes a list or a tuple. + + Parameters + ---------- + serialized_dict: `dict` + The serialized dictionary. + obj_type: `str` + The type of the object to be deserialized. Must be "list" or "tuple". + + Returns + ------- + obj: `list` or `tuple + The deserialized list or tuple. + """ + pickles, directories, obj_types = self._get_keys_from_serialized_dict(serialized_dict) + result = [] + # Order index is a number appended to the front. + elements = sorted(pickles + directories, key=lambda x: int(x.split("_")[0])) + # Recursively loads elements. + for element in elements: + if PICKLE_KEY_EXTENSION in element: + value = self.loads_from_str(serialized_dict[element]) + else: + value = self.loads( + serialized_dict[element], + obj_types[element.split(".")[0]]) + result.append(value) + + if obj_type == "tuple": + result = tuple(result) + + return result + + def _deserialize_class(self, serialized_dict, obj_type): + """Deserializes a class. + + Parameters + ---------- + serialized_dict: `dict` + The serialized dictionary. + obj_type: `str` + The type of the object to be deserialized. Must be instance of a class. + + Returns + ------- + obj: `class` + The deserialized class. + """ + pickles, directories, obj_types = self._get_keys_from_serialized_dict(serialized_dict) + # Object is a class instance. + # Creates the class instance and sets the attributes. + # Some class has required args during initialization, + # these args are pulled from attributes. + init_params = list(inspect.signature(obj_type.__init__).parameters) # init args + elements = pickles + directories + # Gets the attribute names and their values in a dictionary. + values = {} + for element in elements: + if PICKLE_KEY_EXTENSION in element: + values[element.split(".")[0]] = self.loads_from_str(serialized_dict[element]) + else: + values[element.split(".")[0]] = self.loads( + serialized_dict=serialized_dict[element], + obj_type=obj_types[element.split(".")[0]], + ) + # Gets the init args from values. + init_dict = {key: value for key, value in values.items() if key in init_params} + # Some attributes have a "_" at the beginning. + init_dict.update({key[1:]: value for key, value in values.items() + if (key[1:] in init_params and key[0] == "_")}) + # ``design_info`` does not have column_names attribute, + # which is required during init. + # The column_names param is pulled from the column_name_indexes attribute. + # This can be omitted once we allow dumping @property attributes. + if "column_names" in init_params: + init_dict["column_names"] = values["column_name_indexes"].keys() + # Creates the instance. + result = obj_type(**init_dict) + # Sets the attributes. + for key, value in values.items(): + setattr(result, key, value) + + return result + + @staticmethod + def dumps_to_str(obj): + """Returns a serialized string representation of the `obj`. + The `obj` must be serializable by `dill`. + + Serialized output `dill` is a bytes object, which can not be stored in a json file. + This method encodes the bytes object to a base64 string, which can be stored in a json file. + + Parameters + ---------- + obj: any + The object to be serialized. Must be serializable by `dill`. + + Returns + ------- + serialized_string: `str` + A serialized string representation of the object. + """ + return codecs.encode(dill.dumps(obj), "base64").decode() + + @staticmethod + def loads_from_str(serialized_string): + """Returns a deserialized object from a `serialized_string`, usually the + output of `dumps_to_str`. + + Parameters + ---------- + serialized_string: `str` + The serialized string. + + Returns + ------- + obj: any + The deserialized object. + """ + return dill.loads(codecs.decode(serialized_string.encode(), "base64")) + + def _get_keys_from_serialized_dict(self, serialized_dict): + """Returns the keys from a `serialized_dict`, the output of `dumps`. + + Parameters + ---------- + serialized_dict: `dict` + The serialized dictionary. + + Returns + ------- + keys: `tuple` + A tuple of (pickles, directories, obj_types). + - pickles: A list of keys that ends with ".pkl". + - directories: A list of keys that ends with ".dir". + - obj_types: A dictionary of {key: obj_type} for keys that ends with ".type". + """ + serialized_keys = list(serialized_dict.keys()) + pickles = [key for key in serialized_keys if PICKLE_KEY_EXTENSION in key] + directories = [key for key in serialized_keys if DIRECTORY_KEY_EXTENSION in key] + obj_types = {key.split(".")[0]: self.loads_from_str(serialized_dict[key]) for key in + serialized_keys if TYPE_KEY_EXTENSION in key} + # There should be one dir_key for every type_key + if not all([directory.split(".")[0] in obj_types for directory in directories]): + raise ValueError("type and directories do not match.") + + return pickles, directories, obj_types diff --git a/greykite/detection/common/testing_utils.py b/greykite/detection/common/testing_utils.py new file mode 100644 index 0000000..16fef70 --- /dev/null +++ b/greykite/detection/common/testing_utils.py @@ -0,0 +1,299 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra, Reza Hosseini + + +import numpy as np +import pandas as pd + +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import TIME_COL +from greykite.common.constants import VALUE_COL +from greykite.common.testing_utils import generate_df_for_tests +from greykite.common.testing_utils_anomalies import contaminate_df_with_anomalies +from greykite.common.viz.timeseries_annotate import plot_lines_markers +from greykite.detection.detector.ad_utils import get_anomaly_df + + +def generate_anomaly_data( + freq, + periods, + anomaly_block_list, + noise_std=10.0, + intercept=500, + delta_range_lower=0, + delta_range_upper=0.2): + """Generates dataset for anomaly detection unit tests. + + Parameters + ---------- + freq: `str` + Frequency of the dataset. + periods: `int` + Number of periods to generate. + anomaly_block_list: `List`[`List`[`int`]] + List of blocks of indices to insert anomalies in. + noise_std: `float` or None, default 10 + Standard deviation of gaussian noise. + intercept: `float` or None, default 500 + Intercept of the data generation model. + delta_range_lower: `float` or None, default 0 + Lower boundary of the interval to choose delta from. + delta_range_upper: `float` or None, default 0.2 + Upper boundary of the interval to choose delta from. + + Returns + ------- + data: `dict` + A dictionary with two keys. + - "df": `pd.DataFrame` + Dataset containing anomalies. + - "anomaly_df": `pd.DataFrame` + Dataframe with anomaly information. + """ + df = generate_df_for_tests( + freq=freq, + periods=periods, + noise_std=noise_std, + intercept=intercept, + train_start_date="2020-01-01", + train_frac=0.99, + seed=123 + )["df"] + # Introduces anomalies + df = contaminate_df_with_anomalies( + df, + anomaly_block_list=anomaly_block_list, + delta_range_lower=delta_range_lower, + delta_range_upper=delta_range_upper, + value_col=VALUE_COL, + min_admissible_value=None, + max_admissible_value=None + ) + anomaly_df = get_anomaly_df(df=df, anomaly_col=ANOMALY_COL) + df = df.drop(columns=[VALUE_COL, ANOMALY_COL]).rename( + columns={"contaminated_y": VALUE_COL} + ) + + return { + "df": df, + "anomaly_df": anomaly_df + } + + +def generate_anomaly_data_daily(): + """Generates daily data to be used for end-to-end AD testing.""" + anomaly_block_list = [ + np.arange(100, 105), + np.arange(200, 210), + np.arange(310, 315) + ] + res = generate_anomaly_data( + freq="D", + periods=30*14, + anomaly_block_list=anomaly_block_list + ) + + return res + + +def generate_anomaly_data_hourly(): + """Generates hourly data to be used for end-to-end AD testing.""" + anomaly_block_list = [ + np.arange(1000, 1050), + np.arange(5000, 5100), + np.arange(8000, 8050) + ] + res = generate_anomaly_data( + freq="H", + periods=24*400, + anomaly_block_list=anomaly_block_list + ) + # Introduces missing data + res["df"][VALUE_COL].iloc[150:160] = np.nan + # Changes the datetype format of time column + res["df"][TIME_COL] = res["df"][TIME_COL].dt.strftime("%Y-%m-%d-%H") + # Renames `df` columns + res["df"] = res["df"].rename( + columns={ + TIME_COL: "time", + VALUE_COL: "value" + } + ) + + return res + + +def generate_anomaly_data_weekly(): + """Generates weekly data to be used for end-to-end AD testing.""" + anomaly_block_list = [ + np.arange(50, 55), + np.arange(110, 120) + ] + res = generate_anomaly_data( + freq="W-MON", + periods=200, + anomaly_block_list=anomaly_block_list + ) + + return res + + +def sim_anomalous_data_and_forecasts( + sample_size, + anomaly_num, + anomaly_magnitude=20, + seed=None): + """This function construct normal data and injects anomalies into the data + (returned in ``"df"`` item of the result). + It also creates two forecasts (given in ``"forecast_dfs"``) for the same + data of varying quality in terms of accuracy. + The first forecast is constructed to be more accurate. + The fuction also returns train and test versions of these data which are + constructed to be the first and second half of the generated data respectively. + + Parameters + ---------- + sample_size : `int` + The total sample size of the data generated + anomaly_num : `int` + The number of anomalies injected into the data + anomaly_magnitude : `float`, default 20 + The magnitude of the anomalies injected, which is the mean of the normal + distribution used to add the anomalies. + seed : `int` or None, default None + The seed used in randomization. If None, no seed is set. + + + Returns + ------- + result : `dict` + A dictionary consisting of these items + + - ``"df"`` : `pandas.DataFrame` + A dataframe which includes a time series with columns + - "ts" : to denote time, ranging from 0 to ``sample_size`` + - "y" : the value of the series + - "is_anomaly" : boolean to denote is the point is an anomaly + - ``"forecast_dfs"`` : `list` [`pandas.DataFrame`] + A list of two dataframes which are to be considered to come from + a predictive model. + These are constructed simply by adding noise to observations. + The first forecast is more accurate by construction (less noisy). + below. + - ``"df_train"`` : `pandas.DataFrame` + First half of ``"df"`` defined above, which is usually to be used in + training step. + - ``"forecast_dfs_train"`` : `list` [`pandas.DataFrame`] + First half of the forecast data given in ``"forecast_dfs"`` defined + above. + - ``"df_test"`` : `pandas.DataFrame` + Second half of ``"df"`` defined above, which is usually used in testing + step. + - ``"forecast_dfs_test"`` : `list` [`pandas.DataFrame`] + Second half of the forecast data given in ``"forecast_dfs"`` defined + above. + - ``"fig"``: `plotly.graph_objects.Figure` + A plotly interactive figure, to compare the observed data with the two + predictions constructed in this function. + + """ + + np.random.seed(seed=seed) + y = np.arange(0, sample_size, dtype=float) + is_anomaly = [False] * sample_size + y_pred0 = y + np.random.normal( + loc=0.0, + scale=1.0, + size=sample_size) + + y_pred1 = y + np.random.normal( + loc=0.0, + scale=10.0, + size=sample_size) + + anomaly_idx = np.random.choice( + np.arange(0, sample_size, dtype=int), + size=anomaly_num, + replace=False) + + anomaly_idx.sort() + + for idx in anomaly_idx: + # Randomly introduces positive or negative anomalies + p = np.random.uniform(low=0.0, high=1.0) + if p > 0.5: + y[idx] += np.random.normal( + loc=anomaly_magnitude, + scale=1.0) + else: + y[idx] += -np.random.normal( + loc=anomaly_magnitude, + scale=1.0) + + is_anomaly[idx] = True + + ts = range(sample_size) + df = pd.DataFrame({ + "ts": ts, + "y": y, + "is_anomaly": is_anomaly}) + + df0 = pd.DataFrame({ + "ts": ts, + "y_pred": y_pred0}) + + df1 = pd.DataFrame({ + "ts": ts, + "y_pred": y_pred1}) + + df_all = pd.DataFrame({ + "ts": ts, + "y": y, + "y_pred0": y_pred0, + "y_pred1": y_pred1}) + + forecast_dfs = {} + forecast_dfs[0] = df0 + forecast_dfs[1] = df1 + + fig = plot_lines_markers( + df=df_all, + x_col="ts", + line_cols=["y", "y_pred0", "y_pred1"]) + + train_size = int(sample_size / 2) + df_train = df[:train_size] + forecast_dfs_train = { + k: df[:train_size] for k, df in forecast_dfs.items()} + + df_test = df[train_size:] + forecast_dfs_test = { + k: df[train_size:] for k, df in forecast_dfs.items()} + + return { + "df": df, + "forecast_dfs": forecast_dfs, + "df_train": df_train, + "forecast_dfs_train": forecast_dfs_train, + "df_test": df_test, + "forecast_dfs_test": forecast_dfs_test, + "fig": fig} diff --git a/greykite/detection/detector/__init__.py b/greykite/detection/detector/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/greykite/detection/detector/ad_utils.py b/greykite/detection/detector/ad_utils.py new file mode 100644 index 0000000..c8ad7c4 --- /dev/null +++ b/greykite/detection/detector/ad_utils.py @@ -0,0 +1,585 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra, Reza Hosseini +from functools import reduce + +import pandas as pd + +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import END_TIME_COL +from greykite.common.constants import START_TIME_COL +from greykite.common.constants import TIME_COL +from greykite.common.features.outlier import ZScoreOutlierDetector +from greykite.common.logging import LoggingLevelEnum +from greykite.common.logging import log_message +from greykite.detection.detector.constants import Z_SCORE_CUTOFF + + +def partial_return(func, k): + """For a given function ``func`` which returns multiple outputs accessible by + ``[]`` e.g. python list or dictionary, it construct a new function which only + returns part of the output given in position `k` where key can be a key or + other index. + + Parameters + ---------- + func : callable + A function which returns multiple output accessible by ``[]`` + k : `int` or `str` + A key or index which is implemented by ``[]`` on output of the + input function ``func`` + + Returns + ------- + result : callable + A function which only returns part of what the input ``func`` returns given + in position ``[k]``. In the case that the key does not exist or index is + out of bound, it returns None. + """ + def func_k(*args, **kwargs): + res = func(*args, **kwargs) + # If result is a dictionary we check if key `k` exist, + # If so return the value for that key. + if type(res) == dict: + if k in res.keys(): + return res[k] + # This is for the non-dict case eg `list`, `pandas.Series`, `np.array`. + # In this case we check if `k` is an `integer` and `res` has sufficient length. + # If so, we return the k-th element. + elif type(k) == int and len(res) > k: + return res[k] + # Otherwise it returns None. + return None + + return func_k + + +def vertical_concat_dfs( + df_list, + join_cols, + common_value_cols=[], + different_value_cols=[]): + """For a given set of datarfames with same columns + in ``df_dict``, it will concat them vertically by using + ``join_cols`` as joining columns. + + For ``common_value_cols`` it only extract the columns from the first + dataframe. + + For ``different_value_cols`` it will extract them for each + df and concat them horizontally. The new column names will + have an added index based on their order in ``df_list``. + + Parameters + ---------- + df_list : `list` [`pandas.DataFrame`] + A list of dataframes which are to be concatenated. + join_cols : `list` [`str`] + The list of columns which are to be used for joining. + common_value_cols : `list` [`str`], default ``[]`` + The list of column names for which we assume the values are the + same across dataframes in ``df_list``. For these columns only data + is pulled from the first dataframe appearing in ``df_list`.` + different_value_cols : `list` [`str`], default ``[]`` + The list of columns which are assumed to have potentially different + values across dataframes. + + Returns + ------- + result : `pd.DataFrame` + The resulting concatenated dataframe. + """ + new_df_list = [] + for i, df in enumerate(df_list): + df = df.copy() + # only keeps the `common_value_cols` from the first df + if i != 0: + for col in common_value_cols: + del df[col] + + for col in different_value_cols: + df[f"{col}{i}"] = df[col] + del df[col] + new_df_list.append(df) + + concat_df = reduce( + lambda left, right: pd.merge(left, right, on=join_cols), + new_df_list) + + return concat_df + + +def add_new_params_to_records( + new_params, + records=None): + """For a list of records (each being a `dict`) and a set of parameters + each having a list of potential values, it expands each record in all possible + ways based on all possible values for each param in ``new_params``. + Then it returns all possible augmented records in a list. + + Parameters + ---------- + new_params : `dict` {`str`: `list`} + A dictionary with keys representing (new) variables and values for each + key being the possible values for that variable. + records : `list` [`dict`] or None, default None + List of existing records which are to be augmented with all possible + combinations of the new variables. If None, it is assigned to ``[{}]`` + which means we start from an empty record. + + Returns + ------- + expanded_records : `list` [`dict`] + The resulting list of augmented records. + """ + if records is None: + records = [{}] + + def add_new_param_values(name, values, records): + # `records` is a list and it is copied so that its not altered + # Note that `deepcopy` is not possible for lists + # Therefore inside the for loop we copy each param (`dict`) + records = records.copy() + expanded_records = [] + for param in records: + for v in values: + # Copies to avoid over-write + expanded_param = param.copy() + expanded_param.update({name: v}) + expanded_records.append(expanded_param) + return expanded_records + + expanded_records = records.copy() + for name, values in new_params.items(): + expanded_records = add_new_param_values( + name=name, + values=values, + records=expanded_records) + + return expanded_records + + +def get_anomaly_df( + df, + time_col=TIME_COL, + anomaly_col=ANOMALY_COL): + """Computes anomaly dataframe from a labeled ``df``. + + Parameters + ---------- + df : `pandas.DataFrame` + A data frame which includes minimally + - the timestamp column (``time_col``) + - the anomaly column (``anomaly_col``). + time_col : `str` or None + The column name of timestamps in ``df``. + If None, it is set to + `~greykite.common.constants.TIME_COL`. + anomaly_col : `str` or None + The column name of anomaly labels in ``df``. + ``True`` indicates anomalous data. + ``False`` indicates non-anomalous data. + If None, it is set to + `~greykite.detection.detector.constants.ANOMALY_COL`. + + Returns + ------- + anomaly_df : `pandas.DataFrame` + The dataframe that contains anomaly info. + It should have + - the anomaly start column "start_time" + `~greykite.detection.detector.constants.ANOMALY_START_TIME`. + - the anomaly end column "end_time" + `~greykite.detection.detector.constants.ANOMALY_END_TIME`. + Both should be inclusive. + """ + # Copies `df` by resetting the index to avoid alteration to input df + df = df.reset_index(drop=True) + # When all rows are True/ anomalies + if df[anomaly_col].all(): + start_index = [0] + end_index = [df.index[-1]] + # When all rows are False/ not anomalies + elif not df[anomaly_col].any(): + start_index = [] + end_index = [] + else: + df[anomaly_col] = df[anomaly_col].astype(int) + df[f"{anomaly_col}_diff"] = df[anomaly_col].diff() + + start_index = df.index[df[f"{anomaly_col}_diff"] == 1.0].tolist() + end_index = df.index[df[f"{anomaly_col}_diff"] == -1.0].tolist() + end_index = [index-1 for index in end_index] # to make end points inclusive + + # The first entry of df is an anomaly + if df.iloc[0][anomaly_col]: + start_index.insert(0, 0) + + # The last entry of df is an anomaly + if df.iloc[df.index[-1]][anomaly_col]: + end_index.append(df.index[-1]) + + anomaly_df = pd.DataFrame({ + START_TIME_COL: df.iloc[start_index][time_col].values, + END_TIME_COL: df.iloc[end_index][time_col].values + }) + + return anomaly_df + + +def get_canonical_anomaly_df( + anomaly_df, + freq, + start_time_col=START_TIME_COL, + end_time_col=END_TIME_COL): + """Validates and merges overlapping anomaly periods in anomaly dataframe. + Also standardizes column names. + + For example, consider the following input ``anomaly_df``: + start_time end_time + "2020-01-01" "2020-01-02" + "2020-01-03" "2020-01-05" + + For a daily dataset i.e. ``freq = "D"``, the end time "2020-01-02" and start time + "2020-01-03" are consecutive. Hence, in the output the ``anomaly_df`` is converted to + start_time end_time + "2020-01-01" "2020-01-05" + + However, for an hourly dataset i.e. ``freq = "H"`` the end time "2020-01-02" and start time + "2020-01-03" are not consecutive. Hence, the output is the same as the input ``anomaly_df``. + + Parameters + --------- + anomaly_df : `pandas.DataFrame` + The dataframe that contains anomaly info. + It should at least have + - the anomaly start column ``start_time_col`` + - the anomaly end column ``end_time_col`` + Both are assumed to be inclusive of the start and end times. + freq : `str` + Frequency of the timeseries represented in ``anomaly_df``. + This is used to determine if the timestamps in the ``anomaly_df`` are consecutive. + start_time_col : `str` or None + The column name containing anomaly start timestamps in ``anomaly_df``. + If None, it is set to + `~greykite.detection.detector.constants.START_TIME_COL`. + end_time_col : `str` or None + The column name containing anomaly end timestamps in ``anomaly_df``. + If None, it is set to + `~greykite.detection.detector.constants.END_TIME_COL`. + Returns + ------- + anomaly_df : `pandas.DataFrame` + Standardized anomaly dataframe. + It should have + - the anomaly start column "start_time" + `~greykite.detection.detector.constants.ANOMALY_START_TIME`. + - the anomaly end column "end_time" + `~greykite.detection.detector.constants.ANOMALY_END_TIME`. + Both should be inclusive. + The anomaly periods are non-overlapping and sorted from earliest to latest. + """ + df = anomaly_df.copy() + df[start_time_col] = pd.to_datetime(df[start_time_col]) + df[end_time_col] = pd.to_datetime(df[end_time_col]) + df = df.sort_values(by=[start_time_col]).reset_index(drop=True) + row_num = df.shape[0]-1 + for row in range(row_num): + start_time = df[start_time_col][row] + end_time = df[end_time_col][row] + if start_time > end_time: + raise ValueError(f"Anomaly 'start_time' ({start_time}) is after the anomaly 'end_time' ({end_time}).") + # Merges anomalies + next_start_time = df[start_time_col][row+1] + next_end_time = df[end_time_col][row+1] + num_periods = (next_start_time.to_period(freq=freq) - end_time.to_period(freq=freq)).n + # Start times and end times are inclusive for anomaly df. Hence, the anomaly periods should + # be merged if the number of periods between the anomalies are less than 1. + # e.g. The anomaly periods ["2020-01-01", "2020-01-02"] and ["2020-01-03", "2020-01-04"] + # should be merged into a single anomaly period ["2020-01-01", "2020-01-04"]. + if num_periods <= 1: + df[start_time_col][row+1] = start_time + if next_end_time < end_time: + df[end_time_col][row+1] = end_time + df = df.drop_duplicates( + subset=[start_time_col], + keep="last" + ).rename({ + start_time_col: START_TIME_COL, + end_time_col: END_TIME_COL + }, axis=1).reset_index(drop=True) + return df + + +def optimize_df_with_constraints( + df, + objective_col, + constraint_col, + constraint_value): + """Function that solves the following constrained optimization problem. + + maximize ``df``[``objective_col``] + subject to ``df``[``constraint_col``] >= ``constraint_value``. + + However, unlike traditional constrained optimization, which returns None when no + values satisfy the constraint, this function maximizes ``df``[``constraint_col``]. + Note that in this case, since the constraint is not satisfied, this will get + as close as possible to the ``constraint_value``. + + To understand the reasoning behind this choice, it is helpful to think + about ``objective_col`` as precision and ``constraint_col`` as recall. Thus, + the optimization problem becomes: + maximize precision subject to recall >= target_recall. + + The algorithm proceeds as follows: + 1. Find rows which satisfy ``df``[``constraint_col``] >= ``constraint_value``. + 1.1. If such rows exist, find rows that have highest ``df``[``objective_col``]. + 1.1.1. Find rows that have highest ``df``[``constraint_col``]. + 1.1.2. Among these, find row that maximize ``df``[``objective_col``]. + This solves for multiple ties, if any. + 1.2. If no such rows exist, + 1.2.1. Find rows that maximizes ``df``[``constraint_col``]. + 1.2.2. Among these, find row that maximize ``df``[``objective_col``]. + This solves for multiple ties, if any. + 2. Return corresponding ``df`` row. + + Parameters + ---------- + df : `pandas.DataFrame` + A data frame which includes minimally + - the objective column (``objective_col``) + - the constraint column (``constraint_col``). + objective_col : `str` + The column name of the variable to be optimized. + constraint_col : `str` + The column name of the constraint variable. + constraint_value : `float` + The value of the constraint. + + Returns + ------- + optimal_dict : `dict` + The row of the ``df`` which is the optimal of the + corresponding optimization problem.. + """ + df = df.copy() + constraint_match_indices = df[constraint_col] >= constraint_value + if constraint_match_indices.any(): + log_message(f"Values satisfying the constraint are found.\n" + f"Solving the following optimization problem:\n" + f"Maximize {objective_col} subject to {constraint_col} >= {constraint_value}.", + LoggingLevelEnum.INFO) + df = df[constraint_match_indices] + df = df[df[objective_col] == max(df[objective_col])] + df = df[df[constraint_col] == max(df[constraint_col])] + else: + log_message(f"No values satisfy the constraint.\n" + f"Maximizing ``constraint_col`` ({constraint_col}) so that it is as " + f"close as possible to the ``constraint_value`` ({constraint_value}).", + LoggingLevelEnum.INFO) + df = df[df[constraint_col] == max(df[constraint_col])] + df = df[df[objective_col] == max(df[objective_col])] + + return df.iloc[-1].to_dict() + + +def validate_volatility_features( + volatility_features_list, + valid_features=None): + """Removes duplicate values from ``volatility_features_list`` + and validates the features against ``valid_features``. + + Parameters + ---------- + volatility_features_list: `list` [`list` [`str`]] + Lists of volatility features used to optimize anomaly detection performance. + Valid volatility feature column names are + either columns of ``df`` or belong to + `~greykite.common.constants.TimeFeaturesEnum`. + + valid_features: `list` [`str`] or None + ``volatility_features_list`` is validated against this list. + + Returns + ------- + validated_features_list: `list` [`list` [`str`]] + List of validated volatility features. + """ + # Removes duplicates + validated_features_list = [] + for features in volatility_features_list: + # Removes duplicates within a set of features + features = list(dict.fromkeys(features)) + # Removes duplicates among the feature sets + if features not in validated_features_list: + validated_features_list.append(features) + + # Checks features against the provided features in ``valid_features`` + if valid_features is not None: + all_features = sum(validated_features_list, []) + unique_features = set(all_features) + missing_features = unique_features - set(valid_features) + if missing_features: + raise ValueError(f"Unknown feature(s) ({missing_features}) in `volatility_features_list`. " + f"Valid features are: [{valid_features}].") + + return validated_features_list + + +def get_timestamp_ceil(ts, freq): + """Returns the smallest timestamp that is greater than or equal to `ts` + and is also a multiple of the `freq`. + Assume hourly frequency i.e. `freq` = "H". Then + If `ts` = 1:30, this function returns 2:00. + If `ts` = 1:00, this function returns 1:00. + + Parameters + ---------- + ts: `str` + Timestamp in `str` format. + freq: `str` + Pandas timeseries frequency string. + + Returns + ------- + dt_ceil: `pd.Timestamp` + The smallest timestamp that is greater than or equal to `ts` + and is also a multiple of the `freq`. + """ + dt = pd.to_datetime(ts) + try: + return dt.ceil(freq=freq) + # `pd.Timestamp.ceil` raises a ValueError when `freq` is a non-fixed frequency + # e.g. weekly ("W-MON"), business day ("B) or monthly ("M") + except ValueError: + return dt.to_period(freq).to_timestamp(how="E").normalize() + + +def get_timestamp_floor(ts, freq): + """Returns the largest timestamp that is smaller than or equal to `ts` + and is also a multiple of the `freq`. + Assume hourly frequency i.e. `freq` = "H". Then + If `ts` = 1:30, this function returns 1:00. + If `ts` = 1:00, this function returns 1:00. + + Parameters + ---------- + ts: `str` + Timestamp in `str` format. + freq: `str` + Pandas timeseries frequency string. + + Returns + ------- + dt_floor: `pd.Timestamp` + The largest timestamp that is smaller than or equal to `ts` + and is also a multiple of the `freq`. + """ + dt_ceil = get_timestamp_ceil(ts, freq) + # If input `ts` is not on the `freq` offset, `dt_ceil` > `ts`. + # e.g. Assume `freq` = "H". If `ts` = 1:30, then `dt_ceil` = 2:00. + # Then `dt_ceil` is reduced one `freq` offset to get `dt_floor`. + if dt_ceil > pd.to_datetime(ts): + return dt_ceil - pd.tseries.frequencies.to_offset(freq) + else: + return dt_ceil + + +def get_anomaly_df_from_outliers( + df, + time_col, + value_col, + freq, + z_score_cutoff=Z_SCORE_CUTOFF, + trim_percent=1.0): + """This function identifies extreme values as outliers based on z-scores. + A normal distribution will be fit on ``value_col`` of input df, + and the time points with corresponding values that satisfy abs(z-scores) > + ``Z_SCORE_CUTOFF`` will be considered as outliers. + The function will then construct ``anomaly_df`` based on identified outliers. + If trimming is specified via ``trim_percent`` to be non-zero, + data is trimmed in symmetric fashion (removes high and low values) before + calculating mean and variance of the standard normal. + This is done to deal with large outliers. + + Parameters + ---------- + df : `pandas.DataFrame` + The dataframe with ``time_col`` and ``value_col``. + time_col : `str` + The column name of timestamps in ``df``. + value_col : `str` + The column name of values in ``df`` on which z-scores will be calculated to identify outliers. + freq : `str` + Pandas timeseries frequency string, e.g. "H", "D", etc. + See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases. + z_score_cutoff : `float`, default ``Z_SCORE_CUTOFF`` + Z score cutoff for outlier detection. + trim_percent : `float`, default 1.0 + Trimming percent for calculating the variance. + The function first removes this amount of data in symmetric fashion from + both ends and then it calculates the mean and the variance. + + Returns + ------- + anomaly_df : `pandas.DataFrame` + The dataframe that contains anomaly info based on identified outliers. + It should have + + - the anomaly start column "start_time" + `~greykite.common.constants.START_TIME_COL`. + - the anomaly end column "end_time" + `~greykite.common.constants.END_TIME_COL. + + Both are inclusive. + """ + if df.empty: + return None + + if time_col not in df.columns: + raise ValueError(f"`df` does not have `time_col` with name {time_col}.") + + if value_col not in df.columns: + raise ValueError(f"`df` does not have `value_col` with name {value_col}.") + + df[time_col] = pd.to_datetime(df[time_col]) + + # Calculates z-scores after trimming (if trimming is not 0) + # and identifies points with abs(z-score) > `Z_SCORE_CUTOFF` as outliers. + detect_outlier = ZScoreOutlierDetector( + diff_method=None, + trim_percent=trim_percent, + z_score_cutoff=z_score_cutoff) + + detect_outlier.fit(df[value_col]) + fitted = detect_outlier.fitted + cond_outlier = fitted.is_outlier + + # Extracts time points when outliers occur. + outlier_points = df.loc[cond_outlier, time_col].reset_index(drop=True) + + anomaly_df = pd.DataFrame(columns=[START_TIME_COL, END_TIME_COL]) + if len(outlier_points) > 0: + anomaly_df[START_TIME_COL] = outlier_points + anomaly_df[END_TIME_COL] = anomaly_df[START_TIME_COL] + anomaly_df = get_canonical_anomaly_df( + anomaly_df=anomaly_df, + freq=freq) + return anomaly_df diff --git a/greykite/detection/detector/ape_based.py b/greykite/detection/detector/ape_based.py new file mode 100644 index 0000000..d222454 --- /dev/null +++ b/greykite/detection/detector/ape_based.py @@ -0,0 +1,133 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini + +import numpy as np + +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.detection.detector.best_forecast import BestForecastDetector +from greykite.detection.detector.optimizer import CalcResult + + +# The parameter space to iterate over for APE threshold (absolute percent error) +APE_PARAM_ITERABLE = [{"ape_thresh": x} for x in np.arange(0, 4, 0.05)] + + +class APEDetector(BestForecastDetector): + """This class implements APE (absolute percent error) based detector. + The class finds the + + - best forecast among multiple forecasts which can be + passed as baselines + - as well as optimal APE threshold to use to denote an anomaly + + + This class inherits its parameters and attributes from + + `~greykite.detection.detector.forecast_based.BestForecast` + + and all methods apply here as well. The only difference is: ``param_iterable`` + is not passed but constructed during the ``__init__``. + + + Parameters + ---------- + Solely inherited from + `~greykite.detection.detector.forecast_based.BestForecast` + except for the input parameter `param_iterable` which is constructed in the + ``__init__``. + + Attributes + ---------- + Solely inherited from + `~greykite.detection.detector.forecast_based.BestForecast` + + """ + + def __init__( + self, + value_cols, + pred_cols, + is_anomaly_col=None, + join_cols=None, + reward=None, + anomaly_percent_dict=None): + super().__init__( + value_cols=value_cols, + pred_cols=pred_cols, + is_anomaly_col=is_anomaly_col, + join_cols=join_cols, + reward=reward, + anomaly_percent_dict=anomaly_percent_dict) + + self.param_iterable = APE_PARAM_ITERABLE + + def add_features_one_df( + self, + joined_df): + """Adds features to one joined dataframe. This will be used to add + features to all joined dataframes. + + Parameters + ---------- + joined_df : `pandas.DataFrame` + An input dataframe. + + Returns + ------- + joined_df : `pandas.DataFrame` + An output dataframe, with an extra column of APE values. + """ + + for col in (self.value_cols + self.pred_cols): + if col not in joined_df.columns: + raise ValueError( + "f{col} was not found in joined data with columns:", + "{joined_df.columns}") + + # Multivariate APE using Euclidean distance + joined_df["ape"] = ( + np.linalg.norm( + joined_df[self.value_cols].values - joined_df[self.pred_cols].values, + axis=1) / + np.linalg.norm( + joined_df[self.value_cols].values, + axis=1)) + + def calc_with_param(self, param, data): + """It assigns anomaly label to any points which has a larger mape + than the threshold. + + Parameters + ---------- + data : See class docstring + + Returns + ------- + result : `pandas.DataFrame` + Data frame with predictions added in a new column: ``PREDICTED_ANOMALY_COL`` + """ + pred_df = data.joined_dfs[param["forecast_id"]] + y_pred = (pred_df["ape"] > param["ape_thresh"]) + pred_df[PREDICTED_ANOMALY_COL] = y_pred + data.y_pred = y_pred + data.pred_df = pred_df + return CalcResult(data=data) diff --git a/greykite/detection/detector/best_forecast.py b/greykite/detection/detector/best_forecast.py new file mode 100644 index 0000000..2e8682d --- /dev/null +++ b/greykite/detection/detector/best_forecast.py @@ -0,0 +1,145 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini + + +from greykite.detection.detector.ad_utils import add_new_params_to_records +from greykite.detection.detector.forecast_based import ForecastBasedDetector + + +class BestForecastDetector(ForecastBasedDetector): + """This class purpose is to find the best forecast from given k forecasts + to act as baseline for anomaly detection. + + This class inherits its parameters and attributes from + `~greykite.detection.detector.forecast_based.ForecastBasedDetector` + and all methods apply here as well. + + The only addition is on top of the specified parameters to optimize over, this + class also searches for the best forecast to use out of a number of forecasts + passed. It achieves that simply by extending the parameter space, given in the input + `param_iterable` to have one extra parameter: ``"forecast_id"``. + + Given that assumption, this class further implements the ``fit`` method fully + and the user does not need to implement ``fit``. + + Therefore user only needs to implement the following: + + - ``add_features_one_df``: Note that the class uses this method to fully implement + ``add_features`` because the class can assume that the same function can + be used for all k forecasts. + - ``calc_with_param``: the prediction logic, assuming the optimal param + is determined. + + As an example see how the APE based method is implemented easily by inheriting + from the current class: + + `~greykite.detection.detector.ape_based.APEDetector` + + + Parameters + ---------- + Solely inherited from + `~greykite.detection.detector.forecast_based.ForecastBasedDetector` + + + Attributes + ---------- + Solely inherited from + `~greykite.detection.detector.forecast_based.ForecastBasedDetector` + """ + def __init__( + self, + value_cols=None, + pred_cols=None, + is_anomaly_col=None, + join_cols=None, + reward=None, + anomaly_percent_dict=None, + param_iterable=None): + super().__init__( + value_cols=value_cols, + pred_cols=pred_cols, + is_anomaly_col=is_anomaly_col, + join_cols=join_cols, + reward=reward, + anomaly_percent_dict=anomaly_percent_dict, + param_iterable=param_iterable) + + def fit( + self, + data): + """ + Parameters + ---------- + data : `~greykite.detection.detector.ForecastDetectorData` + Object including the data. + + Returns + ------- + result : None + The fit will update ``self.fit_info`` and ``self.fitted_df`` + """ + # Adds the forecast id possibilities to `param_iterable` + # so that we can optimize over forecasts as well as + # existing combinations of parameters given in `param_iterable` + if data.forecast_dfs is not None: + # First it creates the list of needed forecast ids + # based on the length of the input `forecast_dfs` + forecast_ids = list(range(len(data.forecast_dfs))) + if self.param_iterable is None: + # It creates a list of dictionaries of length one + # each prescribing only the `forecast_id` + param_iterable = [{"forecast_id": x} for x in forecast_ids] + else: + param_iterable = add_new_params_to_records( + new_params={"forecast_id": forecast_ids}, + records=self.param_iterable) + else: + param_iterable = self.param_iterable + + # Joins the forecast dfs with df + # each joined df is a join between df and one element of + # `forecast_dfs` + # Prepares data + self.prep_df_for_predict(data) + # True labels might be needed for some objectives + # therefore we extract them if available + # If labels are not available, then `y_true` is set to be None + data.y_true = None + if self.is_anomaly_col is not None and data.y_true is None: + data.y_true = data.df[self.is_anomaly_col] + + self.data = data + + optim_res = self.optimize_param( + data=data, + param_iterable=param_iterable) + + self.fit_info = { + "param": optim_res["best_param"], + "param_full": optim_res["best_param_full"], + "obj_value": optim_res["best_obj_value"], + "param_obj_list": optim_res["param_obj_list"], + "best_calc_result": optim_res["best_calc_result"]} + + # Gets fitted values and final objective values + self.fitted_df = self.predict(data) diff --git a/greykite/detection/detector/config.py b/greykite/detection/detector/config.py new file mode 100644 index 0000000..aef4602 --- /dev/null +++ b/greykite/detection/detector/config.py @@ -0,0 +1,199 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra + +import json +from dataclasses import dataclass +from typing import Any +from typing import List +from typing import Optional + +from greykite.common.python_utils import assert_equal +from greykite.framework.templates.autogen.forecast_config import from_bool +from greykite.framework.templates.autogen.forecast_config import from_float +from greykite.framework.templates.autogen.forecast_config import from_int +from greykite.framework.templates.autogen.forecast_config import from_list_float +from greykite.framework.templates.autogen.forecast_config import from_list_list_str +from greykite.framework.templates.autogen.forecast_config import from_none +from greykite.framework.templates.autogen.forecast_config import from_str +from greykite.framework.templates.autogen.forecast_config import from_union + + +F1 = "F1" +"""This constant means F1 in anomaly detection evaluation. This is used in `objective` field of `ADConfig`.""" +RECALL = "RECALL" +"""This constant means Recall in anomaly detection evaluation. This is used in `objective` field of `ADConfig`.""" +PRECISION = "PRECISION" +"""This constant means Precision in anomaly detection evaluation. This is used in `objective` field of `ADConfig`.""" + + +@dataclass +class ADConfig: + """Config for providing parameters to the Anomaly Detection library.""" + volatility_features_list: Optional[List[List[str]]] = None + """Set of volatility features used to optimize anomaly detection performance.""" + coverage_grid: Optional[List[float]] = None + """A set of coverage values to optimize anomaly detection performance. + Optimum coverage is chosen among this list.""" + ape_grid: Optional[List[float]] = None + """A set of absolute percentage error (APE) threshold values to optimize anomaly detection performance.""" + sape_grid: Optional[List[float]] = None + """A set of symmetric absolute percentage error (SAPE) threshold values to optimize anomaly detection performance.""" + min_admissible_value: Optional[float] = None + """Lowest admissible value for the obtained confidence intervals.""" + max_admissible_value: Optional[float] = None + """Highest admissible value for the obtained confidence intervals.""" + objective: Optional[str] = None + """The main objective for optimization. It can be either of: F1, PRECISION or RECALL.""" + target_precision: Optional[float] = None + """Minimum precision to achieve during AD optimization in a labeled data.""" + target_recall: Optional[float] = None + """Minimum recall to achieve during AD optimization in a labeled data.""" + soft_window_size: Optional[int] = None + """Window size for soft precision, recall and f1 in a labeled data.""" + target_anomaly_percent: Optional[float] = None + """Desired anomaly percent during AD optimization of an unlabeled data (0-100 scale).""" + variance_scaling: Optional[bool] = None + """The variance scaling method in ridge / linear regression takes into account + (1) the degrees of freedom of the model; (2) the standard error from the coefficients, + hence will provide more accurate variance estimate / prediction intervals.""" + + @staticmethod + def from_dict(obj: Any) -> 'ADConfig': + """Converts a dictionary to the corresponding instance of the `ADConfig` class. + Raises ValueError if the input is not a dictionary. + """ + if not isinstance(obj, dict): + raise ValueError(f"The input ({obj}) is not a dictionary.") + volatility_features_list = from_union([from_list_list_str, from_none], obj.get("volatility_features_list")) + coverage_grid = from_union([from_list_float, from_none], obj.get("coverage_grid")) + ape_grid = from_union([from_list_float, from_none], obj.get("ape_grid")) + sape_grid = from_union([from_list_float, from_none], obj.get("sape_grid")) + min_admissible_value = from_union([from_float, from_none], obj.get("min_admissible_value")) + max_admissible_value = from_union([from_float, from_none], obj.get("max_admissible_value")) + objective = from_union([from_str, from_none], obj.get("objective")) + target_precision = from_union([from_float, from_none], obj.get("target_precision")) + target_recall = from_union([from_float, from_none], obj.get("target_recall")) + soft_window_size = from_union([from_int, from_none], obj.get("soft_window_size")) + target_anomaly_percent = from_union([from_float, from_none], obj.get("target_anomaly_percent")) + variance_scaling = from_union([from_bool, from_none], obj.get("variance_scaling")) + + return ADConfig( + volatility_features_list=volatility_features_list, + coverage_grid=coverage_grid, + ape_grid=ape_grid, + sape_grid=sape_grid, + min_admissible_value=min_admissible_value, + max_admissible_value=max_admissible_value, + objective=objective, + target_precision=target_precision, + target_recall=target_recall, + soft_window_size=soft_window_size, + target_anomaly_percent=target_anomaly_percent, + variance_scaling=variance_scaling + ) + + def to_dict(self) -> dict: + """Converts an instance of the `ADConfig` class to its dictionary format.""" + result = dict() + result["volatility_features_list"] = from_union( + [from_list_list_str, from_none], + self.volatility_features_list) + result["coverage_grid"] = from_union( + [from_list_float, from_none], + self.coverage_grid) + result["ape_grid"] = from_union( + [from_list_float, from_none], + self.ape_grid) + result["sape_grid"] = from_union( + [from_list_float, from_none], + self.sape_grid) + result["min_admissible_value"] = from_union( + [from_float, from_none], + self.min_admissible_value) + result["max_admissible_value"] = from_union( + [from_float, from_none], + self.max_admissible_value) + result["objective"] = from_union( + [from_str, from_none], + self.objective) + result["target_precision"] = from_union( + [from_float, from_none], + self.target_precision) + result["target_recall"] = from_union( + [from_float, from_none], + self.target_recall) + result["soft_window_size"] = from_union( + [from_int, from_none], + self.soft_window_size) + result["target_anomaly_percent"] = from_union( + [from_float, from_none], + self.target_anomaly_percent) + result["variance_scaling"] = from_union( + [from_bool, from_none], + self.variance_scaling) + + return result + + @staticmethod + def from_json(obj: Any) -> 'ADConfig': + """Converts a json string to the corresponding instance of the `ADConfig` class. + Raises ValueError if the input is not a json string. + """ + try: + ad_dict = json.loads(obj) + except Exception: + raise ValueError(f"The input ({obj}) is not a json string.") + + return ADConfig.from_dict(ad_dict) + + def to_json(self) -> str: + """Converts an instance of the `ADConfig` class to its json string format.""" + ad_dict = self.to_dict() + return json.dumps(ad_dict) + + +def assert_equal_ad_config( + ad_config_1: ADConfig, + ad_config_2: ADConfig): + """Asserts equality between two instances of `ADConfig`. + Raises a ValueError in case of a parameter mismatch. + + Parameters + ---------- + ad_config_1: `ADConfig` + First instance of the + :class:`~greykite.detection.detector.config.ADConfig` for comparing. + ad_config_2: `ADConfig` + Second instance of the + :class:`~greykite.detection.detector.config.ADConfig` for comparing. + + Raises + ------- + AssertionError + If `ADConfig`s do not match, else returns None. + """ + if not isinstance(ad_config_1, ADConfig): + raise ValueError(f"The input ({ad_config_1}) is not a member of 'ADConfig' class.") + if not isinstance(ad_config_2, ADConfig): + raise ValueError(f"The input ({ad_config_2}) is not a member of 'ADConfig' class.") + + assert_equal(ad_config_1.to_dict(), ad_config_2.to_dict()) diff --git a/greykite/detection/detector/config_to_reward.py b/greykite/detection/detector/config_to_reward.py new file mode 100644 index 0000000..2cfbb68 --- /dev/null +++ b/greykite/detection/detector/config_to_reward.py @@ -0,0 +1,147 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini + +from greykite.detection.common.ad_evaluation import soft_f1_score +from greykite.detection.common.ad_evaluation import soft_precision_score +from greykite.detection.common.ad_evaluation import soft_recall_score +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.config import F1 +from greykite.detection.detector.config import PRECISION +from greykite.detection.detector.config import RECALL +from greykite.detection.detector.detector import build_anomaly_percent_reward +from greykite.detection.detector.reward import Reward + + +# Evaluation metrics needed. +# Soft F1 score for the True label: +calc_soft_f1 = partial_return(soft_f1_score, True) +# Soft Precision score, for the True label: +calc_soft_precision = partial_return(soft_precision_score, True) +# Soft Recall score for the True label: +calc_soft_recall = partial_return(soft_recall_score, True) + + +OBJECTIVE_FUNC_MAP = { + F1: calc_soft_f1, + PRECISION: calc_soft_precision, + RECALL: calc_soft_recall +} +"""This is a mapping from objective (string) to a function.""" + + +def config_to_reward(ad_config): + """Uses information in `ADConfig` to construct a reward function. + The constructed reward function will be the sum of various rewards + related to the objective and other information given in `ADConfig`. + + The relevant fields in `ADConfig` are: + + - target_anomaly_percent: + An `anomaly_percent_range` will be created here with penalty of -1 + for not hitting that range. The range will be the given `target_anomaly_percent` + plus / minus 10 percent. + - soft_window_size: + This is used as a parameter in calculating objective and target_precision / target_recall (below). + - objective: + It is either of `F1`, `RECALL` and `PRECISION`. + Soft versions will be used if `soft_window_size` is not None. + - target_precision: + This is the minimal precision we aim for. + Any precision below this will be penalized by -1. + - target_recall + This is the minimal recall we aim for. + Any recall below this will be penalized by -1. + + Parameters + ---------- + ad_config : `~greykite.detection.detector.config.ADConfig` + See the linked dataclass (`ADConfig`) for details. + + Returns + ------- + result : `~greykite.detection.detector.reward.Reward` + See the linked class (`Reward`) for details. + """ + # We initialize the reward function to return 0 regradless of input. + # This will be then added to other rewards based on `ADConfig`. + def reward_func(data): + return 0 + + reward = Reward(reward_func=reward_func) + + if ad_config.target_anomaly_percent is not None: + anomaly_percent_upper = min(1.1 * ad_config.target_anomaly_percent, 100) + anomaly_percent_lower = max(0.9 * ad_config.target_anomaly_percent, 0) + anomaly_percent_range = (anomaly_percent_lower, anomaly_percent_upper) + anomaly_percent_dict = { + "range": anomaly_percent_range, "penalty": -1} + anomaly_percent_reward = build_anomaly_percent_reward( + anomaly_percent_dict) + + reward = reward + anomaly_percent_reward + + # Handles objective. + # Determine soft evaluation window. + window = 0 + if ad_config.soft_window_size is not None: + window = ad_config.soft_window_size + if ad_config.objective in [F1, PRECISION, RECALL]: + def reward_func(data): + obj = OBJECTIVE_FUNC_MAP[ad_config.objective]( + y_true=data.y_true, + y_pred=data.y_pred, + window=window) + if obj is not None: + return obj + return 0 + reward = reward + Reward(reward_func=reward_func) + + if ad_config.target_precision is not None: + def reward_func(data): + precision = OBJECTIVE_FUNC_MAP[PRECISION]( + y_true=data.y_true, + y_pred=data.y_pred, + window=window) + if precision is not None: + return precision + return 0 + reward = reward + Reward( + reward_func=reward_func, + min_unpenalized=ad_config.target_precision, + penalty=-1) + + if ad_config.target_recall is not None: + def reward_func(data): + recall = OBJECTIVE_FUNC_MAP[RECALL]( + y_true=data.y_true, + y_pred=data.y_pred, + window=window) + if recall is not None: + return recall + return 0 + + reward = reward + Reward( + reward_func=reward_func, + min_unpenalized=ad_config.target_recall, + penalty=-1) + + return reward diff --git a/greykite/detection/detector/config_utils.py b/greykite/detection/detector/config_utils.py new file mode 100644 index 0000000..e6e09ae --- /dev/null +++ b/greykite/detection/detector/config_utils.py @@ -0,0 +1,61 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra + +"""This file contains anomaly detection configs and corresponding json strings +to be used for testing. +""" + +from greykite.detection.detector.config import ADConfig + + +AD_CONFIG_JSON_DEFAULT = dict( + ad_config=ADConfig(), + ad_json="{}" +) + +AD_CONFIG_PARTIAL = ADConfig( + volatility_features_list=[], + max_admissible_value=1000, + target_recall=None +) + +AD_CONFIG_JSON_PARTIAL = dict( + ad_config=AD_CONFIG_PARTIAL, + ad_json=AD_CONFIG_PARTIAL.to_json() +) + +AD_CONFIG_COMPLETE = ADConfig( + volatility_features_list=[["dow", "is_event"], ["dow_hr"]], + coverage_grid=[0.9, 0.95, 0.99], + min_admissible_value=0, + max_admissible_value=1000, + target_precision=0.5, + target_recall=0.8, + soft_window_size=3, + target_anomaly_percent=2.0, + variance_scaling=False +) + +AD_CONFIG_JSON_COMPLETE = dict( + ad_config=AD_CONFIG_COMPLETE, + ad_json=AD_CONFIG_COMPLETE.to_json() +) diff --git a/greykite/detection/detector/constants.py b/greykite/detection/detector/constants.py new file mode 100644 index 0000000..2a6bef7 --- /dev/null +++ b/greykite/detection/detector/constants.py @@ -0,0 +1,82 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra, Reza Hosseini + +from enum import Enum + +import numpy as np + + +RESIDUAL_COL = "residual" +"""The column name representing residual values.""" + +DEFAULT_COVERAGE_GRID = ( + list(np.arange(1, 20)/20) + + [0.96, 0.97, 0.98, 0.99, 0.995, 0.999]) +"""Default grid of coverage values to optimize anomaly detection performance. """ + +DEFAULT_VOLATILITY_FEATURES_LIST = [[]] +"""Default list of volatility features to use for anomaly detection.""" + +# Default target anomaly fraction if not passed. +DEFAULT_TARGET_ANOMALY_FRACTION = 0.05 + +# Default target anomaly percent if not passed. +DEFAULT_TARGET_ANOMALY_PERCENT = 5.0 + +Z_SCORE_COL = "z_score" +"""The column name representing z-score values.""" + +Z_SCORE_CUTOFF = 20.0 +"""The cut-off value used to identify outliers. Values with abs(z-score) > ``Z_SCORE_CUTOFF`` are treated as outliers.""" + +FIG_SHOW = False +"""Whether to show figures in tests.""" + +PHASE_TRAIN = "train" +"""The phase name for algorithm training.""" + +PHASE_PREDICT = "predict" +"""The phase name for algorithm prediction.""" + + +class PenalizeMethod(Enum): + """Enum used in + + `~greykite.detection.detector.reward` + + to construct penalized reward functions. + Such functions will impose a penalty when the reward value is outside + a pre-specified interval. + + Attributes + ---------- + ADDITIVE : `str` + The penalty will be added to reward + MULTIPLICATIVE : `str` + The penalty will be multiplied to the reward + PENALTY_ONLY : `str` + The penalty value will be used only (original reward value is ignored.) + + """ + ADDITIVE = "additive" + MULTIPLICATIVE = "multiplicative" + PENALTY_ONLY = "penalty_only" diff --git a/greykite/detection/detector/data.py b/greykite/detection/detector/data.py new file mode 100644 index 0000000..5c1bedf --- /dev/null +++ b/greykite/detection/detector/data.py @@ -0,0 +1,84 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini + +from dataclasses import dataclass +from typing import Optional + +import pandas as pd + + +@dataclass +class Data: + """This class is useful in constructing the data consumed in + `~greykite.detection.detector.optimizer.Optimizer` + class. + + Attributes + ---------- + df : `pandas.DataFrame` or None, default None + If not None, it's a dataframe. + """ + df: Optional[pd.DataFrame] = None + + +@dataclass +class DetectorData(Data): + """This class is useful in constructing the data consumed in + `~greykite.detection.detector.Detector` + class. + + Attributes + ---------- + pred_df : `pandas.DataFrame` or None, default None + If not None, it's a dataframe which typically includes predicted data. + y_true : `pandas.Series` or None, default None + If not None, a pandas series of typically boolean values denoting anomaly occurrences + in observed data. + y_pred : `pandas.Series` or None, default None + If not None, a pandas series of typically boolean values denoting anomaly occurrences + in predicted data. + anomaly_df : `pandas.DataFrame` or None, default None + A dataframe which includes the start and end times of observed anomalies. + """ + pred_df: Optional[pd.DataFrame] = None + y_true: Optional[list] = None + y_pred: Optional[list] = None + anomaly_df: Optional[pd.DataFrame] = None + + +@dataclass +class ForecastDetectorData(DetectorData): + """This class is useful in constructing the data consumed in + `~greykite.detection.detector.forecast_based.ForecastBasedDetector` + + Attributes + ---------- + forecast_dfs : `list` [`pandas.DataFrame`] or None, default None + A list of dataframes, which are typically expected to include forecasts. + Each one is typically joined with ``df`` to construct a ``joined_df``, + results of which will be stored in ``joined_dfs`` (see below). + joined_dfs : `list` [`pandas.DataFrame`] or None, default None + A list of dataframes, which are typically the result of joining ``df`` + with each dataframe in ``forecast_dfs`` (see above). + """ + forecast_dfs: Optional[list] = None + joined_dfs: Optional[list] = None diff --git a/greykite/detection/detector/detector.py b/greykite/detection/detector/detector.py new file mode 100644 index 0000000..4386ef2 --- /dev/null +++ b/greykite/detection/detector/detector.py @@ -0,0 +1,442 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +import numpy as np +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini +import pandas as pd + +from greykite.common.constants import ACTUAL_COL +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import END_TIME_COL +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.common.constants import PREDICTED_COL +from greykite.common.constants import PREDICTED_LOWER_COL +from greykite.common.constants import PREDICTED_UPPER_COL +from greykite.common.constants import START_TIME_COL +from greykite.common.constants import TIME_COL +from greykite.common.viz.timeseries_annotate import plot_anomalies_over_forecast_vs_actual +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.ad_evaluation import precision_score +from greykite.detection.common.ad_evaluation import recall_score +from greykite.detection.detector.ad_utils import get_anomaly_df +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.constants import PHASE_PREDICT +from greykite.detection.detector.constants import PHASE_TRAIN +from greykite.detection.detector.constants import PenalizeMethod +from greykite.detection.detector.optimizer import Optimizer +from greykite.detection.detector.reward import Reward + + +# Default for `anomaly_percent_dict` which is needed in +# `Detector` class if no `reward` is passed. +DEFAULT_ANOMALY_PERCENT_DICT = {"range": (4, 6), "penalty": -1} + + +def build_anomaly_percent_reward(anomaly_percent_dict): + """This builds an reward function given an expected anomaly percent range. + This is the expected percent of anomalies in the data by user. + This is useful when the user does now know which points are anomalies, + but has some idea about what percent of data are anomalies. + The reward function constructed here will penalize for being far from the + center of the range specified by the user (``anomaly_percent_dict["range"]``) + and it will add an extra penalty for being outside that interval. The penalty + is specified in``anomaly_percent_dict["penalty"]``. + + Parameters + ---------- + anomaly_percent_dict : `dict` or None, default None + If not None, a dictionary with items: + + - ``"range"`` : `tuple` + We expect a tuple with two elements. + - ``"penalty"`` : `float` + A real number to specify the penalty of being outside the range + specified by user. It should be typically a negative value and it + could be set to ``float("-inf")`` to make this reward a restriction + while using it along with other rewards. + + The dictionary is used to construct an reward which will be the reward in + the optimization is no reward is passed to the ``Detector`` class below. + + If another reward is passed then this will be added to the passed reward. + + The constructed reward based on ``anomaly_percent_dict`` penalizes + by the distance between the center of the ``"range"`` and predicted anomaly + percent as long as the predicted is within range. + + For values outside the range an extra penalty given in`"penalty"` + will be applied. + If `"penalty"` is None, it will be set to -1. + + If None, the default ``DEFAULT_ANOMALY_PERCENT_DICT`` will be used. + + Returns + ------- + result : `~greykite.detection.detector.reward.Reward` + The reward which reflects the information given in the input. + """ + min_percent = anomaly_percent_dict["range"][0] + max_percent = anomaly_percent_dict["range"][1] + target_percent = (min_percent + max_percent) / 2.0 + + def reward_func(data): + percent_anomaly = 100 * np.mean(data.y_pred) + diff = abs(percent_anomaly - target_percent) / 100.0 + # `-diff` will be returned because we assume higher is better + return -diff + + # Below intends to calculate which diffs using the above reward + # function transalte to the `percent_anomaly` being outside the + # specified range in `anomaly_percent_dict` + # Considering the above `reward_func` measures the diff from the mid-point + # of the range: we can deduce that maximum diff which still remains in + # the range is the length of the range. + # since the`reward_func` also divides the diff by 100, we also divide by + # 100. + max_acceptable_diff = (max_percent - min_percent) / (2 * 100.0) + + penalty = anomaly_percent_dict["penalty"] + if penalty is None: + penalty = -1 + + # Below `min_unpenalized=-max_acceptable_diff` will ensure that `percent_anomaly` + # which is outside the specified range in `anomaly_percent_reward` will be + # penalized by an extra penalty + # with default -1, if not provided in `anomaly_percent_dict` + anomaly_percent_reward = Reward( + reward_func=reward_func, + min_unpenalized=-max_acceptable_diff, + max_unpenalized=float("inf"), + penalize_method=PenalizeMethod.ADDITIVE.value, + penalty=penalty) + + return anomaly_percent_reward + + +class Detector(Optimizer): + """Base detector class for Anomaly Detection. + The class initializes by passing an arbitrary ``reward`` for optimization + and a potentially multivariate parameter (given in ``param_iterable``) to optimize. + + The ``reward`` object is required to implement the ``apply`` method which is the case for this class: + `~greykite.detection.detector.reward.Reward` + + The class behaves similar to typical machine-learning algorithms as it includes + + - ``fit`` + - ``predict`` + + methods. + + The optimization method (``optimize_param``) is inherited from: + + `~greykite.detection.detector.optimizer.Optimizer` + + It works simply by iterating over ``param_iterable`` and calculating + the reward to choose the optimal parameter via ``calc_with_param`` method + which is an abstract method already appearing in the ``Optimizer`` class. + + The class assumes that larger is better for the reward function, + during optimization. + + The classes inheriting this class, need to implement ``calc_with_param`` + method to be able to use the optimizer and given that implementation. + + The ``predict`` method is already implemented here and should work for most cases. + This is because in most cases, ``predict`` is simply ``calc_with_param`` + applied to the best param found during the optimization step. + + Parameters + ---------- + reward : `~greykite.detection.detector.reward.Reward` or None, default None + The reward to be used in the optimization. + If None, an reward will be built using the other input + ``anomaly_percent_dict``. + + anomaly_percent_dict : `dict` or None, default None + If not None, a dictionary with items: + + - ``"range"`` : `tuple` + We expect a tuple with two elements denoting the min and max + of the interval range. + - ``"penalty"`` : `float` + A real number to specify the penalty of being outside the range + specified by user. It should be typically a negative value and it + could be set to ``float("-inf")`` to make this reward a restriction + while using it along with other rewards. + + which is used to construct an reward which will be the reward in + the optimization is no reward is passed. + If another reward is passed then this will be added to the passed reward. + The constructed reward based on ``anomaly_percent_dict`` penalizes + by the distance between the center of the ``"range"`` and predicted anomaly + percent as long as the predicted is within range. + For values outside the range an extra penalty given in``"penalty"`` will be applied. + If `"penalty"` is None, it will be set to -1. + + param_iterable : iterable or None, default None + An iterable with every element being a parameter passed to the method + ``calc_with_param`` which takes ``param`` as one of its arguments. + Each `param` can be a dictionary including values for a set of variables, + but that is not a requirement. + The optimizer method (``optimize_param``) will iterate over all the + parameters to find the best parameter in terms of the specified reward. + + Attributes + ---------- + data : `dataclasses.dataclass` or None, default None + A data class object which includes the data for fitting or + prediction. Depending on the model, this data class might + include various fields. The prominent used class which can support + forecast based approaches is given in + `~greykite.detection.detector.DetectorData`. + fitted_df : `pandas.DataFrame` or None, default None + The fitted data after applying the detector. + fit_info : `dict` + A dictionary which includes information about the fitted model. + It is expected that this includes ``"full_param"`` after the fitting + so that the ``predict`` function can use that param during the prediction + and simply call ``calc_with_param``. + In that case the ``predict`` function does not need further implementation + in child classes as it's already implemented in this class. + """ + def __init__( + self, + reward=None, + anomaly_percent_dict=None, + param_iterable=None): + # If both `reward` and `anomaly_percent_dict` are None, + # the detector will use a default `DEFAULT_ANOMALY_PERCENT_DICT` + # the default reward will be smaller for values away from the center of + # `anomaly_percent_dict["range"]` + # and it adds a penalty (which is -1 if not passed) + # if the anomaly percent is outside the range + if reward is None and anomaly_percent_dict is None: + anomaly_percent_dict = DEFAULT_ANOMALY_PERCENT_DICT + + anomaly_percent_reward = None + if anomaly_percent_dict is not None: + anomaly_percent_reward = build_anomaly_percent_reward( + anomaly_percent_dict) + + if reward is None: + self.reward = anomaly_percent_reward + elif anomaly_percent_dict is None: + self.reward = reward + else: + self.reward = reward + anomaly_percent_reward + + self.anomaly_percent_dict = anomaly_percent_dict + self.param_iterable = param_iterable + # Initialize attributes + self.data = None + self.fitted_df = None + self.fit_info = {"param_full": None} + + # Set by the predict method + self.pred_df = None + + def fit( + self, + data=None): + pass + + def prep_df_for_predict( + self, + data=None): + """A method to prepares the data for ``fit``, ``calc_with_param``. + + Parameters + ---------- + data : See class attributes. + + Returns + ------- + result : None + It updates ``data`` + """ + if data is not None and data.df is not None: + if TIME_COL in data.df.columns: + data.df[TIME_COL] = pd.to_datetime(data.df[TIME_COL]) + return data + + def predict( + self, + data, + **kwargs): + """``predict`` method is already implemented here and should work for most cases. + This is because in most cases, ``predict`` is simply + ``calc_with_param`` applied to the best param found during optimization. + + Parameters + ---------- + data : See class attributes. + + Returns + ------- + result : return type of ``calc_with_param`` + Usually a ``pandas.DataFrame`` which includes the predicted anomalies. + """ + self.prep_df_for_predict(data) + calc_result = self.calc_with_param( + data=data, + param=self.fit_info["param_full"], + **kwargs) + if data is not None: + self.pred_df = calc_result.data.pred_df + return calc_result.data + + def plot(self, phase=PHASE_PREDICT, title=None): + """Plots the predicted anomalies over the actual anomalies. + + Parameters + ---------- + phase : str, default ``PHASE_PREDICT`` + The phase of the detector to plot. + Must be one of ``PHASE_PREDICT`` or ``PHASE_TRAIN``. + title : str, default None + The title of the plot. + If None, a default title will be used. + + Returns + ------- + fig : `plotly.graph_objects.Figure` + The plotly figure object. + """ + if phase == PHASE_PREDICT: + if self.pred_df is None: + raise ValueError("No data to plot. Please run `predict` first.") + else: + train_end_date = self.fitted_df[TIME_COL].max() + if train_end_date < self.pred_df[TIME_COL].min(): + train_end_date = None + if title is None: + title = "Detected vs actual anomalies - Prediction phase" + fig = plot_anomalies_over_forecast_vs_actual( + df=self.pred_df, + time_col=TIME_COL, + actual_col=ACTUAL_COL, + predicted_col=PREDICTED_COL, + predicted_anomaly_col=PREDICTED_ANOMALY_COL, + anomaly_col=ANOMALY_COL, + marker_opacity=1, + predicted_anomaly_marker_color="rgba(0, 90, 181, 0.9)", + anomaly_marker_color="rgba(250, 43, 20, 0.7)", + predicted_lower_col=PREDICTED_LOWER_COL, + predicted_upper_col=PREDICTED_UPPER_COL, + train_end_date=train_end_date, + title=title) + return fig + elif phase == PHASE_TRAIN: + if self.fitted_df is None: + raise ValueError("No data to plot. Please run `fit` first.") + else: + if title is None: + title = "Predicted vs actual anomalies - Training phase" + fig = plot_anomalies_over_forecast_vs_actual( + df=self.fitted_df, + time_col=TIME_COL, + actual_col=ACTUAL_COL, + predicted_col=PREDICTED_COL, + predicted_anomaly_col=PREDICTED_ANOMALY_COL, + anomaly_col=ANOMALY_COL, + marker_opacity=1, + predicted_anomaly_marker_color="rgba(0, 90, 181, 0.9)", + anomaly_marker_color="rgba(250, 43, 20, 0.7)", + predicted_lower_col=PREDICTED_LOWER_COL, + predicted_upper_col=PREDICTED_UPPER_COL, + train_end_date=self.fitted_df[TIME_COL].max(), + title=title) + return fig + else: + raise ValueError(f"phase {phase} is not supported. Must be one of {PHASE_PREDICT}, {PHASE_TRAIN}.") + + def summary(self): + """Returns a summary of the fitted model.""" + if self.fitted_df is None: + raise ValueError("No data to summarize. Please run `fit` first.") + else: + # `fitted_df` can be a pandas dataframe or a DetectorData object. + if isinstance(self.fitted_df, pd.DataFrame): + df = self.fitted_df.copy() + else: + df = self.fitted_df.df.copy() + + # Adds the model name and number of observations to the summary. + content = " Anomaly Detection Model Summary ".center(80, "=") + "\n\n" + content += f"Number of observations: {len(df)}\n" + content += f"Model: {self.__class__.__name__}\n" + content += f"Number of detected anomalies: {np.sum(df[PREDICTED_ANOMALY_COL])}\n\n" + + # Calculates the duration of each anomaly block. + if TIME_COL in df: + pred_anomaly_df = get_anomaly_df( + df=df, + time_col=TIME_COL, + anomaly_col=PREDICTED_ANOMALY_COL) + # To calculate the duration of each anomaly block, we add 1 + # to the difference between the end and start times. This is + # because both the start time and the end time is inclusive. + freq = pd.infer_freq(df[TIME_COL]) + pred_anomaly_df["anomaly_interval"] = ( + pred_anomaly_df[END_TIME_COL] + pd.Timedelta(value=1, unit=freq) - + pred_anomaly_df[START_TIME_COL]) + + # Adds anomaly duration info to the summary. + duration_mean = np.mean(pred_anomaly_df["anomaly_interval"]) + content += f"Average Anomaly Duration: {duration_mean}\n" + duration_min = np.min(pred_anomaly_df["anomaly_interval"]) + content += f"Minimum Anomaly Duration: {duration_min}\n" + duration_max = np.max(pred_anomaly_df["anomaly_interval"]) + content += f"Maximum Anomaly Duration: {duration_max}\n\n" + + content += f"Alert Rate(%): {np.mean(df[PREDICTED_ANOMALY_COL] * 100)}" + # Calculates metrics e,g, precision and recall. + if ANOMALY_COL in df and not df[ANOMALY_COL].isnull().all(): + content += ", " + content += f"Anomaly Rate(%): {np.mean(df[ANOMALY_COL]) * 100}\n" + # Calculates Precision score for the True label. + calc_precision = partial_return(precision_score, True) + precision = calc_precision( + y_true=df[ANOMALY_COL], + y_pred=df[PREDICTED_ANOMALY_COL]) + content += f"Precision: {round(precision, 3)}," + # Calculates Recall score for the True label. + calc_recall = partial_return(recall_score, True) + recall = calc_recall( + y_true=df[ANOMALY_COL], + y_pred=df[PREDICTED_ANOMALY_COL]) + content += f" Recall: {round(recall, 3)}," + # Calculates F1 score for the True label. + calc_f1 = partial_return(f1_score, True) + f1 = calc_f1( + y_true=df[ANOMALY_COL], + y_pred=df[PREDICTED_ANOMALY_COL]) + content += f" F1 Score: {round(f1, 3)}\n" + + # Adds the optimal objective value and parameters to the summary. + content += "\n" + obj_value = round(self.fit_info["obj_value"], 3) + content += f"Optimal Objective Value: {obj_value}\n" + optimal_params = self.fit_info["param"] + content += f"Optimal Parameters: {optimal_params}\n" + content += "\n" + + return content diff --git a/greykite/detection/detector/forecast_based.py b/greykite/detection/detector/forecast_based.py new file mode 100644 index 0000000..345bf57 --- /dev/null +++ b/greykite/detection/detector/forecast_based.py @@ -0,0 +1,212 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini + +from greykite.detection.detector.detector import Detector + + +class ForecastBasedDetector(Detector): + """This class enables anomaly detection algorithms which use baseline + forecasts in their logic. + + The class assumes that for a given dataset (`df`) which can for example include + timestamps and observed values for a timeseries as well as anomaly labels (optional), + a number of (say k) forecasts are given. + The goal is to use those k forecasts as baselines to detect anomalies. + + To that end this class implements joining of the k forecasts with the observed + data (`df`), thus producing k joined dataframes. + + This class facilitates the join by implementing it by using the input `join_cols` + using its `join_with_forecasts` method within the class. + In order to use this class user only required to implement these three methods: + + - ``add_features_one_df``: Note that the class uses this method to fully implement + ``add_features`` because the class can assume that the same function can + be used for all k forecasts. + - ``calc_with_param``: the prediction logic, assuming the optimal param + is determined + - ``fit``: The fit method + + This class also already implements ``prep_df_for_predict`` which is basically + a combination of + + - Joining with baselines: ``join_with_forecasts`` + - Adding features: ``add_features`` + + In this way, with minimal implementation, one can implement a large variety of + "forecast based" anomaly detectors. + + + Parameters + ---------- + value_cols : `list` [`str`] or None + The columns for the response metric (which can be multivariate). + If not None, and also ``pred_cols`` (below) also not None, we expect them + to be ordered consistently. + pred_cols : `list` [`str`] or None + The columns for the response metric (which can be multivariate). + If not None, and also ``value_cols`` (above) also not None, we expect them + to be ordered consistently. + is_anomaly_col : `str` or None + join_cols : `list` [`str`] or None + reward : See docstring for + `~/greykite.detection.detector.detector.Detector` + anomaly_percent_dict : See docstring for + `~/greykite.detection.detector.detector.Detector` + param_iterable : See docstring for + `~/greykite.detection.detector.detector.Detector` + + Attributes + ---------- + Solely inherited from `~/greykite.detection.detector.detector.Detector` + """ + def __init__( + self, + value_cols=None, + pred_cols=None, + is_anomaly_col=None, + join_cols=None, + reward=None, + anomaly_percent_dict=None, + param_iterable=None): + super().__init__( + reward=reward, + anomaly_percent_dict=anomaly_percent_dict, + param_iterable=param_iterable) + self.value_cols = value_cols + self.pred_cols = pred_cols + self.is_anomaly_col = is_anomaly_col + self.join_cols = join_cols + + def join_with_forecasts( + self, + forecast_dfs, + df=None): + """Joins data with forecasts. + This will be used both in training and prediction phases. + Parameters + ---------- + forecast_dfs : `dict` [`str`: `pandas.DataFrame`] or None, default None + Dict of baselines (forecasts) to be joined with observed data given + in ``df``. + If ``df`` is None, no join is needed and ``forecast_dfs`` are + returned. + df : `pandas.DataFrame` or None, default None + A dataframe which includes the observed data and potentially the + observed labels. + If None, it is assumed that the ``forecast_dfs`` + list has already been joined or has enough information to fit. + + Returns + ------- + result : `list` [`pandas.DataFrame`] + The list of baselines (forecasts) after being joined with ``df`` (if needed). + """ + + # If either `df` or `self.join_cols` is None, we assume data is joined + # already or has enough info already + if df is None or self.join_cols is None: + return forecast_dfs + + joined_dfs = {} + for k, forecast_df in forecast_dfs.items(): + joined_dfs[k] = df.merge( + forecast_df, + how="inner", + on=self.join_cols) + + return joined_dfs + + def add_features_one_df( + self, + joined_df): + """Adds features to one joined dataframe. + This will be used to add features to all joined dataframes. + Classes inherting from this class can implement this to get new detectors. + + Parameters + ---------- + joined_df : `pandas.DataFrame` + An input dataframe. + + Returns + ------- + joined_df : `pandas.DataFrame` + An output dataframe, potentially with new columns or altered columns. + + """ + return joined_df + + def add_features( + self, + joined_dfs=None): + """Adds features to `joined_dfs` passed. + Note that if nothing is passed, this will update ``self.joined_dfs`` + + Parameters + ---------- + joined_dfs : `list` [`pandas.DataFrame`] or None, default None + A list of dataframes. If None, ``self.joined_dfs`` will be + used as input. + + Returns + ------- + joined_dfs : `list` [`pandas.DataFrame`] + The resulting list of dataframes. + + """ + if joined_dfs is None: + joined_dfs = self.joined_dfs + + if joined_dfs is None: + raise ValueError( + "`joined_dfs` cannot be None." + "`join_with_forecasts` method is to be called before") + + for k, joined_df in joined_dfs.items(): + self.add_features_one_df(joined_df) + return joined_dfs + + def prep_df_for_predict( + self, + data): + """This will prepares the detection data (``data``) by applying + the joins and adding features. + + Parameters + ---------- + data : `~greykite.detection.detector.ForecastDetectorData` + Object including the data. + + Returns + ------- + None + The input ``data`` will be altered. + """ + data.joined_dfs = self.join_with_forecasts( + df=data.df, + forecast_dfs=data.forecast_dfs) + assert len(data.joined_dfs) == len(data.forecast_dfs) + data.joined_dfs = self.add_features( + data.joined_dfs) + + return None diff --git a/greykite/detection/detector/greykite.py b/greykite/detection/detector/greykite.py new file mode 100644 index 0000000..8394e9f --- /dev/null +++ b/greykite/detection/detector/greykite.py @@ -0,0 +1,624 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini, Sayan Patra + +from copy import deepcopy + +import numpy as np +import pandas as pd + +from greykite.algo.uncertainty.conditional.conf_interval import conf_interval +from greykite.algo.uncertainty.conditional.conf_interval import predict_ci +from greykite.common.constants import ACTUAL_COL +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import END_TIME_COL +from greykite.common.constants import ERR_STD_COL +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.common.constants import PREDICTED_COL +from greykite.common.constants import PREDICTED_LOWER_COL +from greykite.common.constants import PREDICTED_UPPER_COL +from greykite.common.constants import QUANTILE_SUMMARY_COL +from greykite.common.constants import RESIDUAL_COL +from greykite.common.constants import START_TIME_COL +from greykite.common.constants import TIME_COL +from greykite.common.constants import VALUE_COL +from greykite.common.evaluation import ElementwiseEvaluationMetricEnum +from greykite.common.features.adjust_anomalous_data import adjust_anomalous_data +from greykite.common.features.timeseries_features import add_daily_events +from greykite.common.features.timeseries_features import add_time_features_df +from greykite.common.logging import LoggingLevelEnum +from greykite.common.logging import log_message +from greykite.common.time_properties import describe_timeseries +from greykite.common.time_properties import get_canonical_data +from greykite.common.time_properties import infer_freq +from greykite.detection.detector.ad_utils import add_new_params_to_records +from greykite.detection.detector.ad_utils import get_anomaly_df +from greykite.detection.detector.ad_utils import get_anomaly_df_from_outliers +from greykite.detection.detector.ad_utils import get_canonical_anomaly_df +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.config_to_reward import config_to_reward +from greykite.detection.detector.constants import DEFAULT_COVERAGE_GRID +from greykite.detection.detector.constants import PHASE_PREDICT +from greykite.detection.detector.constants import Z_SCORE_COL +from greykite.detection.detector.detector import Detector +from greykite.detection.detector.optimizer import CalcResult +from greykite.framework.templates.autogen.forecast_config import ForecastConfig +from greykite.framework.templates.autogen.forecast_config import MetadataParam +from greykite.framework.templates.forecaster import Forecaster + + +DETECTOR_PREDICT_COLS = [ + TIME_COL, + ACTUAL_COL, + PREDICTED_COL, + PREDICTED_LOWER_COL, + PREDICTED_UPPER_COL, + PREDICTED_ANOMALY_COL, + Z_SCORE_COL] +"""The standard columns returned by the greykite detector's `predict` method.""" + + +class GreykiteDetector(Detector): + """This class enables Greykite based anomaly detection algorithms. + It takes a ``forecast_config`` and ``ad_config`` (see Parameters) and builds a detector which + uses the forecast as baseline. + + The fit consists of following stages: + + - Fit a forecast model using the ``forecast_config`` passed + - Fit a volatility model using + `~greykite.algo.uncertainty.conditional.conf_interval.conf_interval` + to optimize over + + -- ``volatility_features_list`` + -- ``coverage_grid`` + + specified in ``ad_config`` passed. + + Parameters + ---------- + ad_config: `~greykite.detection.detector.config.ADConfig` or None, default None + Config object for anomaly detection to use. + forecast_config : `~greykite.framework.templates.model_templates.ForecastConfig` or None, default None + Config object for forecast to use. + reward : See docstring for + `~greykite.detection.detector.detector.Detector` + + Attributes + ---------- + anomaly_percent_dict : `dict` or None, + See attributes of `~greykite.detection.detector.detector.Detector` + forecast_result : `~greykite.framework.pipeline.pipeline.ForecastResult` or None + See class:`~greykite.framework.pipeline.pipeline.ForecastResult` + for details. + forecast_estimator : `~greykite.sklearn.estimator.base_forecast_estimator.BaseForecastEstimator` or None + See class: `~greykite.sklearn.estimator.base_forecast_estimator.BaseForecastEstimator` + for more details. + ci_model : `dict` + This is the fitted volatility model which is the returned dictionary from + `greykite.algo.uncertainty.conditional.conf_interval`. + fit_info : `dict` or None + See attributes of `~greykite.detection.detector.detector.Detector`. + anomaly_df : `pandas.DataFrame` or None + A dataframe which includes the start and end times of observed anomalies. + fit_data : `~greykite.detection.detector.data.DetectorData` or None + The data used in the `fit` method. + """ + def __init__( + self, + forecast_config=None, + ad_config=None, + reward=None): + """Initializes the GreykiteDetector class.""" + if forecast_config is None: + forecast_config = ForecastConfig() + else: + forecast_config = deepcopy(forecast_config) + if ad_config is None: + ad_config = ADConfig() + else: + ad_config = deepcopy(ad_config) + + # Constructs an object of class `Reward` from `ad_config`. + reward_from_config = config_to_reward(ad_config) + if reward is None: + reward = reward_from_config + else: + reward = reward + reward_from_config + + param_iterable = None + if ad_config.coverage_grid is None: + ad_config.coverage_grid = DEFAULT_COVERAGE_GRID # coverage grid can not be empty + param_iterable = add_new_params_to_records( + new_params={"coverage": ad_config.coverage_grid}, + records=param_iterable) + + if ad_config.volatility_features_list is None: + ad_config.volatility_features_list = [[]] # volatility features can be empty + param_iterable = add_new_params_to_records( + new_params={"volatility_features": ad_config.volatility_features_list}, + records=param_iterable) + + if ad_config.ape_grid is not None: + metric = ElementwiseEvaluationMetricEnum.AbsolutePercentError + param_iterable = add_new_params_to_records( + new_params={metric.get_metric_name(): ad_config.ape_grid}, + records=param_iterable) + + if ad_config.sape_grid is not None: + metric = ElementwiseEvaluationMetricEnum.SymmetricAbsolutePercentError + param_iterable = add_new_params_to_records( + new_params={metric.get_metric_name(): ad_config.sape_grid}, + records=param_iterable) + + super().__init__( + reward=reward, + anomaly_percent_dict=None, + # In the above, if anomaly percent appears in `ad_config`, + # it is already baked into `reward` via `config_to_reward` call above. + param_iterable=param_iterable) + + # Attributes + self.forecast_config = forecast_config + self.ad_config = ad_config + + # These will be set by the `fit` method. + self.forecast_result = None + self.forecast_estimator = None + self.anomaly_df = None + self.ci_model = None + self.fit_info = None + self.fit_data = None + + def fit( + self, + data): + """The fit method for Greykite based method. + The fit consists of following stages: + + - Fit a forecast model using the ``forecast_config`` passed + - Fit a volatility model using + `~greykite.algo.uncertainty.conditional.conf_interval.conf_interval` + to optimize over + + -- ``volatility_features_list`` + -- ``coverage_grid`` + + specified in ``ad_config`` passed. + + Parameters + ---------- + data : `~greykite.detection.detector.data.DetectorData` + The input data which needs to include the input timeseries in ``df`` + attribute. We assume df includes minimally these two columns: + + - ``TIME_COL`` + - ``VALUE_COL`` + + and if labels are also available in this datasets we expect the labels + to be available in ``ANOMALY_COL`` + """ + fit_data = deepcopy(data) + df = fit_data.df.copy() + if df is None: + raise ValueError("observed dataframe (df) must be available in fit data") + if VALUE_COL not in df.columns: + raise ValueError(f"observed dataframe (df) must be include {VALUE_COL} column") + if TIME_COL not in df.columns: + raise ValueError(f"observed dataframe (df) must be include {TIME_COL} column") + + # Initializes the forecaster and extracts the forecast parameters. + forecaster = Forecaster() + forecast_params = forecaster.apply_forecast_config( + df=df, + config=self.forecast_config) + freq = forecast_params["freq"] or infer_freq(df, time_col=TIME_COL) + if freq is None: + raise ValueError("Frequency could not be inferred as timestamps were too irregular.") + + # Sets train end date to be the last date in the data. + # This way the Forecaster does not drop anomalous dates from the training data if + # the anomaly is at the end of the data. + train_end_date = forecast_params.get("train_end_date", None) or df[TIME_COL].max() + date_format = forecast_params["date_format"] + + # Logs warnings if the data is not regularly spaced. + # The missing timestamps are filled in the `get_canonical_data` function. + time_stats = describe_timeseries(df=df, time_col=TIME_COL) + if not time_stats["regular_increments"]: + log_message( + "Input time series data is not regularly spaced. " + f"Minimum time increment: {time_stats['min_delta']}. " + f"Maximum time increment: {time_stats['max_delta']}. " + "We will attempt to fill in missing times.", + LoggingLevelEnum.WARNING) + + # Logs warnings if the data has repeated timestamps. + # The repeated timestamps are removed in the `get_canonical_data` function. + ts_unique, ts_count = np.unique( + df[TIME_COL], + return_counts=True) + if max(ts_count) > 1: + log_message( + "The data timestamps had repeated values. " + f"One timestamp had {max(ts_count)} repetitions. " + "We will attempt to remove repeated timestamps and only keep first instance.", + LoggingLevelEnum.WARNING) + + # Gets the canonical data. + canonical_data_dict = get_canonical_data( + df=df, + time_col=TIME_COL, + value_col=VALUE_COL, + freq=freq, + date_format=date_format, + train_end_date=train_end_date, + anomaly_info=None) + df = canonical_data_dict["df"].reset_index(drop=True) + + # The following code checks for the existence of anomaly labels in the input `data` + # and `forecast_config`, extracts them if available and merges them into a single dataframe. + # Updated data is saved in `self.fit_data`. + fit_data.df = df + self.fit_data = self.merge_anomaly_info(fit_data=fit_data, freq=freq) + + # Builds the anomaly info. + if self.anomaly_df is None: + anomaly_info = None + else: + anomaly_info = { + "value_col": VALUE_COL, + "anomaly_df": self.anomaly_df} + + # Updates `metadata_param` in the `forecast_config`. + if self.forecast_config.metadata_param is None: + self.forecast_config.metadata_param = MetadataParam() + self.forecast_config.metadata_param.time_col = TIME_COL + self.forecast_config.metadata_param.value_col = VALUE_COL + self.forecast_config.metadata_param.train_end_date = train_end_date + self.forecast_config.metadata_param.freq = freq + self.forecast_config.metadata_param.anomaly_info = anomaly_info + + # Fits the forecast model. + self.forecast_result = forecaster.run_forecast_config( + df=df, + config=self.forecast_config) + forecast_estimator = self.forecast_result.model[-1] + self.forecast_estimator = forecast_estimator + + default_param = {} + optim_res = self.optimize_param( + data=self.fit_data, + param_iterable=self.param_iterable, + default_param=default_param, + phase="fit") + + self.fit_info = { + "param": optim_res["best_param"], + "param_full": optim_res["best_param_full"], + "obj_value": optim_res["best_obj_value"], + "param_obj_list": optim_res["param_obj_list"], + "best_calc_result": optim_res["best_calc_result"]} + self.fitted_df = self.fit_info["best_calc_result"].data.pred_df + + def calc_with_param( + self, + param, + data=None, + phase=PHASE_PREDICT): + """Predicts anomalies assuming the parameters: + + - ``volatility_features`` + - ``coverage`` + + are passed. This will enable optimization over these parameters + in the `fit` phase. + + Parameters + ---------- + param : `dict` + The parameter to optimize over if desired. + data : `~greykite.detection.detector.data.DetectorData` + The input data which needs to include the input timeseries in ``df`` + attribute. We assume df includes minimally these two columns: + + - ``TIME_COL`` + - ``VALUE_COL`` + + and if labels are also available in this datasets we expect the labels + to be available in ``ANOMALY_COL`` + phase : `str`, default ``PHASE_PREDICT`` + If ``PHASE_PREDICT`` the baseline data will be generated and otherwise we + assume we are in fitting phase and will extract the baseline from the + fitted model. + + Returns + ------- + cal_result: A calculation result from the optimizer. + See `~greykite.detection.detector.optimizer.CalcResult` + """ + df = data.df.copy() + model = self.forecast_estimator + trained_model = model.model_dict + + if phase == PHASE_PREDICT: + # `forecast_pred_df` has the forecasted values. + forecast_pred_df = model.predict(X=df) + x_mat = model.forecast_x_mat + # Extracts only the time column and forecast column. + forecast_pred_df = forecast_pred_df[[TIME_COL, PREDICTED_COL]] + else: + x_mat = trained_model["x_mat"] + forecast_pred_df = trained_model["fitted_df"].copy() + # Extracts only the time column and forecast column. + # Forecasts are in the original `value_col` in fitted data. + forecast_pred_df = forecast_pred_df[[TIME_COL, VALUE_COL]] + forecast_pred_df.columns = [TIME_COL, PREDICTED_COL] + # Asserts that the returned dataframe has the correct size. + assert len(forecast_pred_df) == len(df), ( + f"length of `forecast_pred_df`: {len(forecast_pred_df)}," + f" must be same as length of `df`: {len(df)}") + + volatility_df = pd.merge(df, forecast_pred_df, on=TIME_COL) + assert len(volatility_df) == len(df), "length of `volatility_df` must be same `df`" + + # Adds time features + volatility_df = add_time_features_df( + df=volatility_df, + time_col=TIME_COL, + conti_year_origin=trained_model["origin_for_time_vars"]) + + # Adds daily events (e.g. holidays) + # if daily event data are given, we add them to temporal features data + # `date_col` below is used to join with `daily_events` data given + # in `daily_event_df_dict`. + # Note: events must be provided for both train and forecast time range. + daily_event_df_dict = trained_model["daily_event_df_dict"] + if daily_event_df_dict is not None: + daily_event_neighbor_impact = trained_model["daily_event_neighbor_impact"] + volatility_df = add_daily_events( + df=volatility_df, + event_df_dict=daily_event_df_dict, + date_col="date", + neighbor_impact=daily_event_neighbor_impact) + + coverage = param.get("coverage", None) + volatility_features = param.get("volatility_features", []) # volatility features can be empty + ape = param.get(ElementwiseEvaluationMetricEnum.AbsolutePercentError.get_metric_name(), None) + sape = param.get(ElementwiseEvaluationMetricEnum.SymmetricAbsolutePercentError.get_metric_name(), None) + sigma_scaler = trained_model["sigma_scaler"] + h_mat = trained_model["h_mat"] + x_mean = trained_model["x_mean"] + if self.ad_config.variance_scaling is False or self.ad_config.variance_scaling is None: + # Enables default variance scaling. + sigma_scaler = None + h_mat = None + + if phase == "fit": + alpha = (1 - coverage) + q_lower = alpha / 2.0 + q_upper = 1 - q_lower + quantiles = (q_lower, q_upper) + + # Residual column is calculated based on the adjusted values so that the anomalies and outliers + # do not impact the training of the volatility model. + volatility_df[RESIDUAL_COL] = volatility_df[f"adjusted_{VALUE_COL}"] - volatility_df[PREDICTED_COL] + ci_model = conf_interval( + df=volatility_df, + distribution_col=RESIDUAL_COL, + offset_col=PREDICTED_COL, + conditional_cols=volatility_features, + quantiles=quantiles, + sigma_scaler=sigma_scaler, + h_mat=h_mat, + x_mean=x_mean, + min_admissible_value=self.ad_config.min_admissible_value, + max_admissible_value=self.ad_config.max_admissible_value) + else: + ci_model = self.fit_info["best_calc_result"].model + + # Calculates the prediction intervals using the fitted `ci_model`. + # Note that if there is no variance scaling (since variance scaling only applies to ridge / linear regression), + # `ci_model` has contained this information since both `sigma_scaler` and `h_mat` were set to `None`. + # In this case, the `predict_ci` function behaves the same as `x_mat` is not passed. + # Also note that the default behavior in greykite is to scale the variance. + ci_df = predict_ci(new_df=volatility_df, ci_model=ci_model, x_mat=x_mat) + # Adds the z score column. + ci_df[Z_SCORE_COL] = (ci_df[VALUE_COL] - ci_df[PREDICTED_COL]) / ci_df[ERR_STD_COL] + + cols = [PREDICTED_COL, QUANTILE_SUMMARY_COL, ERR_STD_COL, Z_SCORE_COL] + pred_df = pd.concat([df, ci_df[cols]], axis=1) + pred_df[PREDICTED_LOWER_COL] = pred_df[QUANTILE_SUMMARY_COL].map(lambda x: x[0]) + pred_df[PREDICTED_UPPER_COL] = pred_df[QUANTILE_SUMMARY_COL].map(lambda x: x[1]) + + # Since, we like to return the observed values in `ACTUAL_COL`, + # the column `ACTUAL_COL` is added and it is set to be equal to + # the `VALUE_COL` which includes the observed values. + pred_df[ACTUAL_COL] = pred_df[VALUE_COL] + + # Anomaly is declared when the actual is outside the confidence interval. + y_pred = ( + (pred_df[ACTUAL_COL] < pred_df[PREDICTED_LOWER_COL]) | + (pred_df[ACTUAL_COL] > pred_df[PREDICTED_UPPER_COL])) + + # Adds Absolute Percent Error (APE) threshold check if ape is not None. + if ape is not None: + metric = ElementwiseEvaluationMetricEnum.AbsolutePercentError + pred_df[metric.get_metric_name()] = pred_df.apply( + lambda row: metric.get_metric_func()(row[ACTUAL_COL], row[PREDICTED_COL]), axis=1) + y_pred = y_pred & (pred_df[metric.get_metric_name()] > ape) + + # Adds Symmetric Absolute Percent Error (SAPE) threshold check if sape is not None. + if sape is not None: + metric = ElementwiseEvaluationMetricEnum.SymmetricAbsolutePercentError + pred_df[metric.get_metric_name()] = pred_df.apply( + lambda row: metric.get_metric_func()(row[ACTUAL_COL], row[PREDICTED_COL]), axis=1) + y_pred = y_pred & (pred_df[metric.get_metric_name()] > sape) + + pred_df[PREDICTED_ANOMALY_COL] = y_pred + + # Only subset the columns needed / prescribed. + pred_df = pred_df[DETECTOR_PREDICT_COLS] + if ANOMALY_COL in df.columns: + pred_df[ANOMALY_COL] = df[ANOMALY_COL] + else: + pred_df[ANOMALY_COL] = None + data.y_pred = y_pred + data.pred_df = pred_df + + return CalcResult(data=data, model=ci_model) + + def merge_anomaly_info(self, fit_data, freq): + """This function combines anomalies information that may exist in various places + and combine them into a consistent `anomaly_df`: a point will be an anomaly if it appears in + ANY of the given inputs. + + 1. As `ANOMALY_COL` column in the input fit_data (`fit_data.df`) + 2. As a vector in the input fit_data (`fit_data.y_true`) + 3. As a separate dataframe in the input fit_data (`fit_data.anomaly_df`) + 4. As a separate dataframe in the `metadata_param` in the `forecast_config`. + + It then updates the anomaly information to be the final one across various inputs. + + Parameters + ---------- + fit_data : `~greykite.detection.detector.fit_data.DetectorData` or None, default None + The input fit_data to the `fit` method. + freq : `str` + The frequency of the input fit_data. + + Returns + ------- + fit_data : `~greykite.detection.detector.data.DetectorData` + The updated fit_data which includes the merged anomaly information. + """ + df = fit_data.df.copy() + merged_anomaly_df = pd.DataFrame() + + # Adds anomalies from `fit_data.df`. + if ANOMALY_COL in df.columns: + anomaly_df = get_anomaly_df( + df=df, + time_col=TIME_COL, + anomaly_col=ANOMALY_COL) + merged_anomaly_df = pd.concat([merged_anomaly_df, anomaly_df]) + + # Adds anomalies from `fit_data.y_true`. + if fit_data.y_true is not None: + assert len(fit_data.y_true) == len(df), ( + f"length of `y_true`: {len(fit_data.y_true)}, must be same as length of `df`: {len(df)}") + # Builds a temporary df to be used in `get_anomaly_df`. + temp_df = pd.DataFrame({ + TIME_COL: df[TIME_COL], + ANOMALY_COL: fit_data.y_true}) + anomaly_df = get_anomaly_df( + df=temp_df, + time_col=TIME_COL, + anomaly_col=ANOMALY_COL) + merged_anomaly_df = pd.concat([merged_anomaly_df, anomaly_df]) + + # Adds anomalies from `fit_data.anomaly_df`. + if fit_data.anomaly_df is not None: + fit_data.anomaly_df = fit_data.anomaly_df[[START_TIME_COL, END_TIME_COL]] + merged_anomaly_df = pd.concat([merged_anomaly_df, fit_data.anomaly_df]) + + # Adds anomalies from `metadata_param` in `forecast_config`. + metadata_param = self.forecast_config.metadata_param + if (metadata_param is not None) and (metadata_param.anomaly_info is not None): + anomaly_df = metadata_param.anomaly_info.get("anomaly_df", None) + if anomaly_df is not None: + anomaly_df = anomaly_df[[START_TIME_COL, END_TIME_COL]] + merged_anomaly_df = pd.concat([merged_anomaly_df, anomaly_df]) + + # Input fit_data might have huge outliers which have a large impact on model fit. + # At this step, the function identifies outliers based on z-scores + # and constructs `anomaly_df_outliers`to save the information, + # which is added to `merged_anomaly_df`. + # Does 1.0 percent trimming to ensure the standard deviation and mean, + # used in z-score calculation are robust. + anomaly_df_outliers = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=VALUE_COL, + freq=freq, + trim_percent=1.0) + if not anomaly_df_outliers.empty: + log_message( + f"Found the following outliers: \n{anomaly_df_outliers[START_TIME_COL]}. " + " Adding these to `anomaly_df`.", + LoggingLevelEnum.WARNING) + merged_anomaly_df = pd.concat([merged_anomaly_df, anomaly_df_outliers]) + + # Gets canonical anomaly df. + if merged_anomaly_df.empty: + merged_anomaly_df = None + log_message( + "No anomalies are provided and no outliers have been found. " + "Setting 'anomaly_df' to None.", + LoggingLevelEnum.WARNING) + else: + merged_anomaly_df = get_canonical_anomaly_df( + anomaly_df=merged_anomaly_df, + freq=freq) + + # Updates `anomaly_df` in respective places. + if merged_anomaly_df is not None: + # Directly adjusts anomalies in input fit_data, + # so that prediction intervals are not impacted by outliers as well + # in `predict_with_params` during the "fit" phase. + # Values of the anomaly locations will become none in "adjusted_{VALUE_COL}" column. + adj_df_info = adjust_anomalous_data( + df=df, + time_col=TIME_COL, + value_col=VALUE_COL, + anomaly_df=merged_anomaly_df) + df = adj_df_info["augmented_df"] + df[ANOMALY_COL] = df[ANOMALY_COL].astype(bool) + + if self.forecast_config.metadata_param is None: + self.forecast_config.metadata_param = MetadataParam() + if self.forecast_config.metadata_param.anomaly_info is None: + self.forecast_config.metadata_param.anomaly_info = { + "value_col": VALUE_COL, + "anomaly_df": merged_anomaly_df} + else: + self.forecast_config.metadata_param.anomaly_info.update({ + "value_col": VALUE_COL, + "anomaly_df": merged_anomaly_df}) + else: + df[f"adjusted_{VALUE_COL}"] = df[VALUE_COL] + df[ANOMALY_COL] = None + fit_data.y_true = df[ANOMALY_COL] + fit_data.df = df + fit_data.anomaly_df = merged_anomaly_df + self.anomaly_df = merged_anomaly_df + + return fit_data + + def summary(self): + """Returns a summary of the fitted model. + Fetches the summary from the forecast estimator and adds it to the summary + of the base class. + """ + content = super().summary() + content += self.forecast_estimator.summary().__str__() + + return content diff --git a/greykite/detection/detector/optimizer.py b/greykite/detection/detector/optimizer.py new file mode 100644 index 0000000..d581de9 --- /dev/null +++ b/greykite/detection/detector/optimizer.py @@ -0,0 +1,246 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini + +import dataclasses +from abc import abstractmethod +from dataclasses import dataclass +from typing import Optional + +from greykite.detection.detector.data import DetectorData + + +@dataclass +class CalcResult: + """This data class represents the standard return of the method: + `calc_with_param` of `~greykite.detection.detector.optimizer.Optimizer` + + Attributes + ---------- + data : `object` or None, default None + This is part of the calculation which includes data e.g. dataframes etc. + This data is the only part of calculation which is needed to calculate the + reward during optimization. + model :`object` or None, default None + This is returned by the calculation, and it might be a trained model, or + a useful object generated during the calculation which can be used later + for example during prediction phase, when the optimizer is a predictor. + + """ + data: Optional[DetectorData] = None + model: Optional[object] = None + + +class Optimizer: + """A class to enable easy implementation of optimization over + arbitrary parameter spaces and using arbitrary rewards. + The optimization problem can be stated in pseudo code: + + Maximize_{param} reward(some_function(param)) + + Note that 'reward(some_function(param))' can be considered as the objective + which is to be maximized. The objective is a two part calculation in this framework + + - calculate a function for param + - calculate the reward for the above + + Here is a more detailed mathematical explanation in more detailed pseudo code: + + Assume: + - ``param`` is a (potentially multivariate) parameter in a parameter space ``param_iterable`` + - ``calc_with_param`` is a function which depends on ``param`` + - psudeo code: 'calc_result = calc_with_param(param, ...)' + - psudeo code: 'obj_value = reward.apply(calc_result.data)' + + Goal: + - optimize (maximize) 'obj_value' across all possible param + + Note that `reward` does not take ``param`` as an input and only applied to + the updated data calculated using ``param``. + + The class initializes by passing an arbitrary ``reward`` + for optimization and a potentially multivariate parameter + (given in ``param_iterable``) to optimize. + + The ``reward`` object is required to implement the ``apply`` method + which is the case for this class: + `~greykite.detection.detector.reward.Reward` + + The optimization method (``optimize_param``) is the main method in this class and + works simply by iterating over ``param_iterable`` and calculating the reward + to choose the optimal parameter. + The class assumes that larger is better for the reward function, during optimization. + + The classes inherting this class, need to implement ``calc_with_param`` method to be able to use the optimizer and given that implementation. + + Parameters + ---------- + reward : `~greykite.detection.detector.reward.Reward` or None, default None + The reward to be used in the optimization. + param_iterable : iterable or None, default None + An iterable with every element being a parameter passed to the method + ``calc_with_param`` which takes ``param`` as one of its arguments and + ``data`` as the other. + Each ``param`` can be a dictionary including values for a set of variables. + The optimizer method (``optimize_param``) will iterate over all the + parameters to find the best parameter in terms of the specified reward. + + Attributes + ---------- + data : `dataclasses.dataclass` or None, default None + A data class object which includes the data for fitting or + prediction. Depending on the model, this data class might + include various fields. A simple example is given in + `~greykite.detection.detector.data.Data` + fit_info : `dict` + A dictionary which includes information about the fitted model. + It is expected that this includes ``"full_param"`` after the fitting + so that the `predict` function can use that param during the prediction + and simply call `calc_with_param`. + In that case the `predict` function does not need further implementation + in child classes as it's already implemented in this class. + """ + def __init__( + self, + reward=None, + param_iterable=None): + self.reward = reward + self.param_iterable = param_iterable + # Initialize attributes + self.data = None + self.fit_info = {"param_full": None} + + def optimize_param( + self, + param_iterable=None, + data=None, + default_param=None, + **kwargs): + """The optimizer which picks the best possible parameter from the ones + specified in ``param_iterable``, using the reward specified in the + class instance. + This method assumes larger is better for the reward. + + Parameters + ---------- + param_iterable : iterable or None, default None + See class docstring. + If None in this method call, it will be set to + ``self.param_iterable`` + data : `dataclasses.dataclass` or None, default None + See class docstring. + default_param : `dict` or None, default None + A fixed parameter which is used as the default and for each + param in ``param_iterable``, it will be used to construct the full + parameter. For example it can include fixed parameters which are + calculated separately before optimization occurs. + + Returns + ------- + result : `dict` + A dictionary with following items: + + - ``"best_param"``: `dict` + The best parameter in terms of the specified reward, where larger + is considered better. + - ``"best_param_full"``: `dict` + The default parameter augmented with the best parameter + to construct the full parameter. + - ``"best_obj_value"``: `float` + The best reward value. + - ``"param_obj_list"``: `list` [`dict`] + - "best_calc_result": `greykite.detection.detector.optimizer.CalcResult` + The calculation result at the best parameter. + """ + if param_iterable is None: + param_iterable = self.param_iterable + if default_param is None: + default_param = {} + best_obj_value = float("-inf") + best_param = None + best_param_full = None + param_obj_list = [] + best_calc_result = None + + for param in param_iterable: + full_param = default_param.copy() + full_param.update(param) + calc_result = self.calc_with_param( + param=full_param, + data=data, + **kwargs) + obj_value = self.reward.apply(calc_result.data) + + if obj_value > best_obj_value: + best_obj_value = obj_value + best_param = param.copy() + best_param_full = full_param.copy() + # In order to preserve the data from the optimal case, + # we make a copy of the data using `.replace` when it is None + # Note that `replace` is the method to copy data for `dataclasses` + # Note that the otherwise the data could be over-written + # duing the next iterations for the optimizer's for loop + if dataclasses.is_dataclass(calc_result.data): + calc_result.data = dataclasses.replace(calc_result.data) + best_calc_result = calc_result + + param = param.copy() + # Adds the reward value as a new key to each param + param.update({"obj_value": obj_value}) + param_obj_list.append(param) + + return { + "best_param": best_param, + "best_param_full": best_param_full, + "best_obj_value": best_obj_value, + "param_obj_list": param_obj_list, + "best_calc_result": best_calc_result} + + @abstractmethod + def calc_with_param( + self, + param, + data=None, + **kwargs): + """This is a calculation step which uses both ``data`` and ``param``. + This is typically expected to be implemented by user. + By default, it simply returns the data without any alteration. + However, in general the data will be altered in various ways + depending on the ``param`` passed. + + Parameters + ---------- + param : `Any` + One element of `param_iterable`. + Typically a dictionary which includes the values of a set of parameters + given in its keys. However it could also be simply a float if `param_iterable` + is a list of floats. + data : `dataclasses.dataclass` or None, default None + The `data` is typically updated in this function after + we use the given `param` in the calculation here. + The data is then returned as a part of returned `CalcResult`. + + Returns + ------- + calc_result : `greykite.detection.detector.optimizer.CalcResult` + The optimization results. + """ + return CalcResult(data=data, model=None) diff --git a/greykite/detection/detector/reward.py b/greykite/detection/detector/reward.py new file mode 100644 index 0000000..71d2e4d --- /dev/null +++ b/greykite/detection/detector/reward.py @@ -0,0 +1,230 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +import inspect +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Reza Hosseini +import numbers + +from greykite.detection.detector.constants import PenalizeMethod + + +class Reward: + """Reward class which is to support very flexible set of rewards + used in optimization where an objective is to be optimized. + + The main method for this class is `apply` which + is used when the reward is to be applied to data. + No assumption is made on the arguments of `apply` to keep this class very + generic. + + This class enables two powerful mechanisms: + + - taking a simple `reward_func` and construct a penalized version of that + - starting from existing objectives building more complex ones by adding / + multiplying / dividing them or use same operations with numbers. + + Using these two mechanisms can enable use to support multi-objective problems. + For examples, in the context of anomaly detection if recall is to be optimized + subject to precision being at least 80 percent, then use can enable that by + + def recall(y_true, y_pred): + # recall function implementation + ... + + def precision(y_true, y_pred): + # precision function implementation + ... + + reward = + Reward(reward_func=recall) + + Reward( + reward_func=precision, + min_unpenalized_metric=0.8, + max_unpenalized_metric=None, + penalty=-inf) + + where the second part will cause the total sum of the objectives to be -inf + whenever precision is not in the desired range. + Also note that the "+" operation is defined in this class using the dunder + method `__add__`. + + One can also combine objectives to achieve more complex objectives from existing ones. For example F1 can be easily expressed in terms of + precision and recall: + + rec_obj = Reward(reward_func=recall) + prec_obj =Reward(reward_func=precision) + f1_obj = (2 * rec_obj * prec_obj) / (rec_obj + prec_obj) + + The penalize mechanism on its own is useful for example in the context of + anomaly detection without labels, where we only have an idea about the + expected anomaly percentage in the data. In such a case an objective can be + constructed for optimization. See + `~greykite.detection.detector.detector.Detector` + init to see a construction of such an objective. + + + Parameters + ---------- + reward_func : callable + The reward function which will be used as the staring point and augmented + with extra logic depending on other input. + min_unpenalized : `float`, default `float("-inf")` + The minimum value of the reward function (`reward_func`) which will + remain un-penalized. + max_unpenalized : `float`, default `float("inf")` + The maximum value of the reward function (`reward_func`) which will + remain un-penalized. + penalize_method : `str`, default `PenalizeMethod.ADDITIVE.value` + The logic of using the penalty. The possibilities are given in + `~greykite.detection.detector.constants.PenalizeMethod` + penalty : `float` or None, default None + The penalty amount. If None, it will be mapped to 0 for additive and + 1 for multiplicative. + + Attributes + ---------- + None + """ + + def __init__( + self, + reward_func, + min_unpenalized=float("-inf"), + max_unpenalized=float("inf"), + penalize_method=PenalizeMethod.ADDITIVE.value, + penalty=None): + self.reward_func = reward_func + self.min_unpenalized = min_unpenalized + self.max_unpenalized = max_unpenalized + self.penalize_method = penalize_method + if penalty is None: + if penalize_method == PenalizeMethod.ADDITIVE.value: + penalty = 0 + else: + penalty = 1 + self.penalty = penalty + + def apply( + self, + *args, + **kwargs): + + obj_value = self.reward_func( + *args, + **kwargs) + + if ( + obj_value > self.max_unpenalized or + obj_value < self.min_unpenalized): + + if self.penalize_method == PenalizeMethod.ADDITIVE.value: + obj_value += self.penalty + elif self.penalize_method == PenalizeMethod.MULTIPLICATIVE.value: + obj_value *= self.penalty + elif self.penalize_method == PenalizeMethod.PENALTY_ONLY.value: + obj_value = self.penalty + elif self.penalize_method is None: + obj_value = self.penalty + else: + raise ValueError( + f"penalize_method {self.penalize_method.value} does not exist") + + return obj_value + + def __add__(self, other): + """Addition of objects or an object with a number (scalar).""" + if isinstance(other, numbers.Number): + def reward_func(*args, **kwargs): + return ( + self.apply(*args, **kwargs) + + other) + else: + def reward_func(*args, **kwargs): + return ( + self.apply(*args, **kwargs) + + other.apply(*args, **kwargs)) + + return Reward(reward_func=reward_func) + + def __mul__(self, other): + """Multiplication of objects or an object with a number.""" + if isinstance(other, numbers.Number): + def reward_func(*args, **kwargs): + return ( + self.apply(*args, **kwargs) * + other) + else: + def reward_func(*args, **kwargs): + return ( + self.apply(*args, **kwargs) * + other.apply(*args, **kwargs)) + + return Reward(reward_func=reward_func) + + def __truediv__(self, other): + """Division of objects or an object with a number.""" + if isinstance(other, numbers.Number): + def reward_func(*args, **kwargs): + return ( + self.apply(*args, **kwargs) / + other) + else: + def reward_func(*args, **kwargs): + return ( + self.apply(*args, **kwargs) / + other.apply(*args, **kwargs)) + + return Reward(reward_func=reward_func) + + # Below defines the above operators from right + # Addition and multiplication operators are commutative + # Division is an exception + def __radd__(self, other): + """Right addition.""" + return self.__add__(other) + + def __rmul__(self, other): + """Right multiplication.""" + return self.__mul__(other) + + def __rtruediv__(self, other): + """Right division.""" + if isinstance(other, numbers.Number): + def reward_func(*args, **kwargs): + return ( + other / + self.apply(*args, **kwargs)) + else: + def reward_func(*args, **kwargs): + return ( + other.apply(*args, **kwargs) / + self.apply(*args, **kwargs)) + + return Reward(reward_func=reward_func) + + def __str__(self): + """Print method.""" + reward_func_content = inspect.getsource(self.reward_func) + return ( + f"\n reward_func:\n {reward_func_content} \n" + f"min_unpenalized: {self.min_unpenalized} \n" + f"max_unpenalized: {self.max_unpenalized} \n" + f"penalize_method: {self.penalize_method} \n" + f"penalty: {self.penalty} \n") diff --git a/greykite/framework/pipeline/utils.py b/greykite/framework/pipeline/utils.py index e365cf4..470018a 100644 --- a/greykite/framework/pipeline/utils.py +++ b/greykite/framework/pipeline/utils.py @@ -52,12 +52,12 @@ from greykite.sklearn.estimator.simple_silverkite_estimator import SimpleSilverkiteEstimator from greykite.sklearn.sklearn_scorer import make_scorer_df from greykite.sklearn.transform.column_selector import ColumnSelector +from greykite.sklearn.transform.difference_based_outlier_transformer import DifferenceBasedOutlierTransformer from greykite.sklearn.transform.drop_degenerate_transformer import DropDegenerateTransformer from greykite.sklearn.transform.dtype_column_selector import DtypeColumnSelector from greykite.sklearn.transform.normalize_transformer import NormalizeTransformer from greykite.sklearn.transform.null_transformer import NullTransformer from greykite.sklearn.transform.pandas_feature_union import PandasFeatureUnion -from greykite.sklearn.transform.zscore_outlier_transformer import ZscoreOutlierTransformer def get_best_index(results, metric="score", greater_is_better=False): @@ -361,13 +361,13 @@ def get_basic_pipeline( ])), ("response", Pipeline([ # applies outlier and null transformation to value column ("select_val", ColumnSelector([VALUE_COL])), - ("outlier", ZscoreOutlierTransformer(z_cutoff=None)), + ("outlier", DifferenceBasedOutlierTransformer()), ("null", NullTransformer(impute_algorithm="interpolate")) ])), ("regressors_numeric", Pipeline([ ("select_reg", ColumnSelector(all_reg_cols)), ("select_reg_numeric", DtypeColumnSelector(include="number")), - ("outlier", ZscoreOutlierTransformer(z_cutoff=None)), + ("outlier", DifferenceBasedOutlierTransformer()), ("normalize", NormalizeTransformer(normalize_algorithm=None)), # no normalization by default ("null", NullTransformer(impute_algorithm="interpolate")) ])), diff --git a/greykite/framework/templates/autogen/forecast_config_utils.py b/greykite/framework/templates/autogen/forecast_config_utils.py new file mode 100644 index 0000000..16f6ab0 --- /dev/null +++ b/greykite/framework/templates/autogen/forecast_config_utils.py @@ -0,0 +1,251 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra + +"""This file contains forecast configs and corresponding json strings +to be used for testing. +""" + +from greykite.common.evaluation import EvaluationMetricEnum +from greykite.framework.templates.autogen.forecast_config import ComputationParam +from greykite.framework.templates.autogen.forecast_config import EvaluationMetricParam +from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam +from greykite.framework.templates.autogen.forecast_config import ForecastConfig +from greykite.framework.templates.autogen.forecast_config import MetadataParam +from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam +from greykite.framework.templates.model_templates import ModelTemplateEnum + + +FORECAST_CONFIG_JSON_DEFAULT = dict( + forecast_config=ForecastConfig(), + forecast_json="{}" +) + +FORECAST_CONFIG_JSON_COMPLETE = dict( + forecast_config=ForecastConfig( + model_template=ModelTemplateEnum.SILVERKITE.name, + forecast_horizon=24, + coverage=0.7, + metadata_param=MetadataParam( + time_col="time", + value_col="value", + freq="H", + date_format="%Y-%m-%d-%H", + train_end_date="2021-07-01-10", + ), + evaluation_period_param=EvaluationPeriodParam( + test_horizon=1, + periods_between_train_test=2, + cv_horizon=3, + cv_min_train_periods=4, + cv_expanding_window=True, + cv_use_most_recent_splits=True, + cv_periods_between_splits=5, + cv_periods_between_train_test=6, + cv_max_splits=2 + ), + evaluation_metric_param=EvaluationMetricParam( + cv_selection_metric=EvaluationMetricEnum.MeanSquaredError.name, + cv_report_metrics=[EvaluationMetricEnum.MeanAbsoluteError.name, + EvaluationMetricEnum.MeanAbsolutePercentError.name], + null_model_params={ + "strategy": "quantile", + "constant": None, + "quantile": 0.8 + }, + relative_error_tolerance=0.02 + ), + model_components_param=ModelComponentsParam( + seasonality={ + "yearly_seasonality": True, + "weekly_seasonality": False, + "monthly_seasonality": "auto", + "daily_seasonality": 10 + }, + growth={ + "growth_term": "quadratic" + }, + events={ + "holidays_to_model_separately": [ + "New Year's Day", + "Chinese New Year", + "Christmas Day", + "Independence Day", + "Thanksgiving", + "Labor Day", + "Good Friday", + "Easter Monday", + "Memorial Day", + "Veterans Day", + "Independence Day", + ], + "holiday_lookup_countries": ["UnitedStates"], + "holiday_pre_num_days": 3, + "holiday_post_num_days": 2 + }, + changepoints={ + "changepoints_dict": { + "method": "uniform", + "n_changepoints": 20, + } + }, + autoregression={ + "autoreg_dict": { + "lag_dict": { + "orders": [1, 2, 3] + }, + "agg_lag_dict": { + "orders_list": [[7, 14, 21]] + } + } + }, + regressors={ + "regressor_cols": [] + }, + lagged_regressors={ + "lagged_regressor_dict": None + }, + uncertainty={ + "uncertainty_dict": "auto", + }, + hyperparameter_override={ + "input__response__null__max_frac": 0.1 + }, + custom={ + "fit_algorithm_dict": { + "fit_algorithm": "ridge", + "fit_algorithm_params": {"normalize": True}, + }, + "feature_sets_enabled": False + } + ), + computation_param=ComputationParam( + hyperparameter_budget=10, + n_jobs=None, + verbose=1 + ), + ), + forecast_json="""{ + "model_template": "SILVERKITE", + "forecast_horizon": 24, + "coverage": 0.7, + "metadata_param": { + "time_col": "time", + "value_col": "value", + "freq": "H", + "date_format": "%Y-%m-%d-%H", + "train_end_date": "2021-07-01-10" + }, + "evaluation_period_param": { + "test_horizon": 1, + "periods_between_train_test": 2, + "cv_horizon": 3, + "cv_min_train_periods": 4, + "cv_expanding_window": true, + "cv_use_most_recent_splits": true, + "cv_periods_between_splits": 5, + "cv_periods_between_train_test": 6, + "cv_max_splits": 2 + }, + "evaluation_metric_param": { + "cv_selection_metric": "MeanSquaredError", + "cv_report_metrics": ["MeanAbsoluteError", "MeanAbsolutePercentError"], + "null_model_params": { + "strategy": "quantile", + "constant": null, + "quantile": 0.8 + }, + "relative_error_tolerance": 0.02 + }, + "model_components_param": { + "seasonality":{ + "yearly_seasonality": true, + "weekly_seasonality": false, + "monthly_seasonality": "auto", + "daily_seasonality": 10 + }, + "growth": { + "growth_term": "quadratic" + }, + "events": { + "holidays_to_model_separately": [ + "New Year's Day", + "Chinese New Year", + "Christmas Day", + "Independence Day", + "Thanksgiving", + "Labor Day", + "Good Friday", + "Easter Monday", + "Memorial Day", + "Veterans Day", + "Independence Day" + ], + "holiday_lookup_countries": ["UnitedStates"], + "holiday_pre_num_days": 3, + "holiday_post_num_days": 2 + }, + "changepoints": { + "changepoints_dict": { + "method": "uniform", + "n_changepoints": 20 + } + }, + "autoregression": { + "autoreg_dict": { + "lag_dict": { + "orders": [1, 2, 3] + }, + "agg_lag_dict": { + "orders_list": [[7, 14, 21]] + } + } + }, + "regressors": { + "regressor_cols": [] + }, + "custom": { + "custom_param": 1 + }, + "lagged_regressors": { + "lagged_regressor_dict": null + }, + "uncertainty": { + "uncertainty_dict": "auto" + }, + "hyperparameter_override": { + "input__response__null__max_frac": 0.1 + }, + "custom": { + "fit_algorithm_dict": { + "fit_algorithm": "ridge", + "fit_algorithm_params": {"normalize": true} + }, + "feature_sets_enabled": false + } + }, + "computation_param": { + "hyperparameter_budget": 10, + "n_jobs": null, + "verbose": 1 + } + }""" +) diff --git a/greykite/framework/templates/multistage_forecast_template_config.py b/greykite/framework/templates/multistage_forecast_template_config.py index 8cea1ca..e9bb21b 100644 --- a/greykite/framework/templates/multistage_forecast_template_config.py +++ b/greykite/framework/templates/multistage_forecast_template_config.py @@ -275,7 +275,8 @@ class MultistageForecastTemplateConfig: "holiday_pre_post_num_dict": None, # ignored "daily_event_df_dict": None, # ignored "daily_event_neighbor_impact": None, # ignored - "daily_event_shifted_effect": None + "daily_event_shifted_effect": None, + "auto_holiday_params": None }, changepoints={ "auto_growth": True, diff --git a/greykite/framework/templates/simple_silverkite_template.py b/greykite/framework/templates/simple_silverkite_template.py index 756f735..6869deb 100644 --- a/greykite/framework/templates/simple_silverkite_template.py +++ b/greykite/framework/templates/simple_silverkite_template.py @@ -1282,6 +1282,7 @@ def __get_hyperparameter_grid_from_model_components(model_components): "estimator__daily_event_df_dict": model_components.events["daily_event_df_dict"], "estimator__daily_event_neighbor_impact": model_components.events["daily_event_neighbor_impact"], "estimator__daily_event_shifted_effect": model_components.events["daily_event_shifted_effect"], + "estimator__auto_holiday_params": model_components.events["auto_holiday_params"], "estimator__feature_sets_enabled": model_components.custom["feature_sets_enabled"], "estimator__fit_algorithm_dict": model_components.custom["fit_algorithm_dict"], "estimator__max_daily_seas_interaction_order": model_components.custom["max_daily_seas_interaction_order"], diff --git a/greykite/framework/templates/simple_silverkite_template_config.py b/greykite/framework/templates/simple_silverkite_template_config.py index 380a855..ef64e30 100644 --- a/greykite/framework/templates/simple_silverkite_template_config.py +++ b/greykite/framework/templates/simple_silverkite_template_config.py @@ -513,6 +513,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 1, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -524,6 +525,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -535,6 +537,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 4, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -546,6 +549,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 3, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -557,6 +561,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 0, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }), @@ -655,6 +660,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -739,6 +745,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -808,6 +815,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -877,6 +885,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -947,6 +956,7 @@ class SimpleSilverkiteTemplateOptions: "holiday_post_num_days": 0, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, diff --git a/greykite/framework/utils/framework_testing_utils.py b/greykite/framework/utils/framework_testing_utils.py index 3fb9360..788193a 100644 --- a/greykite/framework/utils/framework_testing_utils.py +++ b/greykite/framework/utils/framework_testing_utils.py @@ -324,8 +324,7 @@ def assert_basic_pipeline_equal(actual: Pipeline, expected: Pipeline): check_keys = [ 'input__date__select_date__column_names', 'input__response__select_val__column_names', - 'input__response__outlier__use_fit_baseline', - 'input__response__outlier__z_cutoff', + 'input__response__outlier__params', 'input__response__null__impute_algorithm', 'input__response__null__impute_all', 'input__response__null__impute_params', @@ -333,8 +332,7 @@ def assert_basic_pipeline_equal(actual: Pipeline, expected: Pipeline): 'input__regressors_numeric__select_reg__column_names', 'input__regressors_numeric__select_reg_numeric__exclude', 'input__regressors_numeric__select_reg_numeric__include', - 'input__regressors_numeric__outlier__use_fit_baseline', - 'input__regressors_numeric__outlier__z_cutoff', + 'input__regressors_numeric__outlier__params', 'input__regressors_numeric__normalize__normalize_algorithm', 'input__regressors_numeric__normalize__normalize_params', 'input__regressors_numeric__null__impute_algorithm', diff --git a/greykite/sklearn/estimator/simple_silverkite_estimator.py b/greykite/sklearn/estimator/simple_silverkite_estimator.py index de6093a..b5d4647 100644 --- a/greykite/sklearn/estimator/simple_silverkite_estimator.py +++ b/greykite/sklearn/estimator/simple_silverkite_estimator.py @@ -117,6 +117,7 @@ def __init__( holiday_post_num_days: int = 2, holiday_pre_post_num_dict: Optional[Dict] = None, daily_event_df_dict: Optional[Dict] = None, + auto_holiday_params: Optional[Dict] = None, daily_event_neighbor_impact: Optional[Union[int, List[int], callable]] = None, daily_event_shifted_effect: Optional[List[str]] = None, auto_growth: bool = False, @@ -175,6 +176,7 @@ def __init__( self.holiday_post_num_days = holiday_post_num_days self.holiday_pre_post_num_dict = holiday_pre_post_num_dict self.daily_event_df_dict = daily_event_df_dict + self.auto_holiday_params = auto_holiday_params self.daily_event_neighbor_impact = daily_event_neighbor_impact self.daily_event_shifted_effect = daily_event_shifted_effect self.auto_growth = auto_growth @@ -290,6 +292,7 @@ def fit( holiday_post_num_days=self.holiday_post_num_days, holiday_pre_post_num_dict=self.holiday_pre_post_num_dict, daily_event_df_dict=self.daily_event_df_dict, + auto_holiday_params=self.auto_holiday_params, daily_event_neighbor_impact=self.daily_event_neighbor_impact, daily_event_shifted_effect=self.daily_event_shifted_effect, auto_growth=self.auto_growth, diff --git a/greykite/sklearn/transform/difference_based_outlier_transformer.py b/greykite/sklearn/transform/difference_based_outlier_transformer.py new file mode 100644 index 0000000..d226607 --- /dev/null +++ b/greykite/sklearn/transform/difference_based_outlier_transformer.py @@ -0,0 +1,237 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Yi-Wei Liu + +import numpy as np +import pandas as pd +from sklearn.base import BaseEstimator +from sklearn.base import TransformerMixin +from sklearn.exceptions import NotFittedError + +from greykite.common.features.outlier import IMPLEMENTED_DIFF_METHODS +from greykite.common.features.outlier import TukeyOutlierDetector +from greykite.common.features.outlier import ZScoreOutlierDetector +from greykite.common.logging import LoggingLevelEnum +from greykite.common.logging import log_message + + +class DifferenceBasedOutlierTransformer(BaseEstimator, TransformerMixin): + """Replaces outliers in data with NaN. + Outliers are determined by anomaly scores computed by the differences or ratios between + the observed values and their baseline values. Cutoffs of the scores are derived through + z-score or tukey coeficient methods. Columns are handled independently. If the baseline + is not specified, the observed values will be used as the anomaly scores, together with + the z-score method, the algorithm degenerates to the `ZscoreOutlierTransformer`. + + Parameters + ---------- + method : `str`, default "z_score" + Method used to determine the outliers. Must be either "z_score" or "tukey". + - When `method = "z_score"`, outliers are defined as those y_{t}'s with + absolute z-scores of the anomaly scores larger than ``z_cutoff``. + - When `method = "tukey"`, outliers are defined as those y_{t}'s with anomaly + scores larger than `Q3 + tukey_cutoff * IQR`, or smaller than `Q1 - tukey_cutoff * IQR`. + Here Q1, Q3, and IQR are the first-quartile, third-quartile, and inter-quartile + range of the anomaly scores, respectively. + score_type : `str`, default "difference" + Formula with respect to the baseline values to compute anomaly scores. + Must be either "difference" or "ratio". + Given a time series y_{t} and its baseline values b_{t}, the anomaly scores for + - "difference" is: y_{t} - b_{t} + - "ratio" is: (y_{t} / b_{t}) - 1 + params : `dict` [`str`, any] or None, default None + A dictionary with seven keys: + + - "diff_method": `DiffMethod` or None, default None + An object of the `DiffMethod` class, describing `name` and `param` of "diff_method". + See `~greykite.common.features.outlier.DiffMethod`. + - "agg_func": `numpy.functions` or None, default None + The function to compute baseline values with arguments specified in "lag_orders". + If None, the anomaly scores are the actual values of the time series y_{t}. + - "lag_orders": `list [`int`]` or None, default None + Values in the observed data used to compute the baseline. For example, if + `lag_orders = [-7, -1, 1, 7]` and `agg_func = numpy.nanmean`, the baseline + value for y_{t} is the average of (y_{t-7}, y_{t-1}, y_{t+1}, y_{t+7}). + If None, the anomaly scores are the actual values of the time series y_{t}. + - "trim_percent": `float` or None, default None + Trimming percentage on anomaly scores for calculating the thresholds. + This removes `trim_percent` of anomaly scores in symmetric fashion from both ends and + then calculates the quantities needed (e.g., mean, standard deviation, quartiles). + For example, in `method = "z_score"`, this will remove extreme values of anomaly scores + to calculate the mean and variance for computing the z-scores of anomaly scores. + - "z_cutoff": `float` or None, default None + The cutoff on the z-scores of anomaly scores to determine outliers. + Effective only when `method = "z_score"`. If None, no outliers are removed. + - "tukey_cutoff": `float` or None, default None + The tukey coefficient for anomaly scores to determine outliers. + Effective only when `method = "tukey"`. If None, no outliers are removed. + - "max_outlier_percent": `float` or None, default None + Maximum percentage of outliers to be removed. Range from 0 to 100. + When specified, for example `max_outlier_percent = 5`, the maximum portion of outliers + to be removed is 5% of the total number of data. If the original outliers detected + are less than 5%, the result is unaffected; if original outliers are more than 5%, + then only the top 5% outliers with the most extreme anomaly scores will be removed. + + Attributes + ---------- + score : `pandas.DataFrame` + Anomaly scores for each value in the input data. The anomaly scores can be computed + with the function `fit` in the class. + _is_fitted : `bool` + Whether the transformer is fitted. + """ + def __init__( + self, + method: str = "z_score", + score_type: str = "difference", + params: dict | None = None,): + self.score_type = score_type + self.method = method + self.params = params + self.score = None + self._is_fitted = False + + def fit(self, X, y=None): + """Computes the column-wise anomaly scores, stored as ``score`` attribute. + + Parameters + ---------- + X : `pandas.DataFrame` + Training input data. e.g. each column is a timeseries. + Columns are expected to be numeric. + y : None + There is no need of a target in a transformer, yet the pipeline API + requires this parameter. + + Returns + ------- + self : object + Returns self. + """ + # Gets variables and from `params` dictionary because __init__ is only run in initialization. + if self.params is not None: + self.diff_method = self.params.get("diff_method") + self.agg_func = self.params.get("agg_func") + self.lag_orders = self.params.get("lag_orders") + self.trim_percent = self.params.get("trim_percent") + self.z_cutoff = self.params.get("z_cutoff") + self.tukey_cutoff = self.params.get("tukey_cutoff") + self.max_outlier_percent = self.params.get("max_outlier_percent") + else: + self.diff_method = None + self.agg_func = None + self.lag_orders = None + self.trim_percent = None + self.z_cutoff = None + self.tukey_cutoff = None + self.max_outlier_percent = None + # Checks if the input variables are valid. + if self.score_type not in ["difference", "ratio"]: + raise NotImplementedError( + f"{self.score_type} is an invalid 'score_type': " + "must be either 'difference' or 'ratio'.") + if self.method not in ["z_score", "tukey"]: + raise NotImplementedError( + f"{self.method} is an invalid 'method': " + "must be either 'z_score' or 'tukey'.") + self._is_fitted = True + # If no threshold specified, does nothing. + if self.method == "z_score" and self.z_cutoff is None: + return self + if self.method == "tukey" and self.tukey_cutoff is None: + return self + # If the name of `diff_method` is in the available list, uses the `diff_method` in transform. + # Otherwise, sets `self.diff_method` to None. + if self.diff_method is not None and self.diff_method.name in IMPLEMENTED_DIFF_METHODS: + self.score = X + return self + else: + self.diff_method = None + if self.agg_func is not None and self.lag_orders is not None: + # Computes the baseline values. + lag_orders_list = [] + for lag_order in self.lag_orders: + lag_orders_list += [X.shift(-lag_order)] + baseline = pd.DataFrame(self.agg_func(lag_orders_list, axis=0)) + baseline.columns = X.columns + baseline.index = X.index + if self.score_type == "difference": + self.score = X - baseline + elif self.score_type == "ratio": + self.score = (X / baseline) - 1 + else: + self.score = X + return self + + def transform(self, X): + """Replaces outliers with NaN. + + Parameters + ---------- + X : `pandas.DataFrame` + Data to transform. e.g. each column is a timeseries. + Columns are expected to be numeric. + + Returns + ------- + X_outliers_removed : `pandas.DataFrame` + A copy of the data frame with original values and outliers replaced with NaN. + """ + if self._is_fitted is False: + raise NotFittedError( + "This instance is not fitted yet. Call `fit` with appropriate arguments " + "before calling `transform`.") + result = X.copy() + if self.score is None: + return result + if self.method == "z_score": + detector = ZScoreOutlierDetector( + z_score_cutoff=self.z_cutoff, + trim_percent=self.trim_percent, + diff_method=self.diff_method) + elif self.method == "tukey": + detector = TukeyOutlierDetector( + tukey_cutoff=self.tukey_cutoff, + iqr_lower=0.25, + iqr_upper=0.75, + trim_percent=self.trim_percent, + diff_method=self.diff_method) + # Creates a dataframe to store the outlier scores / indices for each column in `score`. + outlier_scores = pd.DataFrame(0, index=self.score.index, columns=self.score.columns) + outlier_indices = pd.DataFrame(0, index=self.score.index, columns=self.score.columns) + for col_name in self.score.columns: + detector.fit(self.score[col_name]) + outlier_scores[col_name] = detector.fitted.scores + outlier_indices[col_name] = np.array(detector.fitted.is_outlier) + # Checks for each column if the outliers are more than `max_outlier_percent`. + if self.max_outlier_percent is not None: + for col_name in outlier_indices.columns: + if outlier_indices[col_name].mean() > (self.max_outlier_percent / 100): + upper_cutoff = outlier_scores[col_name].quantile(1 - ((self.max_outlier_percent / 100) / 2)) + lower_cutoff = outlier_scores[col_name].quantile((self.max_outlier_percent / 100) / 2) + outlier_indices[col_name] = (outlier_scores[col_name] > upper_cutoff) | (outlier_scores[col_name] < lower_cutoff) + + if np.any(outlier_indices): + total_na = outlier_indices.sum().sum() + log_message(f"Detected {total_na} outlier(s).", LoggingLevelEnum.INFO) + result = result.mask(outlier_indices) + + return result diff --git a/greykite/tests/algo/changepoint/adalasso/test_changepoint_detector.py b/greykite/tests/algo/changepoint/adalasso/test_changepoint_detector.py index 9303879..bbebe13 100644 --- a/greykite/tests/algo/changepoint/adalasso/test_changepoint_detector.py +++ b/greykite/tests/algo/changepoint/adalasso/test_changepoint_detector.py @@ -10,9 +10,11 @@ from greykite.algo.changepoint.adalasso.changepoint_detector import ChangepointDetector from greykite.algo.changepoint.adalasso.changepoint_detector import get_changepoints_dict from greykite.algo.changepoint.adalasso.changepoint_detector import get_seasonality_changepoints +from greykite.algo.changepoint.shift_detection.shift_detector import ShiftDetection from greykite.common.data_loader import DataLoader from greykite.common.logging import LOGGER_NAME from greykite.common.testing_utils import generate_df_for_tests +from greykite.common.testing_utils import generate_df_with_arbitrary_trends_and_shifts from greykite.common.testing_utils import generate_test_changepoint_df @@ -311,6 +313,131 @@ def test_find_trend_changepoints(hourly_data): assert model.trend_df.shape[1] > 100 + 1 + 8 * 2 # checks extra columns are created for varying yearly seasonality +def test_find_trend_changepoints_with_shift_detector(hourly_data): + df = generate_df_with_arbitrary_trends_and_shifts( + start_date="2015-01-01", + length=365*4, + freq="D", + seed=10, + trend_slopes=[-1., 1.], + trend_intervals=[.5, 1.], + level_shifts=[(.05, .15), (.25, .35), (.65, .75), (.85, .95)], + level_shift_magnitudes=[100, -100, -100, 100] + ) + + model = ChangepointDetector() + # test class variables are initialized as None + assert model.trend_model is None + assert model.trend_coef is None + assert model.trend_intercept is None + assert model.trend_changepoints is None + assert model.trend_potential_changepoint_n is None + assert model.trend_df is None + assert model.y is None + assert model.original_df is None + assert model.value_col is None + assert model.time_col is None + assert model.adaptive_lasso_coef is None + # model training with default values + model.find_trend_changepoints( + df=df, + time_col="timestamp", + value_col="y" + ) + trend_df_shape = model.trend_df.shape + assert isinstance(model.trend_model, RegressorMixin) + assert model.trend_model.coef_.shape[0] == 100 + 1 + 8 * 2 + assert model.trend_coef.shape[0] == 100 + 1 + 8 * 2 + assert model.trend_intercept is not None + assert model.trend_changepoints is not None + assert model.trend_potential_changepoint_n == 100 + assert model.trend_df.shape[1] == 100 + 1 + 8 * 2 + assert model.original_df.shape == df.shape + assert model.time_col is not None + assert model.value_col is not None + assert model.adaptive_lasso_coef[1].shape[0] == 100 + 1 + 8 * 2 + assert model.y.index[0] not in model.trend_changepoints + # model training with default values and shift detector + model.find_trend_changepoints( + df=df, + time_col="timestamp", + value_col="y", + shift_detector=ShiftDetection() + ) + trend_df_with_shifts_shape = model.trend_df.shape + assert isinstance(model.trend_model, RegressorMixin) + assert model.trend_model.coef_.shape[0] >= 100 + 1 + 8 * 2 + assert model.trend_coef.shape[0] >= 100 + 1 + 8 * 2 + assert model.trend_intercept is not None + assert model.trend_changepoints is not None + assert model.trend_potential_changepoint_n == 100 + assert model.trend_df.shape[1] >= 100 + 1 + 8 * 2 + assert model.original_df.shape == df.shape + assert model.time_col is not None + assert model.value_col is not None + assert model.adaptive_lasso_coef[1].shape[0] >= 100 + 1 + 8 * 2 + assert model.y.index[0] not in model.trend_changepoints + trend_df_with_shifts_shape = model.trend_df.shape + # check that the found changepoints are <= to changepoints with level shift regressors + assert trend_df_shape[1] <= trend_df_with_shifts_shape[1] + # check that trend_df has more columns, check column names for levelshift regressors + assert any("levelshift" in column for column in model.trend_df.columns) + # test a given ``regularization_strength`` + model = ChangepointDetector() + model.find_trend_changepoints( + df=df, + time_col="timestamp", + value_col="y", + regularization_strength=1.0, + shift_detector=ShiftDetection() + ) + assert isinstance(model.trend_model, RegressorMixin) + assert model.trend_model.coef_.shape[0] >= 100 + 1 + 8 * 2 + assert model.trend_coef.shape[0] >= 100 + 1 + 8 * 2 + assert model.trend_intercept is not None + assert model.trend_changepoints is not None + assert model.trend_potential_changepoint_n == 100 + assert model.trend_df.shape[1] >= 100 + 1 + 8 * 2 + assert model.original_df.shape == df.shape + assert model.time_col is not None + assert model.value_col is not None + assert model.adaptive_lasso_coef[1].shape[0] >= 100 + 1 + 8 * 2 + assert model.y.index[0] not in model.trend_changepoints + assert model.trend_changepoints == [] + model.find_trend_changepoints( + df=df, + time_col="timestamp", + value_col="y", + regularization_strength=0.5, + shift_detector=ShiftDetection() + ) + # ``regularization_strength`` between 0 and 1 indicates at least one change point + assert len(model.trend_changepoints) > 0 + model.find_trend_changepoints( + df=df, + time_col="timestamp", + value_col="y", + actual_changepoint_min_distance="D", + regularization_strength=0.0, + shift_detector=ShiftDetection() + ) + # ``regularization_strength`` == 0.0 indicates all potential change points are present + assert len(model.trend_changepoints) == 100 + with pytest.raises(ValueError, + match="In potential_changepoint_distance, the maximal unit is 'D', " + "i.e., you may use units no more than 'D' such as" + "'10D', '5H', '100T', '200S'. The reason is that 'W', 'M' " + "or higher has either cycles or indefinite number of days, " + "thus is not parsable by pandas as timedelta."): + model.find_trend_changepoints( + df=df, + time_col="timestamp", + value_col="y", + potential_changepoint_distance="2M", + shift_detector=ShiftDetection() + ) + + def test_find_trend_changepoints_slow(hourly_data): """Tests the trend changepoint detection when fast trend estimation is turned off.""" dl = DataLoader() diff --git a/greykite/tests/algo/changepoint/shift_detection/test_shift_detector.py b/greykite/tests/algo/changepoint/shift_detection/test_shift_detector.py new file mode 100644 index 0000000..c035a71 --- /dev/null +++ b/greykite/tests/algo/changepoint/shift_detection/test_shift_detector.py @@ -0,0 +1,211 @@ +from datetime import datetime + +import pandas as pd +import pytest + +from greykite.algo.changepoint.shift_detection.shift_detector import ShiftDetection +from greykite.common.constants import LEVELSHIFT_COL_PREFIX_SHORT + + +# Test common time and value column names for Greykite and OLOF respectively. +@pytest.mark.parametrize("time_col, value_col", [("ts", "actual"), ("timestamp", "value")]) +def test_detect_daily(time_col: str, value_col: str): + # create input_df + input_val_ls = [100] * 10 + [200] * 10 + [300] * 10 + input_ts_ls = pd.date_range(datetime(2020, 1, 1), freq="D", periods=30) + input_df = pd.DataFrame({time_col: input_ts_ls, value_col: input_val_ls}) + + # create expected_df + expected_df = pd.DataFrame({ + time_col: pd.date_range(datetime(2020, 1, 1), freq="D", periods=35), + value_col: [100] * 10 + [200] * 10 + [300] * 10 + [None] * 5, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_11_00_00": [0] * 10 + [1] * 25, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_21_00_00": [0] * 20 + [1] * 15 + }) + + # call the function + detector = ShiftDetection() + output_regressor_col, output_df = detector.detect( + input_df, + time_col=time_col, + value_col=value_col, + forecast_horizon=5, + freq="D", + z_score_cutoff=3 + ) + + # unit test + pd.testing.assert_frame_equal(output_df, expected_df) + assert output_regressor_col == [f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_11_00_00", + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_21_00_00"] + + +@pytest.mark.parametrize("time_col, value_col", [("ts", "actual"), ("timestamp", "value")]) +def test_detect_weekly(time_col: str, value_col: str): + # create input_df + input_val_ls = [100] * 10 + [200] * 10 + [300] * 10 + input_ts_ls = pd.date_range(datetime(2020, 1, 1), freq="W", periods=30) + input_df = pd.DataFrame({time_col: input_ts_ls, value_col: input_val_ls}) + + # create expected_df + expected_df = pd.DataFrame({ + time_col: pd.date_range(datetime(2020, 1, 1), freq="W", periods=35), + value_col: [100] * 10 + [200] * 10 + [300] * 10 + [None] * 5, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_03_15_00_00": [0] * 10 + [1] * 25, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_05_24_00_00": [0] * 20 + [1] * 15 + }) + + # call the function + detector = ShiftDetection() + output_regressor_col, output_df = detector.detect( + input_df, + time_col=time_col, + value_col=value_col, + forecast_horizon=5, + freq="W", + z_score_cutoff=3 + ) + + # unit test + pd.testing.assert_frame_equal(output_df, expected_df) + assert output_regressor_col == [f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_03_15_00_00", + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_05_24_00_00"] + + +def test_invalid_freq(time_col="ts", value_col="actual"): + # create input_df + input_val_ls = [100] * 10 + [200] * 10 + [300] * 10 + input_ts_ls = pd.date_range(datetime(2020, 1, 1), freq="S", periods=30) + input_df = pd.DataFrame({time_col: input_ts_ls, value_col: input_val_ls}) + + # call the function + detector = ShiftDetection() + with pytest.raises(ValueError): + output_regressor_col, output_df = detector.detect( + input_df, + time_col=time_col, + value_col=value_col, + forecast_horizon=5, + freq="S", + z_score_cutoff=3 + ) + + +def test_find_shifts(): + # create input_df + input_val_ls = [100] * 10 + [200] * 10 + [300] * 10 + input_ts_ls = pd.date_range(datetime(2020, 1, 1), freq="D", periods=30) + input_df = pd.DataFrame({"ts": input_ts_ls, "actual": input_val_ls}) + + # create expected results + expected_shift_dates = [ + (datetime(2020, 1, 11, 0, 0, 0), datetime(2020, 1, 11, 0, 0, 0)), + (datetime(2020, 1, 21, 0, 0, 0), datetime(2020, 1, 21, 0, 0, 0))] + + expected_df_find_shifts = pd.DataFrame({ + "ts": pd.date_range(datetime(2020, 1, 1), freq="D", periods=30), + "actual": [100] * 10 + [200] * 10 + [300] * 10, + "actual_diff": [None] + [0] * 9 + [100] + [0] * 9 + [100] + [0] * 9, + "zscore": [None] + [-0.267432] * 9 + [3.610330] + [-0.267432] * 9 + [3.610330] + [-0.267432] * 9 + }) + + # call the function + detector = ShiftDetection() + output_df_find_shifts, output_shift_dates = detector.find_shifts( + input_df, + time_col="ts", + value_col="actual", + z_score_cutoff=3 + ) + + # unit test + assert output_shift_dates == expected_shift_dates + pd.testing.assert_frame_equal(output_df_find_shifts, expected_df_find_shifts) + + +def test_find_no_shift(): + # create input_df with no shift + input_val_ls = list(range(30)) + input_ts_ls = pd.date_range(datetime(2020, 1, 1), freq="D", periods=30) + input_df = pd.DataFrame({"ts": input_ts_ls, "actual": input_val_ls}) + + # create expected results + expected_shift_dates = [] + + expected_df_find_shifts = pd.DataFrame({ + "ts": pd.date_range(datetime(2020, 1, 1), freq="D", periods=30), + "actual": list(range(30)), + "actual_diff": [None] + [1] * 29, + "zscore": [float('nan')] * 30 # actual_diff's standard deviation is 0 so zscore is 0/0=NaN + }) + + # call the function + detector = ShiftDetection() + output_df_find_shifts, output_shift_dates = detector.find_shifts( + input_df, + time_col="ts", + value_col="actual", + z_score_cutoff=3 + ) + + # unit test + assert output_shift_dates == expected_shift_dates + pd.testing.assert_frame_equal(output_df_find_shifts, expected_df_find_shifts) + + +def test_create_df_with_regressor(): + # create inputs + input_shiftsm = [ + (datetime(2020, 1, 11, 0, 0, 0), datetime(2020, 1, 11, 0, 0, 0)), + (datetime(2020, 1, 21, 0, 0, 0), datetime(2020, 1, 21, 0, 0, 0))] + input_val_ls = [100] * 10 + [200] * 10 + [300] * 10 + input_ts_ls = pd.date_range(datetime(2020, 1, 1), freq="D", periods=30) + input_df = pd.DataFrame({"ts": input_ts_ls, "actual": input_val_ls}) + + # create expected outputs + expected_df_regressor = pd.DataFrame({ + "ts": pd.date_range(datetime(2020, 1, 1), freq="D", periods=30), + "actual": [100] * 10 + [200] * 10 + [300] * 10, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_11_00_00": [0] * 10 + [1] * 20, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_21_00_00": [0] * 20 + [1] * 10 + }) + + # call the function + detector = ShiftDetection() + output_df_regressor = detector.create_df_with_regressor(input_df, "ts", input_shiftsm) + + # unit test + pd.testing.assert_frame_equal(output_df_regressor, expected_df_regressor) + + +def test_create_regressor_for_future_dates(): + # create inputs + input_df_regressor = pd.DataFrame({ + "ts": pd.date_range(datetime(2020, 1, 1), freq="D", periods=30), + "actual": [100] * 10 + [200] * 10 + [300] * 10, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_11_00_00": [0] * 10 + [1] * 20, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_21_00_00": [0] * 20 + [1] * 10 + }) + + # create expected outputs + expected_df = pd.DataFrame({ + "ts": pd.date_range(datetime(2020, 1, 1), freq="D", periods=35), + "actual": [100] * 10 + [200] * 10 + [300] * 10 + [None] * 5, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_11_00_00": [0] * 10 + [1] * 25, + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_21_00_00": [0] * 20 + [1] * 15 + }) + + # call the function + detector = ShiftDetection() + output_regressor_col, output_df = detector.create_regressor_for_future_dates( + input_df_regressor, + time_col="ts", + value_col="actual", + forecast_horizon=5, + freq="D", + ) + + # unit test + pd.testing.assert_frame_equal(output_df, expected_df) + assert output_regressor_col == [f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_11_00_00", + f"{LEVELSHIFT_COL_PREFIX_SHORT}_2020_01_21_00_00"] diff --git a/greykite/tests/algo/common/test_holiday_grouper.py b/greykite/tests/algo/common/test_holiday_grouper.py index 5b2743c..7ba4471 100644 --- a/greykite/tests/algo/common/test_holiday_grouper.py +++ b/greykite/tests/algo/common/test_holiday_grouper.py @@ -47,6 +47,8 @@ def test_expand_holiday_df_with_suffix(holiday_df): holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=None, get_suffix_func=None ).sort_values(by=[EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]).reset_index(drop=True) @@ -58,17 +60,41 @@ def test_expand_holiday_df_with_suffix(holiday_df): holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict={"unknown": [1, 1]}, get_suffix_func=None ).sort_values(by=[EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]).reset_index(drop=True) assert_equal(expanded_holiday_df, holiday_df) - # Tests the case when only neighboring days are added. + # Tests the case when only neighboring days are added and only through `holiday_impact_pre_num_days` and + # `holiday_impact_post_num_days`. expanded_holiday_df = HolidayGrouper.expand_holiday_df_with_suffix( holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=1, + holiday_impact_post_num_days=2, + holiday_impact_dict=None, + get_suffix_func=None + ).sort_values(by=[EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]).reset_index(drop=True) + + # Spot checks a few events are being correctly added. + assert "Christmas Day_minus_1" in expanded_holiday_df[EVENT_DF_LABEL_COL].tolist() + assert "New Year's Day_plus_2" in expanded_holiday_df[EVENT_DF_LABEL_COL].tolist() + + # Checks the expected total number of events. + expected_diff = len(holiday_df) * (1+2) + assert len(expanded_holiday_df) - len(holiday_df) == expected_diff + + # Tests the case when only neighboring days are added and only through `holiday_impact_dict`. + expanded_holiday_df = HolidayGrouper.expand_holiday_df_with_suffix( + holiday_df=holiday_df, + holiday_date_col=EVENT_DF_DATE_COL, + holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=HOLIDAY_IMPACT_DICT, get_suffix_func=None ).sort_values(by=[EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]).reset_index(drop=True) @@ -85,11 +111,39 @@ def test_expand_holiday_df_with_suffix(holiday_df): expected_diff += additional_days assert len(expanded_holiday_df) - len(holiday_df) == expected_diff + # Tests the case when neighboring days are added through `holiday_impact_pre_num_days`, + # `holiday_impact_post_num_days` and `holiday_impact_dict`. + expanded_holiday_df = HolidayGrouper.expand_holiday_df_with_suffix( + holiday_df=holiday_df, + holiday_date_col=EVENT_DF_DATE_COL, + holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=8, + holiday_impact_post_num_days=0, + holiday_impact_dict=HOLIDAY_IMPACT_DICT, + get_suffix_func=None + ).sort_values(by=[EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]).reset_index(drop=True) + + # Spot checks a few events are being correctly added or not added. + assert "Veterans Day_minus_8" in expanded_holiday_df[EVENT_DF_LABEL_COL].tolist() + assert "Veterans Day_plus_1" not in expanded_holiday_df[EVENT_DF_LABEL_COL].tolist() + assert "New Year's Day_plus_4" in expanded_holiday_df[EVENT_DF_LABEL_COL].tolist() + assert "New Year's Day_minus_8" not in expanded_holiday_df[EVENT_DF_LABEL_COL].tolist() + + # Checks the expected total number of events. + expected_diff = len(holiday_df[~holiday_df[EVENT_DF_LABEL_COL].isin(HOLIDAY_IMPACT_DICT.keys())]) * 8 + for event, (pre, post) in HOLIDAY_IMPACT_DICT.items(): + count = (holiday_df[EVENT_DF_LABEL_COL] == event).sum() + additional_days = (pre + post) * count + expected_diff += additional_days + assert len(expanded_holiday_df) - len(holiday_df) == expected_diff + # Tests the case when both neighboring days and suffixes are added. expanded_holiday_df = HolidayGrouper.expand_holiday_df_with_suffix( holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=HOLIDAY_IMPACT_DICT, get_suffix_func=get_weekday_weekend_suffix ).sort_values(by=[EVENT_DF_DATE_COL, EVENT_DF_LABEL_COL]).reset_index(drop=True) @@ -114,6 +168,8 @@ def test_expand_holiday_df_with_suffix(holiday_df): holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=None, get_suffix_func="unknown" ) @@ -129,6 +185,8 @@ def test_holiday_grouper_init(daily_df, holiday_df): holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=None, get_suffix_func=None ) @@ -152,6 +210,8 @@ def test_group_holidays(daily_df, holiday_df): holiday_df=holiday_df, holiday_date_col=EVENT_DF_DATE_COL, holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=0, + holiday_impact_post_num_days=0, holiday_impact_dict=HOLIDAY_IMPACT_DICT, get_suffix_func=default_get_suffix_func ) diff --git a/greykite/tests/algo/common/test_holiday_utils.py b/greykite/tests/algo/common/test_holiday_utils.py index 670beaf..0e2291a 100644 --- a/greykite/tests/algo/common/test_holiday_utils.py +++ b/greykite/tests/algo/common/test_holiday_utils.py @@ -1,7 +1,12 @@ import pandas as pd +import pytest +from greykite.algo.common.holiday_utils import add_shifted_events +from greykite.algo.common.holiday_utils import get_autoreg_holiday_interactions from greykite.algo.common.holiday_utils import get_dow_grouped_suffix from greykite.algo.common.holiday_utils import get_weekday_weekend_suffix +from greykite.common.constants import EVENT_DF_DATE_COL +from greykite.common.constants import EVENT_DF_LABEL_COL def test_get_dow_grouped_suffix(): @@ -10,7 +15,7 @@ def test_get_dow_grouped_suffix(): assert get_dow_grouped_suffix(date) == "_Sun" date = pd.to_datetime("2023-01-02") - assert get_dow_grouped_suffix(date) == "_WD" + assert get_dow_grouped_suffix(date) == "_Mon/Fri" date = pd.to_datetime("2023-01-03") assert get_dow_grouped_suffix(date) == "_WD" @@ -22,7 +27,7 @@ def test_get_dow_grouped_suffix(): assert get_dow_grouped_suffix(date) == "_WD" date = pd.to_datetime("2023-01-06") - assert get_dow_grouped_suffix(date) == "_WD" + assert get_dow_grouped_suffix(date) == "_Mon/Fri" date = pd.to_datetime("2023-01-07") assert get_dow_grouped_suffix(date) == "_Sat" @@ -50,3 +55,61 @@ def test_get_weekday_weekend_suffix(): date = pd.to_datetime("2023-01-07") assert get_weekday_weekend_suffix(date) == "_WE" + + +@pytest.fixture +def daily_event_df_dict(): + """A sample holiday configuration.""" + daily_event_df_dict = { + "New Years Day": pd.DataFrame({ + EVENT_DF_DATE_COL: pd.to_datetime(["2020-01-01", "2021-01-01", "2022-01-01"]), + EVENT_DF_LABEL_COL: "event" + }), + "Christmas Day": pd.DataFrame({ + EVENT_DF_DATE_COL: pd.to_datetime(["2020-12-25", "2021-12-25", "2022-12-25"]), + EVENT_DF_LABEL_COL: "event" + }) + } + return daily_event_df_dict + + +def test_get_autoreg_holiday_interactions(daily_event_df_dict): + """Tests `get_autoreg_holiday_interactions` function.""" + interaction_terms = get_autoreg_holiday_interactions( + daily_event_df_dict=daily_event_df_dict, + lag_names=["y_lag1", "y_avglag_7_14_21"] + ) + assert interaction_terms == [ + "C(Q('events_Christmas Day'), levels=['', 'event']):y_lag1", + "C(Q('events_New Years Day'), levels=['', 'event']):y_lag1", + "C(Q('events_Christmas Day'), levels=['', 'event']):y_avglag_7_14_21", + "C(Q('events_New Years Day'), levels=['', 'event']):y_avglag_7_14_21" + ] + + +def test_add_shifted_events(daily_event_df_dict): + """Tests `expand_holidays_with_lags` function.""" + shifted_events_dict = add_shifted_events(daily_event_df_dict=daily_event_df_dict, shifted_effect_lags=["1D", "-1D"]) + new_daily_event_df_dict = shifted_events_dict["new_daily_event_df_dict"] + shifted_events_cols = shifted_events_dict["shifted_events_cols"] + drop_pred_cols = shifted_events_dict["drop_pred_cols"] + assert sorted(new_daily_event_df_dict.keys()) == [ + "Christmas Day", + "Christmas Day_1D_after", + "Christmas Day_1D_before", + "New Years Day", + "New Years Day_1D_after", + "New Years Day_1D_before" + ] + assert sorted(shifted_events_cols) == [ + "events_Christmas Day_1D_after", + "events_Christmas Day_1D_before", + "events_New Years Day_1D_after", + "events_New Years Day_1D_before" + ] + assert sorted(drop_pred_cols) == [ + "C(Q('events_Christmas Day_1D_after'), levels=['', 'Christmas Day_1D_after'])", + "C(Q('events_Christmas Day_1D_before'), levels=['', 'Christmas Day_1D_before'])", + "C(Q('events_New Years Day_1D_after'), levels=['', 'New Years Day_1D_after'])", + "C(Q('events_New Years Day_1D_before'), levels=['', 'New Years Day_1D_before'])" + ] diff --git a/greykite/tests/algo/common/test_ml_models.py b/greykite/tests/algo/common/test_ml_models.py index e686990..11baa46 100644 --- a/greykite/tests/algo/common/test_ml_models.py +++ b/greykite/tests/algo/common/test_ml_models.py @@ -581,6 +581,7 @@ def test_fit_ml_model(): "h_mat", "p_effective", "sigma_scaler", + "x_mean", "fitted_df"] assert (trained_model["y"] == df["y"]).all() @@ -1562,6 +1563,8 @@ def test_dummy(): p_effective = trained_model["p_effective"] assert round(p_effective, 2) == 2 assert trained_model["sigma_scaler"] == np.sqrt((n - 1) / (n - p_effective)) + assert trained_model["x_mean"] is None # When the model is linear, `"x_mean"` is `None`. + assert trained_model["h_mat"].shape == trained_model["x_mat"].shape[::-1] def test_fit_ml_model_with_evaluation_nan(): @@ -1586,6 +1589,8 @@ def test_fit_ml_model_with_evaluation_nan(): # Since the design matrix is singular, variance scaling is skipped. assert "Zero degrees of freedom" in record[1].message.args[0] assert trained_model["sigma_scaler"] is None + assert trained_model["x_mean"] is None # When the model is linear, `"x_mean"` is `None`. + assert trained_model["h_mat"].shape == trained_model["x_mat"].shape[::-1] assert_equal(trained_model["y"], df["y"].loc[(0, 1, 3), ]) diff --git a/greykite/tests/algo/forecast/silverkite/test_auto_config.py b/greykite/tests/algo/forecast/silverkite/test_auto_config.py index 8a5c3c8..f800b3f 100644 --- a/greykite/tests/algo/forecast/silverkite/test_auto_config.py +++ b/greykite/tests/algo/forecast/silverkite/test_auto_config.py @@ -1,6 +1,8 @@ +import numpy as np import pandas as pd import pytest +from greykite.algo.common.holiday_grouper import HolidayGrouper from greykite.algo.forecast.silverkite.auto_config import get_auto_growth from greykite.algo.forecast.silverkite.auto_config import get_auto_holidays from greykite.algo.forecast.silverkite.auto_config import get_auto_seasonality @@ -9,6 +11,8 @@ from greykite.common.constants import TIME_COL from greykite.common.constants import VALUE_COL from greykite.common.data_loader import DataLoader +from greykite.common.features.timeseries_features import get_holidays +from greykite.common.python_utils import assert_equal @pytest.fixture @@ -71,54 +75,299 @@ def test_get_auto_seasonality_override(df_daily): def test_get_auto_holiday(df_daily): - """Tests automatic holidays.""" - custom_event = pd.DataFrame({ - EVENT_DF_DATE_COL: pd.to_datetime(["2015-03-03", "2016-03-03", "2017-03-03"]), - EVENT_DF_LABEL_COL: "threethree" - }) - holidays = get_auto_holidays( + """Tests automatic holidays if the return is the same as calling `HolidayGrouper`.""" + # Initializes inputs that will be used in both cases. + start_year = 2007 + end_year = 2016 + pre_num = 2 + post_num = 2 + pre_post_num_dict = {"New Year's Day": (1, 3)} + holiday_lookup_countries = ["US"] + + # Constructs `daily_event_df_dict` through directly calling `HolidayGrouper`. + # Constructs `holiday_df`. + holiday_df_dict = get_holidays( + countries=holiday_lookup_countries, + year_start=start_year - 1, + year_end=end_year + 1) + + holiday_df_list = [holidays for _, holidays in holiday_df_dict.items()] + holiday_df = pd.concat(holiday_df_list) + # Removes the observed holidays and only keep the original holidays. + holiday_df = holiday_df[~holiday_df[EVENT_DF_LABEL_COL].str.contains("Observed")] + + # Calls `HolidayGrouper`. + hg = HolidayGrouper( df=df_daily, time_col=TIME_COL, value_col=VALUE_COL, - countries=["UnitedStates"], - daily_event_dict_override=dict( - custom_event=custom_event - ) + holiday_df=holiday_df, + holiday_date_col=EVENT_DF_DATE_COL, + holiday_name_col=EVENT_DF_LABEL_COL, + holiday_impact_pre_num_days=pre_num, + holiday_impact_post_num_days=post_num, + holiday_impact_dict=pre_post_num_dict ) - assert len(holidays) == 34 # Only United States is used. - assert holidays["custom_event"].equals(custom_event) - assert "Holiday_positive_group" in holidays - assert "Holiday_negative_group" in holidays - assert "UnitedKingdom_Christmas Day_minus_1" not in holidays - assert "UnitedStates_Labor Day" in holidays - - -def test_get_auto_holiday_super_daily(df_daily): - """Tests automatic holidays for super daily data.""" - custom_event = pd.DataFrame({ - EVENT_DF_DATE_COL: pd.to_datetime(["2015-03-03", "2016-03-03", "2017-03-03"]), - EVENT_DF_LABEL_COL: "threethree" - }) - df = df_daily.resample("7D", on=TIME_COL).mean().reset_index(drop=False) - # With custom event, result only has custom event. - holidays = get_auto_holidays( - df=df, + + hg.group_holidays() + daily_event_df_dict_constructed = hg.result_dict["daily_event_df_dict"] + + # Constructs `daily_event_df_dict` through `get_auto_holidays` with country list and asserts the result is the same. + daily_event_df_dict_from_auto_country = get_auto_holidays( + df=df_daily, time_col=TIME_COL, value_col=VALUE_COL, - daily_event_dict_override=dict( - custom_event=custom_event + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=None + ) + assert_equal(daily_event_df_dict_constructed, daily_event_df_dict_from_auto_country) + + # Constructs `daily_event_df_dict` through `get_auto_holidays` with external `daily_event_df_dict` and asserts the + # result is the same. + + # Constructs `daily_event_df_dict_input` for input based on `holiday_df`. + daily_event_df_dict_input = {key: value for key, value in holiday_df.groupby(EVENT_DF_LABEL_COL)} + daily_event_df_dict_from_auto_input = get_auto_holidays( + df=df_daily, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=[], + holidays_to_model_separately=None, + daily_event_df_dict=daily_event_df_dict_input, + auto_holiday_params=None + ) + assert_equal(daily_event_df_dict_constructed, daily_event_df_dict_from_auto_input) + + # Uses 'auto_holiday_params["df"]' to input time series on which holidays are referred and asserts the + # result is the same. + daily_event_df_dict_from_auto_input = get_auto_holidays( + df=None, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=[], + holidays_to_model_separately=None, + daily_event_df_dict=daily_event_df_dict_input, + auto_holiday_params=dict(df=df_daily) + ) + assert_equal(daily_event_df_dict_constructed, daily_event_df_dict_from_auto_input) + + # Inputs `holiday_df` directly through `auto_holiday_params` and asserts the result is the same. + # The `holiday_lookup_countries` should be ignored in this case. + daily_event_df_dict_from_input_holiday_df = get_auto_holidays( + df=df_daily, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=dict( + holiday_df=holiday_df ) ) - assert holidays == dict( - custom_event=custom_event + assert_equal(daily_event_df_dict_constructed, daily_event_df_dict_from_input_holiday_df) + + # When holidays are not passed in through `holiday_lookup_countries` or `daily_event_df_dict`, a `ValueError` + # should be raised. + with pytest.raises(ValueError, match="Holiday list needs to be specified"): + _ = get_auto_holidays( + df=df_daily, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=[], + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=None + ) + + # When no `df` is passed in through `df` or `auto_holiday_params`, a `ValueError` + # should be raised. + with pytest.raises(ValueError, match="Dataframe cannot be `None` or empty"): + _ = get_auto_holidays( + df=None, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=None + ) + + +def test_get_auto_holiday_with_holidays_to_model_separately(df_daily): + """Tests automatic holidays with its functionality of `holidays_to_model_separately`.""" + + # Initializes inputs that will be used in both cases. + start_year = 2007 + end_year = 2016 + pre_num = 2 + post_num = 2 + holiday_lookup_countries = ["US"] + pre_post_num_dict = {"New Year's Day": (1, 3)} + # Uses minimum thresholds to make sure holidays in holiday groupers are all preserved. + # e.g. holidays will not be dropped in holiday grouper due to lack of similar days/different + # impact across years etc. We can then check if the same holiday dates are preserved + # even when we model some of them separately. + auto_holiday_params = dict( + min_abs_avg_score=0, + min_same_sign_ratio=0, + get_suffix_func=None, + min_n_days=1 + ) + + # Constructs `daily_event_df_dict` through `get_auto_holidays` with country list. + daily_event_df_dict_no_separate = get_auto_holidays( + df=df_daily, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=auto_holiday_params ) - # Without custom event, result is empty. - holidays = get_auto_holidays( - df=df, + + # Constructs `daily_event_df_dict` through `get_auto_holidays`, here we model "New Year's Day" + # Separately. Notice that we also specify its neighboring days though `pre_post_num_dict`. + # We also expect its neighboring days to be modeled separately in the final `daily_event_df_dict_with_separate`. + daily_event_df_dict_with_separate = get_auto_holidays( + df=df_daily, time_col=TIME_COL, - value_col=VALUE_COL + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=["New Year's Day"], + daily_event_df_dict=None, + auto_holiday_params=auto_holiday_params ) - assert holidays == {} + + # Asserts the holiday dates are the same. + unique_dates_no_separate = pd.concat([df for df in daily_event_df_dict_no_separate.values()])["date"] + unique_dates_no_separate = set(unique_dates_no_separate) + unique_dates_with_separate = pd.concat([df for df in daily_event_df_dict_with_separate.values()])["date"] + unique_dates_with_separate = set(unique_dates_with_separate) + assert unique_dates_no_separate == unique_dates_with_separate + + # Asserts all keys for `daily_event_df_dict_no_separate` for holiday groups are also included in + # `daily_event_df_dict_with_separate`. + assert daily_event_df_dict_no_separate.keys() - daily_event_df_dict_with_separate.keys() == set() + + # Checks that "New Year's Day" and its neighboring days have its own groups in `daily_event_df_dict_with_separate`. + # The number of neighboring days depend on `pre_post_num_dict`. + assert "New Years Day" in daily_event_df_dict_with_separate.keys() + assert "New Years Day_minus_1" in daily_event_df_dict_with_separate.keys() + assert "New Years Day_minus_2" not in daily_event_df_dict_with_separate.keys() + assert "New Years Day_plus_3" in daily_event_df_dict_with_separate.keys() + + +def test_get_auto_holiday_non_daily_df(df_hourly): + """Tests automatic holidays for data frequency different from daily.""" + + # Initializes inputs that will be used in all cases. + start_year = 2010 + end_year = 2019 + pre_num = 2 + post_num = 2 + holiday_lookup_countries = ["US"] + pre_post_num_dict = {} + + # Constructs `daily_event_df_dict` through `get_auto_holidays` with country list with hourly data. + daily_event_df_dict_hourly = get_auto_holidays( + df=df_hourly, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=None + ) + + # Aggregates hourly data to daily and checks if the resulted `daily_event_df_dict` is the same + df_tmp = df_hourly.resample("D", on=TIME_COL).agg({VALUE_COL: np.nanmean}) + df_daily_reconstructed = (df_tmp.drop(columns=TIME_COL).reset_index() if TIME_COL in df_tmp.columns + else df_tmp.reset_index()) + + daily_event_df_dict_daily_reconstructed = get_auto_holidays( + df=df_daily_reconstructed, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=None + ) + + assert_equal(daily_event_df_dict_hourly, daily_event_df_dict_daily_reconstructed) + + # Aggregates hourly data to weekly and check that the correct error will be raised. + df_tmp = df_hourly.resample("W", on=TIME_COL).agg({VALUE_COL: np.nanmean}) + df_weekly_reconstructed = (df_tmp.drop(columns=TIME_COL).reset_index() if TIME_COL in df_tmp.columns + else df_tmp.reset_index()) + + with pytest.raises(ValueError, match="frequency less than daily"): + get_auto_holidays( + df=df_weekly_reconstructed, + time_col=TIME_COL, + value_col=VALUE_COL, + start_year=start_year, + end_year=end_year, + pre_num=pre_num, + post_num=post_num, + pre_post_num_dict=pre_post_num_dict, + holiday_lookup_countries=holiday_lookup_countries, + holidays_to_model_separately=None, + daily_event_df_dict=None, + auto_holiday_params=None + ) def test_get_auto_growth_daily(df_daily): diff --git a/greykite/tests/algo/forecast/silverkite/test_forecast_simple_silverkite.py b/greykite/tests/algo/forecast/silverkite/test_forecast_simple_silverkite.py index cfca1b4..51b0de9 100644 --- a/greykite/tests/algo/forecast/silverkite/test_forecast_simple_silverkite.py +++ b/greykite/tests/algo/forecast/silverkite/test_forecast_simple_silverkite.py @@ -1409,16 +1409,19 @@ def test_auto_config_params(daily_data_reg): value_col=VALUE_COL, forecast_horizon=7, auto_holiday=True, - holidays_to_model_separately="auto", + holidays_to_model_separately=["custom_event"], holiday_lookup_countries="auto", - holiday_pre_num_days=2, - holiday_post_num_days=2, + holiday_pre_num_days=0, + holiday_post_num_days=0, daily_event_df_dict=dict( custom_event=pd.DataFrame({ EVENT_DF_DATE_COL: pd.to_datetime(["2010-03-03", "2011-03-03", "2012-03-03"]), EVENT_DF_LABEL_COL: "threethree" }) ), + auto_holiday_params=dict( + n_clusters=5 + ), auto_growth=True, growth_term="quadratic", changepoints_dict=dict( @@ -1445,9 +1448,9 @@ def test_auto_config_params(daily_data_reg): assert "ct1" in params["extra_pred_cols"] assert params["changepoints_dict"]["method"] == "custom" # Holidays is overridden by auto seasonality. - assert len(params["daily_event_df_dict"]) == 203 + assert len(params["daily_event_df_dict"]) == 6 assert "custom_event" in params["daily_event_df_dict"] - assert "China_Chinese New Year" in params["daily_event_df_dict"] + assert "holiday_group_0" in params["daily_event_df_dict"] def test_config_run_with_dst_features(daily_data_reg): @@ -1466,6 +1469,7 @@ def test_config_run_with_dst_features(daily_data_reg): holiday_lookup_countries="auto", holiday_pre_num_days=2, holiday_post_num_days=2, + auto_holiday_params=None, extra_pred_cols=["eu_dst"], auto_seasonality=True, yearly_seasonality=0, @@ -1505,6 +1509,7 @@ def test_auto_config_run(daily_data_reg): EVENT_DF_LABEL_COL: "event" }) ), + auto_holiday_params=None, auto_growth=True, growth_term="quadratic", changepoints_dict=dict( diff --git a/greykite/tests/algo/uncertainty/conditional/test_conf_interval.py b/greykite/tests/algo/uncertainty/conditional/test_conf_interval.py index 4b5d4b6..dbc5f3a 100644 --- a/greykite/tests/algo/uncertainty/conditional/test_conf_interval.py +++ b/greykite/tests/algo/uncertainty/conditional/test_conf_interval.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from greykite.algo.uncertainty.conditional.conf_interval import conf_interval @@ -17,12 +18,11 @@ def data(): def test_conf_interval_ecdf_method(data): - """Testing "conf_interval" function with "ecdf" method - """ + """Testing `conf_interval` function with "ecdf" method""" df = data["df"] new_df = data["new_df"] - # ``quantile_estimation_method = "ecdf"`` + # `quantile_estimation_method = "ecdf"` ci_model = conf_interval( df=df, distribution_col="residual", @@ -53,6 +53,72 @@ def test_conf_interval_ecdf_method(data): assert list(pred_df[ERR_STD_COL].values) == expected_stds +def test_conf_interval_ecdf_method_no_large_segments_remove_mean_false(data): + """Testing `conf_interval` function with "ecdf" method. + Two things are tested: + (a) the case where no segment has enough samples. + (b) the case where conditional mean isn't removed. + """ + df = data["df"] + + # `quantile_estimation_method = "ecdf"` + # We test for all segments being smaller than `sample_size_thresh`. + # Note that we are almost calculating min/median/max by passing `quantiles=[0, 0.5, 1]`. + ci_model = conf_interval( + df=df, + distribution_col="residual", + offset_col="y_hat", + conditional_cols=["x"], + quantiles=[0, 0.5, 1], + quantile_estimation_method="ecdf", + sample_size_thresh=len(df) + 1, # delibrately forcing no segment to have enough samples + small_sample_size_method="std_quantiles", + small_sample_size_quantile=0.95, + min_admissible_value=None, + max_admissible_value=None) + + # It is expected that we get only one row in `"ecdf_df"` item + assert len(ci_model["ecdf_df"]) == 1 + estim_quantiles = ci_model["ecdf_df"]["residual_ecdf_quantile_summary"].values[0] + estim_quantiles = np.array(estim_quantiles) + expected_quantiles = np.array([-1.180189405360272, 0.03387456098249153, 0.893547644053081]) + assert np.allclose(estim_quantiles, expected_quantiles) + # Stores the values for comparing with the next case + expected_quantiles_mean_removed = expected_quantiles + + # Here we test for the case where the conditional mean is not removed. + # Still all segments being smaller than `sample_size_thresh`. + # Note that we are almost calculating min/median/max by passing `quantiles=[0, 0.5, 1]`. + ci_model = conf_interval( + df=df, + distribution_col="residual", + offset_col="y_hat", + conditional_cols=["x"], + quantiles=[0, 0.5, 1], + quantile_estimation_method="ecdf", + remove_conditional_mean=False, + sample_size_thresh=len(df) + 1, # delibrately forcing no segment to have enough samples + small_sample_size_method="std_quantiles", + small_sample_size_quantile=0.95, + min_admissible_value=None, + max_admissible_value=None) + + # It is still expected that we get only one row in `"ecdf_df"` item. + assert len(ci_model["ecdf_df"]) == 1 + estim_quantiles = ci_model["ecdf_df"]["residual_ecdf_quantile_summary"].values[0] + estim_quantiles = np.array(estim_quantiles) + # This time we expect the quantiles to not be centered around zero + expected_quantiles = np.array([-2.08197503, -0.86791106, -0.00823798]) + assert np.allclose(estim_quantiles, expected_quantiles) + expected_quantiles_mean_not_removed = expected_quantiles + + # Comparing the two cases to ensure that the quantiles are simply shifted by the mean + mean_value = df["residual"].mean() + assert np.allclose( + expected_quantiles_mean_removed, + expected_quantiles_mean_not_removed - mean_value) + + def test_conf_interval_normal_method(data): """Testing "conf_interval" function, normal method""" df = data["df"] diff --git a/greykite/tests/common/features/test_outlier.py b/greykite/tests/common/features/test_outlier.py new file mode 100644 index 0000000..e0ddf4f --- /dev/null +++ b/greykite/tests/common/features/test_outlier.py @@ -0,0 +1,647 @@ +import numpy as np +import pandas as pd +import pytest + +from greykite.common.features.outlier import EXPONENTIAL_SMOOTHING +from greykite.common.features.outlier import MOVING_MEDIAN +from greykite.common.features.outlier import BaseOutlierDetector +from greykite.common.features.outlier import DetectionResult +from greykite.common.features.outlier import DiffMethod +from greykite.common.features.outlier import TukeyOutlierDetector +from greykite.common.features.outlier import ZScoreOutlierDetector +from greykite.common.viz.timeseries_annotate import plot_lines_markers + + +# Boolean to decide if figures are to be shown or not when this test file is run. +# Turn this on when changes are made and include in code reviews. +# Compare before and after the change to confirm everything is as expected. +FIG_SHOW = False + + +@pytest.fixture(scope="module") +def data(): + """Generates data for testing.""" + sampler = np.random.default_rng(1317) + # Defines two clean vectors, one for `fit` and one for `detect`. + y_clean = np.arange(0, 100) + # Add small noise + y_clean = y_clean + sampler.normal(loc=0.0, scale=3, size=len(y_clean)) + y_clean_test = np.arange(30, 40) + y_clean_test = y_clean_test + sampler.normal(loc=0.0, scale=3, size=len(y_clean_test)) + + # Constructs two scenarios: + # Easy to detect (`y_easy_outlier`): there is an outlier in position two which is 10X the max + # Hard to detect (`y_hard_outlier`): there is an outlier in position 3 which is equal to max, + # however locally it is much larger than neighboring points. + y_easy_outlier = y_clean.copy() + y_hard_outlier = y_clean.copy() + y_easy_outlier_test = y_clean_test.copy() + y_hard_outlier_test = y_clean_test.copy() + + # Add outlier to `fit` data. + y_easy_outlier[2] = 1000 + y_hard_outlier[2] = 100 + + # Add outlier to test / `detect` data. + y_easy_outlier_test[5] = 2000 + y_hard_outlier_test[5] = 150 + + return { + "y_clean": y_clean, + "y_clean_test": y_clean_test, + "y_easy_outlier": y_easy_outlier, + "y_easy_outlier_test": y_easy_outlier_test, + "y_hard_outlier": y_hard_outlier, + "y_hard_outlier_test": y_hard_outlier_test} + + +def helper_plot_outliers( + y, + detection_result, + title): + """This is just a helper function to generate plots for outlier detection during tests. + This plots the raw input and marks the discovered anomalies. + + Parameters + ---------- + y: `pandas.Series` + Input data. + detection_result: `~greykite.common.features.outlier.DetectionResult` + Outlier detection results. + title: `str` + Title of plot. + Returns + ------- + "fig" : `plotly.graph_objects.Figure` + + """ + df = pd.DataFrame({ + "ind": range(len(y)), + "y": y, + "scores": detection_result.scores, + "is_outlier": detection_result.is_outlier}) + + df["y_normal"] = None + df["y_outlier"] = None + + df.loc[~df["is_outlier"], "y_normal"] = df.loc[~df["is_outlier"], "y"] + df.loc[df["is_outlier"], "y_outlier"] = df.loc[df["is_outlier"], "y"] + + fig = plot_lines_markers( + df=df, + x_col="ind", + line_cols=["y", "scores"], + marker_cols=["y_outlier"], + title=title) + + return fig + + +def test_detection_result(): + """Tests the dataclass `DetectionResult`.""" + detection_result = DetectionResult() + + assert detection_result.scores is None + assert detection_result.is_outlier is None + + +def test_diff_methods_init(): + """Tests the dataclass DiffMethod.""" + baseline_method = DiffMethod() + + assert baseline_method.name is None + assert baseline_method.param is not None + + +def test_base_outlier_detector_init(): + """Tests the basics of `BaseOutlierDetector` class.""" + # Tests default `__init__`. + detect_outlier = BaseOutlierDetector() + + assert detect_outlier.trim_percent == 5.0 + assert detect_outlier.diff_method is not None + assert detect_outlier.lower_bound is None + assert detect_outlier.upper_bound is None + assert detect_outlier.fitted_param == {} + assert detect_outlier.y is None + assert detect_outlier.y_diffed is None + assert detect_outlier.y_na_removed is None + assert detect_outlier.y_trimmed is None + assert detect_outlier.y_ready_to_fit is None + assert detect_outlier.fitted == DetectionResult(scores=None, is_outlier=None) + assert detect_outlier.y_new is None + assert detect_outlier.y_new_ready_to_predict is None + assert detect_outlier.predicted == DetectionResult(scores=None, is_outlier=None) + + # Tests `__init__` with parameters. + detect_outlier = BaseOutlierDetector( + trim_percent=1, + diff_method=DiffMethod(name="es")) + + assert detect_outlier.trim_percent == 1.0 + assert detect_outlier.diff_method.name == "es" + assert detect_outlier.lower_bound is None + assert detect_outlier.upper_bound is None + assert detect_outlier.fitted_param == {} + assert detect_outlier.y is None + assert detect_outlier.y_diffed is None + assert detect_outlier.y_na_removed is None + assert detect_outlier.y_trimmed is None + assert detect_outlier.y_ready_to_fit is None + assert detect_outlier.fitted == DetectionResult(scores=None, is_outlier=None) + assert detect_outlier.y_new is None + assert detect_outlier.y_new_ready_to_predict is None + assert detect_outlier.predicted == DetectionResult(scores=None, is_outlier=None) + + +def test_base_outlier_detector_trim(): + """Tests `trim` method.""" + detect_outlier = BaseOutlierDetector() + y = np.arange(100) + y_trimmed_1pcnt = detect_outlier.trim(y, 1) + # This is the default (5%) + y_trimmed_5pcnt = detect_outlier.trim(y) + + # Original range. + assert max(y) == 99 + assert min(y) == 0 + + # 1 percent case. + assert max(y_trimmed_1pcnt) == 98 + assert min(y_trimmed_1pcnt) == 1 + + # 5 percent case. + assert max(y_trimmed_5pcnt) == 96 + assert min(y_trimmed_5pcnt) == 3 + + with pytest.raises( + ValueError, + match="Trim percent:"): + detect_outlier.trim(y, -1) + + with pytest.raises( + ValueError, + match="Trim percent:"): + detect_outlier.trim(y, 150) + + +def test_base_outlier_detector_remove_na(data): + """Tests `remove_na` method.""" + detect_outlier = BaseOutlierDetector() + y = data["y_clean"].copy() + y = pd.Series(y) + y[5] = None + y[70] = None + + y_na_removed = detect_outlier.remove_na(y) + + # Original length and new length. + assert len(y) == 100 + assert len(y_na_removed) == 98 + + y = pd.Series([None, None, None, 5, None]) + with pytest.raises( + ValueError, + match="Length of y after removing NAs is less than 2"): + detect_outlier.remove_na(y) + + +def test_base_detect_diff_from_baseline_es(): + """Tests `diff_from_baseline` method with exponential smoothing.""" + detect_outlier = BaseOutlierDetector() + y = np.arange(100) + + # Example with exponential smoothing. + baseline_result = detect_outlier.diff_from_baseline( + y=y, + diff_method=EXPONENTIAL_SMOOTHING) + + residuals = baseline_result["residuals"] + baseline_y = baseline_result["baseline_y"] + + # Original range. + assert min(y) == 0 + assert max(y) == 99 + + # Residuals range. + assert min(residuals) == 0 + assert abs(max(residuals) - 1.0) < 0.1 + + # Example with exponential smoothing. + y = pd.Series([0, 0, 0, 10, 10, 10, 10, 0, 0, 0]) + baseline_result = detect_outlier.diff_from_baseline( + y=y, + diff_method=EXPONENTIAL_SMOOTHING) + residuals = baseline_result["residuals"] + baseline_y = baseline_result["baseline_y"] + + assert (round(baseline_y) == [0, 0, 0, 5, 8, 9, 9, 5, 2, 1]).all() + assert (round(residuals) == [0, 0, 0, 5, 2, 1, 1, -5, -2, -1]).all() + + # Another example with custom `alpha = 1`. + # This will imply that `y` is unchanged and residuals are all zero. + y = pd.Series([0, 0, 0, 10, 10, 10, 10, 0, 0, 0]) + diff_method = DiffMethod(name="es", param={"alpha": 1}) + baseline_result = detect_outlier.diff_from_baseline( + y=y, + diff_method=diff_method) + residuals = baseline_result["residuals"] + baseline_y = baseline_result["baseline_y"] + + assert (baseline_y == y).all() + assert (residuals == [0]*10).all() + + +def test_base_detect_diff_from_baseline_moving_med(): + """Tests `diff_from_baseline` method with exponential smoothing.""" + detect_outlier = BaseOutlierDetector() + y = np.arange(100) + + # Example with moving median. + baseline_result = detect_outlier.diff_from_baseline( + y=y, + diff_method=MOVING_MEDIAN) + + residuals = baseline_result["residuals"] + baseline_y = baseline_result["baseline_y"] + + # Original range. + assert min(y) == 0 + assert max(y) == 99 + + # Residuals range. + assert min(residuals) == -1 + assert abs(max(residuals) - 1.0) < 0.1 + + # Example with moving median on a short vector. + y = pd.Series([0, 0, 0, 10, 10, 10, 10, 0, 0, 0]) + baseline_result = detect_outlier.diff_from_baseline( + y=y, + diff_method=MOVING_MEDIAN) + residuals = baseline_result["residuals"] + baseline_y = baseline_result["baseline_y"] + + assert (round(baseline_y) == [0, 0, 0, 10, 10, 10, 10, 0, 0, 0]).all() + assert (round(residuals) == [0]*10).all() + + # Another example with custom `window = 2`. + # This will imply that `y` is unchanged and residuals are all zero. + y = pd.Series([0, 0, 0, 10, 10, 10, 10, 0, 0, 0]) + diff_method = DiffMethod( + name="moving_med", + param={ + "window": 2, + "min_periods": 1, + "center": True}) + baseline_result = detect_outlier.diff_from_baseline( + y=y, + diff_method=diff_method) + residuals = baseline_result["residuals"] + baseline_y = baseline_result["baseline_y"] + + assert (baseline_y == [0, 0, 0, 5, 10, 10, 10, 5, 0, 0]).all() + assert (residuals == [0, 0, 0, 5, 0, 0, 0, -5, 0, 0]).all() + + +def test_z_score_outlier_detector_init(): + """Tests `ZScoreOutlierDetector` init.""" + # Tests default `__init__`. + detect_outlier = ZScoreOutlierDetector() + assert detect_outlier.trim_percent == 5.0 + assert detect_outlier.diff_method is not None + assert detect_outlier.lower_bound is None + assert detect_outlier.upper_bound is None + assert detect_outlier.fitted_param == {} + assert detect_outlier.y is None + assert detect_outlier.y_diffed is None + assert detect_outlier.y_na_removed is None + assert detect_outlier.y_trimmed is None + assert detect_outlier.y_ready_to_fit is None + assert detect_outlier.fitted == DetectionResult(scores=None, is_outlier=None) + assert detect_outlier.y_new is None + assert detect_outlier.y_new_ready_to_predict is None + assert detect_outlier.predicted == DetectionResult(scores=None, is_outlier=None) + # Specific to this class. + assert detect_outlier.z_score_cutoff == 5.0 + + # Tests `__init__` with parameters. + detect_outlier = ZScoreOutlierDetector(z_score_cutoff=10) + assert detect_outlier.z_score_cutoff == 10.0 + + +def test_tukey_outlier_detector_init(): + """Tests `TukeyOutlierDetector` init.""" + # Tests default `__init__`. + detect_outlier = TukeyOutlierDetector() + # The default for `trim_percent` is different from Z-score. + assert detect_outlier.trim_percent is None + assert detect_outlier.diff_method is not None + assert detect_outlier.lower_bound is None + assert detect_outlier.upper_bound is None + assert detect_outlier.fitted_param == {} + assert detect_outlier.y is None + assert detect_outlier.y_diffed is None + assert detect_outlier.y_na_removed is None + assert detect_outlier.y_trimmed is None + assert detect_outlier.y_ready_to_fit is None + assert detect_outlier.fitted == DetectionResult(scores=None, is_outlier=None) + assert detect_outlier.y_new is None + assert detect_outlier.y_new_ready_to_predict is None + assert detect_outlier.predicted == DetectionResult(scores=None, is_outlier=None) + # Specific to this class. + assert detect_outlier.iqr_lower == 0.1 + assert detect_outlier.iqr_upper == 0.9 + assert detect_outlier.tukey_cutoff == 1.0 + + # Tests `__init__` with parameters. + detect_outlier = TukeyOutlierDetector( + iqr_lower=0.05, + iqr_upper=0.95, + tukey_cutoff=0.2) + + assert detect_outlier.iqr_lower == 0.05 + assert detect_outlier.iqr_upper == 0.95 + assert detect_outlier.tukey_cutoff == 0.2 + + +def test_z_score_outlier_detector(data): + """Tests `ZScoreOutlierDetector` usage.""" + # Default setting and easy detection. + detect_outlier = ZScoreOutlierDetector() + + # Easy detection example. + y = data["y_easy_outlier"].copy() + y_test = data["y_easy_outlier_test"].copy() + + detect_outlier.fit(y) + assert abs(detect_outlier.fitted_param["trimmed_mean"] - 0.12188) < 0.1 + assert abs(detect_outlier.fitted_param["trimmed_sd"] - 2.202187) < 0.1 + + detect_outlier.detect(y_test) + + fitted = detect_outlier.fitted + predicted = detect_outlier.predicted + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Easy detection Z-score") + if FIG_SHOW: + fig.show() + assert fig is not None + + # We expect only an outlier in second position as per `data` definition. + assert fitted.is_outlier[2] + assert sum(fitted.is_outlier) == 1 + + # We expect only an outlier in 5th position as per `data` definition. + assert predicted.is_outlier[5] + assert sum(predicted.is_outlier) == 1 + + # Hard detection example without differencing. + # Here we expect that the outlier is not removed. + # This will showcase how without differencing anomaly is missed. + y = data["y_hard_outlier"].copy() + y_test = data["y_hard_outlier_test"].copy() + + detect_outlier = ZScoreOutlierDetector(diff_method=None) + detect_outlier.fit(y) + detect_outlier.detect(y_test) + + fitted = detect_outlier.fitted + predicted = detect_outlier.predicted + # We note that the trimmed mean and sd are quite large since no diffing is done. + assert abs(detect_outlier.fitted_param["trimmed_mean"] - 50.5917) < 0.1 + assert abs(detect_outlier.fitted_param["trimmed_sd"] - 26.8908) < 0.1 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Hard detection with Z-score and no baseline diffing.") + if FIG_SHOW: + fig.show() + assert fig is not None + + # We expect no outlier is detected. + assert not fitted.is_outlier[2] + assert sum(fitted.is_outlier) == 0 + + # We expect no outlier is detected. + assert not predicted.is_outlier[5] + assert sum(predicted.is_outlier) == 0 + + # Hard detection example with differencing (default behavior). + # Here we expect that the outlier is removed. + # This will showcase how how differencing with appropriate baseline is helpful. + y = data["y_hard_outlier"] + y_test = data["y_hard_outlier_test"] + + detect_outlier = ZScoreOutlierDetector() + detect_outlier.fit(y) + detect_outlier.detect(y_test) + + fitted = detect_outlier.fitted + predicted = detect_outlier.predicted + # Due to diffing, the mean and sd are much smaller. + assert abs(detect_outlier.fitted_param["trimmed_mean"] - 0.12188) < 0.1 + assert abs(detect_outlier.fitted_param["trimmed_sd"] - 2.202187) < 0.1 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Hard detection with Z-score with diffing.") + if FIG_SHOW: + fig.show() + assert fig is not None + + # We expect only an outlier in second position as per `data` definition. + assert fitted.is_outlier[2] + assert sum(fitted.is_outlier) == 1 + + # We expect only an outlier in 5th position as per `data` definition. + assert predicted.is_outlier[5] + assert sum(predicted.is_outlier) == 1 + + +def test_tukey_outlier_detector(data): + """Tests `TukeyOutlierDetector` usage.""" + detect_outlier = TukeyOutlierDetector() + + # Easy detection example. + y = data["y_easy_outlier"].copy() + y_test = data["y_easy_outlier_test"].copy() + + detect_outlier.fit(y) + detect_outlier.detect(y_test) + fitted = detect_outlier.fitted + predicted = detect_outlier.predicted + + assert abs(detect_outlier.fitted_param["quantile_value_lower"] - (-2.86)) < 0.5 + assert abs(detect_outlier.fitted_param["quantile_value_upper"] - 3.80) < 0.5 + assert abs(detect_outlier.fitted_param["iqr"] - 6.66) < 0.5 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Easy detection Tukey") + if FIG_SHOW: + fig.show() + assert fig is not None + + # We expect only an outlier in second position as per `data` definition. + assert fitted.is_outlier[2] + assert sum(fitted.is_outlier) == 1 + + # We expect only an outlier in 5th position as per `data` definition. + assert predicted.is_outlier[5] + assert sum(predicted.is_outlier) == 1 + + # Hard detection example without differencing. + # Here we expect that the outlier is not removed. + # This will showcase how without differencing anomaly is missed. + y = data["y_hard_outlier"].copy() + y_test = data["y_hard_outlier_test"].copy() + + detect_outlier = TukeyOutlierDetector(diff_method=None) + detect_outlier.fit(y) + detect_outlier.detect(y_test) + + fitted = detect_outlier.fitted + predicted = detect_outlier.predicted + + assert abs(detect_outlier.fitted_param["quantile_value_lower"] - 11.8) < 0.5 + assert abs(detect_outlier.fitted_param["quantile_value_upper"] - 90.1) < 0.5 + assert abs(detect_outlier.fitted_param["iqr"] - 78.3) < 0.5 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Hard detection with Tukey and no baseline diffing.") + if FIG_SHOW: + fig.show() + assert fig is not None + + # We expect no outlier is detected. + assert not fitted.is_outlier[2] + assert sum(fitted.is_outlier) == 0 + + # We expect no outlier is detected. + assert not predicted.is_outlier[5] + assert sum(predicted.is_outlier) == 0 + + # Hard detection example with differencing (default behavior). + # Here we expect that the outlier is removed. + # This will showcase how how differencing with appropriate baseline is helpful. + y = data["y_hard_outlier"].copy() + y_test = data["y_hard_outlier_test"].copy() + + detect_outlier = TukeyOutlierDetector() + detect_outlier.fit(y) + detect_outlier.detect(y_test) + + fitted = detect_outlier.fitted + predicted = detect_outlier.predicted + + assert abs(detect_outlier.fitted_param["quantile_value_lower"] - (-2.86)) < 0.5 + assert abs(detect_outlier.fitted_param["quantile_value_upper"] - 3.80) < 0.5 + assert abs(detect_outlier.fitted_param["iqr"] - 6.66) < 0.5 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Hard detection with Tukey with diffing.") + if FIG_SHOW: + fig.show() + assert fig is not None + + # We expect only an outlier in second position as per `data` definition. + assert fitted.is_outlier[2] + assert sum(fitted.is_outlier) == 1 + + # We expect only an outlier in 5th position as per `data` definition. + assert predicted.is_outlier[5] + assert sum(predicted.is_outlier) == 1 + + +def test_tukey_outlier_detector_corner_cases(): + """Tests `TukeyOutlierDetector` usage with corner cases.""" + detect_outlier = TukeyOutlierDetector() + # The case where data is perfectly linear. + y = np.arange(100) + + detect_outlier.fit(y) + fitted = detect_outlier.fitted + + assert abs(detect_outlier.fitted_param["quantile_value_lower"] - 0) < 0.5 + assert abs(detect_outlier.fitted_param["quantile_value_upper"] - 0) < 0.5 + assert abs(detect_outlier.fitted_param["iqr"] - 0) < 0.5 + assert sum(fitted.is_outlier) == 0 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Tukey: perfectly linear data") + if FIG_SHOW: + fig.show() + assert fig is not None + + detect_outlier = TukeyOutlierDetector() + # The case where data is constant. + y = np.zeros(100) + + detect_outlier.fit(y) + fitted = detect_outlier.fitted + + assert abs(detect_outlier.fitted_param["quantile_value_lower"] - 0) < 0.5 + assert abs(detect_outlier.fitted_param["quantile_value_upper"] - 0) < 0.5 + assert abs(detect_outlier.fitted_param["iqr"] - 0) < 0.5 + assert sum(fitted.is_outlier) == 0 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="Tukey: constant") + if FIG_SHOW: + fig.show() + assert fig is not None + + +def test_z_score_outlier_detector_corner_cases(): + """Tests `TukeyOutlierDetector` usage with corner cases.""" + detect_outlier = ZScoreOutlierDetector() + # The case where data is perfectly linear. + y = np.arange(100) + + detect_outlier.fit(y) + fitted = detect_outlier.fitted + + assert abs(detect_outlier.fitted_param["trimmed_mean"] - 0) < 0.5 + assert abs(detect_outlier.fitted_param["trimmed_sd"] - 0) < 0.5 + assert sum(fitted.is_outlier) == 0 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="ZScore: perfectly linear data") + if FIG_SHOW: + fig.show() + assert fig is not None + + detect_outlier = ZScoreOutlierDetector() + # The case where data is constant. + y = np.zeros(100) + + detect_outlier.fit(y) + fitted = detect_outlier.fitted + + assert abs(detect_outlier.fitted_param["trimmed_mean"] - 0) < 0.5 + assert abs(detect_outlier.fitted_param["trimmed_sd"] - 0) < 0.5 + assert sum(fitted.is_outlier) == 0 + + fig = helper_plot_outliers( + y=y, + detection_result=fitted, + title="ZScore: constant") + if FIG_SHOW: + fig.show() + assert fig is not None diff --git a/greykite/tests/common/features/test_timeseries_features.py b/greykite/tests/common/features/test_timeseries_features.py index 95e74ee..d800f48 100644 --- a/greykite/tests/common/features/test_timeseries_features.py +++ b/greykite/tests/common/features/test_timeseries_features.py @@ -614,11 +614,28 @@ def test_add_daily_events_with_neighbor_impact(): event_df_dict, neighbor_impact=7 ) - # Checks holidays are mapped to the correct weekly dates. + # Checks holidays are mapped to the correct daily dates. assert new_df.iloc[0].tolist() == [pd.Timestamp("2020-01-01"), 0, "Christmas Day", 1, 0, 1] assert new_df.iloc[1].tolist() == [pd.Timestamp("2020-01-01"), 0, "New Year's Day", 1, 0, 1] assert new_df.iloc[2].tolist() == [pd.Timestamp("2020-01-02"), 0, "New Year's Day", 1, 0, 1] + # Tests daily data, assuming a list of customized neighboring effect. + df = pd.DataFrame({ + "date": pd.date_range("2020-01-01", freq="D", periods=500), + "y": 0 + }) + countries = ["US"] + event_df_dict = get_holidays(countries, year_start=2015, year_end=2025) + new_df = add_daily_events( + df, + event_df_dict, + neighbor_impact=[1, 4] + ) + # Checks holidays are mapped to the correct daily dates. + assert new_df.iloc[0].tolist() == [pd.Timestamp("2020-01-01"), 0, "New Year's Day", 1, 0, 1] + assert new_df.iloc[1].tolist() == [pd.Timestamp("2020-01-02"), 0, "New Year's Day", 1, 0, 1] + assert new_df.iloc[4].tolist() == [pd.Timestamp("2020-01-05"), 0, "New Year's Day", 1, 0, 1] + def test_add_daily_event_shifted_effect(): """Tests adding additional neighbor events. diff --git a/greykite/tests/common/test_evaluation.py b/greykite/tests/common/test_evaluation.py index a441870..e98e65e 100644 --- a/greykite/tests/common/test_evaluation.py +++ b/greykite/tests/common/test_evaluation.py @@ -31,6 +31,7 @@ from greykite.common.evaluation import elementwise_quantile from greykite.common.evaluation import elementwise_residual from greykite.common.evaluation import elementwise_squared_error +from greykite.common.evaluation import elementwise_symmetric_absolute_percent_error from greykite.common.evaluation import elementwise_within_bands from greykite.common.evaluation import fraction_outside_tolerance from greykite.common.evaluation import fraction_within_bands @@ -776,6 +777,19 @@ def test_elementwise_absolute_percent_error(): assert "true_val is less than 1e-8. Percent error is very likely highly volatile." in record[0].message.args[0] +def test_elementwise_symmetric_absolute_percent_error(): + """Tests elementwise_symmetric_absolute_percent_error function.""" + assert elementwise_symmetric_absolute_percent_error(1.0, 3.0) == pytest.approx(50.0, 0.1) + assert elementwise_symmetric_absolute_percent_error(3.0, 1.0) == elementwise_symmetric_absolute_percent_error(1.0, 3.0) + + with pytest.warns(UserWarning, match="Symmetric absolute percent error is undefined"): + assert elementwise_symmetric_absolute_percent_error(0.0, 0.0) is None + + with pytest.warns(Warning) as record: + elementwise_symmetric_absolute_percent_error(1e-9, 1e-9) + assert "denominator contains very small values. Symmetric absolute percent error is very likely highly volatile." in record[0].message.args[0] + + def test_elementwise_quantile(): """Tests elementwise_quantile function""" assert elementwise_quantile(1.0, 3.0, q=0.8) == pytest.approx(2.0 * 0.2, rel=1e-5) diff --git a/greykite/tests/common/test_testing_utils_anomalies.py b/greykite/tests/common/test_testing_utils_anomalies.py new file mode 100644 index 0000000..0034b2a --- /dev/null +++ b/greykite/tests/common/test_testing_utils_anomalies.py @@ -0,0 +1,86 @@ +import datetime + +import numpy as np +import pandas as pd + +from greykite.common.testing_utils import generate_df_for_tests +from greykite.common.testing_utils_anomalies import calc_quantiles_simulated_df +from greykite.common.testing_utils_anomalies import contaminate_df_with_anomalies +from greykite.common.testing_utils_anomalies import generate_anomaly_blocks +from greykite.common.testing_utils_anomalies import generate_df_with_anomalies_sim_based + + +def test_generate_anomaly_blocks(): + res = generate_anomaly_blocks( + timeseries_length=100, + block_number=5, + mean_block_size=5) + assert len(res["anomaly_block_list"]) == res["block_number"] + assert res["anomaly_block_list"][-1][-1] <= 100 + + +def test_contaminate_df_with_anomalies(): + # data size + n = 2000 + res = generate_df_for_tests( + freq="1D", + periods=n, + train_start_date=datetime.datetime(2018, 7, 1)) + df = res["df"] + + res = generate_anomaly_blocks( + timeseries_length=n, + block_number=20, + mean_block_size=5) + + anomaly_block_list = res["anomaly_block_list"] + + df = contaminate_df_with_anomalies( + df=df, + anomaly_block_list=anomaly_block_list, + delta_range_lower=5, + delta_range_upper=6, + value_col="y", + min_admissible_value=None, + max_admissible_value=None + ) + assert df.shape == (2000, 4) + assert list(df.columns) == ["ts", "y", "contaminated_y", "is_anomaly"] + assert not df.isna().any().any() + + +def test_calc_quantiles_simulated_df(): + def sim_df_func(): + return pd.DataFrame({"y": np.random.uniform(3, 5, 100)}) + + quantiles_df = calc_quantiles_simulated_df( + sim_df_func=sim_df_func, + quantiles=[0.25, 0.75], + simulation_num=50) + + assert quantiles_df.shape == (100, 2) + + def sim_df_func(x): + return pd.DataFrame({"y": np.random.uniform(x, 5, 100)}) + + quantiles_df = calc_quantiles_simulated_df( + sim_df_func=sim_df_func, + quantiles=[0.25, 0.75], + simulation_num=50, + x=1) + + assert quantiles_df.shape == (100, 2) + + +def test_generate_df_with_anomalies_sim_based(): + res = generate_df_with_anomalies_sim_based( + freq="5min", + periods=24*12*10, + block_number=10, + mean_block_size=5) + df = res["df"] + assert df.shape == (24*12*10, 6) + assert not df.isna().any().any() + quantiles_df = res["quantiles_df"] + assert quantiles_df.shape == (24*12*10, 2) + assert not quantiles_df.isna().any().any() diff --git a/greykite/tests/common/viz/test_timeseries_annotate.py b/greykite/tests/common/viz/test_timeseries_annotate.py index 732c52b..2bb8073 100644 --- a/greykite/tests/common/viz/test_timeseries_annotate.py +++ b/greykite/tests/common/viz/test_timeseries_annotate.py @@ -229,10 +229,10 @@ def test_plot_lines_markers(): assert fig.data[2].marker.color == "rgba(31, 119, 180, 1.0)" assert fig.data[3].marker.color == "rgba(255, 127, 14, 1.0)" - # Length of ``line_cols`` must be the same as ``line_cols`` if passed. + # Length of `line_colors` must be larger than or equal to length of `line_cols` if passed. with pytest.raises( ValueError, - match="If `line_colors` is passed, its length must be equal to `line_cols`"): + match="If `line_colors` is passed"): plot_lines_markers( df=df, x_col="ts", @@ -241,7 +241,8 @@ def test_plot_lines_markers(): line_colors=line_colors[:1], marker_colors=marker_colors) - # At least one of ``line_cols`` or ``marker_cols`` must be provided (not None). + # At least one of `line_cols` or `marker_cols` or `band_cols` + # must be provided (not None). with pytest.raises( ValueError, match="At least one of"): @@ -250,10 +251,100 @@ def test_plot_lines_markers(): x_col="ts", line_cols=None, marker_cols=None, + band_cols=None, line_colors=None, marker_colors=None) +def test_plot_lines_markers_with_bands(): + """Tests ``plot_lines_markers`` with bands.""" + df = pd.DataFrame({ + "x": range(4), + "y": range(4), + "z1": range(1, 5), + "z2": range(-1, 3), + "w": [(0, 1), (1, 3), (1, 5), (3, 5)], + "u": [(2, 3), (3, 3), (4, 4), (6, 8)]}) + + fig = plot_lines_markers( + df=df, + x_col="x", + line_cols=["y", "z1"], + band_cols=["u", "w"]) + + assert len(fig.data) == 6 + assert fig.data[0].line.color is None + assert fig.data[1].line.color is None + assert fig.data[2].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[3].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[4].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[5].line.color == "rgba(0, 0, 0, 0)" + + assert fig.data[3].name == "u" + assert fig.data[5].name == "w" + assert fig.data[3].fillcolor == "rgba(31, 119, 180, 0.2)" + assert fig.data[5].fillcolor == "rgba(255, 127, 14, 0.2)" + assert fig.layout.title.text is None + + # Bands with custom colors and a title for the plot. + fig = plot_lines_markers( + df=df, + x_col="x", + line_cols=["y", "z1"], + band_cols=["u", "w"], + band_colors=["rgba(0, 255, 0, 0.2)", "rgba(255, 0, 0, 0.2)"], + title="custom band colors") + + assert len(fig.data) == 6 + assert fig.data[0].line.color is None + assert fig.data[1].line.color is None + assert fig.data[2].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[3].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[4].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[5].line.color == "rgba(0, 0, 0, 0)" + + assert fig.data[3].name == "u" + assert fig.data[5].name == "w" + + assert fig.data[3].fillcolor == "rgba(0, 255, 0, 0.2)" + assert fig.data[5].fillcolor == "rgba(255, 0, 0, 0.2)" + assert fig.layout.title.text == "custom band colors" + + # Bands specified by dictionary. + df = pd.DataFrame({ + "x": range(4), + "y": [2, 3, 4, 5], + "z1": [4, 5, 6, 8], + "z2": range(-1, 3), + "w1": [5, 6, 6, 8], + "w2": [7, 8, 9, 9], + "u1": [2, 3, 5, 7], + "u3": [4, 5, 8, 8]}) + + fig = plot_lines_markers( + df=df, + x_col="x", + line_cols=["y", "z1"], + band_cols_dict={"u": ["u1", "u3"], "w": ["w1", "w2"]}, + band_colors=["rgba(0, 255, 0, 0.2)", "rgba(255, 0, 0, 0.2)"], + title="bands via dict") + + assert len(fig.data) == 6 + assert fig.data[0].line.color is None + assert fig.data[1].line.color is None + assert fig.data[2].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[3].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[4].line.color == "rgba(0, 0, 0, 0)" + assert fig.data[5].line.color == "rgba(0, 0, 0, 0)" + + assert fig.data[3].name == "u" + assert fig.data[5].name == "w" + + assert fig.data[3].fillcolor == "rgba(0, 255, 0, 0.2)" + assert fig.data[5].fillcolor == "rgba(255, 0, 0, 0.2)" + assert fig.layout.title.text == "bands via dict" + + def test_plot_event_periods_multi(): """Tests ``plot_event_periods_multi`` function.""" df = pd.DataFrame({ diff --git a/greykite/tests/detection/common/test_ad_evaluation.py b/greykite/tests/detection/common/test_ad_evaluation.py new file mode 100644 index 0000000..3276dae --- /dev/null +++ b/greykite/tests/detection/common/test_ad_evaluation.py @@ -0,0 +1,564 @@ +import numpy as np +import pandas as pd +import pytest + +from greykite.detection.common.ad_evaluation import confusion_matrix +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.ad_evaluation import informedness_statistic +from greykite.detection.common.ad_evaluation import matthews_corrcoef +from greykite.detection.common.ad_evaluation import precision_score +from greykite.detection.common.ad_evaluation import range_based_precision_score +from greykite.detection.common.ad_evaluation import range_based_recall_score +from greykite.detection.common.ad_evaluation import recall_score +from greykite.detection.common.ad_evaluation import soft_f1_score +from greykite.detection.common.ad_evaluation import soft_precision_score +from greykite.detection.common.ad_evaluation import soft_recall_score + + +@pytest.fixture +def input_values(): + values = { + "y_true": ["0", "0", "0", "0", "0", "1", "1", "1", "a", "a", "a", "a"], + "y_pred": ["0", "0", "0", "1", "1", "1", "1", "1", "a", "a", "a", "a"], + "expected_precision": { + "0": 1.0, + "1": 0.6, + "a": 1.0 + }, + "expected_recall": { + "0": 0.6, + "1": 1.0, + "a": 1.0 + }, + "expected_f1_score": { + "0": 0.75, + "1": 0.75, + "a": 1.0 + }, + "expected_confusion_matrix": np.array([[3, 2, 0], [0, 3, 0], [0, 0, 4]]), + "sample_weight": [1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0], + "expected_precision_with_weight": { + "0": 1.0, + "1": 1.0, + "a": 0.0 + }, + "expected_matthews_corrcoef": 0.7872340425531915, + "expected_matthews_corrcoef_with_weight": 1.0, + "expected_informedness_statistic": 0.8, + "expected_informedness_statistic_with_weight": 1.0 + + } + return values + + +@pytest.fixture +def soft_input_values(): + values = { + "y_true": [0, 1, 1, 1, 0, 0, np.nan, np.nan, 0], + "y_pred": [0, 0, 0, 1, 0, 1, np.nan, 1, np.nan], + "expected_soft_precision": [ + {0.0: 0.5, 1.0: 0.5}, + {1.0: 0.5, 0.0: 0.0}, + {1.0: 1.0, 0.0: 0.0}], + "expected_soft_recall": [ + {0.0: 2/3, 1.0: 1/3}, + {0.0: 1/3, 1.0: 2/3}, + {0.0: 1/3, 1.0: 1.0}], + "expected_soft_f1": [ + {0.0: 0.5714285714285715, 1.0: 0.4}, + {0.0: 0.0, 1.0: 0.5714285714285715}, + {0.0: 0.0, 1.0: 1.0}] + } + + return values + + +@pytest.fixture +def range_based_input_values(): + values = { + "y_true": [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0], + "y_pred": [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0], + # Expected precision values with different positional biases + "expected_range_based_precision_flat_positional_bias": 0.938, + "expected_range_based_precision_front_positional_bias": 0.9, + "expected_range_based_precision_middle_positional_bias": 0.958, + "expected_range_based_precision_back_positional_bias": 0.975, + # Expected recall values with different positional biases + "expected_range_based_recall_flat_positional_bias": 0.817, + "expected_range_based_recall_front_positional_bias": 0.908, + "expected_range_based_recall_middle_positional_bias": 0.854, + "expected_range_based_recall_back_positional_bias": 0.725, + + } + + return values + + +def test_precision_score(input_values): + y_true = input_values["y_true"] + y_pred = input_values["y_pred"] + expected_precision = input_values["expected_precision"] + + # Tests list input. + precision = precision_score( + y_true=y_true, + y_pred=y_pred + ) + assert precision == expected_precision + + # Tests numpy array input. + precision = precision_score( + y_true=np.array(y_true), + y_pred=np.array(y_pred) + ) + assert precision == expected_precision + + # Tests pandas Series input. + precision = precision_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + assert precision == expected_precision + + # Tests pandas DataFrame input. + precision = precision_score( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred) + ) + assert precision == expected_precision + + +def test_recall_score(input_values): + y_true = input_values["y_true"] + y_pred = input_values["y_pred"] + expected_recall = input_values["expected_recall"] + + # Tests list input. + recall = recall_score( + y_true=y_true, + y_pred=y_pred + ) + assert recall == expected_recall + + # Tests numpy array input. + recall = recall_score( + y_true=np.array(y_true), + y_pred=np.array(y_pred) + ) + assert recall == expected_recall + + # Tests pandas Series input. + recall = recall_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + assert recall == expected_recall + + # Tests pandas DataFrame input. + recall = recall_score( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred) + ) + assert recall == expected_recall + + +def test_f1_score(input_values): + y_true = input_values["y_true"] + y_pred = input_values["y_pred"] + expected_f1_score = input_values["expected_f1_score"] + + # Tests list input. + f1 = f1_score( + y_true=y_true, + y_pred=y_pred + ) + assert {key: round(value, 2) for key, value in f1.items()} == expected_f1_score + + # Tests numpy array input. + f1 = f1_score( + y_true=np.array(y_true), + y_pred=np.array(y_pred) + ) + assert {key: round(value, 2) for key, value in f1.items()} == expected_f1_score + + # Tests pandas Series input. + f1 = f1_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + assert {key: round(value, 2) for key, value in f1.items()} == expected_f1_score + + # Tests pandas DataFrame input. + f1 = f1_score( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred) + ) + assert {key: round(value, 2) for key, value in f1.items()} == expected_f1_score + + +def test_confusion_matrix(input_values): + y_true = input_values["y_true"] + pred = input_values["y_pred"] + expected_confusion_matrix = input_values["expected_confusion_matrix"] + confusion_mat = confusion_matrix( + y_true=y_true, + y_pred=pred + ) + assert (confusion_mat.values == expected_confusion_matrix).all().all() + + +def test_error_and_warnings(): + # Not 1-D array. + with pytest.raises( + ValueError, + match="The input for scoring must be 1"): + precision_score( + y_true=[[1, 2, 3], [4, 5, 6]], + y_pred=[[1, 2, 3], [4, 5, 6]] + ) + + # Not equal length. + with pytest.raises( + ValueError, + match="The input lengths must be the same, found"): + precision_score( + y_true=[1, 2, 3], + y_pred=[1, 2] + ) + + # Warnings 1. + with pytest.warns( + UserWarning, + match="The following categories do not appear in y_true column,"): + precision_score( + y_true=[1, 2, 2], + y_pred=[1, 2, 4] + ) + + # Warnings 2. + with pytest.warns( + UserWarning, + match="The following categories do not appear in y_pred column,"): + precision_score( + y_true=[1, 2, 3], + y_pred=[1, 2, 2] + ) + + +def test_sample_weight(input_values): + y_true = input_values["y_true"] + y_pred = input_values["y_pred"] + sample_weight = input_values["sample_weight"] + expected_precision = input_values["expected_precision_with_weight"] + + # Tests list input. + precision = precision_score( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight + ) + assert precision == expected_precision + + +def test_soft_precision_score(soft_input_values): + y_true = soft_input_values["y_true"] + y_pred = soft_input_values["y_pred"] + expected_soft_precision = soft_input_values["expected_soft_precision"] + + # Tests list input. + soft_precision = [soft_precision_score( + y_true=y_true, + y_pred=y_pred, + window=window) for window in [0, 1, 2]] + assert soft_precision == expected_soft_precision + + # Tests numpy array input. + soft_precision = [soft_precision_score( + y_true=np.array(y_true), + y_pred=np.array(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_precision == expected_soft_precision + + # Tests pandas Series input. + soft_precision = [soft_precision_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_precision == expected_soft_precision + + # Tests pandas DataFrame input. + soft_precision = [soft_precision_score( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_precision == expected_soft_precision + + +def test_soft_recall_score(soft_input_values): + y_true = soft_input_values["y_true"] + y_pred = soft_input_values["y_pred"] + expected_soft_recall = soft_input_values["expected_soft_recall"] + + # Tests list input. + soft_recall = [soft_recall_score( + y_true=y_true, + y_pred=y_pred, + window=window) for window in [0, 1, 2]] + assert soft_recall == expected_soft_recall + + # Tests numpy array input. + soft_recall = [soft_recall_score( + y_true=np.array(y_true), + y_pred=np.array(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_recall == expected_soft_recall + + # Tests pandas Series input. + soft_recall = [soft_recall_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_recall == expected_soft_recall + + # Tests pandas DataFrame input. + soft_recall = [soft_recall_score( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_recall == expected_soft_recall + + +def test_soft_f1(soft_input_values): + y_true = soft_input_values["y_true"] + y_pred = soft_input_values["y_pred"] + expected_soft_f1 = soft_input_values["expected_soft_f1"] + + # Tests list input. + soft_f1 = [soft_f1_score( + y_true=y_true, + y_pred=y_pred, + window=window) for window in [0, 1, 2]] + assert soft_f1 == expected_soft_f1 + + # Tests numpy array input. + soft_f1 = [soft_f1_score( + y_true=np.array(y_true), + y_pred=np.array(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_f1 == expected_soft_f1 + + # Tests pandas Series input. + soft_f1 = [soft_f1_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_f1 == expected_soft_f1 + + # Tests pandas DataFrame input. + soft_f1 = [soft_f1_score( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred), + window=window) for window in [0, 1, 2]] + assert soft_f1 == expected_soft_f1 + + +def test_range_based_precision_score(range_based_input_values): + """Tests for range_based_precision_score function""" + + y_true = range_based_input_values["y_true"] + y_pred = range_based_input_values["y_pred"] + + # Tests range-based precision with flat positional bias + expected_precision = range_based_input_values["expected_range_based_precision_flat_positional_bias"] + range_based_precision = range_based_precision_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="flat") + assert round(range_based_precision, 3) == expected_precision + + # Tests range-based precision with front positional bias + expected_precision = range_based_input_values["expected_range_based_precision_front_positional_bias"] + range_based_precision = range_based_precision_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="front") + assert round(range_based_precision, 3) == expected_precision + + # Tests range-based precision with middle positional bias + expected_precision = range_based_input_values["expected_range_based_precision_middle_positional_bias"] + range_based_precision = range_based_precision_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="middle") + assert round(range_based_precision, 3) == expected_precision + + # Tests range-based precision with back positional bias + expected_precision = range_based_input_values["expected_range_based_precision_back_positional_bias"] + range_based_precision = range_based_precision_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="back") + assert round(range_based_precision, 3) == expected_precision + + # Tests if the range_based implementation subsumes the classical recall implementation + classical_precision = precision_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + precision = range_based_precision_score( + y_true=y_true, + y_pred=y_pred, + range_based=False) + assert round(precision, 3) == round(classical_precision[1], 3) + + +def test_range_based_recall_score(range_based_input_values): + """Tests for range_based_recall_score function""" + + y_true = range_based_input_values["y_true"] + y_pred = range_based_input_values["y_pred"] + + # Tests range-based recall with flat positional bias + expected_recall = range_based_input_values["expected_range_based_recall_flat_positional_bias"] + range_based_recall = range_based_recall_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="flat") + assert round(range_based_recall, 3) == expected_recall + + # Tests range-based recall with front positional bias + expected_recall = range_based_input_values["expected_range_based_recall_front_positional_bias"] + range_based_recall = range_based_recall_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="front") + assert round(range_based_recall, 3) == expected_recall + + # Tests range-based recall with middle positional bias + expected_recall = range_based_input_values["expected_range_based_recall_middle_positional_bias"] + range_based_recall = range_based_recall_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="middle") + assert round(range_based_recall, 3) == expected_recall + + # Tests range-based recall with back positional bias + expected_recall = range_based_input_values["expected_range_based_recall_back_positional_bias"] + range_based_recall = range_based_recall_score( + y_true=y_true, + y_pred=y_pred, + positional_bias="back") + assert round(range_based_recall, 3) == expected_recall + + # Tests if the range_based implementation subsumes the classical recall implementation + classical_recall = recall_score( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + recall = range_based_recall_score( + y_true=y_true, + y_pred=y_pred, + range_based=False) + assert round(recall, 3) == round(classical_recall[1], 3) + + +def test_matthews_corrcoef(input_values): + y_true = input_values["y_true"] + y_pred = input_values["y_pred"] + sample_weight = input_values["sample_weight"] + expected_mcc = input_values["expected_matthews_corrcoef"] + expected_mcc_with_weight = input_values["expected_matthews_corrcoef_with_weight"] + + # Tests list input. + mcc = matthews_corrcoef( + y_true=y_true, + y_pred=y_pred + ) + assert mcc == pytest.approx(expected_mcc) + + # Tests numpy array input. + mcc = matthews_corrcoef( + y_true=np.array(y_true), + y_pred=np.array(y_pred) + ) + assert mcc == pytest.approx(expected_mcc) + + # Tests pandas Series input. + mcc = matthews_corrcoef( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + assert mcc == pytest.approx(expected_mcc) + + # Tests pandas DataFrame input. + mcc = matthews_corrcoef( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred) + ) + assert mcc == pytest.approx(expected_mcc) + + # Tests expected result when using sample weights. + mcc = matthews_corrcoef( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight + ) + assert mcc == pytest.approx(expected_mcc_with_weight) + + +def test_informedness_statistic(input_values): + y_true = input_values["y_true"] + y_pred = input_values["y_pred"] + sample_weight = input_values["sample_weight"] + expected_informedness = input_values["expected_informedness_statistic"] + expected_informedness_with_weight = input_values["expected_informedness_statistic_with_weight"] + + # Tests list input. + informedness = informedness_statistic( + y_true=y_true, + y_pred=y_pred + ) + assert informedness == pytest.approx(expected_informedness) + + # Tests numpy array input. + informedness = informedness_statistic( + y_true=np.array(y_true), + y_pred=np.array(y_pred) + ) + assert informedness == pytest.approx(expected_informedness) + + # Tests pandas Series input. + informedness = informedness_statistic( + y_true=pd.Series(y_true), + y_pred=pd.Series(y_pred) + ) + assert informedness == pytest.approx(expected_informedness) + + # Tests pandas DataFrame input. + informedness = informedness_statistic( + y_true=pd.DataFrame(y_true), + y_pred=pd.DataFrame(y_pred) + ) + assert informedness == pytest.approx(expected_informedness) + + # Tests expected result when using sample weights. + informedness = informedness_statistic( + y_true=y_true, + y_pred=y_pred, + sample_weight=sample_weight + ) + assert informedness == pytest.approx(expected_informedness_with_weight) + + # Tests that informedness_statistic returns the same results as sensitivity + specificity - 1 for binary output. + y_true = ["0", "0", "0", "0", "0", "1", "1", "1"] + y_pred = ["0", "0", "0", "1", "1", "1", "1", "1"] + informedness = informedness_statistic( + y_true=y_true, + y_pred=y_pred, + sample_weight=None + ) + recalls = recall_score( + y_true=y_true, + y_pred=y_pred, + sample_weight=None) + assert informedness == sum(recalls.values()) - 1.0 diff --git a/greykite/tests/detection/common/test_ad_evaluation_utils.py b/greykite/tests/detection/common/test_ad_evaluation_utils.py new file mode 100644 index 0000000..620361d --- /dev/null +++ b/greykite/tests/detection/common/test_ad_evaluation_utils.py @@ -0,0 +1,223 @@ +import numpy as np + +from greykite.detection.common.ad_evaluation_utils import compute_range_based_score +from greykite.detection.common.ad_evaluation_utils import get_cardinality_factor +from greykite.detection.common.ad_evaluation_utils import get_overlap_size_and_position_reward +from greykite.detection.common.ad_evaluation_utils import get_positional_reward +from greykite.detection.common.ad_evaluation_utils import prepare_anomaly_ranges + + +def test_prepare_anomaly_ranges(): + """Tests for prepare_anomaly_ranges function""" + + y_true = [0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0] + y_pred = [0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0] + + real_anomaly_ranges = prepare_anomaly_ranges(np.array(y_true)) + expected_real_anomaly_ranges = np.array([[3, 7], [11, 18]]) + predicted_anomaly_ranges = prepare_anomaly_ranges(np.array(y_pred)) + expected_predicted_anomaly_ranges = np.array([[2, 5], [11, 12], [15, 17]]) + + assert np.array_equal(real_anomaly_ranges, expected_real_anomaly_ranges) + assert np.array_equal(predicted_anomaly_ranges, expected_predicted_anomaly_ranges) + + real_anomaly_ranges = prepare_anomaly_ranges(np.array(y_true), range_based=False) + expected_real_anomaly_ranges = [[3, 3], [4, 4], [5, 5], [6, 6], [7, 7], + [11, 11], [12, 12], [13, 13], [14, 14], + [15, 15], [16, 16], [17, 17], [18, 18]] + predicted_anomaly_ranges = prepare_anomaly_ranges(np.array(y_pred), range_based=False) + expected_predicted_anomaly_ranges = [[2, 2], [3, 3], [4, 4], [5, 5], [11, 11], + [12, 12], [15, 15], [16, 16], [17, 17]] + + assert all(anomaly_range in real_anomaly_ranges for anomaly_range in expected_real_anomaly_ranges) + assert all(anomaly_range in predicted_anomaly_ranges for anomaly_range in expected_predicted_anomaly_ranges) + + +def test_get_cardinality_factor(): + """Tests for get_cardinality_factor function""" + + cardinality_factor = get_cardinality_factor(overlap_count=[1]) + assert cardinality_factor == 1.0 + + # When cardinality bias is set to "reciprocal", return the reciprocal of x where x is the number of + # overlapping anomaly ranges with a certain anomaly range. + # An example of this is when there is a real anomaly range, with two predicted anomaly ranges overlapping + # with it. In this case, the cardinality factor should be 1/2 + cardinality_factor = get_cardinality_factor(overlap_count=[2], cardinality_bias="reciprocal") + assert cardinality_factor == 0.5 + + +def test_get_positional_reward(): + """Tests for get_positional_reward function""" + + # With "flat" positional bias, no matter where a pointwise anomaly is within an anomaly + # range of length 5, return a fixed value (1.0) + positional_reward = get_positional_reward(loc=1, anomaly_length=5, positional_bias="flat") + assert positional_reward == 1.0 + positional_reward = get_positional_reward(loc=3, anomaly_length=5, positional_bias="flat") + assert positional_reward == 1.0 + positional_reward = get_positional_reward(loc=5, anomaly_length=5, positional_bias="flat") + assert positional_reward == 1.0 + + # With "front" positional bias, higher positional reward is allocated to a pointwise + # anomaly the earlier it is in the anomaly range of length 5 + positional_reward = get_positional_reward(loc=1, anomaly_length=5, positional_bias="front") + assert positional_reward == 5.0 + positional_reward = get_positional_reward(loc=5, anomaly_length=5, positional_bias="front") + assert positional_reward == 1.0 + + # With "middle" positional bias, higher positional reward is allocated to a pointwise anomaly + # the closer to the middle it is of the anomaly range of length 5 + positional_reward = get_positional_reward(loc=1, anomaly_length=5, positional_bias="middle") + assert positional_reward == 1.0 + positional_reward = get_positional_reward(loc=3, anomaly_length=5, positional_bias="middle") + assert positional_reward == 3.0 + positional_reward = get_positional_reward(loc=5, anomaly_length=5, positional_bias="middle") + assert positional_reward == 1.0 + + # With "back" positional bias, higher positional reward is allocated to a pointwise anomaly + # the later it is in the anomaly range of length 5 + positional_reward = get_positional_reward(loc=1, anomaly_length=5, positional_bias="back") + assert positional_reward == 1.0 + positional_reward = get_positional_reward(loc=5, anomaly_length=5, positional_bias="back") + assert positional_reward == 5.0 + + +def test_get_overlap_size_and_position_reward(): + """Tests for get_overlap_size_and_position_reward function""" + + # No overlap produces an overlap_size_and_position_reward of zero + anomaly_range_1 = np.array([3, 8]) + anomaly_range_2 = np.array([9, 10]) + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="flat" + ) + assert overlap_size_and_position_reward == 0 + + # Overlap of anomaly_range_2 with anomaly_range_1 happens at the beginning of anomaly_range_1 + # This overlap yields higher size and positional reward when positional bias is set to "front" than when + # set to "middle" or "back". + anomaly_range_1 = np.array([3, 8]) + anomaly_range_2 = np.array([3, 5]) + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="front") + assert round(overlap_size_and_position_reward, 3) == 0.714 + + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="middle") + assert round(overlap_size_and_position_reward, 3) == 0.50 + + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="back") + assert round(overlap_size_and_position_reward, 3) == 0.286 + + # Overlap of anomaly_range_2 with anomaly_range_1 happens at the end of anomaly_range_1 + # This overlap yields higher size and positional reward when positional bias is set to "back" than when + # set to "middle" or "front". + anomaly_range_1 = np.array([3, 8]) + anomaly_range_2 = np.array([5, 9]) + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="front") + assert round(overlap_size_and_position_reward, 3) == 0.476 + + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="middle") + assert round(overlap_size_and_position_reward, 3) == 0.75 + + overlap_size_and_position_reward = get_overlap_size_and_position_reward( + anomaly_range_1=anomaly_range_1, + anomaly_range_2=anomaly_range_2, + overlap_count=[0], + positional_bias="back") + assert round(overlap_size_and_position_reward, 3) == 0.857 + + +def test_compute_range_based_score(): + """Tests for compute_range_based_score function""" + + y_true = np.array([0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0]) + y_pred = np.array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0]) + + real_anomaly_ranges = prepare_anomaly_ranges(y_true) + predicted_anomaly_ranges = prepare_anomaly_ranges(y_pred) + + recall = compute_range_based_score( + real_anomaly_ranges, + predicted_anomaly_ranges, + alpha=0.5, + positional_bias="flat") + precision = compute_range_based_score( + predicted_anomaly_ranges, + real_anomaly_ranges, + alpha=0.5, positional_bias="flat") + assert round(precision, 3) == 0.958 + assert round(recall, 3) == 0.806 + + recall = compute_range_based_score( + real_anomaly_ranges, + predicted_anomaly_ranges, + alpha=0.5, positional_bias="front") + precision = compute_range_based_score( + predicted_anomaly_ranges, + real_anomaly_ranges, + alpha=0.5, + positional_bias="front") + assert round(precision, 3) == 0.933 + assert round(recall, 3) == 0.867 + + recall = compute_range_based_score( + real_anomaly_ranges, + predicted_anomaly_ranges, + alpha=0.5, + positional_bias="middle") + precision = compute_range_based_score( + predicted_anomaly_ranges, + real_anomaly_ranges, + alpha=0.5, + positional_bias="middle") + assert round(precision, 3) == 0.972 + assert round(recall, 3) == 0.817 + + recall = compute_range_based_score( + real_anomaly_ranges, + predicted_anomaly_ranges, + alpha=0.5, positional_bias="back") + precision = compute_range_based_score( + predicted_anomaly_ranges, + real_anomaly_ranges, + alpha=0.5, positional_bias="back") + assert round(precision, 3) == 0.983 + assert round(recall, 3) == 0.746 + + recall = compute_range_based_score( + real_anomaly_ranges, + predicted_anomaly_ranges, + alpha=0.5, + positional_bias="front", + cardinality_bias="reciprocal") + precision = compute_range_based_score( + predicted_anomaly_ranges, + real_anomaly_ranges, + alpha=0.5, + positional_bias="front", + cardinality_bias="reciprocal") + assert round(precision, 3) == 0.933 + assert round(recall, 3) == 0.783 diff --git a/greykite/tests/detection/common/test_pickler.py b/greykite/tests/detection/common/test_pickler.py new file mode 100644 index 0000000..17cb353 --- /dev/null +++ b/greykite/tests/detection/common/test_pickler.py @@ -0,0 +1,640 @@ +import datetime +import inspect +import sys +from collections import OrderedDict + +import dill +import numpy as np +import pandas as pd +import pytest +from patsy.desc import Term + +from greykite.common.constants import ACTUAL_COL +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.common.constants import PREDICTED_COL +from greykite.common.constants import PREDICTED_LOWER_COL +from greykite.common.constants import PREDICTED_UPPER_COL +from greykite.common.constants import TIME_COL +from greykite.common.constants import VALUE_COL +from greykite.common.python_utils import assert_equal +from greykite.common.testing_utils import generate_df_for_tests +from greykite.common.testing_utils_anomalies import contaminate_df_with_anomalies +from greykite.common.viz.timeseries_annotate import plot_anomalies_over_forecast_vs_actual +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.ad_evaluation import precision_score +from greykite.detection.common.ad_evaluation import recall_score +from greykite.detection.common.pickler import GreykitePickler +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.constants import FIG_SHOW +from greykite.detection.detector.data import DetectorData as Data +from greykite.detection.detector.greykite import DETECTOR_PREDICT_COLS +from greykite.detection.detector.greykite import GreykiteDetector +from greykite.detection.detector.reward import Reward +from greykite.framework.templates.autogen.forecast_config import ComputationParam +from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam +from greykite.framework.templates.autogen.forecast_config import ForecastConfig +from greykite.framework.templates.autogen.forecast_config import MetadataParam +from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam +from greykite.framework.templates.forecaster import Forecaster +from greykite.framework.templates.model_templates import ModelTemplateEnum +from greykite.framework.utils.result_summary import summarize_grid_search_results + + +# Evaluation metrics used in the tests. +# F1 score for the True label. +f1_calc = partial_return(f1_score, True) +# Precision score, for the True label. +calc_precision = partial_return(precision_score, True) +# Recall score for the True label. +calc_recall = partial_return(recall_score, True) + + +@pytest.fixture(scope="module") +def daily_data(): + """Generates data for testing `GreykiteDetector`.""" + + df = generate_df_for_tests( + freq="D", + train_start_date=datetime.datetime(2020, 1, 1), + intercept=50, + train_frac=0.99, + periods=200)["df"] + + anomaly_block_list = [ + np.arange(10, 15), + np.arange(33, 35), + np.arange(60, 65), + np.arange(82, 85), + np.arange(94, 98), + np.arange(100, 105), + np.arange(111, 113), + np.arange(125, 130), + np.arange(160, 163), + np.arange(185, 190), + np.arange(198, 200)] + + # Contaminates `df` with anomalies at the specified locations, + # via `anomaly_block_list`. + # If original value is y, the anomalous value is: (1 +/- delta)*y. + df = contaminate_df_with_anomalies( + df=df, + anomaly_block_list=anomaly_block_list, + delta_range_lower=0.25, + delta_range_upper=0.5, + value_col=VALUE_COL, + min_admissible_value=None, + max_admissible_value=None) + + df = df.drop(columns=[VALUE_COL]).rename( + columns={"contaminated_y": VALUE_COL}) + df[ANOMALY_COL] = (df[ANOMALY_COL] == 1) + + assert len(df) == 200 + assert sum(df[ANOMALY_COL]) == 41 + + train_size = int(100) + df_train = df[:train_size].reset_index(drop=True) + df_test = df[train_size:].reset_index(drop=True) + + assert len(df_train) == 100 + assert len(df_test) == 100 + + return { + "df_train": df_train, + "df_test": df_test, + "df": df} + + +@pytest.fixture(scope="module") +def forecast_config_info_daily(): + """Generates ``forecast_config`` for testing.""" + metadata = MetadataParam( + time_col=TIME_COL, + value_col=VALUE_COL, + train_end_date=None, + anomaly_info=None) + + evaluation_period = EvaluationPeriodParam( + test_horizon=0, + cv_max_splits=0) + + model_components = ModelComponentsParam( + autoregression={ + "autoreg_dict": { + "lag_dict": {"orders": [7]}, + "agg_lag_dict": None}}, + events={ + "auto_holiday": False, + "holiday_lookup_countries": ["US"], + "holiday_pre_num_days": 2, + "holiday_post_num_days": 2, + "daily_event_df_dict": None}, + custom={ + "extra_pred_cols": ["dow"], + "min_admissible_value": 0, + "normalize_method": "zero_to_one"}) + + return ForecastConfig( + model_template="SILVERKITE_EMPTY", + metadata_param=metadata, + coverage=None, + evaluation_period_param=evaluation_period, + forecast_horizon=1, + model_components_param=model_components) + + +class X: + def __init__(self, a): + self.a = a + + +class TestClass: + def __init__(self, a, b): + self.a = a + self.b = X(b) + self.c = { + Term([]): None, + Term(["a", "b"]): [3, 4, 5] + } + self.d = { + Term([]): { + "d_1_1": 1, + Term([]): Term(["a", "b"]) + }, + "d_2": [1, 2] + } + + +def test_init(): + """Tests initialization.""" + pickler = GreykitePickler() + assert pickler.obj is None + + +def test_integer(): + """Tests pickling and unpickling of integers.""" + pickler = GreykitePickler() + obj = 1 + serialized = pickler.dumps(obj) + assert serialized == {"ROOT.pkl": 'gARLAS4=\n'} + assert pickler.obj == obj + deserialized = pickler.loads(serialized) + assert deserialized == obj + + +def test_list(): + """Tests pickling and unpickling of lists.""" + # Checks simple lists that can be serialized by dill. + obj = [1, 2, 3] + serialized_by_dill = GreykitePickler.dumps_to_str(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + assert serialized == {"ROOT.pkl": serialized_by_dill} + deserialized = pickler.loads(serialized) + assert deserialized == obj + + # Checks complex lists that cannot be serialized by dill. + obj = [Term([]), Term(["a", "b"]), 2] + with pytest.raises(NotImplementedError): + dill.dumps(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + deserialized = pickler.loads(serialized) + assert isinstance(deserialized, list) + assert deserialized == obj + + +def test_tuple(): + """Tests pickling and unpickling of tuples.""" + # Checks simple lists that can be serialized by dill. + obj = (1, 2, 3) + serialized_by_dill = GreykitePickler.dumps_to_str(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + assert serialized == {"ROOT.pkl": serialized_by_dill} + deserialized = pickler.loads(serialized) + assert deserialized == obj + + # Checks complex lists that cannot be serialized by dill. + obj = (Term([]), Term(["a", "b"]), 2) + with pytest.raises(NotImplementedError): + dill.dumps(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + deserialized = pickler.loads(serialized) + assert isinstance(deserialized, tuple) + assert deserialized == obj + + +def test_dict(): + """Tests pickling and unpickling of dictionaries.""" + # Checks simple dictionaries that can be serialized by dill. + obj = {"key1": X(1)} + serialized_by_dill = GreykitePickler.dumps_to_str(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + assert serialized == {"ROOT.pkl": serialized_by_dill} + deserialized = pickler.loads(serialized) + assert isinstance(deserialized, dict) + assert deserialized.keys() == obj.keys() + assert deserialized["key1"].a == obj["key1"].a + + # Checks complex dictionaries that cannot be serialized by dill. + obj = {"key1": Term([])} + with pytest.raises(NotImplementedError): + dill.dumps(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + deserialized = pickler.loads(serialized) + assert isinstance(deserialized, dict) + assert deserialized.keys() == obj.keys() + assert deserialized["key1"].factors == obj["key1"].factors + + +def test_ordered_dict(): + """Tests pickling and unpickling of ordered dictionaries.""" + # Checks simple ordered dictionaries that can be serialized by dill. + obj = OrderedDict({"a": 1, X(2): 3, 5: ["b"]}) + serialized_by_dill = GreykitePickler.dumps_to_str(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + assert serialized == {"ROOT.pkl": serialized_by_dill} + deserialized = pickler.loads(serialized) + assert isinstance(deserialized, OrderedDict) + assert deserialized["a"] == obj["a"] + assert deserialized[5] == obj[5] + + # Checks complex ordered dictionaries that cannot be serialized by dill. + obj = OrderedDict({"a": 1, Term([]): 3, 5: ["b"]}) + with pytest.raises(NotImplementedError): + dill.dumps(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + deserialized = pickler.loads(serialized) + assert isinstance(deserialized, dict) + assert deserialized["a"] == obj["a"] + assert deserialized[5] == obj[5] + + +def test_class(): + """Tests pickling and unpickling of classes.""" + # Checks simple classes that can be serialized by dill. + obj = X(a=1) + serialized_by_dill = GreykitePickler.dumps_to_str(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + assert serialized == {"ROOT.pkl": serialized_by_dill} + deserialized = pickler.loads(serialized) + assert deserialized.a == obj.a + + # Checks complex classes that cannot be serialized by dill. + obj = Term([]) + with pytest.raises(NotImplementedError): + dill.dumps(obj) + pickler = GreykitePickler() + serialized = pickler.dumps(obj) + deserialized = pickler.loads(serialized) + assert deserialized.__class__ == obj.__class__ + assert deserialized.__dict__ == obj.__dict__ + + +def test_silverkite_forecast_result(): + """Tests pickling and unpickling of Silverkite ForecastResult.""" + df = generate_df_for_tests( + freq="D", + periods=365)["df"] + forecaster = Forecaster() + # Run the forecast + result = forecaster.run_forecast_config( + df=df, # includes the regressor + config=ForecastConfig( + model_template=ModelTemplateEnum.SILVERKITE.name, + forecast_horizon=7, + coverage=0.8, + metadata_param=MetadataParam( + time_col="ts", + value_col="y", + freq="D" + ), + evaluation_period_param=EvaluationPeriodParam( + cv_max_splits=1, + cv_horizon=7, + test_horizon=7, + cv_min_train_periods=80 + ), + model_components_param=ModelComponentsParam( + custom={"fit_algorithm_dict": {"fit_algorithm": "linear"}}, + autoregression={"autoreg_dict": "auto"} + ), + computation_param=ComputationParam(n_jobs=-1), + ) + ) + pickler = GreykitePickler() + serialized = pickler.dumps(result) + deserialized = pickler.loads(serialized) + + # Tests loaded results. + # Grid search cv results. + assert_equal( + summarize_grid_search_results(result.grid_search), + summarize_grid_search_results(deserialized.grid_search) + ) + # Grid search attributes. + for key in result.grid_search.__dict__.keys(): + if key not in ["scoring", "estimator", "refit", "cv", "error_score", "cv_results_", + "scorer_", "best_estimator_"]: + assert_equal( + result.grid_search.__dict__[key], + deserialized.grid_search.__dict__[key]) + + # Model. + assert_equal( + result.model[-1].predict(df), + deserialized.model[-1].predict(df) + ) + assert result.model[-1].model_dict["x_design_info"] is not None + # Model: estimator. + for key in result.model[-1].__dict__.keys(): + if key not in ["score_func", "silverkite", "silverkite_diagnostics", "model_dict"]: + assert_equal( + result.model[-1].__dict__[key], + deserialized.model[-1].__dict__[key]) + assert_equal( + inspect.getsource(result.model[-1].__dict__["score_func"]), + inspect.getsource(deserialized.model[-1].__dict__["score_func"]) + ) + # Model: estimator/model_dict. + for key in result.model[-1].model_dict.keys(): + # Functions and classes are not testable. + if key not in ["x_design_info", "fs_func", "ml_model", "plt_pred", + "autoreg_dict", "changepoint_detector", "autoreg_func", "normalize_df_func"]: + assert_equal( + result.model[-1].model_dict[key], + deserialized.model[-1].model_dict[key]) + # Tests function source code. + elif key in ["fs_func", "plt_pred", "autoreg_func", "normalize_df_func"]: + assert_equal( + inspect.getsource(result.model[-1].model_dict[key]), + inspect.getsource(deserialized.model[-1].model_dict[key])) + # Model: estimator/model_dict/autoreg_dict. + for key in result.model[-1].model_dict["autoreg_dict"].keys(): + if key not in ["series_na_fill_func"]: + assert_equal( + result.model[-1].model_dict["autoreg_dict"][key], + deserialized.model[-1].model_dict["autoreg_dict"][key]) + assert_equal( + inspect.getsource(result.model[-1].model_dict["autoreg_dict"]["series_na_fill_func"]), + inspect.getsource(deserialized.model[-1].model_dict["autoreg_dict"]["series_na_fill_func"])) + + # Forecast. + assert_equal( + result.forecast.estimator.predict(df), + deserialized.forecast.estimator.predict(df) + ) + assert result.forecast.estimator.model_dict["x_design_info"] is not None + # Forecast: attributes. + for key in result.forecast.__dict__.keys(): + if key not in ["r2_loss_function", "estimator"]: + assert_equal( + result.forecast.__dict__[key], + deserialized.forecast.__dict__[key]) + assert_equal( + inspect.getsource(result.forecast.__dict__["r2_loss_function"]), + inspect.getsource(deserialized.forecast.__dict__["r2_loss_function"])) + # Forecast: estimator. + for key in result.forecast.estimator.__dict__.keys(): + if key not in ["score_func", "silverkite", "silverkite_diagnostics", "model_dict"]: + assert_equal( + result.forecast.estimator.__dict__[key], + deserialized.forecast.estimator.__dict__[key]) + assert_equal( + inspect.getsource(result.forecast.estimator.__dict__["score_func"]), + inspect.getsource(deserialized.forecast.estimator.__dict__["score_func"]) + ) + # Model: estimator/model_dict + for key in result.forecast.estimator.model_dict.keys(): + # Functions and classes are not testable. + if key not in ["x_design_info", "fs_func", "ml_model", "plt_pred", + "autoreg_dict", "changepoint_detector", "autoreg_func", "normalize_df_func"]: + assert_equal( + result.forecast.estimator.model_dict[key], + deserialized.forecast.estimator.model_dict[key]) + # Tests function source code. + elif key in ["fs_func", "plt_pred", "autoreg_func", "normalize_df_func"]: + assert_equal( + inspect.getsource(result.forecast.estimator.model_dict[key]), + inspect.getsource(deserialized.forecast.estimator.model_dict[key])) + # Model: estimator/model_dict/autoreg_dict. + for key in result.forecast.estimator.model_dict["autoreg_dict"].keys(): + if key not in ["series_na_fill_func"]: + assert_equal( + result.forecast.estimator.model_dict["autoreg_dict"][key], + deserialized.forecast.estimator.model_dict["autoreg_dict"][key]) + assert_equal( + inspect.getsource(result.forecast.estimator.model_dict["autoreg_dict"]["series_na_fill_func"]), + inspect.getsource(deserialized.forecast.estimator.model_dict["autoreg_dict"]["series_na_fill_func"])) + + # Backtest. + assert_equal( + result.backtest.estimator.predict(df), + deserialized.backtest.estimator.predict(df) + ) + assert result.backtest.estimator.model_dict["x_design_info"] is not None + # Backtest: attributes. + for key in result.backtest.__dict__.keys(): + if key not in ["r2_loss_function", "estimator"]: + assert_equal( + result.backtest.__dict__[key], + deserialized.backtest.__dict__[key]) + assert_equal( + inspect.getsource(result.backtest.__dict__["r2_loss_function"]), + inspect.getsource(deserialized.backtest.__dict__["r2_loss_function"])) + # Backtest: estimator. + for key in result.backtest.estimator.__dict__.keys(): + if key not in ["score_func", "silverkite", "silverkite_diagnostics", "model_dict"]: + assert_equal( + result.backtest.estimator.__dict__[key], + deserialized.backtest.estimator.__dict__[key]) + assert_equal( + inspect.getsource(result.backtest.estimator.__dict__["score_func"]), + inspect.getsource(deserialized.backtest.estimator.__dict__["score_func"]) + ) + # Model: estimator/model_dict. + for key in result.backtest.estimator.model_dict.keys(): + # Functions and classes are not testable. + if key not in ["x_design_info", "fs_func", "ml_model", "plt_pred", + "autoreg_dict", "changepoint_detector", "autoreg_func", "normalize_df_func"]: + assert_equal( + result.backtest.estimator.model_dict[key], + deserialized.backtest.estimator.model_dict[key]) + # Tests function source code. + elif key in ["fs_func", "plt_pred", "autoreg_func", "normalize_df_func"]: + assert_equal( + inspect.getsource(result.backtest.estimator.model_dict[key]), + inspect.getsource(deserialized.backtest.estimator.model_dict[key])) + # Model: estimator/model_dict/autoreg_dict. + for key in result.backtest.estimator.model_dict["autoreg_dict"].keys(): + if key not in ["series_na_fill_func"]: + assert_equal( + result.backtest.estimator.model_dict["autoreg_dict"][key], + deserialized.backtest.estimator.model_dict["autoreg_dict"][key]) + assert_equal( + inspect.getsource(result.backtest.estimator.model_dict["autoreg_dict"]["series_na_fill_func"]), + inspect.getsource(deserialized.backtest.estimator.model_dict["autoreg_dict"]["series_na_fill_func"])) + + # Timeseries. + for key in result.timeseries.__dict__.keys(): + assert_equal( + result.timeseries.__dict__[key], + deserialized.timeseries.__dict__[key]) + + # Checks the size of the serialized object in megabytes. + memory_size_mb = sys.getsizeof(serialized)*1e-6 + assert memory_size_mb < 64.0 + + +def test_silverkite_ad_result(daily_data, forecast_config_info_daily): + """Tests pickling and unpickling of Greykite anomaly detector.""" + df_train = daily_data["df_train"] + df_test = daily_data["df_test"] + df = daily_data["df"] + + forecast_config = forecast_config_info_daily + ad_config = ADConfig( + volatility_features_list=[["dow"], ["is_weekend"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=None, + variance_scaling=True) + + train_data = Data(df=df_train) + + def reward_func(data): + return f1_calc( + y_true=data.y_true, + y_pred=data.y_pred) + reward = Reward(reward_func) + + # Trains the anomaly detector. + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + detector.fit(data=train_data) + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + + # Pickles and deserializes the anomaly detector. + pickler = GreykitePickler() + serialized = pickler.dumps(detector) + deserialized = pickler.loads(serialized) + + # Checks that the original and deserialized anomaly detector train results are the same. + deserialized_fit_data = deserialized.fit_info["best_calc_result"].data + deserialized_fit_df = deserialized_fit_data.pred_df + + # Checks if we get the expected columns in the fit data. + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(deserialized_fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + + fit_obj_value = detector.reward.apply(fit_data) + deserialized_fit_obj_value = deserialized.reward.apply(deserialized_fit_data) + assert fit_obj_value == deserialized_fit_obj_value + + fit_recall = calc_recall( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + deserialized_f1_recall = calc_recall( + y_true=deserialized_fit_data.y_true, + y_pred=deserialized_fit_data.y_pred) + assert fit_recall == deserialized_f1_recall + + fit_precision = calc_precision( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + deserialized_f1_precision = calc_precision( + y_true=deserialized_fit_data.y_true, + y_pred=deserialized_fit_data.y_pred) + assert fit_precision == deserialized_f1_precision + + # Predicts on the test data with the original and deserialized anomaly detector. + test_data = Data( + df=df_test, + y_true=df_test[ANOMALY_COL]) + test_data = detector.predict(test_data) + pred_df = test_data.pred_df + + deserialized_test_data = deserialized.predict(test_data) + deserialized_pred_df = deserialized_test_data.pred_df + + # Checks that the original and deserialized anomaly detector test results are the same. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(deserialized_pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + + test_obj_value = detector.reward.apply(test_data) + deserialized_test_obj_value = deserialized.reward.apply(deserialized_test_data) + assert test_obj_value == deserialized_test_obj_value + + test_recall = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + deserialized_test_recall = calc_recall( + y_true=deserialized_test_data.y_true, + y_pred=deserialized_test_data.y_pred) + assert test_recall == deserialized_test_recall + + test_precision = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + deserialized_test_precision = calc_precision( + y_true=deserialized_test_data.y_true, + y_pred=deserialized_test_data.y_pred) + assert test_precision == deserialized_test_precision + + # Checks the size of the serialized object. + memory_size_mb = sys.getsizeof(serialized)*1e-6 + assert memory_size_mb < 64.0 + + # Plots the fit and test results of the original and deserialized anomaly detector. + # This provides a visual check that the results are the same. + fit_pred_df = pd.concat([fit_df, pred_df], axis=0) + fit_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + "Test of train and predict of the Greykite Detector." + fig = plot_anomalies_over_forecast_vs_actual( + df=fit_pred_df, + time_col=TIME_COL, + actual_col=ACTUAL_COL, + predicted_col=PREDICTED_COL, + predicted_anomaly_col=PREDICTED_ANOMALY_COL, + anomaly_col=ANOMALY_COL, + marker_opacity=0.6, + predicted_anomaly_marker_color="black", + anomaly_marker_color="green", + predicted_lower_col=PREDICTED_LOWER_COL, + predicted_upper_col=PREDICTED_UPPER_COL, + train_end_date=fit_df[TIME_COL].max(), + title="Test of train and predict of the Greykite Detector.") + assert fig is not None + if FIG_SHOW: + fig.show() + + deserialized_pred_df = pd.concat([deserialized_fit_df, deserialized_pred_df], axis=0) + deserialized_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + fig = plot_anomalies_over_forecast_vs_actual( + df=deserialized_pred_df, + time_col=TIME_COL, + actual_col=ACTUAL_COL, + predicted_col=PREDICTED_COL, + predicted_anomaly_col=PREDICTED_ANOMALY_COL, + anomaly_col=ANOMALY_COL, + marker_opacity=0.6, + predicted_anomaly_marker_color="black", + anomaly_marker_color="red", + predicted_lower_col=PREDICTED_LOWER_COL, + predicted_upper_col=PREDICTED_UPPER_COL, + train_end_date=fit_df[TIME_COL].max(), + title="Test of train and predict of the deserialized Greykite Detector.") + assert fig is not None + if FIG_SHOW: + fig.show() diff --git a/greykite/tests/detection/detector/test_ad_utils.py b/greykite/tests/detection/detector/test_ad_utils.py new file mode 100644 index 0000000..b4bcc7c --- /dev/null +++ b/greykite/tests/detection/detector/test_ad_utils.py @@ -0,0 +1,550 @@ +import itertools + +import numpy as np +import pandas as pd +import pytest +from testfixtures import LogCapture + +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import END_TIME_COL +from greykite.common.constants import LOGGER_NAME +from greykite.common.constants import START_TIME_COL +from greykite.common.constants import TIME_COL +from greykite.common.constants import TimeFeaturesEnum +from greykite.common.testing_utils import assert_equal +from greykite.detection.detector.ad_utils import add_new_params_to_records +from greykite.detection.detector.ad_utils import get_anomaly_df +from greykite.detection.detector.ad_utils import get_anomaly_df_from_outliers +from greykite.detection.detector.ad_utils import get_canonical_anomaly_df +from greykite.detection.detector.ad_utils import get_timestamp_ceil +from greykite.detection.detector.ad_utils import get_timestamp_floor +from greykite.detection.detector.ad_utils import optimize_df_with_constraints +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.ad_utils import validate_volatility_features +from greykite.detection.detector.ad_utils import vertical_concat_dfs + + +@pytest.fixture(scope="module") +def y_clean(): + """Constructs a clean vector of random numbers, used in outlier removal tests.""" + sampler = np.random.default_rng(1317) + # Defines two clean vectors, one for `fit` and one for `detect`. + y_clean = np.arange(0, 998) + # Add small noise + y_clean = y_clean + sampler.normal(loc=0.0, scale=1.0, size=len(y_clean)) + + return y_clean + + +def test_partial_return(): + """Tests `partial_return`.""" + def func(x): + return {"1": x, "2": -x, "3": x+100} + + v = partial_return(func, "1")(x=10) + assert v == 10 + + v1 = partial_return(func, "1")(10) + assert v1 == 10 + + # The case for lists. + def func(x): + return [x + 1, x + 2, x + 33] + + v = partial_return(func, 0)(x=10) + assert v == 11 + + v1 = partial_return(func, 1)(200) + assert v1 == 202 + + # The case for which index is out of bound. + v2 = partial_return(func, 25)(200) + assert v2 is None + + +def test_vertical_concat_dfs(): + """Tests `vertical_concat_dfs`.""" + df0 = pd.DataFrame({ + "ts": [0, 1, 2, 3, 4], + "day": ["Mon", "Tue", "Wed", "Thu", "Fri"], + "y": [10, 20, 30, 40, 50]}) + + df1 = pd.DataFrame({ + "ts": [0, 1, 2, 3, 4], + "day": ["Mon", "Tue", "Wed", "Thu", "Fri"], + "y": [11, 21, 31, 41, 51]}) + + df = vertical_concat_dfs( + df_list=[df0, df1], + join_cols=["ts"], + common_value_cols=["day"], + different_value_cols=["y"]) + + expected_df = pd.DataFrame({ + "ts": [0, 1, 2, 3, 4], + "day": ["Mon", "Tue", "Wed", "Thu", "Fri"], + "y0": [10, 20, 30, 40, 50], + "y1": [11, 21, 31, 41, 51]}) + + assert pd.DataFrame.equals(df, expected_df) + + +def test_add_new_params_to_records(): + """Tests `add_new_params_to_records`.""" + grid_seed_dict = { + "a": [1, 2, 3], + "cat": ["boz", "asb"]} + + var_names = list(grid_seed_dict.keys()) + combinations_list = list( + itertools.product(*[grid_seed_dict[var] for var in var_names])) + + df = pd.DataFrame(combinations_list, columns=var_names) + + records = df.to_dict("records") + expanded_param_list = add_new_params_to_records( + new_params={"dog": [1, 3], "horse": [13, 17]}, + records=records) + + assert (len(expanded_param_list)) == len(records) * 2 * 2 + assert expanded_param_list[0] == {"a": 1, "cat": "boz", "dog": 1, "horse": 13} + assert expanded_param_list[-1] == {"a": 3, "cat": "asb", "dog": 3, "horse": 17} + + +def test_get_anomaly_df(): + """Tests `get_anomaly_df`.""" + # All anomalies + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", periods=10, freq="D"), + ANOMALY_COL: [True, True, True, True, True, True, True, True, True, True]}) + anomaly_df = get_anomaly_df( + df=df, + time_col=TIME_COL, + anomaly_col=ANOMALY_COL) + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01"]), + END_TIME_COL: pd.to_datetime(["2020-01-10"])}) + assert_equal(anomaly_df, expected_anomaly_df) + + # No anomalies + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", periods=10, freq="D"), + ANOMALY_COL: [False, False, False, False, False, False, False, False, False, False]}) + anomaly_df = get_anomaly_df( + df=df, + time_col=TIME_COL, + anomaly_col=ANOMALY_COL) + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime([]), + END_TIME_COL: pd.to_datetime([])}) + assert_equal(anomaly_df, expected_anomaly_df) + + # Distinct anomalies (single data point and multiple data points) + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", periods=10, freq="D"), + ANOMALY_COL: [False, True, True, True, False, True, False, False, False, False]}) + anomaly_df = get_anomaly_df( + df=df, + time_col=TIME_COL, + anomaly_col=ANOMALY_COL) + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-02", "2020-01-06"]), + END_TIME_COL: pd.to_datetime(["2020-01-04", "2020-01-06"])}) + assert_equal(anomaly_df, expected_anomaly_df) + + +def test_get_canonical_anomaly_df(): + """Tests `get_canonical_anomaly_df`.""" + # Non-overlapping anomaly periods + anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01", "2020-02-01"]), + END_TIME_COL: pd.to_datetime(["2020-01-02", "2020-02-05"])}) + canonical_anomaly_df = get_canonical_anomaly_df( + anomaly_df=anomaly_df, + freq="D") + assert_equal(canonical_anomaly_df, anomaly_df) + # Partially overlapping anomaly periods + anomaly_df = pd.DataFrame({ + "begin": ["2020-01-02-05", "2020-01-02-10", "2020-01-02-03", "2020-01-02-05"], + "end": ["2020-01-02-15", "2020-01-02-20", "2020-01-02-17", "2020-01-02-13"]}) + canonical_anomaly_df = get_canonical_anomaly_df( + anomaly_df=anomaly_df, + freq="H", + start_time_col="begin", + end_time_col="end") + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-02-03"]), + END_TIME_COL: pd.to_datetime(["2020-01-02-20"])}) + assert_equal(canonical_anomaly_df, expected_anomaly_df) + # One anomaly period covers others + anomaly_df = pd.DataFrame({ + "begin": ["2020-02-05", "2020-02-19", "2020-02-03", "2020-02-05"], + "end": ["2020-02-15", "2020-02-20", "2020-02-17", "2020-02-13"]}) + canonical_anomaly_df = get_canonical_anomaly_df( + anomaly_df=anomaly_df, + freq="D", + start_time_col="begin", + end_time_col="end") + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-02-03", "2020-02-19"]), + END_TIME_COL: pd.to_datetime(["2020-02-17", "2020-02-20"])}) + assert_equal(canonical_anomaly_df, expected_anomaly_df) + # Checks anomaly merging logic. + anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-04"]), + END_TIME_COL: pd.to_datetime(["2020-01-02", "2020-01-03", "2020-01-05"])}) + canonical_anomaly_df = get_canonical_anomaly_df( + anomaly_df=anomaly_df, + freq="D") + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01"]), + END_TIME_COL: pd.to_datetime(["2020-01-05"])}) + assert_equal(canonical_anomaly_df, expected_anomaly_df) + # Checks anomaly merging logic + anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01", "2020-01-02", "2020-01-04"]), + END_TIME_COL: pd.to_datetime(["2020-01-02", "2020-01-03", "2020-01-05"])}) + canonical_anomaly_df = get_canonical_anomaly_df( + anomaly_df=anomaly_df, + freq="H") + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01", "2020-01-04"]), + END_TIME_COL: pd.to_datetime(["2020-01-03", "2020-01-05"])}) + assert_equal(canonical_anomaly_df, expected_anomaly_df) + + +def test_optimize_df_with_constraints(): + """Tests `optimize_df_with_constraints`.""" + df = pd.DataFrame({ + "a": [20, 20, 5, 10, 10, 0, 5], + "b": [1, 1, 2, 2, 3, 3, 4], + "c": [1, 2, 3, 4, 5, 6, 7]}) + # Constraint is satisfied, unique optimal + with LogCapture(LOGGER_NAME) as log_capture: + optimal_dict = optimize_df_with_constraints( + df=df, + objective_col="a", + constraint_col="b", + constraint_value=4) + expected_optimal_dict = {"a": 5, "b": 4, "c": 7} + assert_equal(optimal_dict, expected_optimal_dict) + log_capture.check( + (LOGGER_NAME, + "INFO", + f"Values satisfying the constraint are found.\n" + f"Solving the following optimization problem:\n" + f"Maximize a subject to b >= 4.")) + # Constraint is satisfied. Multiple rows with the maximum value + # in ``objective_col``, resolved by considering ``constraint_col`` + optimal_dict = optimize_df_with_constraints( + df=df, + objective_col="a", + constraint_col="b", + constraint_value=2) + expected_optimal_dict = {"a": 10, "b": 3, "c": 5} + assert_equal(optimal_dict, expected_optimal_dict) + # Constraint is satisfied. Multiple rows with the maximum value + # in both ``objective_col`` and ``constraint_col``, last entry is chosen + optimal_dict = optimize_df_with_constraints( + df=df, + objective_col="a", + constraint_col="b", + constraint_value=1) + expected_optimal_dict = {"a": 20, "b": 1, "c": 2} + assert_equal(optimal_dict, expected_optimal_dict) + + # Constraint is NOT satisfied, unique optimal + with LogCapture(LOGGER_NAME) as log_capture: + optimal_dict = optimize_df_with_constraints( + df=df, + objective_col="a", + constraint_col="b", + constraint_value=4.5) + expected_optimal_dict = {"a": 5, "b": 4, "c": 7} + assert_equal(optimal_dict, expected_optimal_dict) + log_capture.check( + (LOGGER_NAME, + "INFO", + f"No values satisfy the constraint.\n" + f"Maximizing ``constraint_col`` (b) so that it is as " + f"close as possible to the ``constraint_value`` (4.5).")) + # Constraint is NOT satisfied. Multiple rows with the maximum value + # in ``objective_col``, resolved by considering ``constraint_col`` + df = df[:6] + optimal_dict = optimize_df_with_constraints( + df=df, + objective_col="a", + constraint_col="b", + constraint_value=3.5) + expected_optimal_dict = {"a": 10, "b": 3, "c": 5} + assert_equal(optimal_dict, expected_optimal_dict) + # Constraint is NOT satisfied. Multiple rows with the maximum value + # in both ``objective_col`` and ``constraint_col``, last entry is chosen + df = df[:2] + optimal_dict = optimize_df_with_constraints( + df=df, + objective_col="a", + constraint_col="b", + constraint_value=1.5) + expected_optimal_dict = {"a": 20, "b": 1, "c": 2} + assert_equal(optimal_dict, expected_optimal_dict) + + +def test_validate_volatility_features(): + """Tests ``validate_volatility_features``.""" + # Default behaviour, no + volatility_features_list = [["dow"], ["dow"], ["dow", "hour"], ["dow", "hour", "dow"], ["is_holiday"]] + validated_features_list = validate_volatility_features( + volatility_features_list=volatility_features_list) + expected_volatility_features_list = [["dow"], ["dow", "hour"], ["is_holiday"]] + assert_equal(validated_features_list, expected_volatility_features_list) + + # Error when input feature is not in ``valid_features`` + with pytest.raises(ValueError, match="Unknown feature\\(s\\) \\({'is_holiday'}\\) in `volatility_features_list`."): + valid_features = TimeFeaturesEnum._member_names_ + validate_volatility_features( + volatility_features_list=volatility_features_list, + valid_features=valid_features) + + +def test_get_timestamp_ceil(): + """Tests `get_timestamp_ceil`.""" + assert get_timestamp_ceil("2023-01-31", "M") == pd.to_datetime("2023-01-31") + assert get_timestamp_ceil("2023-01-19", "M") == pd.to_datetime("2023-01-31") + assert get_timestamp_ceil("2023-01-18", "W-MON") == pd.to_datetime("2023-01-23") + assert get_timestamp_ceil("2023-01-23", "W-MON") == pd.to_datetime("2023-01-23") + assert get_timestamp_ceil("2023-01-23", "D") == pd.to_datetime("2023-01-23") + assert get_timestamp_ceil("2023-01-22 20:15", "D") == pd.to_datetime("2023-01-23") + assert get_timestamp_ceil("2023-01-23", "B") == pd.to_datetime("2023-01-23") + assert get_timestamp_ceil("2023-01-21 10:15", "B") == pd.to_datetime("2023-01-23") + assert get_timestamp_ceil("2023-01-23 10:00:00", "H") == pd.to_datetime("2023-01-23 10:00:00") + assert get_timestamp_ceil("2023-01-23 09:15:40", "H") == pd.to_datetime("2023-01-23 10:00:00") + assert get_timestamp_ceil("2023-01-23 10:15:00", "T") == pd.to_datetime("2023-01-23 10:15:00") + assert get_timestamp_ceil("2023-01-23 10:14:40", "T") == pd.to_datetime("2023-01-23 10:15:00") + + +def test_get_timestamp_floor(): + """Tests `get_timestamp_floor`.""" + assert get_timestamp_floor("2023-01-31", "M") == pd.to_datetime("2023-01-31") + assert get_timestamp_floor("2023-01-19", "M") == pd.to_datetime("2022-12-31") + assert get_timestamp_floor("2023-01-18", "W-MON") == pd.to_datetime("2023-01-16") + assert get_timestamp_floor("2023-01-23", "W-MON") == pd.to_datetime("2023-01-23") + assert get_timestamp_floor("2023-01-23", "D") == pd.to_datetime("2023-01-23") + assert get_timestamp_floor("2023-01-22 20:15", "D") == pd.to_datetime("2023-01-22") + assert get_timestamp_floor("2023-01-23", "B") == pd.to_datetime("2023-01-23") + assert get_timestamp_floor("2023-01-21 10:15", "B") == pd.to_datetime("2023-01-20") + assert get_timestamp_floor("2023-01-23 10:00:00", "H") == pd.to_datetime("2023-01-23 10:00:00") + assert get_timestamp_floor("2023-01-23 09:15:40", "H") == pd.to_datetime("2023-01-23 09:00:00") + assert get_timestamp_floor("2023-01-23 10:15:00", "T") == pd.to_datetime("2023-01-23 10:15:00") + assert get_timestamp_floor("2023-01-23 10:14:40", "T") == pd.to_datetime("2023-01-23 10:14:00") + + +def test_get_anomaly_df_from_outliers1(): + """Tests `get_anomaly_df_from_outliers` when no outlier exists.""" + value_col = "y" + freq = "H" + + # No outlier in Series - returns empty `pd.DataFrame` with columns `START_TIME_COL`, `END_TIME_COL`. + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", end="2020-02-11 16:00:00", freq=freq), + value_col: [2]*1001}) + empty_anomaly_df = pd.DataFrame(columns=[START_TIME_COL, END_TIME_COL]) + anomaly_df = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=value_col, + freq=freq) + assert_equal(anomaly_df, empty_anomaly_df) + + +def test_get_anomaly_df_from_outliers2(y_clean): + """Tests `get_anomaly_df_from_outliers` when some outliers exist.""" + value_col = "y" + freq = "H" + + # Tests when outliers exist. + # The outliers are to be identified and saved to `anomaly_df`. + y = [2, 1e10] + list(y_clean) + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", end="2020-02-11 15:00:00", freq=freq), + value_col: y}) + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01 01:00:00"]), + END_TIME_COL: pd.to_datetime(["2020-01-01 01:00:00"])}) + anomaly_df = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=value_col, + freq=freq) + assert_equal(anomaly_df, expected_anomaly_df) + + +def test_get_anomaly_df_from_outliers_raise_error(): + """Tests `get_anomaly_df_from_outliers` error catching.""" + value_col = "y" + freq = "H" + # Tests when outliers exist. + # The outliers are to be identified and saved to `anomaly_df`. + y = [2, 1e10] + [2] * 998 + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", end="2020-02-11 15:00:00", freq=freq), + value_col: y}) + # Captures Error when input `time_col` is not in `df`. + non_exist_col = "column not exist" + with pytest.raises(ValueError, match=f"`df` does not have `time_col` with name {non_exist_col}."): + get_anomaly_df_from_outliers( + df=df, + time_col=non_exist_col, + value_col=value_col, + freq=freq) + + # Captures Error when input `value_col` is not in `df`. + with pytest.raises(ValueError, match=f"`df` does not have `value_col` with name {non_exist_col}."): + get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=non_exist_col, + freq=freq) + + +def test_get_anomaly_df_from_outliers_small_data_size(): + """Tests `get_anomaly_df_from_outliers`.""" + value_col = "y" + freq = "D" + # Tests when outliers exist. + # The outliers are to be identified and saved to `anomaly_df`. + # This example contains less data + ts = pd.date_range(start="2020-01-01", end="2020-05-01", freq=freq) + y = list(range(len(ts))) + # Let's overwrite the third value to be an outlier + y[2] = 1000 + + df = pd.DataFrame({ + TIME_COL: ts, + value_col: y}) + + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-03"]), + END_TIME_COL: pd.to_datetime(["2020-01-03"])}) + + anomaly_df = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=value_col, + freq=freq, + trim_percent=1.0) + assert_equal(anomaly_df, expected_anomaly_df) + + # Disables trimming by setting it to zero. + # In this case, we do not expect an empty dataframe + anomaly_df = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=value_col, + freq=freq, + trim_percent=0.0) + expected_anomaly_df = pd.DataFrame(columns=[START_TIME_COL, END_TIME_COL]) + assert_equal(anomaly_df, expected_anomaly_df) + + +def test_get_anomaly_df_from_outliers_with_missing1(): + """Tests `get_anomaly_df_from_outliers` with outliers and missing values.""" + value_col = "y" + freq = "H" + # Tests when outliers exist and there is a missing value. + # The outliers are to be identified and saved to `anomaly_df`. + y = [2, 1e10] + list(range(998)) + df = pd.DataFrame({ + TIME_COL: pd.date_range( + start="2020-01-01", + end="2020-02-11 15:00:00", + freq=freq), + value_col: y}) + + df[value_col][2] = None + + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01 01:00:00"]), + END_TIME_COL: pd.to_datetime(["2020-01-01 01:00:00"])}) + anomaly_df = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=value_col, + freq=freq, + trim_percent=0.0) + assert_equal(anomaly_df, expected_anomaly_df) + + +def test_get_anomaly_df_from_outliers_with_missing2(y_clean): + """Tests `get_anomaly_df_from_outliers` with outliers and missing values.""" + value_col = "y" + freq = "H" + + # Tests when outliers exist and there is a missing value. + # The outliers are to be identified and saved to `anomaly_df`. + y = [0.0, 1e10] + list(y_clean) + ts = pd.date_range( + start="2020-01-01", + end="2020-02-11 15:00:00", + freq=freq) + assert len(ts) == len(y) + df = pd.DataFrame({ + TIME_COL: ts, + value_col: y}) + + # Setting the third element to be None. + df[value_col][2] = None + + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-01 01:00:00"]), + END_TIME_COL: pd.to_datetime(["2020-01-01 01:00:00"])}) + + anomaly_df = get_anomaly_df_from_outliers( + df=df, + time_col=TIME_COL, + value_col=value_col, + freq=freq, + trim_percent=1.0) + + assert_equal(anomaly_df, expected_anomaly_df) + + +def test_get_anomaly_df_from_outliers_with_missing_raise_error(): + """Tests `get_anomaly_df_from_outliers` with outliers and missing values.""" + value_col = "y" + freq = "H" + # Tests when outliers exist and there is a missing value. + # The outliers are to be identified and saved to `anomaly_df`. + df = pd.DataFrame({ + TIME_COL: pd.date_range( + start="2020-01-01", + end="2020-02-11 15:00:00", + freq=freq), + value_col: [2, 1e10] + list(range(998))}) + + df[value_col][2] = None + # Captures Error when y does not have at least two values after removing outliers. + df0 = df[:3] + df0[value_col] = [None, 3, None] + with pytest.raises( + ValueError, + match=f"Length of y after removing NAs is less than 2."): + get_anomaly_df_from_outliers( + df=df0, + time_col=TIME_COL, + value_col=value_col, + freq=freq) + + # Captures warning when y has at least two values after removing outliers. + df0 = df[:3] + # In this example, only one value remains after trimming. + df0[value_col] = [-5*10000, 1, 5*10000] + with pytest.warns( + UserWarning, + match=f"After trimming there were less than two values:"): + get_anomaly_df_from_outliers( + df=df0, + time_col=TIME_COL, + value_col=value_col, + freq=freq, + trim_percent=5.0) diff --git a/greykite/tests/detection/detector/test_ape_based.py b/greykite/tests/detection/detector/test_ape_based.py new file mode 100644 index 0000000..9c900fe --- /dev/null +++ b/greykite/tests/detection/detector/test_ape_based.py @@ -0,0 +1,113 @@ +import pandas as pd +import plotly.express as px + +from greykite.common.viz.timeseries_annotate import plot_lines_markers +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.testing_utils import sim_anomalous_data_and_forecasts +from greykite.detection.detector.ad_utils import vertical_concat_dfs +from greykite.detection.detector.ape_based import APE_PARAM_ITERABLE +from greykite.detection.detector.ape_based import APEDetector +from greykite.detection.detector.data import ForecastDetectorData as Data +from greykite.detection.detector.reward import Reward + + +def test_ape_detector(): + """Tests `APEDetector`.""" + data = sim_anomalous_data_and_forecasts( + sample_size=200, + anomaly_num=20, + seed=1317) + + # train data + df_train = data["df_train"] + forecast_dfs_train = data["forecast_dfs_train"] + + # test data + df_test = data["df_test"] + forecast_dfs_test = data["forecast_dfs_test"] + + def reward_func(data): + f1 = f1_score( + y_true=data.y_true, + y_pred=data.y_pred) + return f1[True] + + detector = APEDetector( + reward=Reward(reward_func), + value_cols=["y"], + pred_cols=["y_pred"], + is_anomaly_col="is_anomaly", + join_cols=["ts"]) + + assert detector.param_iterable == APE_PARAM_ITERABLE + # Checks if the attributes are inherited from the `Detector` class + assert detector.data is None + assert detector.fitted_df is None + assert detector.fit_info == {"param_full": None} + + joined_dfs = detector.join_with_forecasts( + df=df_train, + forecast_dfs=forecast_dfs_train) + joined_dfs = detector.add_features(joined_dfs) + + assert len(joined_dfs) == 2 + assert list(joined_dfs.keys()) == [0, 1] + for joined_df in joined_dfs.values(): + assert "y_pred" in joined_df.columns + assert detector.data is None + + # Concats the joined data to plot + joined_df_all = vertical_concat_dfs( + df_list=list(joined_dfs.values()), + join_cols=detector.join_cols, + common_value_cols=["y", "is_anomaly"], + different_value_cols=["ape", "y_pred"]) + + fig = plot_lines_markers( + df=joined_df_all, + x_col="ts", + line_cols=["ape0", "ape1"]) + fig.layout.update(title="Comparing two forecasts using APE") + assert fig is not None + fig.update_yaxes() + # fig.show() + + # Calculates one reward value + calc_result = detector.calc_with_param( + data=Data( + joined_dfs=joined_dfs, + y_true=df_train["is_anomaly"]), + param={"forecast_id": 0, "ape_thresh": 0.5}) + + obj_value = detector.reward.apply(calc_result.data) + + assert round(obj_value, 3) == 0.25 + + # Fits + detector.fit(Data( + df=df_train, + forecast_dfs=forecast_dfs_train)) + + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + assert list(param_eval_df.columns) == ["ape_thresh", "forecast_id", "obj_value"] + + param_eval_df["forecast_id"] = param_eval_df["forecast_id"].map(str) + fig = px.line( + param_eval_df, + x="ape_thresh", + y="obj_value", + color="forecast_id", + title="'APEDetector' result of parameter search for APE threshold") + assert fig is not None + # fig.show() + + # Prediction step on test set + data = detector.predict(Data( + df=df_test, + forecast_dfs=forecast_dfs_test, + y_true=df_test[detector.is_anomaly_col])) + + test_obj_value = detector.reward.apply(data) + + assert round(test_obj_value, 3) == 0.421 diff --git a/greykite/tests/detection/detector/test_best_forecast.py b/greykite/tests/detection/detector/test_best_forecast.py new file mode 100644 index 0000000..d931c3a --- /dev/null +++ b/greykite/tests/detection/detector/test_best_forecast.py @@ -0,0 +1,121 @@ +import numpy as np + +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.testing_utils import sim_anomalous_data_and_forecasts +from greykite.detection.detector.best_forecast import BestForecastDetector +from greykite.detection.detector.data import ForecastDetectorData as Data +from greykite.detection.detector.optimizer import CalcResult +from greykite.detection.detector.reward import Reward + + +def test_best_forecast_detector(): + data = sim_anomalous_data_and_forecasts( + sample_size=200, + anomaly_num=20, + seed=1317) + + # train data + df_train = data["df_train"] + forecast_dfs_train = data["forecast_dfs_train"] + + # test data + df_test = data["df_test"] + forecast_dfs_test = data["forecast_dfs_test"] + + def reward_func(data): + f1 = f1_score( + y_true=data.y_true, + y_pred=data.y_pred) + return f1[True] + + APE_PARAM_ITERABLE = [{"ape_thresh": x} for x in np.arange(0, 4, 0.05)] + + detector = BestForecastDetector( + value_cols=["y"], + pred_cols=["y_pred"], + is_anomaly_col="is_anomaly", + join_cols=["ts"], + reward=Reward(reward_func), + param_iterable=APE_PARAM_ITERABLE) + + assert detector.param_iterable == APE_PARAM_ITERABLE + + # Checks if the attributes are inherited from the `Detector` class + assert detector.data is None + assert detector.fitted_df is None + assert detector.fit_info == {"param_full": None} + + joined_dfs = detector.join_with_forecasts( + df=df_train, + forecast_dfs=forecast_dfs_train) + + assert len(joined_dfs) == 2 + assert list(joined_dfs.keys()) == [0, 1] + for joined_df in joined_dfs.values(): + assert "y_pred" in joined_df.columns + assert detector.data is None + + # In order to apply `fit`, we need to implement + # `add_features_one_df` + # `calc_with_param` + def add_features_one_df(joined_df): + joined_df["ape"] = ( + abs(joined_df["y"] - joined_df["y_pred"]) / + abs(joined_df["y"])) + return joined_df + + def calc_with_param(param, data): + pred_df = data.joined_dfs[param["forecast_id"]] + y_pred = (pred_df["ape"] > param["ape_thresh"]) + pred_df[PREDICTED_ANOMALY_COL] = y_pred + data.pred_df = pred_df + data.y_pred = y_pred + return CalcResult(data=data) + + detector.add_features_one_df = add_features_one_df + detector.calc_with_param = calc_with_param + + detector.fit(Data( + df=df_train, + forecast_dfs=forecast_dfs_train)) + + # Checking the fitted parameters + param_full = detector.fit_info["param_full"] + assert round(param_full["ape_thresh"], 2) == 0.15 + assert param_full["forecast_id"] == 0 + + assert detector.data.joined_dfs is not None + + # Checks to see if the attached `joined_dfs` to `data` + # is the same as previously calculated `joined_dfs` in the above + assert len(detector.data.joined_dfs) == 2 + assert list(detector.data.joined_dfs.keys()) == [0, 1] + + common_cols = ["ts", "y", "is_anomaly", "y_pred"] + for i in [0, 1]: + joined_df_direct = joined_dfs[i] + joined_df_from_detector = detector.data.joined_dfs[i] + assert joined_df_direct[common_cols].equals( + joined_df_from_detector[common_cols]) + + # Since forecast 0 is best forecast we expect the corresponding + # joined data to have the predictions within + after_fit_cols = common_cols + ["ape", "is_anomaly_predicted"] + joined_df_from_detector = detector.data.joined_dfs[0] + assert list(joined_df_from_detector.columns) == after_fit_cols + + # The other joined_df will not have the predictions + after_fit_cols = common_cols + ["ape"] + joined_df_from_detector = detector.data.joined_dfs[1] + assert list(joined_df_from_detector.columns) == after_fit_cols + + # Prediction step on test set + data = detector.predict(Data( + df=df_test, + forecast_dfs=forecast_dfs_test, + y_true=df_test[detector.is_anomaly_col])) + + test_obj_value = detector.reward.apply(data) + + assert round(test_obj_value, 3) == 0.421 diff --git a/greykite/tests/detection/detector/test_config.py b/greykite/tests/detection/detector/test_config.py new file mode 100644 index 0000000..b5f2b9b --- /dev/null +++ b/greykite/tests/detection/detector/test_config.py @@ -0,0 +1,103 @@ +# BSD 2-CLAUSE LICENSE + +# Redistribution and use in source and binary forms, with or without modification, +# are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +# #ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# original author: Sayan Patra + + +import json + +import pytest + +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.config import assert_equal_ad_config +from greykite.detection.detector.config_utils import AD_CONFIG_JSON_COMPLETE +from greykite.detection.detector.config_utils import AD_CONFIG_JSON_DEFAULT +from greykite.detection.detector.config_utils import AD_CONFIG_JSON_PARTIAL + + +def test_ad_config_init(): + """Tests init of the `ADConfig`.""" + ad_config = ADConfig() + assert ad_config.volatility_features_list is None + assert ad_config.coverage_grid is None + assert ad_config.min_admissible_value is None + assert ad_config.max_admissible_value is None + assert ad_config.objective is None + assert ad_config.target_precision is None + assert ad_config.target_recall is None + assert ad_config.soft_window_size is None + assert ad_config.target_anomaly_percent is None + assert ad_config.variance_scaling is None + + +def test_ad_config_none(): + """Tests `ADConfig` initialization when input is None.""" + ad_config_none = ADConfig(None) + ad_config = ADConfig() + assert_equal_ad_config(ad_config_none, ad_config) + + +def test_ad_config_from_dict(): + """Tests `from_dict` method of `ADConfig`.""" + for param in [AD_CONFIG_JSON_DEFAULT, AD_CONFIG_JSON_PARTIAL, AD_CONFIG_JSON_COMPLETE]: + ad_config = param.get("ad_config") + ad_json = param.get("ad_json") + ad_dict = json.loads(ad_json) + translated_ad_config = ADConfig.from_dict(ad_dict) + assert_equal_ad_config(ad_config, translated_ad_config) + + # Raises error when input is not a dictionary + with pytest.raises(ValueError, match="not a dictionary"): + ADConfig.from_dict(5) + + +def test_ad_config_from_json(): + """Tests `from_json` method of `ADConfig`.""" + for param in [AD_CONFIG_JSON_DEFAULT, AD_CONFIG_JSON_PARTIAL, AD_CONFIG_JSON_COMPLETE]: + ad_config = param.get("ad_config") + ad_json = param.get("ad_json") + translated_ad_config = ADConfig.from_json(ad_json) + assert_equal_ad_config(ad_config, translated_ad_config) + + # Raises error when input is not a string + with pytest.raises(ValueError, match="is not a json string."): + ADConfig.from_json(5) + + # Raises error when input is not a json string + with pytest.raises(ValueError, match="not a json string"): + json_str = "This is not a json str" + ADConfig.from_json(json_str) + + +def test_assert_equal_ad_config(): + """Tests `assert_equal_ad_config`.""" + ad_config_default = AD_CONFIG_JSON_DEFAULT.get("ad_config") + assert_equal_ad_config(ad_config_default, ad_config_default) + + ad_config_complete = AD_CONFIG_JSON_COMPLETE.get("ad_config") + assert_equal_ad_config(ad_config_complete, ad_config_complete) + + # Raises error when the `ADConfig`s do not match + with pytest.raises(AssertionError, match="Error at dictionary location"): + assert_equal_ad_config(ad_config_default, ad_config_complete) + + # Raises error when one of the input is not an `ADConfig` + with pytest.raises(ValueError, match="is not a member of 'ADConfig' class."): + assert_equal_ad_config(ad_config_default, 4) diff --git a/greykite/tests/detection/detector/test_config_to_reward.py b/greykite/tests/detection/detector/test_config_to_reward.py new file mode 100644 index 0000000..dd1f6fa --- /dev/null +++ b/greykite/tests/detection/detector/test_config_to_reward.py @@ -0,0 +1,109 @@ +from greykite.detection.common.ad_evaluation import soft_f1_score +from greykite.detection.common.ad_evaluation import soft_precision_score +from greykite.detection.common.ad_evaluation import soft_recall_score +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.config import F1 +from greykite.detection.detector.config import PRECISION +from greykite.detection.detector.config import RECALL +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.config_to_reward import config_to_reward +from greykite.detection.detector.data import DetectorData + + +# Soft F1 score for the True label: +calc_soft_f1 = partial_return(soft_f1_score, True) +# Soft Precision score, for the True label: +calc_soft_precision = partial_return(soft_precision_score, True) +# Soft Recall score for the True label: +calc_soft_recall = partial_return(soft_recall_score, True) + + +def test_config_to_reward(): + """Tests `Reward` class.""" + # Defines Test Data. + y_true = [True, True, False, True, True, False, False, False, True, True] + y_pred = [False, True, False, True, True, False, True, False, True, False] + data = DetectorData(y_true=y_true, y_pred=y_pred) + + # This calculates the metrics in simple way. + # We will use these values during testing. + raw_f1_value = calc_soft_f1(y_true=y_true, y_pred=y_pred, window=0) + raw_recall_value = calc_soft_recall(y_true=y_true, y_pred=y_pred, window=0) + raw_precision_value = calc_soft_precision(y_true=y_true, y_pred=y_pred, window=0) + + # Soft F1 with window of 2. + soft_f1_value = calc_soft_f1(y_true=y_true, y_pred=y_pred, window=2) + + # Tests anomaly percent config. + # Case 1: + ad_config = ADConfig(target_anomaly_percent=50) + reward = config_to_reward(ad_config) + + # Since the actual percent in `y_pred` is 50%, we expect a zero reward (best case). + assert reward.apply(data) == 0 + + # Case 2: + ad_config = ADConfig(target_anomaly_percent=20) + reward = config_to_reward(ad_config) + + # Due to mismatch between 20% and 30% anomaly percent (diff = -0.3) + # and the penaly being -1, we expect -1.3 + assert reward.apply(data) == -1.3 + + # Tests F1 as objective. + ad_config = ADConfig(objective=F1) + assert ad_config.objective == F1 + reward = config_to_reward(ad_config) + assert reward.apply(data) == raw_f1_value + + # Tests recall as objective. + ad_config = ADConfig(objective=RECALL) + assert ad_config.objective == RECALL + reward = config_to_reward(ad_config) + assert reward.apply(data) == raw_recall_value + + # Tests precision as objective. + ad_config = ADConfig(objective=PRECISION) + assert ad_config.objective == PRECISION + reward = config_to_reward(ad_config) + assert reward.apply(data) == raw_precision_value + + # Tests F1 with window of 2 as objective. + ad_config = ADConfig( + objective=F1, + soft_window_size=2) + assert ad_config.objective == F1 + reward = config_to_reward(ad_config) + assert reward.apply(data) == soft_f1_value + + # Tests recall target. + # Case 1: We set the recall as the actual recall. + # In this case, we should not get any penalty. + ad_config = ADConfig(target_recall=raw_recall_value) + assert ad_config.target_recall == raw_recall_value + reward = config_to_reward(ad_config) + assert reward.apply(data) == raw_recall_value + + # Case 2: We set the recall as the actual recall plus a very small value (0.01). + # In this case we should get penalized by -1. + ad_config = ADConfig(target_recall=raw_recall_value + 0.01) + reward = config_to_reward(ad_config) + assert reward.apply(data) == raw_recall_value - 1.0 + + # Test objective being RECALL and having a target precision. + # Case 1: We let the precision to be the actual precision. + # We expect no penalty in this case. + ad_config = ADConfig( + objective=RECALL, + target_precision=raw_precision_value) + reward = config_to_reward(ad_config) + assert reward.apply(data) == raw_recall_value + raw_precision_value + + # Case 2: We let the precision to be the actual precision plus a small value (0.01). + # We expect to be penalized by -1 this time. + ad_config = ADConfig( + objective=RECALL, + target_precision=raw_precision_value + 0.01) + assert ad_config.target_precision == raw_precision_value + 0.01 + reward = config_to_reward(ad_config) + assert round(reward.apply(data), 2) == round(raw_recall_value + raw_precision_value - 1, 2) diff --git a/greykite/tests/detection/detector/test_data.py b/greykite/tests/detection/detector/test_data.py new file mode 100644 index 0000000..7a76df2 --- /dev/null +++ b/greykite/tests/detection/detector/test_data.py @@ -0,0 +1,19 @@ +from greykite.detection.detector.data import DetectorData +from greykite.detection.detector.data import ForecastDetectorData + + +def test_detector_data(): + """Tests ``DetectorData`` data class.""" + data = DetectorData(df=None) + + assert data.df is None + assert data.anomaly_df is None + + +def test_forecast_detector_data(): + """Tests ``ForecastDetectorData`` data class.""" + data = ForecastDetectorData(df=None) + + assert data.df is None + assert data.forecast_dfs is None + assert data.anomaly_df is None diff --git a/greykite/tests/detection/detector/test_detector.py b/greykite/tests/detection/detector/test_detector.py new file mode 100644 index 0000000..2dc4b83 --- /dev/null +++ b/greykite/tests/detection/detector/test_detector.py @@ -0,0 +1,345 @@ +import numpy as np +import pandas as pd +import pytest +from scipy import stats + +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.common.viz.timeseries_annotate import plot_lines_markers +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.detector.data import DetectorData as Data +from greykite.detection.detector.detector import Detector +from greykite.detection.detector.detector import build_anomaly_percent_reward +from greykite.detection.detector.optimizer import CalcResult +from greykite.detection.detector.reward import Reward + + +def test_build_anomaly_percent_reward(): + anomaly_percent_dict = {"range": (4, 6), "penalty": -1} + reward = build_anomaly_percent_reward(anomaly_percent_dict) + + assert reward.min_unpenalized == -0.01 + assert reward.max_unpenalized == float("inf") + assert reward.penalty == -1 + + x = reward.apply(Data(y_pred=[True]*5 + [False]*95)) + assert x == 0 + + x = reward.apply(Data(y_pred=[True]*6 + [False]*94)) + assert x == -0.01 + + x = reward.apply(Data(y_pred=[True]*4 + [False]*96)) + assert x == -0.01 + + +# This class is implemented to test the `Detector` class +class TukeyDetector(Detector): + """A detector based on Tukey's outliar definition which is used in Boxplots + (also invented by Tukey) as well to draw the whiskers. + Reference: Exploratory Data Analysis, 1977, John Tukey + Tukey defines outliars to be any points outside the interval range: + ``(q1 - iqr_coef * iqr, q3 + iqr_coef * iqr)`` + where `q1` is the first quartile, `q3` is the third quartile and + ``iqr = q3 - q1``. + ``iqr_coef`` is the coefficient used and it is typically equal to 1.5 + In this detector, we choose the parameter using data. + """ + def __init__( + self, + value_col, + is_anomaly_col=None, + reward=None, + anomaly_percent_dict=None, + param_iterable=None): + + super().__init__( + reward=reward, + anomaly_percent_dict=anomaly_percent_dict, + param_iterable=param_iterable) + + self.is_anomaly_col = is_anomaly_col + self.value_col = value_col + if param_iterable is None: + self.param_iterable = [{"iqr_coef": x} for x in np.arange(0, 5, 0.1)] + + def fit( + self, + data): + df = data.df + assert self.value_col in df.columns + q1 = np.quantile(a=df[self.value_col], q=0.25) + q3 = np.quantile(a=df[self.value_col], q=0.75) + iqr = (q3 - q1) + + default_param = { + "q1": q1, + "q3": q3, + "iqr": iqr, + "iqr_coef": None, + "lower": None, + "upper": None} + + y_true = df[self.is_anomaly_col] + data = Data( + df=df, + y_true=y_true) + + param_iterable = self.param_iterable + + optim_res = self.optimize_param( + data=data, + param_iterable=param_iterable, + default_param=default_param) + + self.fit_info = { + "param": optim_res["best_param"], + "param_full": optim_res["best_param_full"], + "obj_value": optim_res["best_obj_value"], + "param_obj_list": optim_res["param_obj_list"]} + + self.fitted_df = self.predict( + Data(df=df.copy())) + + def calc_with_param( + self, + param, + data): + df = data.df + assert self.value_col in df.columns + param["upper"] = param["q3"] + (param["iqr"] * param["iqr_coef"]) + param["lower"] = param["q1"] - (param["iqr"] * param["iqr_coef"]) + + df[PREDICTED_ANOMALY_COL] = ( + (df[self.value_col] < param["lower"]) | + (df[self.value_col] > param["upper"])) + + data.y_pred = df[PREDICTED_ANOMALY_COL] + + return CalcResult(data=data) + + +# This class is implemented to test the `Detector` class +class NormalDetector(Detector): + """A detector based on normal distribution. + A normal distribution if fitted to data and then any points outside + the range + ``(mu - sig * z, mu + sig * z)`` + is considered an outliar, where + + - mu : mean of the data + - sig : standard deviation of the data + - z : the coefficient used for defining the confidence interval width. + We assume ``z = stats.norm.ppf(p)`` for some ``p`` in ``(0.5, 1)`` range. + + This detector uses data to find the optimal ``p`` during fit. + The optimizer implementation is inherited from + `~greykite.detection.detector.detector.Detector` + + """ + def __init__( + self, + value_col, + is_anomaly_col=None, + reward=None, + anomaly_percent_dict=None, + param_iterable=None): + + super().__init__( + reward=reward, + anomaly_percent_dict=anomaly_percent_dict, + param_iterable=param_iterable) + + self.is_anomaly_col = is_anomaly_col + self.value_col = value_col + if param_iterable is None: + step = 0.005 + self.param_iterable = [ + {"prob_thresh": x} for x in np.arange(0.5 + step, 1 - step, step)] + + def fit( + self, + data): + df = data.df + assert self.value_col in df.columns + mu = np.mean(df[self.value_col]) + sig = np.std(df[self.value_col]) + + default_param = { + "mu": mu, + "sig": sig} + + y_true = None + data = Data( + df=df, + y_true=y_true) + + param_iterable = self.param_iterable + + optim_res = self.optimize_param( + data=data, + param_iterable=param_iterable, + default_param=default_param) + + self.fit_info = { + "param": optim_res["best_param"], + "param_full": optim_res["best_param_full"], + "obj_value": optim_res["best_obj_value"], + "param_obj_list": optim_res["param_obj_list"]} + + self.fitted_df = self.predict( + Data(df=df.copy())) + + def calc_with_param( + self, + param, + data): + df = data.df + assert self.value_col in df.columns + err = stats.norm.ppf(param["prob_thresh"]) * param["sig"] + param["upper"] = param["mu"] + err + param["lower"] = param["mu"] - err + param["err"] = err + + df[PREDICTED_ANOMALY_COL] = ( + (df[self.value_col] < param["lower"]) | + (df[self.value_col] > param["upper"])) + + data.y_pred = df[PREDICTED_ANOMALY_COL] + return CalcResult(data=data) + + +def test_detector(): + """Tests `Detector` class.""" + detector = Detector() + assert detector.reward is not None + assert detector.fit_info == {"param_full": None} + detector.fit = lambda x: 30 + assert detector.fit(1) == 30 + assert detector.fit_info == {"param_full": None} + assert detector.fitted_df is None + assert detector.predict(data=None) is None + + +def test_normal_detector(): + """Tests `NormalDetector` class. + This test is to demonstrate the usage of the `Detector` class.""" + size = 500 + np.random.seed(1317) + y = np.random.normal(loc=0.0, scale=1.0, size=size) + df = pd.DataFrame({"y": y}) + anomaly_percent_dict = {"range": (4, 6), "penalty": -1.0} + + detector = NormalDetector( + value_col="y", + anomaly_percent_dict=anomaly_percent_dict) + + reward = detector.reward + x = reward.apply(Data(y_pred=[True, True, False, False])) + assert x == -1.45 + + x = reward.apply(Data(y_pred=[True]*5 + [False]*95)) + assert x == 0 + + x = reward.apply(Data(y_pred=[True]*6 + [False]*94)) + assert x == -0.01 + + x = reward.apply(Data(y_pred=[True]*4 + [False]*96)) + assert x == -0.01 + + detector.fit(Data(df=df)) + assert detector.value_col == "y" + + param_full = detector.fit_info["param_full"] + assert round(param_full["err"], 2) == 1.89 + assert round(param_full["prob_thresh"], 3) == 0.965 + + param_obj_list = detector.fit_info["param_obj_list"] + param_obj_df = pd.DataFrame.from_records(param_obj_list) + fig = plot_lines_markers( + df=param_obj_df, + x_col="prob_thresh", + line_cols=["obj_value"]) + fig.layout.update(title="'NormalDetector' parameter search for prob_thresh") + assert fig is not None + # fig.show() + + +def test_iqr_detector(): + """Tests `TukeyDetector` class. + This test is to demonstrate the usage of the `Detector` class.""" + + def reward_func(data): + f1 = f1_score( + y_true=data.y_true, + y_pred=data.y_pred) + return f1[True] + + reward = Reward(reward_func=reward_func) + + np.random.seed(seed=1317) + normal_size = 300 + anomaly_size = 10 + anomalies = np.random.normal(loc=0, scale=5, size=anomaly_size) + df = pd.DataFrame({ + "y": list(anomalies) + list(np.random.normal(size=normal_size)), + "is_anomaly": [True]*anomaly_size + [False]*normal_size}) + + new_df = pd.DataFrame({"y": [500, -10, -100, 0, 0.1, -0.2, 0.3, 200, 8]}) + + detector = TukeyDetector( + is_anomaly_col="is_anomaly", + value_col="y", + reward=reward) + + detector.fit(data=Data(df=df)) + + param_full = detector.fit_info["param_full"] + assert round(param_full["iqr"], 2) == 1.37 + assert round(param_full["iqr_coef"], 2) == 1.8 + assert round(param_full["q1"], 3) == -0.627 + assert round(param_full["q3"], 3) == 0.739 + assert round(param_full["lower"], 2) == -3.09 + assert round(param_full["upper"], 2) == 3.20 + + param_obj_list = detector.fit_info["param_obj_list"] + param_obj_df = pd.DataFrame.from_records(param_obj_list) + fig = plot_lines_markers( + df=param_obj_df, + x_col="iqr_coef", + line_cols=["obj_value"]) + fig.layout.update(title="'TukeyDetector' parameter search for iqr_coef") + assert fig is not None + # fig.show() + + assert detector.fit_info is not None + assert detector.fitted_df is not None + pred_data = detector.predict(data=Data(df=new_df)) + y_pred = pred_data.y_pred + + assert np.allclose( + y_pred, + np.array([True]*3 + [False]*4 + [True]*2)) + + +def test_summary(): + """Tests `Detector` class summary method.""" + # Tests summary with NormalDetector + size = 500 + np.random.seed(1317) + y = np.random.normal(loc=0.0, scale=1.0, size=size) + df = pd.DataFrame({"y": y}) + anomaly_percent_dict = {"range": (4, 6), "penalty": -1.0} + + detector = NormalDetector( + value_col="y", + anomaly_percent_dict=anomaly_percent_dict) + detector.fit(Data(df=df)) + + summary = detector.summary() + assert "NormalDetector" in summary + assert "Anomaly Duration" not in summary + assert "Optimal Parameters" in summary + + # Tests error when `Detector` is not fitted. + detector = Detector() + with pytest.raises(ValueError, match="No data to summarize."): + detector.summary() diff --git a/greykite/tests/detection/detector/test_forecast_based.py b/greykite/tests/detection/detector/test_forecast_based.py new file mode 100644 index 0000000..d6eba97 --- /dev/null +++ b/greykite/tests/detection/detector/test_forecast_based.py @@ -0,0 +1,65 @@ +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.testing_utils import sim_anomalous_data_and_forecasts +from greykite.detection.detector.data import ForecastDetectorData as Data +from greykite.detection.detector.forecast_based import ForecastBasedDetector +from greykite.detection.detector.reward import Reward + + +def test_forecast_based_detector(): + data = sim_anomalous_data_and_forecasts( + sample_size=200, + anomaly_num=20, + seed=1317) + + df = data["df"] + forecast_dfs = data["forecast_dfs"] + + def reward_func(y_true, y_pred): + f1 = f1_score( + y_true=y_true, + y_pred=y_pred) + return f1[True] + + detector = ForecastBasedDetector( + reward=Reward(reward_func), + value_cols=["y"], + pred_cols=["y_pred"], + is_anomaly_col="is_anomaly", + join_cols=["ts"]) + + # Checks if the attributes are inherited from the `Detector` class. + assert detector.data is None + assert detector.fitted_df is None + assert detector.fit_info == {"param_full": None} + + joined_dfs = detector.join_with_forecasts( + df=df, + forecast_dfs=forecast_dfs) + + assert len(joined_dfs) == 2 + assert list(joined_dfs.keys()) == [0, 1] + for joined_df in joined_dfs.values(): + assert "y_pred" in joined_df.columns + + assert detector.data is None + + detector.fit() + # Since `fit` is not implemented and inherited from base class: `Detector` + # it does not do anything + assert detector.data is None + + data = Data(df=df, forecast_dfs=forecast_dfs) + assert data.joined_dfs is None + detector.prep_df_for_predict(data) + + assert data.joined_dfs is not None + + # Checks to see if the attached `joined_dfs` to `data` + # is the same as previously calculated `joined_dfs` in the above + assert len(data.joined_dfs) == 2 + assert list(data.joined_dfs.keys()) == [0, 1] + + for i in [0, 1]: + joined_df_direct = joined_dfs[i] + joined_df_from_detector = data.joined_dfs[i] + assert joined_df_direct.equals(joined_df_from_detector) diff --git a/greykite/tests/detection/detector/test_greykite.py b/greykite/tests/detection/detector/test_greykite.py new file mode 100644 index 0000000..06e9717 --- /dev/null +++ b/greykite/tests/detection/detector/test_greykite.py @@ -0,0 +1,1221 @@ +import datetime + +import numpy as np +import pandas as pd +import plotly.express as px +import pytest + +from greykite.common.constants import ACTUAL_COL +from greykite.common.constants import ANOMALY_COL +from greykite.common.constants import END_TIME_COL +from greykite.common.constants import PREDICTED_ANOMALY_COL +from greykite.common.constants import PREDICTED_COL +from greykite.common.constants import PREDICTED_LOWER_COL +from greykite.common.constants import PREDICTED_UPPER_COL +from greykite.common.constants import START_TIME_COL +from greykite.common.constants import TIME_COL +from greykite.common.constants import VALUE_COL +from greykite.common.testing_utils import generate_df_for_tests +from greykite.common.testing_utils_anomalies import contaminate_df_with_anomalies +from greykite.common.viz.timeseries_annotate import plot_anomalies_over_forecast_vs_actual +from greykite.common.viz.timeseries_annotate import plot_lines_markers +from greykite.common.viz.timeseries_annotate import plt_compare_series_annotations +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.ad_evaluation import precision_score +from greykite.detection.common.ad_evaluation import recall_score +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.config import F1 +from greykite.detection.detector.config import ADConfig +from greykite.detection.detector.data import DetectorData +from greykite.detection.detector.greykite import DETECTOR_PREDICT_COLS +from greykite.detection.detector.greykite import GreykiteDetector +from greykite.detection.detector.reward import Reward +from greykite.framework.templates.autogen.forecast_config import EvaluationPeriodParam +from greykite.framework.templates.autogen.forecast_config import ForecastConfig +from greykite.framework.templates.autogen.forecast_config import MetadataParam +from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam + + +# Evaluation metrics used in the tests. +# F1 score for the True label: +calc_f1 = partial_return(f1_score, True) +# Precision score, for the True label: +calc_precision = partial_return(precision_score, True) +# Recall score for the True label: +calc_recall = partial_return(recall_score, True) + +# Boolean to decide if figures are to be shown or not when this test file is run. +# Turn this on when changes are made and include in code reviews. +# Compare before and after the change to confirm everything is as expected. +FIG_SHOW = False + + +@pytest.fixture(scope="module") +def hourly_data(): + """Generates data for testing `GreykiteDetector`.""" + + df = generate_df_for_tests( + freq="H", + train_start_date=datetime.datetime(2020, 1, 1), + intercept=50, + train_frac=0.99, + periods=24*28)["df"] + + anomaly_block_list = [ + np.arange(100, 105), + np.arange(200, 210), + np.arange(310, 315), + np.arange(400, 410), + np.arange(460, 480), + np.arange(601, 610), + np.arange(620, 625), + np.arange(650, 654), + np.arange(666, 667)] + + # Contaminates `df` with anomalies at the specified locations, + # via `anomaly_block_list`. + # If original value is y, the anomalous value is: (1 +/- delta)*y. + df = contaminate_df_with_anomalies( + df=df, + anomaly_block_list=anomaly_block_list, + delta_range_lower=0.1, + delta_range_upper=0.2, + value_col=VALUE_COL, + min_admissible_value=None, + max_admissible_value=None) + + fig = plot_lines_markers( + df=df, + x_col=TIME_COL, + line_cols=["y", "contaminated_y"]) + + fig.layout.update(title="Generation of hourly anomalous data") + fig.update_yaxes() + assert fig is not None + if FIG_SHOW: + fig.show() + + df = df.drop(columns=[VALUE_COL]).rename( + columns={"contaminated_y": VALUE_COL}) + + df[ANOMALY_COL] = (df[ANOMALY_COL] == 1) + + assert len(df) == (28 * 24) + assert sum(df[ANOMALY_COL]) == 69 + + train_size = int(26 * 24) + df_train = df[:train_size].reset_index(drop=True) + df_test = df[train_size:].reset_index(drop=True) + + assert len(df_train) == 26 * 24 + assert len(df_test) == 2 * 24 + + return { + "df_train": df_train, + "df_test": df_test, + "df": df} + + +@pytest.fixture(scope="module") +def daily_data(): + """Generates data for testing `GreykiteDetector`.""" + + df = generate_df_for_tests( + freq="D", + train_start_date=datetime.datetime(2020, 1, 1), + intercept=50, + train_frac=0.99, + periods=200)["df"] + + anomaly_block_list = [ + np.arange(10, 15), + np.arange(33, 35), + np.arange(60, 65), + np.arange(82, 85), + np.arange(94, 98), + np.arange(100, 105), + np.arange(111, 113), + np.arange(125, 130), + np.arange(160, 163), + np.arange(185, 190), + np.arange(198, 200)] + + # Contaminates `df` with anomalies at the specified locations, + # via `anomaly_block_list`. + # If original value is y, the anomalous value is: (1 +/- delta)*y. + df = contaminate_df_with_anomalies( + df=df, + anomaly_block_list=anomaly_block_list, + delta_range_lower=0.25, + delta_range_upper=0.5, + value_col=VALUE_COL, + min_admissible_value=None, + max_admissible_value=None) + + fig = plot_lines_markers( + df=df, + x_col=TIME_COL, + line_cols=["y", "contaminated_y"]) + + fig.layout.update(title="Generation of daily anomalous data") + fig.update_yaxes() + assert fig is not None + if FIG_SHOW: + fig.show() + + df = df.drop(columns=[VALUE_COL]).rename( + columns={"contaminated_y": VALUE_COL}) + + df[ANOMALY_COL] = (df[ANOMALY_COL] == 1) + + assert len(df) == 200 + assert sum(df[ANOMALY_COL]) == 41 + + train_size = int(100) + df_train = df[:train_size].reset_index(drop=True) + df_test = df[train_size:].reset_index(drop=True) + + assert len(df_train) == 100 + assert len(df_test) == 100 + + return { + "df_train": df_train, + "df_test": df_test, + "df": df} + + +@pytest.fixture(scope="module") +def forecast_config_info_hourly(): + """Generates ``forecast_config`` for testing.""" + metadata = MetadataParam( + time_col=TIME_COL, + value_col=VALUE_COL, + train_end_date=None, + anomaly_info=None) + + evaluation_period = EvaluationPeriodParam( + test_horizon=0, + cv_max_splits=0) + + model_components = ModelComponentsParam( + autoregression={ + "autoreg_dict": { + "lag_dict": {"orders": [24]}, + "agg_lag_dict": None}}, + events={ + "auto_holiday": False, + "holiday_lookup_countries": ["US"], + "holiday_pre_num_days": 2, + "holiday_post_num_days": 2, + "daily_event_df_dict": None}, + custom={ + "extra_pred_cols": ["dow_hr"], + "min_admissible_value": 0, + "normalize_method": "zero_to_one"}) + + return ForecastConfig( + model_template="SILVERKITE_EMPTY", + metadata_param=metadata, + coverage=None, + evaluation_period_param=evaluation_period, + forecast_horizon=1, + model_components_param=model_components) + + +@pytest.fixture(scope="module") +def forecast_config_info_daily(): + """Generates ``forecast_config`` for testing.""" + metadata = MetadataParam( + time_col=TIME_COL, + value_col=VALUE_COL, + train_end_date=None, + anomaly_info=None) + + evaluation_period = EvaluationPeriodParam( + test_horizon=0, + cv_max_splits=0) + + model_components = ModelComponentsParam( + autoregression={ + "autoreg_dict": { + "lag_dict": {"orders": [7]}, + "agg_lag_dict": None}}, + events={ + "auto_holiday": False, + "holiday_lookup_countries": ["US"], + "holiday_pre_num_days": 2, + "holiday_post_num_days": 2, + "daily_event_df_dict": None}, + custom={ + "extra_pred_cols": ["dow"], + "min_admissible_value": 0, + "normalize_method": "zero_to_one"}) + + return ForecastConfig( + model_template="SILVERKITE_EMPTY", + metadata_param=metadata, + coverage=None, + evaluation_period_param=evaluation_period, + forecast_horizon=1, + model_components_param=model_components) + + +def test_greykite_init(): + """Tests ``GreykiteDetector`` initialization.""" + detector = GreykiteDetector() + assert detector.forecast_config is not None + assert detector.ad_config is not None + assert detector.reward is not None + + +def test_greykite_detector_hourly_f1(hourly_data, forecast_config_info_hourly): + """Tests ``GreykiteDetector`` with F1 score as reward on hourly data.""" + df_train = hourly_data["df_train"] + df_test = hourly_data["df_test"] + df = hourly_data["df"] + + forecast_config = forecast_config_info_hourly + ad_config = ADConfig( + volatility_features_list=[["dow"], ["hour"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=None, + variance_scaling=False) + + train_data = DetectorData(df=df_train) + + def reward_func(data): + return calc_f1( + y_true=data.y_true, + y_pred=data.y_pred) + + reward = Reward(reward_func) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + + detector.fit(data=train_data) + + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.99, + "volatility_features": ["dow"]} + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + assert list(param_eval_df.columns) == ["coverage", "volatility_features", "obj_value"] + + param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) + fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' result of parameter search: f1, hourly data") + assert fig is not None + if FIG_SHOW: + fig.show() + + test_data = DetectorData( + df=df_test, + y_true=df_test[ANOMALY_COL]) + + test_data = detector.predict(test_data) + test_obj_value = detector.reward.apply(test_data) + assert test_obj_value == pytest.approx(0.70, 0.01) + + test_recall = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + test_precision = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + assert test_recall == pytest.approx(1.00, 0.001) + assert test_precision == pytest.approx(0.545, 0.001) + + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + pred_df = test_data.pred_df + + # Checks if we get the expected columns in the fit / prediction data. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + fit_pred_df = pd.concat([fit_df, pred_df], axis=0).reset_index(drop=True) + fit_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + + fig = plt_compare_series_annotations( + df=fit_pred_df, + x_col=TIME_COL, + actual_col=ACTUAL_COL, + actual_label_col=ANOMALY_COL, + forecast_label_col=PREDICTED_ANOMALY_COL, + keep_cols=[PREDICTED_LOWER_COL, PREDICTED_UPPER_COL], + forecast_col=PREDICTED_COL, + standardize_col=None, + title="test_greykite_detector_hourly_f1") + + fig.add_vline( + x=fit_df[TIME_COL].max(), + line_width=1, + line_dash="dash", + line_color="green") + + fig.add_annotation( + x=fit_df[TIME_COL].max(), + y=fit_pred_df[ACTUAL_COL].max(), + text="end of training") + assert fig is not None + if FIG_SHOW: + fig.show() + + +def test_greykite_detector_hourly_anomaly_percent( + hourly_data, + forecast_config_info_hourly): + """Tests ``GreykiteDetector`` with user-specified anomaly percent as reward.""" + df_train = hourly_data["df_train"] + df_test = hourly_data["df_test"] + df = hourly_data["df"] + + forecast_config = forecast_config_info_hourly + ad_config = ADConfig( + volatility_features_list=[["dow"], ["hour"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=10.0, + variance_scaling=False) + + train_data = DetectorData(df=df_train) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=None) + + detector.fit(data=train_data) + + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.99, + "volatility_features": ["dow"]} + assert {TIME_COL, ACTUAL_COL, PREDICTED_COL, + PREDICTED_LOWER_COL, PREDICTED_UPPER_COL, + PREDICTED_ANOMALY_COL}.issubset(set(detector.fitted_df.columns)) + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + assert list(param_eval_df.columns) == ["coverage", "volatility_features", "obj_value"] + + param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) + fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' res. of param search: reward=anomaly_percent, hourly data") + assert fig is not None + if FIG_SHOW: + fig.show() + + test_data = DetectorData( + df=df_test, + y_true=df_test[ANOMALY_COL]) + + test_data = detector.predict(test_data) + test_obj_value = detector.reward.apply(test_data) + assert test_obj_value == pytest.approx(-1.129, 0.001) + + test_recall = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + test_precision = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + assert test_recall == pytest.approx(1.00, 0.001) + assert test_precision == pytest.approx(0.545, 0.001) + + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + pred_df = test_data.pred_df + + # Checks if we get the expected columns in the fit / prediction data. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + fit_pred_df = pd.concat([fit_df, pred_df], axis=0).reset_index(drop=True) + fit_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + + fig = plt_compare_series_annotations( + df=fit_pred_df, + x_col=TIME_COL, + actual_col=ACTUAL_COL, + actual_label_col=ANOMALY_COL, + forecast_label_col=PREDICTED_ANOMALY_COL, + keep_cols=[PREDICTED_LOWER_COL, PREDICTED_UPPER_COL], + forecast_col=PREDICTED_COL, + standardize_col=None, + title="test_greykite_detector_hourly_anomaly_percent") + + fig.add_vline( + x=fit_df[TIME_COL].max(), + line_width=1, + line_dash="dash", + line_color="green") + + fig.add_annotation( + x=fit_df[TIME_COL].max(), + y=fit_pred_df[ACTUAL_COL].max(), + text="end of training") + assert fig is not None + if FIG_SHOW: + fig.show() + + # Tests plot method + fig = detector.plot(title="test_greykite_detector_hourly_anomaly_percent") + assert fig is not None + if FIG_SHOW: + fig.show() + + +def test_greykite_detector_daily_f1( + daily_data, + forecast_config_info_daily): + """Tests ``GreykiteDetector`` with F1 score as reward. + Also tests if specifying objective through `ADConfig` yields the exact same result.""" + df_train = daily_data["df_train"] + df_test = daily_data["df_test"] + df = daily_data["df"] + + forecast_config = forecast_config_info_daily + ad_config = ADConfig( + volatility_features_list=[["dow"], ["is_weekend"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + variance_scaling=True) + + train_data = DetectorData(df=df_train) + + def reward_func(data): + return calc_f1( + y_true=data.y_true, + y_pred=data.y_pred) + + reward = Reward(reward_func) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + + detector.fit(data=train_data) + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.99, + "volatility_features": ["is_weekend"]} + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + + assert list(param_eval_df.columns) == [ + "coverage", + "volatility_features", + "obj_value"] + + param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) + fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' result of parameter search: reward=f1, daily data") + assert fig is not None + if FIG_SHOW: + fig.show() + + test_data = DetectorData( + df=df_test.copy(), + y_true=df_test[ANOMALY_COL]) + + test_data = detector.predict(test_data) + test_obj_value = detector.reward.apply(test_data) + assert test_obj_value == pytest.approx(0.95454, 0.001) + + test_recall = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + test_precision = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + assert test_recall == pytest.approx(0.95454, 0.001) + assert test_precision == pytest.approx(0.95454, 0.001) + + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + pred_df = test_data.pred_df + # Checks if we get the expected columns in the fit / prediction data. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + + fit_obj_value = detector.reward.apply(fit_data) + assert fit_obj_value == pytest.approx(1.0, 0.001) + + fit_recall = calc_recall( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + fit_precision = calc_precision( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + assert fit_recall == pytest.approx(1.0, 0.001) + assert fit_precision == pytest.approx(1.0, 0.001) + + fit_pred_df = pd.concat([fit_df, pred_df], axis=0).reset_index(drop=True) + fit_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + + fig = plt_compare_series_annotations( + df=fit_pred_df, + x_col=TIME_COL, + actual_col=ACTUAL_COL, + actual_label_col=ANOMALY_COL, + forecast_label_col=PREDICTED_ANOMALY_COL, + keep_cols=[PREDICTED_LOWER_COL, PREDICTED_UPPER_COL], + forecast_col=PREDICTED_COL, + standardize_col=None, + title="test_greykite_detector_detector_daily_f1") + + fig.add_vline( + x=fit_df[TIME_COL].max(), + line_width=1, + line_dash="dash", + line_color="green") + + fig.add_annotation( + x=fit_df[TIME_COL].max(), + y=fit_pred_df[ACTUAL_COL].max(), + text="end of training") + assert fig is not None + if FIG_SHOW: + fig.show() + + fig = plot_anomalies_over_forecast_vs_actual( + df=fit_pred_df, + time_col=TIME_COL, + actual_col=ACTUAL_COL, + predicted_col=PREDICTED_COL, + predicted_anomaly_col=PREDICTED_ANOMALY_COL, + anomaly_col=ANOMALY_COL, + marker_opacity=0.6, + predicted_anomaly_marker_color="black", + anomaly_marker_color="yellow", + predicted_lower_col=PREDICTED_LOWER_COL, + predicted_upper_col=PREDICTED_UPPER_COL, + train_end_date=fit_df[TIME_COL].max()) + assert fig is not None + if FIG_SHOW: + fig.show() + + # Now we check if specifying objective through `ADConfig` yields the same results. + # We do that by over-writing the objective from None to `F1` + # On the other hand we set the reward to be None. + # We calculate same quantities as the above and assign to variables with f"{quantity}_new". + # Then we compare those new quantities to the quantities obtained already. + assert ad_config.objective is None + ad_config.objective = F1 + reward = None + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + + detector.fit(data=train_data) + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.99, + "volatility_features": ["is_weekend"]} + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + + assert list(param_eval_df.columns) == [ + "coverage", + "volatility_features", + "obj_value"] + + test_data = DetectorData( + df=df_test.copy(), + y_true=df_test[ANOMALY_COL]) + + test_data = detector.predict(test_data) + test_obj_value_new = detector.reward.apply(test_data) + assert test_obj_value_new == pytest.approx(test_obj_value, 0.001) + + test_recall_new = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + test_precision_new = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + assert test_recall_new == pytest.approx(test_recall, 0.001) + assert test_precision_new == pytest.approx(test_precision, 0.001) + + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + pred_df = test_data.pred_df + # Checks if we get the expected columns in the fit / prediction data. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + + fit_obj_value_new = detector.reward.apply(fit_data) + assert fit_obj_value_new == pytest.approx(fit_obj_value, 0.001) + + fit_recall_new = calc_recall( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + fit_precision_new = calc_precision( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + assert fit_recall_new == pytest.approx(fit_recall, 0.001) + assert fit_precision_new == pytest.approx(fit_precision, 0.001) + + +def test_greykite_detector_daily_outlier( + daily_data, + forecast_config_info_daily): + """Tests ``GreykiteDetector`` with data injected with large outlier. + It is worth noting the optimal params and test values have not changed + dramatically compared to the case without the injected outlier in this test: + `test_greykite_detector_detector_daily_f1`. + """ + df_train = daily_data["df_train"].copy() + df_test = daily_data["df_test"] + df = daily_data["df"] + # Creates a very large outlier. + df_train.loc[1, "y"] = 10 * max(abs(df["y"])) + fig = plot_lines_markers(df=df_train, x_col=TIME_COL, line_cols=["y"]) + if FIG_SHOW: + fig.show() + + forecast_config = forecast_config_info_daily + ad_config = ADConfig( + volatility_features_list=[["dow"], ["is_weekend"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=None, + variance_scaling=True) + + train_data = DetectorData(df=df_train) + + def reward_func(data): + return calc_f1( + y_true=data.y_true, + y_pred=data.y_pred) + + reward = Reward(reward_func) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + + detector.fit(data=train_data) + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.99, + "volatility_features": ["is_weekend"]} + + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + + assert list(param_eval_df.columns) == [ + "coverage", + "volatility_features", + "obj_value"] + + param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) + fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' result of parameter search: reward=f1, daily data") + assert fig is not None + if FIG_SHOW: + fig.show() + + test_data = DetectorData( + df=df_test, + y_true=df_test[ANOMALY_COL]) + + test_data = detector.predict(test_data) + test_obj_value = detector.reward.apply(test_data) + + test_recall = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + test_precision = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + pred_df = test_data.pred_df + # Checks if we get the expected columns in the fit / prediction data. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + + fit_obj_value = detector.reward.apply(fit_data) + + fit_recall = calc_recall( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + fit_precision = calc_precision( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + assert test_obj_value == pytest.approx(0.95, 0.01) + assert fit_obj_value == pytest.approx(1.0, 0.01) + assert test_recall == pytest.approx(0.95, 0.01) + assert test_precision == pytest.approx(0.95, 0.01) + assert fit_recall == pytest.approx(1.0, 0.01) + assert fit_precision == pytest.approx(1.0, 0.01) + + fit_pred_df = pd.concat([fit_df, pred_df], axis=0).reset_index(drop=True) + fit_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + + fig = plt_compare_series_annotations( + df=fit_pred_df, + x_col=TIME_COL, + actual_col=ACTUAL_COL, + actual_label_col=ANOMALY_COL, + forecast_label_col=PREDICTED_ANOMALY_COL, + keep_cols=[PREDICTED_LOWER_COL, PREDICTED_UPPER_COL], + forecast_col=PREDICTED_COL, + standardize_col=None, + title="test_greykite_detector_detector_daily_f1_outlier") + + fig.add_vline( + x=fit_df[TIME_COL].max(), + line_width=1, + line_dash="dash", + line_color="green") + + fig.add_annotation( + x=fit_df[TIME_COL].max(), + y=fit_pred_df[ACTUAL_COL].max(), + text="end of training") + assert fig is not None + if FIG_SHOW: + fig.show() + + # Tests plot method. + fig = detector.plot( + phase="train", + title="test_greykite_detector_detector_daily_f1_outlier - fit phase") + assert fig is not None + if FIG_SHOW: + fig.show() + + fig = detector.plot(title="test_greykite_detector_detector_daily_f1_outlier - predict phase") + assert fig is not None + if FIG_SHOW: + fig.show() + + +def test_greykite_detector_daily_f1_with_ape_filter( + daily_data, + forecast_config_info_daily): + """Tests ``GreykiteDetector`` on daily data with APE filter and F1 score as reward.""" + df_train = daily_data["df_train"] + df_test = daily_data["df_test"] + df = daily_data["df"] + + forecast_config = forecast_config_info_daily + ad_config = ADConfig( + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=None, + ape_grid=[0, 20, 50], + variance_scaling=True) + + train_data = DetectorData(df=df_train) + + def reward_func(data): + return calc_f1( + y_true=data.y_true, + y_pred=data.y_pred) + + reward = Reward(reward_func) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + + detector.fit(data=train_data) + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.5, + "volatility_features": [], + "absolute_percent_error": 20} + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + + assert list(param_eval_df.columns) == [ + "coverage", + "volatility_features", + "absolute_percent_error", + "obj_value"] + + param_eval_df["absolute_percent_error"] = param_eval_df["absolute_percent_error"].map(str) + fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="absolute_percent_error", + title="'GreykiteDetector' result of parameter search: reward=f1, filter=ape, daily data.") + assert fig is not None + if FIG_SHOW: + fig.show() + + test_data = DetectorData( + df=df_test, + y_true=df_test[ANOMALY_COL]) + + test_data = detector.predict(test_data) + test_obj_value = detector.reward.apply(test_data) + assert test_obj_value == pytest.approx(0.97, 0.01) + + test_recall = calc_recall( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + test_precision = calc_precision( + y_true=test_data.y_true, + y_pred=test_data.y_pred) + + assert test_recall == pytest.approx(0.95454, 0.001) + assert test_precision == pytest.approx(1.00, 0.001) + + fit_data = detector.fit_info["best_calc_result"].data + fit_df = fit_data.pred_df + pred_df = test_data.pred_df + # Checks if we get the expected columns in the fit / prediction data. + assert list(pred_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + assert list(fit_df.columns) == DETECTOR_PREDICT_COLS + [ANOMALY_COL] + + fit_obj_value = detector.reward.apply(fit_data) + assert fit_obj_value == pytest.approx(1.00, 0.01) + + fit_recall = calc_recall( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + fit_precision = calc_precision( + y_true=fit_data.y_true, + y_pred=fit_data.y_pred) + + assert fit_recall == pytest.approx(1.00, 0.01) + assert fit_precision == pytest.approx(1.00, 0.01) + + fit_pred_df = pd.concat([fit_df, pred_df], axis=0).reset_index(drop=True) + fit_pred_df[ANOMALY_COL] = df[ANOMALY_COL] + + fig = plt_compare_series_annotations( + df=fit_pred_df, + x_col=TIME_COL, + actual_col=ACTUAL_COL, + actual_label_col=ANOMALY_COL, + forecast_label_col=PREDICTED_ANOMALY_COL, + keep_cols=[PREDICTED_LOWER_COL, PREDICTED_UPPER_COL], + forecast_col=PREDICTED_COL, + standardize_col=None, + title="test_greykite_detector_with_ape_filter_daily_f1") + + fig.add_vline( + x=fit_df[TIME_COL].max(), + line_width=1, + line_dash="dash", + line_color="green") + + fig.add_annotation( + x=fit_df[TIME_COL].max(), + y=fit_pred_df[ACTUAL_COL].max(), + text="end of training") + assert fig is not None + if FIG_SHOW: + fig.show() + + fig = plot_anomalies_over_forecast_vs_actual( + df=fit_pred_df, + time_col=TIME_COL, + actual_col=ACTUAL_COL, + predicted_col=PREDICTED_COL, + predicted_anomaly_col=PREDICTED_ANOMALY_COL, + anomaly_col=ANOMALY_COL, + marker_opacity=0.6, + predicted_anomaly_marker_color="black", + anomaly_marker_color="yellow", + predicted_lower_col=PREDICTED_LOWER_COL, + predicted_upper_col=PREDICTED_UPPER_COL, + train_end_date=fit_df[TIME_COL].max()) + assert fig is not None + if FIG_SHOW: + fig.show() + + +def test_greykite_detector_daily_anomaly_at_df_end(daily_data, forecast_config_info_daily): + """Tests ``GreykiteDetector`` when anomaly is at the end of the training data.""" + df_train = daily_data["df_train"].copy() + # Adds anomalies at the end of the training data. + df_train["y"][-4:] = np.NaN + fig = plot_lines_markers(df=df_train, x_col=TIME_COL, line_cols=["y"]) + if FIG_SHOW: + fig.show() + + forecast_config = forecast_config_info_daily + ad_config = ADConfig( + volatility_features_list=[["dow"], ["is_weekend"]], + coverage_grid=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=None, + variance_scaling=True) + + train_data = DetectorData(df=df_train) + + def reward_func(data): + return calc_f1( + y_true=data.y_true, + y_pred=data.y_pred) + + reward = Reward(reward_func) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + + detector.fit(data=train_data) + # Checks optimal parameter. + assert detector.fit_info["param"] == { + "coverage": 0.99, + "volatility_features": ["is_weekend"]} + + # Checks parameter grid. + param_obj_list = detector.fit_info["param_obj_list"] + param_eval_df = pd.DataFrame.from_records(param_obj_list) + + assert list(param_eval_df.columns) == [ + "coverage", + "volatility_features", + "obj_value"] + + param_eval_df["volatility_features"] = param_eval_df["volatility_features"].map(str) + fig = px.line( + param_eval_df, + x="coverage", + y="obj_value", + color="volatility_features", + title="'GreykiteDetector' result of parameter search: reward=f1, daily data") + assert fig is not None + if FIG_SHOW: + fig.show() + + +def test_merge_anomaly_info(): + """Tests ``merge_anomaly_info`` method.""" + periods = 10 + df = pd.DataFrame({ + TIME_COL: pd.date_range(start="2020-01-01", periods=periods), + VALUE_COL: range(periods), + ANOMALY_COL: [0, 0, 1, 1, 0, 0, 0, 0, 0, 0] # Anomalies on 2020-01-03 and 2020-01-04 + }) + y_true = [0, 0, 1, 1, 0, 0, 0, 0, 0, 1] # Anomalies on 2020-01-03, 2020-01-04 and 2020-01-10 + anomaly_df = pd.DataFrame({ + START_TIME_COL: ["2020-01-04"], + END_TIME_COL: ["2020-01-06"]}) + data = DetectorData( + df=df, + y_true=y_true, + anomaly_df=anomaly_df) + + detector = GreykiteDetector() + fit_data = detector.merge_anomaly_info(data, freq="D") + # Checks `anomaly_df` in `fit_data`. + # We expect combined anomalies from 2020-01-03 to 2020-01-06, and on 2020-01-10. + expected_anomaly_df = pd.DataFrame({ + START_TIME_COL: pd.to_datetime(["2020-01-03", "2020-01-10"]), + END_TIME_COL: pd.to_datetime(["2020-01-06", "2020-01-10"])}) + assert fit_data.anomaly_df.equals(expected_anomaly_df) + # Checks `y_true` in `fit_data`. + expected_y_true = pd.Series([0, 0, 1, 1, 1, 1, 0, 0, 0, 1]).astype(bool) + assert fit_data.y_true.equals(expected_y_true) + # Checks `df` in `fit_data`. + assert fit_data.df[ANOMALY_COL].equals(expected_y_true) + assert f"adjusted_{VALUE_COL}" in fit_data.df.columns + + +def test_greykite_detector_hourly_anomaly_pickup(): + """Tests that anomaly data is picked up properly by the ``GreykiteDetector``. + + Anomalies are injected to the training data and the anomaly info is passed to + ``GreykiteDetector`` via `anomaly_df`. + + We check that the anomaly info is picked up by the detector during training + and the future forecasts are unaffected by the anomaly values. + """ + metadata = MetadataParam(freq="H") + + evaluation_period = EvaluationPeriodParam( + test_horizon=0, + cv_max_splits=0) + + # This forecast configs includes the median of past three weeks as an important predictor. + # Therefore, if that median is off due to anomalies, the prediction can be off. + # In this test, we prove that by labeling those points and passing that information to the model + # via `anomaly_df`, we can avoid the model to fit to those values. + model_components = ModelComponentsParam( + growth={ + "growth_term": "linear"}, + hyperparameter_override={ + "input__response__null__impute_algorithm": "ts_interpolate", + "input__response__null__impute_params": { + "orders": [168, 336, 504]}}, + seasonality={ + "monthly_seasonality": 2, + "yearly_seasonality": 7}, + events={ + "holiday_lookup_countries": ["US"], + "holiday_pre_num_days": 2, + "holiday_post_num_days": 2}, + autoregression={ + "autoreg_dict": { + "agg_lag_dict": { + "orders_list": [[168, 336, 504]], + "agg_func": "median"}}}, + custom={ + "fit_algorithm_dict": {"fit_algorithm": "ridge"}, + "min_admissible_value": 0, + "normalize_method": "zero_to_one", + "extra_pred_cols": [ + "is_event:is_weekend:C(hour)", + "dow_hr", + "y_avglag_168_336_504*dow_hr", + "y_avglag_168_336_504*sin1_ct1_yearly", + "y_avglag_168_336_504*cos1_ct1_yearly", + "y_avglag_168_336_504*sin1_tom_monthly", + "y_avglag_168_336_504*cos1_tom_monthly", + "us_dst*dow_hr"]}) + + forecast_config = ForecastConfig( + model_template="SILVERKITE_EMPTY", + metadata_param=metadata, + coverage=0.95, + evaluation_period_param=evaluation_period, + forecast_horizon=1, + model_components_param=model_components) + + ad_config = ADConfig( + target_anomaly_percent=0.35, + volatility_features_list=[["hour"]], + coverage_grid=[0.996], + sape_grid=[5.0, 10.0, 15.0, 20.0, 30.0, 40.0, 50.0, 60.0], + variance_scaling=True) + + input_data = generate_df_for_tests( + freq="H", + train_start_date=datetime.datetime(2020, 1, 1), + intercept=50, + train_frac=0.99, + periods=24*60) + df_train = input_data["train_df"] + df_test = input_data["test_df"] + + # Injects 3 anomalies at the same time of the week (0 values). + # Note that if model does not have access to these anomaly labels, + # the prediction can be off. + df_train.loc[[24*28, 24*35, 24*42], "y"] = 0 + # Passes the anomaly info to the `detector` via `anomaly_df`. + anomaly_df = pd.DataFrame({ + START_TIME_COL: ["2020-01-29 00:00:00", "2020-02-05 00:00:00", "2020-02-12 00:00:00"], + END_TIME_COL: ["2020-01-29 00:00:00", "2020-02-05 00:00:00", "2020-02-12 00:00:00"]}) + + fig = plot_lines_markers( + df=df_train, + x_col=TIME_COL, + line_cols=["y"]) + fig.layout.update(title="Generation of daily anomalous data") + fig.update_yaxes() + assert fig is not None + if FIG_SHOW: + fig.show() + + # Trains `GreykiteDetector`. + train_data = DetectorData(df=df_train, anomaly_df=anomaly_df) + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=None) + detector.fit(data=train_data) + # We expect the anomalies to be picked up by the `detector` during training. + if FIG_SHOW: + detector.plot(phase="train") + + # Predicts on test data. + test_data = DetectorData( + df=df_test) + detector.predict(test_data) + fig = detector.plot() + assert fig is not None + # Adds a vertical line at the next hourly datapoint. + # The forecast at this data point should not be close to 0. + # This is because even though the median aggregated lag across the previous 3 weeks is 0, + # the anomaly labels help the model not fit those values. + fig.add_vline(x="2020-02-19 00:00:00", line_dash="dash") + if FIG_SHOW: + fig.show() + + +def test_summary(daily_data, forecast_config_info_daily): + """Tests ``summary`` method.""" + df_train = daily_data["df_train"] + forecast_config = forecast_config_info_daily + ad_config = ADConfig( + coverage_grid=[0.2, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.9, 0.95, 0.99, 0.999], + target_anomaly_percent=None, + ape_grid=[0, 20, 50], + variance_scaling=True) + + train_data = DetectorData(df=df_train) + + def reward_func(data): + return calc_f1( + y_true=data.y_true, + y_pred=data.y_pred) + reward = Reward(reward_func) + + detector = GreykiteDetector( + forecast_config=forecast_config, + ad_config=ad_config, + reward=reward) + detector.fit(data=train_data) + + summary = detector.summary() + assert "Anomaly Detection Model Summary" in summary + assert "Average Anomaly Duration" in summary + assert "Precision" in summary + assert "Recall" in summary + assert "Optimal Parameters" in summary + # Checks if the summary contains the forecast model summary. + assert "Residuals" in summary + assert "Multiple R-squared" in summary diff --git a/greykite/tests/detection/detector/test_optimizer.py b/greykite/tests/detection/detector/test_optimizer.py new file mode 100644 index 0000000..8355ba4 --- /dev/null +++ b/greykite/tests/detection/detector/test_optimizer.py @@ -0,0 +1,112 @@ +import numpy as np +import pandas as pd + +from greykite.common.viz.timeseries_annotate import plot_lines_markers +from greykite.detection.detector.optimizer import CalcResult +from greykite.detection.detector.optimizer import Optimizer +from greykite.detection.detector.reward import Reward + + +def test_optimizer(): + """Tests ``Optimizer`` class.""" + optimizer = Optimizer() + assert optimizer.reward is None + assert optimizer.fit_info == {"param_full": None} + + +def test_optimizer1(): + """Tests ``Optimizer`` class. + This is a simple test where the optimization is used to find the roots + of the polynomial: ``x*2 + 2*x + 1``. + In this simple, example the optimization does not depend on ``data``. + """ + + def distance_to_zero(x): + """This is the reward function which checks how close to zero + the result is.""" + return -abs(x) + + reward = Reward(reward_func=distance_to_zero) + + optimizer = Optimizer( + reward=reward, + param_iterable=[{"x": x} for x in np.arange(-5, 5, 0.1)]) + + def calc_with_param(param, data=None): + """This is the calculation step with ``param`` and ``data``. + In this simple example, this does not depend on ``data.`` + """ + x = param["x"] + return CalcResult(data=x**2 + 2*x + 1, model=None) + + optimizer.calc_with_param = calc_with_param + + # ``data`` is not needed to be passed below + # because it does not appear in ``calc_with_param`` definition + optim_res = optimizer.optimize_param() + + best_param = optim_res["best_param"] + best_param_full = optim_res["best_param_full"] + assert round(best_param["x"], 2) == -1.00 + assert round(best_param_full["x"], 2) == -1.00 + + param_obj_list = optim_res["param_obj_list"] + param_obj_df = pd.DataFrame.from_records(param_obj_list) + + fig = plot_lines_markers( + df=param_obj_df, + x_col="x", + line_cols=["obj_value"]) + fig.layout.update(title="`Optimizer` parameter search for roots of `x**2 + 2*x + 1`") + assert fig is not None + # fig.show() + + +def test_optimizer2(): + """Tests ``Optimizer`` class. + This is a slightly more complex test than above + where the optimization is used to find the roots + of the polynomial: ``x*p + p*x + 1`` + where ``p`` is is determined by ``data``. + """ + + def distance_to_zero(x): + """This is the reward function which checks how close to zero + the result is.""" + return -abs(x) + + reward = Reward(reward_func=distance_to_zero) + + optimizer = Optimizer( + reward=reward, + param_iterable=[{"x": x} for x in np.arange(-5, 5, 0.1)]) + + def calc_with_param(param, data): + """This is the calculation step with ``param`` and ``data``. + In this example, ``p`` which is the polynomial power is + determined / specified in ``data``. + """ + x = param["x"] + p = data["p"] + return CalcResult(data=x**p + p*x + p, model=None) + + optimizer.calc_with_param = calc_with_param + + optim_res = optimizer.optimize_param(data={"p": 3}) + + best_param = optim_res["best_param"] + best_param_full = optim_res["best_param_full"] + assert round(best_param["x"], 2) == -0.80 + assert round(best_param_full["x"], 2) == -0.80 + + param_obj_list = optim_res["param_obj_list"] + param_obj_df = pd.DataFrame.from_records(param_obj_list) + + fig = plot_lines_markers( + df=param_obj_df, + x_col="x", + line_cols=["obj_value"]) + fig.layout.update( + title="`Optimizer` parameter search for roots of `x**p + p*x + p`; p=3") + assert fig is not None + # fig.show() diff --git a/greykite/tests/detection/detector/test_reward.py b/greykite/tests/detection/detector/test_reward.py new file mode 100644 index 0000000..64c5422 --- /dev/null +++ b/greykite/tests/detection/detector/test_reward.py @@ -0,0 +1,144 @@ +from greykite.detection.common.ad_evaluation import f1_score +from greykite.detection.common.ad_evaluation import precision_score +from greykite.detection.common.ad_evaluation import recall_score +from greykite.detection.detector.ad_utils import partial_return +from greykite.detection.detector.reward import Reward + + +def test_reward(): + """Tests `Reward` class.""" + y_true = [True, True, False, True, True, False, False, False, True, True] + y_pred = [False, True, False, True, True, False, True, False, True, False] + + f1 = partial_return(f1_score, True) + prec = partial_return(precision_score, True) + rec = partial_return(recall_score, True) + + raw_f1_value = f1(y_true=y_true, y_pred=y_pred) + raw_recall_value = rec(y_true=y_true, y_pred=y_pred) + + assert round(raw_f1_value, 2) == 0.73 + assert round(raw_recall_value, 2) == 0.67 + + obj_value = Reward(f1).apply(y_true=y_true, y_pred=y_pred) + + assert obj_value == raw_f1_value + + # This penalizes f1 values under 0.75 by `penalty == -1` + obj_value = Reward( + f1, + min_unpenalized=0.75, + penalty=-1).apply(y_true=y_true, y_pred=y_pred) + + assert obj_value == raw_f1_value - 1.0 + + # The penalty won't take effect since `min_unpenalized < raw_f1_value` + obj_value = Reward( + f1, + min_unpenalized=0.70, + penalty=-1).apply(y_true=y_true, y_pred=y_pred) + + assert obj_value == raw_f1_value + + # Multiplicative penalty + obj_value = Reward( + f1, + min_unpenalized=0.75, + penalize_method="multiplicative", + penalty=0.1).apply(y_true=y_true, y_pred=y_pred) + + assert obj_value == raw_f1_value * 0.1 + + # Recall, penalty won't take effect since `0.66 < raw_recall_value` + obj_value = Reward( + rec, + min_unpenalized=0.66, + penalize_method="additive", + penalty=-1).apply(y_true=y_true, y_pred=y_pred) + + assert obj_value == raw_recall_value + + # Recall, penalty will take effect since `0.66 > raw_recall_value` + obj_value = Reward( + rec, + max_unpenalized=0.66, + penalize_method="additive", + penalty=+3.0).apply(y_true=y_true, y_pred=y_pred) + + assert obj_value == raw_recall_value + 3.0 + + # Combining rewards (adding them) + # In this scenario, we penalize all recalls less than 0.8 by -1 + # While we add it to raw F1 + combined_reward = ( + Reward( + rec, + min_unpenalized=0.8, + penalize_method="additive", + penalty=-1) + + Reward(f1)) + + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == raw_f1_value + raw_recall_value - 1.0 + + # This adds a numeric value to f1 + combined_reward = Reward(f1) + 13 + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == raw_f1_value + 13 + + # This multiplies a numeric value to f1 + combined_reward = (Reward(f1) * 0) + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == 0 + + # This divides f1 by a numeric value + combined_reward = (Reward(f1) / 0) + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == float("inf") + + # This divides 0 by f1 + combined_reward = (0 / Reward(f1)) + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == 0 + + # This divides 17 by f1 + combined_reward = (17 / Reward(f1)) + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == 17.0 / raw_f1_value + + # This adds from right + combined_reward = 13 + Reward(f1) + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == raw_f1_value + 13 + + # Punishes recalls less than 0.8 harshly by assigning -inf + # This is useful in constrained optimization + combined_reward = ( + Reward( + rec, + min_unpenalized=0.8, + penalize_method="additive", + penalty=float("-inf")) + + Reward(f1)) + + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == float("-inf") + + # Multiplication of recall and F1 with penalty on the recall part + combined_reward = ( + Reward( + rec, + min_unpenalized=0.8, + penalize_method="multiplicative", + penalty=0.1) * + Reward(f1)) + + obj_value = combined_reward.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == raw_f1_value * raw_recall_value * 0.1 + + # Apply the class operations to construct f1 + rec_obj = Reward(rec) + prec_obj = Reward(prec) + half_f1_obj = (2 * rec_obj * prec_obj) / (rec_obj + prec_obj) + obj_value = half_f1_obj.apply(y_true=y_true, y_pred=y_pred) + assert obj_value == raw_f1_value diff --git a/greykite/tests/framework/templates/test_forecast_config.py b/greykite/tests/framework/templates/test_forecast_config.py index 0a84288..55dc190 100644 --- a/greykite/tests/framework/templates/test_forecast_config.py +++ b/greykite/tests/framework/templates/test_forecast_config.py @@ -1,6 +1,8 @@ import json +from copy import deepcopy from typing import Optional +import pandas as pd import pytest from pytest import fail @@ -17,9 +19,12 @@ from greykite.framework.templates.autogen.forecast_config import ForecastConfig from greykite.framework.templates.autogen.forecast_config import MetadataParam from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam +from greykite.framework.templates.autogen.forecast_config import assert_equal_forecast_config from greykite.framework.templates.autogen.forecast_config import forecast_config_from_dict from greykite.framework.templates.autogen.forecast_config import from_list_dict from greykite.framework.templates.autogen.forecast_config import from_list_dict_or_none +from greykite.framework.templates.autogen.forecast_config_utils import FORECAST_CONFIG_JSON_COMPLETE +from greykite.framework.templates.autogen.forecast_config_utils import FORECAST_CONFIG_JSON_DEFAULT from greykite.framework.templates.forecast_config_defaults import ForecastConfigDefaults from greykite.framework.templates.model_templates import ModelTemplateEnum @@ -468,3 +473,61 @@ def test_forecast_one_by_one(): assert config.to_dict()["forecast_one_by_one"] == [1, 2, 3] config = ForecastConfig().from_dict({"forecast_one_by_one": [1, 2, 3]}) assert config.forecast_one_by_one == [1, 2, 3] + + +def test_forecast_config_from_json(): + """Tests `from_json` method in `ForecastConfig.""" + for param in [FORECAST_CONFIG_JSON_DEFAULT, FORECAST_CONFIG_JSON_COMPLETE]: + forecast_config = param.get("forecast_config") + forecast_json = param.get("forecast_json") + translated_forecast_config = ForecastConfig.from_json(forecast_json) + assert_equal_forecast_config(forecast_config, translated_forecast_config) + + # Raises error when input is not a string + with pytest.raises(ValueError, match="is not a json string."): + ForecastConfig.from_json(5) + + # Raises error when input is not a json string + with pytest.raises(ValueError, match="not a json string"): + json_str = "This is not a json str" + ForecastConfig.from_json(json_str) + + +def test_assert_equal_forecast_config(): + """Tests `assert_equal_forecast_config`.""" + forecast_config_default = FORECAST_CONFIG_JSON_DEFAULT.get("forecast_config") + assert_equal_forecast_config(forecast_config_default, forecast_config_default) + + forecast_config_complete = FORECAST_CONFIG_JSON_COMPLETE.get("forecast_config") + assert_equal_forecast_config(forecast_config_complete, forecast_config_complete) + + # Raises error when the `ForecastConfig`s do not match + with pytest.raises(AssertionError, match="Actual should be a dict, found None."): + assert_equal_forecast_config(forecast_config_default, forecast_config_complete) + + # Raises error when anomaly dataframe is different, this is not captured by dataclass equality check + with pytest.raises(AssertionError, match="\\(column name=\"start\"\\) values are different"): + anomaly_df = pd.DataFrame({ + "start": ["2020-01-01", "2020-02-02"], + "end": ["2020-01-10", "2020-02-05"] + }) + config1 = deepcopy(forecast_config_complete) + config1.metadata_param.anomaly_info = { + "value_col": "value", + "anomaly_df": anomaly_df, + } + + anomaly_df = pd.DataFrame({ + "start": ["2020-01-05", "2020-02-02"], + "end": ["2020-01-10", "2020-02-10"] + }) + config2 = deepcopy(forecast_config_complete) + config2.metadata_param.anomaly_info = { + "value_col": "value", + "anomaly_df": anomaly_df, + } + assert_equal_forecast_config(config1, config2) + + # Raises error when one of the input is not an `ADConfig` + with pytest.raises(ValueError, match="is not a member of 'ForecastConfig' class."): + assert_equal_forecast_config(forecast_config_default, 4) diff --git a/greykite/tests/framework/templates/test_forecaster.py b/greykite/tests/framework/templates/test_forecaster.py index ac7f892..b082386 100644 --- a/greykite/tests/framework/templates/test_forecaster.py +++ b/greykite/tests/framework/templates/test_forecaster.py @@ -681,6 +681,7 @@ def test_run_forecast_config_with_single_simple_silverkite_template(): "estimator__holiday_post_num_days": [0], "estimator__holiday_pre_post_num_dict": [None], "estimator__daily_event_df_dict": [None], + 'estimator__auto_holiday_params': [None], "estimator__daily_event_neighbor_impact": [None], "estimator__daily_event_shifted_effect": [None], "estimator__auto_growth": [False], diff --git a/greykite/tests/framework/templates/test_multistage_forecast_template.py b/greykite/tests/framework/templates/test_multistage_forecast_template.py index 77efa88..e3c40ef 100644 --- a/greykite/tests/framework/templates/test_multistage_forecast_template.py +++ b/greykite/tests/framework/templates/test_multistage_forecast_template.py @@ -335,6 +335,7 @@ def test_get_hyperparameter_grid_extra_configs(df, forecast_config): 'holiday_post_num_days': 1, 'holiday_pre_post_num_dict': None, 'daily_event_df_dict': None, + "auto_holiday_params": None, 'daily_event_neighbor_impact': None, 'daily_event_shifted_effect': None, 'feature_sets_enabled': 'auto', @@ -378,6 +379,7 @@ def test_get_hyperparameter_grid_extra_configs(df, forecast_config): 'holiday_post_num_days': 0, 'holiday_pre_post_num_dict': None, 'daily_event_df_dict': None, + "auto_holiday_params": None, 'daily_event_neighbor_impact': None, 'daily_event_shifted_effect': None, 'feature_sets_enabled': 'auto', @@ -599,6 +601,7 @@ def test_get_estimators_and_params_from_template_configs(df, forecast_config): 'estimator__holiday_post_num_days': [1], 'estimator__holiday_pre_post_num_dict': [None], 'estimator__daily_event_df_dict': [None], + 'estimator__auto_holiday_params': [None], 'estimator__daily_event_neighbor_impact': [None], 'estimator__daily_event_shifted_effect': [None], 'estimator__feature_sets_enabled': ['auto'], @@ -643,6 +646,7 @@ def test_get_estimators_and_params_from_template_configs(df, forecast_config): 'estimator__holiday_post_num_days': [0], 'estimator__holiday_pre_post_num_dict': [None], 'estimator__daily_event_df_dict': [None], + 'estimator__auto_holiday_params': [None], 'estimator__daily_event_neighbor_impact': [None], 'estimator__daily_event_shifted_effect': [None], 'estimator__feature_sets_enabled': ['auto'], diff --git a/greykite/tests/framework/templates/test_multistage_forecast_template_config.py b/greykite/tests/framework/templates/test_multistage_forecast_template_config.py index 5c00cf0..b6abe14 100644 --- a/greykite/tests/framework/templates/test_multistage_forecast_template_config.py +++ b/greykite/tests/framework/templates/test_multistage_forecast_template_config.py @@ -195,6 +195,7 @@ def test_multistage_forecast_silverkite_wow(): "holiday_post_num_days": 0, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, diff --git a/greykite/tests/framework/templates/test_simple_silverkite_template.py b/greykite/tests/framework/templates/test_simple_silverkite_template.py index c2d530c..341d23d 100644 --- a/greykite/tests/framework/templates/test_simple_silverkite_template.py +++ b/greykite/tests/framework/templates/test_simple_silverkite_template.py @@ -295,6 +295,7 @@ def test_get_single_model_components_param_from_template(): "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -365,6 +366,7 @@ def test_get_single_model_components_param_from_template(): "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -434,6 +436,7 @@ def test_get_single_model_components_param_from_template(): "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -503,6 +506,7 @@ def test_get_single_model_components_param_from_template(): "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -573,6 +577,7 @@ def test_get_single_model_components_param_from_template(): "holiday_post_num_days": 0, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -638,6 +643,7 @@ def test_get_model_components_from_model_template(silverkite): "holiday_post_num_days": 2, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None } @@ -776,6 +782,7 @@ def test_get_model_components_from_model_template(silverkite): "holiday_post_num_days": 2, "holiday_pre_post_num_dict": {"New Year's Day": (7, 3)}, "daily_event_df_dict": daily_event_df_dict, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }) @@ -1101,6 +1108,7 @@ def test_get_model_components_and_override_from_model_template_single(): "holiday_post_num_days": 0, "holiday_pre_post_num_dict": None, "daily_event_df_dict": None, + "auto_holiday_params": None, "daily_event_neighbor_impact": None, "daily_event_shifted_effect": None }, @@ -1338,6 +1346,7 @@ def test_apply_default_model_components_daily_1(): estimator__holiday_post_num_days=[2], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1399,6 +1408,7 @@ def test_apply_default_model_components_daily_1(): estimator__holiday_post_num_days=[2], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1460,6 +1470,7 @@ def test_apply_default_model_components_daily_1(): estimator__holiday_post_num_days=[2], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1536,6 +1547,7 @@ def test_apply_default_model_components_daily_90(): estimator__holiday_post_num_days=[2], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1591,6 +1603,7 @@ def test_apply_default_model_components_daily_90(): estimator__holiday_post_num_days=[2], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1654,6 +1667,7 @@ def test_apply_default_model_components_daily_90(): estimator__holiday_post_num_days=[2], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1717,6 +1731,7 @@ def test_apply_default_model_components_daily_90(): estimator__holiday_post_num_days=[4], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1781,6 +1796,7 @@ def test_apply_default_model_components_weekly(): estimator__holiday_post_num_days=[0], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1844,6 +1860,7 @@ def test_apply_default_model_components_weekly(): estimator__holiday_post_num_days=[0], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1907,6 +1924,7 @@ def test_apply_default_model_components_weekly(): estimator__holiday_post_num_days=[0], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -1970,6 +1988,7 @@ def test_apply_default_model_components_weekly(): estimator__holiday_post_num_days=[0], estimator__holiday_pre_post_num_dict=[None], estimator__daily_event_df_dict=[None], + estimator__auto_holiday_params=[None], estimator__daily_event_neighbor_impact=[None], estimator__daily_event_shifted_effect=[None], # Feature sets @@ -2033,6 +2052,7 @@ def test_get_simple_silverkite_hyperparameter_grid(silverkite): "estimator__holiday_post_num_days": [2], "estimator__holiday_pre_post_num_dict": [None], "estimator__daily_event_df_dict": [None], + "estimator__auto_holiday_params": [None], "estimator__daily_event_neighbor_impact": [None], "estimator__daily_event_shifted_effect": [None], "estimator__auto_growth": [False], @@ -3629,7 +3649,7 @@ def test_silverkite_auto_config(): "order": [3, 1, 1, 6], "seas_names": ["weekly", "monthly", "quarterly", "yearly"] })) - assert len(result.model[-1].model_dict["daily_event_df_dict"]) == 199 + assert len(result.model[-1].model_dict["daily_event_df_dict"]) == 5 assert "ct1" in result.model[-1].model_dict["x_mat"].columns diff --git a/greykite/tests/sklearn/estimator/test_simple_silverkite_estimator.py b/greykite/tests/sklearn/estimator/test_simple_silverkite_estimator.py index d5cb62f..6fe74b8 100644 --- a/greykite/tests/sklearn/estimator/test_simple_silverkite_estimator.py +++ b/greykite/tests/sklearn/estimator/test_simple_silverkite_estimator.py @@ -591,16 +591,19 @@ def test_auto_config(): model = SimpleSilverkiteEstimator( forecast_horizon=7, auto_holiday=True, - holidays_to_model_separately="auto", + holidays_to_model_separately=["custom_event"], holiday_lookup_countries="auto", - holiday_pre_num_days=2, - holiday_post_num_days=2, + holiday_pre_num_days=0, + holiday_post_num_days=0, daily_event_df_dict=dict( custom_event=pd.DataFrame({ EVENT_DF_DATE_COL: pd.to_datetime(["2010-03-03", "2011-03-03", "2012-03-03"]), EVENT_DF_LABEL_COL: "threethree" }) ), + auto_holiday_params=dict( + n_clusters=5 + ), auto_growth=True, growth_term="quadratic", changepoints_dict=dict( @@ -630,9 +633,9 @@ def test_auto_config(): assert "ct1" in model.model_dict["x_mat"].columns assert model.model_dict["changepoints_dict"]["method"] == "custom" # Holidays is overridden by auto seasonality. - assert len(model.model_dict["daily_event_df_dict"]) == 203 + assert len(model.model_dict["daily_event_df_dict"]) == 6 assert "custom_event" in model.model_dict["daily_event_df_dict"] - assert "China_Chinese New Year" in model.model_dict["daily_event_df_dict"] + assert "holiday_group_0" in model.model_dict["daily_event_df_dict"] def test_quantile_regression_uncertainty_model(): diff --git a/greykite/tests/sklearn/transform/test_difference_based_outlier_transformer.py b/greykite/tests/sklearn/transform/test_difference_based_outlier_transformer.py new file mode 100644 index 0000000..3edf451 --- /dev/null +++ b/greykite/tests/sklearn/transform/test_difference_based_outlier_transformer.py @@ -0,0 +1,120 @@ +""" +Test for difference_based_outlier_transformer.py +""" +import numpy as np +import pandas as pd +import pytest +from sklearn.exceptions import NotFittedError +from testfixtures import LogCapture + +from greykite.common.constants import LOGGER_NAME +from greykite.common.python_utils import assert_equal +from greykite.sklearn.transform.difference_based_outlier_transformer import DifferenceBasedOutlierTransformer + + +@pytest.fixture +def data(): + """Generates test dataframe for outlier detection""" + np.random.seed(100) + df = pd.DataFrame({ + "a": np.repeat(1.0, 100), + "b": np.arange(0.0, 100.0, 1.0) + np.random.normal(size=100, loc=0, scale=0.1), + "c": np.tile([1.0, 2.0, 3.0, 4.0], 25), + "d": np.repeat(1.0, 100), + }) + df.loc[2, "b"] = 100.0 + df.loc[6, "d"] = 100.0 + return df + + +def test_difference_based_outlier_transformer(data): + """Checks if outliers are properly replaced""" + transformer = DifferenceBasedOutlierTransformer( + method="z_score", + score_type="difference", + params=dict( + agg_func=np.nanmean, + lag_orders=[-1, 1], + z_cutoff=3.5, + max_outlier_percent=5.0 + ) + ) + # init does not modify parameters. + assert transformer.method == "z_score" + assert transformer.score_type == "difference" + assert transformer.score is None + transformer.fit(data) + assert transformer.agg_func == np.nanmean + assert transformer.lag_orders == [-1, 1] + assert transformer.z_cutoff == 3.5 + assert transformer.max_outlier_percent == 5.0 + assert transformer.score is not None + # `transform` removes outliers based on `transformer.scores`. + with LogCapture(LOGGER_NAME) as log_capture: + result = transformer.transform(data) + expected = data.copy() + expected.loc[[1, 2, 3], "b"] = np.nan + expected.loc[[5, 6, 7], "d"] = np.nan + assert_equal(result, expected) + log_capture.check( + (LOGGER_NAME, "INFO", "Detected 6 outlier(s).")) + + transformer = DifferenceBasedOutlierTransformer( + method="tukey", + score_type="ratio", + params=dict( + agg_func=np.nanmean, + lag_orders=[-1, 1], + tukey_cutoff=3.5, + max_outlier_percent=5.0 + ) + ) + # init does not modify parameters. + assert transformer.method == "tukey" + assert transformer.score_type == "ratio" + assert transformer.score is None + transformer.fit(data) + assert transformer.agg_func == np.nanmean + assert transformer.lag_orders == [-1, 1] + assert transformer.tukey_cutoff == 3.5 + assert transformer.max_outlier_percent == 5.0 + assert transformer.score is not None + # `transform` removes outliers based on `transformer.scores`. + with LogCapture(LOGGER_NAME) as log_capture: + result = transformer.transform(data) + expected = data.copy() + expected.loc[[0, 1, 2, 3, 4], "b"] = np.nan + assert_equal(result, expected) + log_capture.check( + (LOGGER_NAME, "INFO", "Detected 5 outlier(s).")) + + transformer = DifferenceBasedOutlierTransformer( + method="neither_z_score_nor_tukey") + with pytest.raises(NotImplementedError, match="is an invalid 'method'"): + transformer.fit(data) + + transformer = DifferenceBasedOutlierTransformer( + method="z_score", + score_type="neither_difference_nor_ratio", + params=dict( + agg_func=np.nanmean, + lag_orders=[-1, 1], + z_cutoff=3.0, + max_outlier_percent=5.0 + ) + ) + with pytest.raises(NotImplementedError, match="is an invalid 'score_type'"): + transformer.fit(data) + + transformer = DifferenceBasedOutlierTransformer( + method="z_score", + score_type="difference", + params=dict( + agg_func=np.nanmean, + lag_orders=[-1, 1], + z_cutoff=3.0, + max_outlier_percent=5.0 + ) + ) + with pytest.raises(NotFittedError, match="This instance is not fitted yet."): + transformer.transform(data) diff --git a/greykite/tests/sklearn/transform/test_pandas_feature_union.py b/greykite/tests/sklearn/transform/test_pandas_feature_union.py index fcfdd04..afb399e 100644 --- a/greykite/tests/sklearn/transform/test_pandas_feature_union.py +++ b/greykite/tests/sklearn/transform/test_pandas_feature_union.py @@ -11,6 +11,7 @@ from greykite.sklearn.estimator.null_model import DummyEstimator from greykite.sklearn.estimator.simple_silverkite_estimator import SimpleSilverkiteEstimator from greykite.sklearn.transform.column_selector import ColumnSelector +from greykite.sklearn.transform.difference_based_outlier_transformer import DifferenceBasedOutlierTransformer from greykite.sklearn.transform.null_transformer import NullTransformer from greykite.sklearn.transform.pandas_feature_union import PandasFeatureUnion from greykite.sklearn.transform.zscore_outlier_transformer import ZscoreOutlierTransformer @@ -42,6 +43,21 @@ def fs(): ]) +@pytest.fixture +def fs_diff(): + """feature transformation pipeline for test cases""" + return PandasFeatureUnion([ + ("date", Pipeline([ + ("select_date", ColumnSelector([TIME_COL])) # leaves time column unmodified + ])), + ("response", Pipeline([ # applies outlier and null transformation to value column + ("select_val", ColumnSelector([VALUE_COL])), + ("outlier", DifferenceBasedOutlierTransformer()), + ("null", NullTransformer()) + ])) + ]) + + def test_feature_union(X): """Tests PandasFeatureUnion on simple projection Inspired by sklearn/tests/test_pipeline.py""" @@ -68,7 +84,7 @@ def test_feature_union(X): ], axis=1)) -def test_transformer_union(X, fs): +def test_transformer_union(X, fs, fs_diff): """Tests PandasFeatureUnion on a pipeline of transformers, with custom parameters""" # sets parameters and fits model z_cutoff = 2.0 @@ -88,6 +104,24 @@ def test_transformer_union(X, fs): assert_equal(X_transformed[TIME_COL], X[TIME_COL]) assert_equal(X_transformed[VALUE_COL], X_after_null[VALUE_COL]) + # adds new test for `DifferenceBasedOutlierTransformer`. + params = dict(z_cutoff=2.0) + fs_diff.set_params(response__outlier__params=params) + fs_diff.fit(X) + X_transformed = fs_diff.transform(X) + + # checks shape + assert X_transformed.shape == (X.shape[0], 2) + assert list(X_transformed.columns) == [TIME_COL, VALUE_COL] + + # checks output result + X_after_column_select = ColumnSelector([VALUE_COL]).fit_transform(X) + X_after_z_score = DifferenceBasedOutlierTransformer(params=params).fit_transform(X_after_column_select) + X_after_null = NullTransformer().fit_transform(X_after_z_score) + + assert_equal(X_transformed[TIME_COL], X[TIME_COL]) + assert_equal(X_transformed[VALUE_COL], X_after_null[VALUE_COL]) + def test_pipeline_union(X, fs): """Tests PandasFeatureUnion on a pipeline of transformers and estimator, and shows diff --git a/requirements-dev.txt b/requirements-dev.txt index c190d76..9a7c3dc 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -20,7 +20,7 @@ matplotlib==3.5.2 nbformat==5.5.0 notebook==6.5.2 numpy==1.23.2 -osqp==0.6.1 +osqp==0.6.2 overrides==7.3.1 pandas==1.5.0 patsy==0.5.2 diff --git a/setup.cfg b/setup.cfg index 2b9fb4a..051da22 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.1 +current_version = 1.0.0 commit = True tag = True @@ -24,4 +24,4 @@ max-line-length = 160 test = pytest [tool:pytest] -collect_ignore = ['setup.py'] +addopts = --ignore=setup.py diff --git a/setup.py b/setup.py index 0b6ee0d..985eb49 100644 --- a/setup.py +++ b/setup.py @@ -16,8 +16,8 @@ "holidays-ext>=0.0.7", "ipython>=7.31.1", "matplotlib>=3.4.1", - "numpy>=1.22.0", # support for Python 3.10 - "osqp>=0.6.1", + "numpy>=1.22.0, <1.25.0", # support for Python 3.10 + "osqp>=0.6.2", "overrides>=2.8.0", "pandas>=1.5.0, <2.0.0", "patsy>=0.5.1", @@ -25,7 +25,7 @@ "pmdarima>=1.8.0, <=1.8.5", "pytest>=4.6.5", "pytest-runner>=5.1", - "scipy>=1.5.4", + "scipy>=1.5.4, <1.11.0", "six>=1.15.0", "scikit-learn>=0.24.1", "statsmodels>=0.12.2", @@ -64,6 +64,6 @@ test_suite="tests", tests_require=test_requirements, url="https://github.com/linkedin/greykite", - version="0.5.1", + version="1.0.0", zip_safe=False, )