Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Add incremental algorithms support #160

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
65 changes: 65 additions & 0 deletions configs/sklearnex_incremental_example.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
{
"PARAMETERS_SETS": {
"common": {"bench": {"n_runs": 10, "time_limit": 60}},
"unlabeled dataset": {
"data": [
{
"source": "make_blobs",
"generation_kwargs": {
"centers": 1,
"n_samples": 1000,
"n_features": [16, 64]
},
"split_kwargs": {"ignore": true}
}
]
},
"labeled dataset": {
"data": {
"source": "make_regression",
"split_kwargs": {"train_size": 0.2, "test_size": 0.8},
"generation_kwargs": {
"n_samples": 5000,
"n_features": [40, 100],
"n_informative": 5,
"noise": 2.0
}
}
},
"covariance": {
"algorithm": [
{
"estimator": "IncrementalEmpiricalCovariance",
"library": "sklearnex.covariance",
"estimator_methods": {"training": "partial_fit"},
"num_batches": {"training": 2}
}
]
},
"linear_regression": {
"algorithm": [
{
"estimator": "IncrementalLinearRegression",
"library": "sklearnex",
"num_batches": {"training": 2}
}
]
},
"pca": {
"algorithm": [
{
"estimator": "IncrementalPCA",
"library": "sklearnex.preview",
"num_batches": {"training": 2}
}
]
}
},
"TEMPLATES": {
"covariance": {"SETS": ["common", "covariance", "unlabeled dataset"]},
"linear_regression": {
"SETS": ["common", "linear_regression", "labeled dataset"]
},
"pca": {"SETS": ["common", "pca", "unlabeled dataset"]}
}
}
57 changes: 41 additions & 16 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str):
def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
# default estimator methods
estimator_methods = {
"training": ["fit"],
"training": ["partial_fit", "fit"],
"inference": ["predict", "predict_proba", "transform"],
}
for stage in estimator_methods.keys():
Expand Down Expand Up @@ -334,34 +334,43 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def create_online_function(method_instance, data_args, batch_size):
n_batches = data_args[0].shape[0] // batch_size
def create_online_function(
estimator_instance, method_instance, data_args, num_batches, batch_size
olegkkruglov marked this conversation as resolved.
Show resolved Hide resolved
):

if "y" in list(inspect.signature(method_instance).parameters):

def ndarray_function(x, y):
for i in range(n_batches):
for i in range(num_batches):
Comment on lines +337 to +344
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Leave old simple logic with batch_size only.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why change? It overcomplicates data slicing with extra parameter checks and calculations, also, it is more common to know batch size before partial_fit call in real world cases.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why change?

Adding new feature which can be useful.

It overcomplicates data slicing with extra parameter checks and calculations

It costs nothing. And doing calculations in the code is better than doing them in calculator before running benchmarks.

it is more common to know batch size before partial_fit call in real world cases.

But while doing benchmarking it is not less common (I'd say even more) when the user wants to specify exact number of partial_fit calls.

method_instance(
x[i * batch_size : (i + 1) * batch_size],
y[i * batch_size : (i + 1) * batch_size],
)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

def dataframe_function(x, y):
for i in range(n_batches):
for i in range(num_batches):
method_instance(
x.iloc[i * batch_size : (i + 1) * batch_size],
y.iloc[i * batch_size : (i + 1) * batch_size],
)
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

else:

def ndarray_function(x):
for i in range(n_batches):
for i in range(num_batches):
method_instance(x[i * batch_size : (i + 1) * batch_size])
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

def dataframe_function(x):
for i in range(n_batches):
for i in range(num_batches):
method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
if hasattr(estimator_instance, "_onedal_finalize_fit"):
estimator_instance._onedal_finalize_fit()

if "ndarray" in str(type(data_args[0])):
return ndarray_function
Expand Down Expand Up @@ -414,12 +423,32 @@ def measure_sklearn_estimator(
data_args = (x_train,)
else:
data_args = (x_test,)
batch_size = get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
)
if batch_size is not None:

if method == "partial_fit":
num_batches = get_bench_case_value(
bench_case, f"algorithm:num_batches:{stage}"
)
batch_size = get_bench_case_value(
bench_case, f"algorithm:batch_size:{stage}"
)

if batch_size is None:
if num_batches is None:
num_batches = 5
batch_size = (
data_args[0].shape[0] + num_batches - 1
) // num_batches
if num_batches is None:
num_batches = (
data_args[0].shape[0] + batch_size - 1
) // batch_size
Alexsandruss marked this conversation as resolved.
Show resolved Hide resolved

method_instance = create_online_function(
method_instance, data_args, batch_size
estimator_instance,
method_instance,
data_args,
num_batches,
batch_size,
)
# daal4py model builders enabling branch
if enable_modelbuilders and stage == "inference":
Expand All @@ -436,10 +465,6 @@ def measure_sklearn_estimator(
metrics[method]["time std[ms]"],
_,
) = measure_case(bench_case, method_instance, *data_args)
if batch_size is not None:
metrics[method]["throughput[samples/ms]"] = (
(data_args[0].shape[0] // batch_size) * batch_size
) / metrics[method]["time[ms]"]
if ensure_sklearnex_patching:
full_method_name = f"{estimator_class.__name__}.{method}"
sklearnex_logging_stream.seek(0)
Expand Down
10 changes: 7 additions & 3 deletions sklbench/report/implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

import argparse
import json
from typing import Dict, List
from typing import Dict, Hashable, List

import openpyxl as xl
import pandas as pd
Expand Down Expand Up @@ -239,6 +239,7 @@ def get_result_tables_as_df(
bench_cases = pd.DataFrame(
[flatten_dict(bench_case) for bench_case in results["bench_cases"]]
)
bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is non-hashable object you are trying to convert?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

basic statistics result_options parameter is a list


if compatibility_mode:
bench_cases = transform_results_to_compatible(bench_cases)
Expand All @@ -248,7 +249,7 @@ def get_result_tables_as_df(
bench_cases.drop(columns=[column], inplace=True)
diffby_columns.remove(column)

return split_df_by_columns(bench_cases, splitby_columns)
return split_df_by_columns(bench_cases, splitby_columns, False)


def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
Expand All @@ -258,7 +259,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
# only relative improvements are included in summary currently
if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
metric_columns.append(column)
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
if metric_columns:
summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
else:
summary = pd.DataFrame()
summary.index = pd.Index([df_name])
return summary

Expand Down
5 changes: 5 additions & 0 deletions test-configuration-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ steps:
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearnex_incremental_example.json
displayName: Incremental algorithms example run
- script: |
source /usr/share/miniconda/etc/profile.d/conda.sh
conda activate bench-env
Expand Down
4 changes: 4 additions & 0 deletions test-configuration-win.yml
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ steps:
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
displayName: Sklearn example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/incremental.json
displayName: Incremental algorithms example run
- script: |
call activate bench-env
python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json
Expand Down
Loading