Skip to content

Commit

Permalink
Setup benchmarks (#64)
Browse files Browse the repository at this point in the history
* Setup benchmarks using asv

* Add asv to dev-requirements.txt

* Add info about benchmarking to a contributing guide

* Add benchmark for concatenating input dims

* Add 4-d case to benchmarks
  • Loading branch information
maxrjones authored Aug 10, 2022
1 parent 3d16162 commit 5312998
Show file tree
Hide file tree
Showing 9 changed files with 409 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ nosetests.xml
coverage.xml
*,cover

# asv environments
.asv

# Translations
*.mo
*.pot
Expand Down
1 change: 1 addition & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ repos:
- id: end-of-file-fixer
- id: check-docstring-first
- id: check-json
exclude: "asv_bench/asv.conf.json"
- id: check-yaml
- id: double-quote-string-fixer

Expand Down
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Xbatcher's contributor guidelines [can be found in the online documentation](https://xbatcher.readthedocs.io/en/latest/contributing.html).
188 changes: 188 additions & 0 deletions asv_bench/asv.conf.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,

// The name of the project being benchmarked
"project": "xbatcher",

// The project's homepage
"project_url": "https://xbatcher.readthedocs.io/",

// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "..",

// The Python project's subdirectory in your repo. If missing or
// the empty string, the project is assumed to be located at the root
// of the repository.
// "repo_subdir": "",

// Customizable commands for building, installing, and
// uninstalling the project. See asv.conf.json documentation.
//
// "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
// "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
// "build_command": [
// "python setup.py build",
// "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
// ],

// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["main"], // for git

// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
// ".git" (if local).
"dvcs": "git",

// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",

// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
"install_timeout": 600,

// the base URL to show a commit for the project.
// "show_commit_url": "http://github.com/pangeo-data/xbatcher/commit/",

// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
// "pythons": ["3.8"],

// The list of conda channel names to be searched for benchmark
// dependency packages in the specified order
"conda_channels": ["conda-forge"],

// A conda environment file that is used for environment creation.
// "conda_environment_file": "environment.yml",

// The matrix of dependencies to test. Each key of the "req"
// requirements dictionary is the name of a package (in PyPI) and
// the values are version numbers. An empty list or empty string
// indicates to just test against the default (latest)
// version. null indicates that the package is to not be
// installed. If the package to be tested is only available from
// PyPi, and the 'environment_type' is conda, then you can preface
// the package name by 'pip+', and the package will be installed
// via pip (with all the conda available packages installed first,
// followed by the pip installed packages).
//
// The ``@env`` and ``@env_nobuild`` keys contain the matrix of
// environment variables to pass to build and benchmark commands.
// An environment will be created for every combination of the
// cartesian product of the "@env" variables in this matrix.
// Variables in "@env_nobuild" will be passed to every environment
// during the benchmark phase, but will not trigger creation of
// new environments. A value of ``null`` means that the variable
// will not be set for the current combination.
//
// "matrix": {
// "req": {
// "numpy": ["1.6", "1.7"],
// "six": ["", null], // test with and without six installed
// "pip+emcee": [""] // emcee is only available for install with pip.
// },
// "env": {"ENV_VAR_1": ["val1", "val2"]},
// "env_nobuild": {"ENV_VAR_2": ["val3", null]},
// },
// "matrix": {
// "xarray": [""],
// "numpy": [""],
// "dask": [""],
// },

// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
//
// An exclude entry excludes entries where all values match. The
// values are regexps that should match the whole string.
//
// An include entry adds an environment. Only the packages listed
// are installed. The 'python' key is required. The exclude rules
// do not apply to includes.
//
// In addition to package names, the following keys are available:
//
// - python
// Python version, as in the *pythons* variable above.
// - environment_type
// Environment type, as above.
// - sys_platform
// Platform, as in sys.platform. Possible values for the common
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
// - req
// Required packages
// - env
// Environment variables
// - env_nobuild
// Non-build environment variables
//
// "exclude": [
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
// {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda
// {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
// ],
//
// "include": [
// // additional env for python2.7
// {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}},
// // additional env if run on windows+conda
// {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}},
// ],

// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
"benchmark_dir": "benchmarks",

// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
"env_dir": ".asv/env",

// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": ".asv/results",

// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
"html_dir": ".asv/html"

// The number of characters to retain in the commit hashes.
// "hash_length": 8,

// `asv` will cache results of the recent builds in each
// environment, making them faster to install next time. This is
// the number of builds to keep, per environment.
// "build_cache_size": 2,

// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
// regexps matching to benchmark names, and values corresponding to
// the commit (exclusive) after which to start looking for
// regressions. The default is to start from the first commit
// with results. If the commit is `null`, regression detection is
// skipped for the matching benchmark.
//
// "regressions_first_commits": {
// "some_benchmark": "352cdf", // Consider regressions only after this commit
// "another_benchmark": null, // Skip regression detection altogether
// },

// The thresholds for relative change in results, after which `asv
// publish` starts reporting regressions. Dictionary of the same
// form as in ``regressions_first_commits``, with values
// indicating the thresholds. If multiple entries match, the
// maximum is taken. If no entry matches, the default is 5%.
//
// "regressions_thresholds": {
// "some_benchmark": 0.01, // Threshold of 1%
// "another_benchmark": 0.5, // Threshold of 50%
// },
}
12 changes: 12 additions & 0 deletions asv_bench/benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
def parameterized(names, params):
"""
Copied from xarray benchmarks:
https://github.com/pydata/xarray/blob/main/asv_bench/benchmarks/__init__.py#L9-L15
"""

def decorator(func):
func.param_names = names
func.params = params
return func

return decorator
147 changes: 147 additions & 0 deletions asv_bench/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import numpy as np
import torch
import xarray as xr

from xbatcher import BatchGenerator
from xbatcher.loaders.torch import IterableDataset, MapDataset

from . import parameterized


class Base:
def setup(self, *args, **kwargs):
shape = (10, 50, 100)
self.ds_3d = xr.Dataset(
{
'foo': (['time', 'y', 'x'], np.random.rand(*shape)),
},
{
'x': (['x'], np.arange(shape[-1])),
'y': (['y'], np.arange(shape[-2])),
},
)

shape_4d = (10, 50, 100, 3)
self.ds_4d = xr.Dataset(
{
'foo': (['time', 'y', 'x', 'b'], np.random.rand(*shape_4d)),
},
{
'x': (['x'], np.arange(shape_4d[-2])),
'y': (['y'], np.arange(shape_4d[-3])),
'b': (['b'], np.arange(shape_4d[-1])),
},
)

self.ds_xy = xr.Dataset(
{
'x': (
['sample', 'feature'],
np.random.random((shape[-1], shape[0])),
),
'y': (['sample'], np.random.random(shape[-1])),
},
)


class Generator(Base):
@parameterized(['preload_batch'], ([True, False]))
def time_batch_preload(self, preload_batch):
"""
Construct a generator on a chunked DataSet with and without preloading
batches.
"""
ds_dask = self.ds_xy.chunk({'sample': 2})
BatchGenerator(
ds_dask, input_dims={'sample': 2}, preload_batch=preload_batch
)

@parameterized(
['input_dims', 'batch_dims', 'input_overlap'],
(
[{'x': 5}, {'x': 10}, {'x': 5, 'y': 5}, {'x': 10, 'y': 5}],
[{}, {'x': 20}, {'x': 30}],
[{}, {'x': 1}, {'x': 2}],
),
)
def time_batch_input(self, input_dims, batch_dims, input_overlap):
"""
Benchmark simple batch generation case.
"""
BatchGenerator(
self.ds_3d,
input_dims=input_dims,
batch_dims=batch_dims,
input_overlap=input_overlap,
)

@parameterized(
['input_dims', 'concat_input_dims'],
([{'x': 5}, {'x': 10}, {'x': 5, 'y': 5}], [True, False]),
)
def time_batch_concat(self, input_dims, concat_input_dims):
"""
Construct a generator on a DataSet with and without concatenating
chunks specified by ``input_dims`` into the batch dimension.
"""
BatchGenerator(
self.ds_3d,
input_dims=input_dims,
concat_input_dims=concat_input_dims,
)

@parameterized(
['input_dims', 'batch_dims', 'concat_input_dims'],
(
[{'x': 5}, {'x': 5, 'y': 5}],
[{}, {'x': 10}, {'x': 10, 'y': 10}],
[True, False],
),
)
def time_batch_concat_4d(self, input_dims, batch_dims, concat_input_dims):
"""
Construct a generator on a DataSet with and without concatenating
chunks specified by ``input_dims`` into the batch dimension.
"""
BatchGenerator(
self.ds_4d,
input_dims=input_dims,
batch_dims=batch_dims,
concat_input_dims=concat_input_dims,
)


class Accessor(Base):
@parameterized(
['input_dims'],
([{'x': 2}, {'x': 4}, {'x': 2, 'y': 2}, {'x': 4, 'y': 2}]),
)
def time_accessor_input_dim(self, input_dims):
"""
Benchmark simple batch generation case using xarray accessor
Equivalent to subset of ``time_batch_input()``.
"""
self.ds_3d.batch.generator(input_dims=input_dims)


class TorchLoader(Base):
def setup(self, *args, **kwargs):
super().setup(**kwargs)
self.x_gen = BatchGenerator(self.ds_xy['x'], {'sample': 10})
self.y_gen = BatchGenerator(self.ds_xy['y'], {'sample': 10})

def time_map_dataset(self):
"""
Benchmark MapDataset integration with torch DataLoader.
"""
dataset = MapDataset(self.x_gen, self.y_gen)
loader = torch.utils.data.DataLoader(dataset)
iter(loader).next()

def time_iterable_dataset(self):
"""
Benchmark IterableDataset integration with torch DataLoader.
"""
dataset = IterableDataset(self.x_gen, self.y_gen)
loader = torch.utils.data.DataLoader(dataset)
iter(loader).next()
1 change: 1 addition & 0 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,5 @@ torch
coverage
pytest-cov
adlfs
asv
-r requirements.txt
Loading

0 comments on commit 5312998

Please sign in to comment.