Setup benchmarks (#64)

* Setup benchmarks using asv * Add asv to dev-requirements.txt * Add info about benchmarking to a contributing guide * Add benchmark for concatenating input dims * Add 4-d case to benchmarks
xarray-contrib · Aug 10, 2022 · 5312998 · 5312998
1 parent 3d16162
commit 5312998
Show file tree

Hide file tree

Showing 9 changed files with 409 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,6 +43,9 @@ nosetests.xml
 coverage.xml
 *,cover
 
+# asv environments
+.asv
+
 # Translations
 *.mo
 *.pot

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -10,6 +10,7 @@ repos:
       - id: end-of-file-fixer
       - id: check-docstring-first
       - id: check-json
+        exclude: "asv_bench/asv.conf.json"
       - id: check-yaml
       - id: double-quote-string-fixer
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -0,0 +1 @@
+Xbatcher's contributor guidelines [can be found in the online documentation](https://xbatcher.readthedocs.io/en/latest/contributing.html).
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
@@ -0,0 +1,188 @@
+{
+  // The version of the config file format.  Do not change, unless
+  // you know what you are doing.
+  "version": 1,
+
+  // The name of the project being benchmarked
+  "project": "xbatcher",
+
+  // The project's homepage
+  "project_url": "https://xbatcher.readthedocs.io/",
+
+  // The URL or local path of the source code repository for the
+  // project being benchmarked
+  "repo": "..",
+
+  // The Python project's subdirectory in your repo.  If missing or
+  // the empty string, the project is assumed to be located at the root
+  // of the repository.
+  // "repo_subdir": "",
+
+  // Customizable commands for building, installing, and
+  // uninstalling the project. See asv.conf.json documentation.
+  //
+  // "install_command": ["in-dir={env_dir} python -mpip install {wheel_file}"],
+  // "uninstall_command": ["return-code=any python -mpip uninstall -y {project}"],
+  // "build_command": [
+  //     "python setup.py build",
+  //     "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"
+  // ],
+
+  // List of branches to benchmark. If not provided, defaults to "master"
+  // (for git) or "default" (for mercurial).
+  "branches": ["main"], // for git
+
+  // The DVCS being used.  If not set, it will be automatically
+  // determined from "repo" by looking at the protocol in the URL
+  // (if remote), or by looking for special directories, such as
+  // ".git" (if local).
+  "dvcs": "git",
+
+  // The tool to use to create environments.  May be "conda",
+  // "virtualenv" or other value depending on the plugins in use.
+  // If missing or the empty string, the tool will be automatically
+  // determined by looking for tools on the PATH environment
+  // variable.
+  "environment_type": "conda",
+
+  // timeout in seconds for installing any dependencies in environment
+  // defaults to 10 min
+  "install_timeout": 600,
+
+  // the base URL to show a commit for the project.
+  // "show_commit_url": "http://github.com/pangeo-data/xbatcher/commit/",
+
+  // The Pythons you'd like to test against.  If not provided, defaults
+  // to the current version of Python used to run `asv`.
+  // "pythons": ["3.8"],
+
+  // The list of conda channel names to be searched for benchmark
+  // dependency packages in the specified order
+  "conda_channels": ["conda-forge"],
+
+  // A conda environment file that is used for environment creation.
+  // "conda_environment_file": "environment.yml",
+
+  // The matrix of dependencies to test.  Each key of the "req"
+  // requirements dictionary is the name of a package (in PyPI) and
+  // the values are version numbers.  An empty list or empty string
+  // indicates to just test against the default (latest)
+  // version. null indicates that the package is to not be
+  // installed. If the package to be tested is only available from
+  // PyPi, and the 'environment_type' is conda, then you can preface
+  // the package name by 'pip+', and the package will be installed
+  // via pip (with all the conda available packages installed first,
+  // followed by the pip installed packages).
+  //
+  // The ``@env`` and ``@env_nobuild`` keys contain the matrix of
+  // environment variables to pass to build and benchmark commands.
+  // An environment will be created for every combination of the
+  // cartesian product of the "@env" variables in this matrix.
+  // Variables in "@env_nobuild" will be passed to every environment
+  // during the benchmark phase, but will not trigger creation of
+  // new environments.  A value of ``null`` means that the variable
+  // will not be set for the current combination.
+  //
+  // "matrix": {
+  //     "req": {
+  //         "numpy": ["1.6", "1.7"],
+  //         "six": ["", null],  // test with and without six installed
+  //         "pip+emcee": [""]   // emcee is only available for install with pip.
+  //     },
+  //     "env": {"ENV_VAR_1": ["val1", "val2"]},
+  //     "env_nobuild": {"ENV_VAR_2": ["val3", null]},
+  // },
+  // "matrix": {
+  //     "xarray": [""],
+  //     "numpy": [""],
+  //     "dask": [""],
+  //   },
+
+  // Combinations of libraries/python versions can be excluded/included
+  // from the set to test. Each entry is a dictionary containing additional
+  // key-value pairs to include/exclude.
+  //
+  // An exclude entry excludes entries where all values match. The
+  // values are regexps that should match the whole string.
+  //
+  // An include entry adds an environment. Only the packages listed
+  // are installed. The 'python' key is required. The exclude rules
+  // do not apply to includes.
+  //
+  // In addition to package names, the following keys are available:
+  //
+  // - python
+  //     Python version, as in the *pythons* variable above.
+  // - environment_type
+  //     Environment type, as above.
+  // - sys_platform
+  //     Platform, as in sys.platform. Possible values for the common
+  //     cases: 'linux2', 'win32', 'cygwin', 'darwin'.
+  // - req
+  //     Required packages
+  // - env
+  //     Environment variables
+  // - env_nobuild
+  //     Non-build environment variables
+  //
+  // "exclude": [
+  //     {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
+  //     {"environment_type": "conda", "req": {"six": null}}, // don't run without six on conda
+  //     {"env": {"ENV_VAR_1": "val2"}}, // skip val2 for ENV_VAR_1
+  // ],
+  //
+  // "include": [
+  //     // additional env for python2.7
+  //     {"python": "2.7", "req": {"numpy": "1.8"}, "env_nobuild": {"FOO": "123"}},
+  //     // additional env if run on windows+conda
+  //     {"platform": "win32", "environment_type": "conda", "python": "2.7", "req": {"libpython": ""}},
+  // ],
+
+  // The directory (relative to the current directory) that benchmarks are
+  // stored in.  If not provided, defaults to "benchmarks"
+  "benchmark_dir": "benchmarks",
+
+  // The directory (relative to the current directory) to cache the Python
+  // environments in.  If not provided, defaults to "env"
+  "env_dir": ".asv/env",
+
+  // The directory (relative to the current directory) that raw benchmark
+  // results are stored in.  If not provided, defaults to "results".
+  "results_dir": ".asv/results",
+
+  // The directory (relative to the current directory) that the html tree
+  // should be written to.  If not provided, defaults to "html".
+  "html_dir": ".asv/html"
+
+  // The number of characters to retain in the commit hashes.
+  // "hash_length": 8,
+
+  // `asv` will cache results of the recent builds in each
+  // environment, making them faster to install next time.  This is
+  // the number of builds to keep, per environment.
+  // "build_cache_size": 2,
+
+  // The commits after which the regression search in `asv publish`
+  // should start looking for regressions. Dictionary whose keys are
+  // regexps matching to benchmark names, and values corresponding to
+  // the commit (exclusive) after which to start looking for
+  // regressions.  The default is to start from the first commit
+  // with results. If the commit is `null`, regression detection is
+  // skipped for the matching benchmark.
+  //
+  // "regressions_first_commits": {
+  //    "some_benchmark": "352cdf",  // Consider regressions only after this commit
+  //    "another_benchmark": null,   // Skip regression detection altogether
+  // },
+
+  // The thresholds for relative change in results, after which `asv
+  // publish` starts reporting regressions. Dictionary of the same
+  // form as in ``regressions_first_commits``, with values
+  // indicating the thresholds.  If multiple entries match, the
+  // maximum is taken. If no entry matches, the default is 5%.
+  //
+  // "regressions_thresholds": {
+  //    "some_benchmark": 0.01,     // Threshold of 1%
+  //    "another_benchmark": 0.5,   // Threshold of 50%
+  // },
+}
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -0,0 +1,12 @@
+def parameterized(names, params):
+    """
+    Copied from xarray benchmarks:
+    https://github.com/pydata/xarray/blob/main/asv_bench/benchmarks/__init__.py#L9-L15
+    """
+
+    def decorator(func):
+        func.param_names = names
+        func.params = params
+        return func
+
+    return decorator
diff --git a/asv_bench/benchmarks/benchmarks.py b/asv_bench/benchmarks/benchmarks.py
@@ -0,0 +1,147 @@
+import numpy as np
+import torch
+import xarray as xr
+
+from xbatcher import BatchGenerator
+from xbatcher.loaders.torch import IterableDataset, MapDataset
+
+from . import parameterized
+
+
+class Base:
+    def setup(self, *args, **kwargs):
+        shape = (10, 50, 100)
+        self.ds_3d = xr.Dataset(
+            {
+                'foo': (['time', 'y', 'x'], np.random.rand(*shape)),
+            },
+            {
+                'x': (['x'], np.arange(shape[-1])),
+                'y': (['y'], np.arange(shape[-2])),
+            },
+        )
+
+        shape_4d = (10, 50, 100, 3)
+        self.ds_4d = xr.Dataset(
+            {
+                'foo': (['time', 'y', 'x', 'b'], np.random.rand(*shape_4d)),
+            },
+            {
+                'x': (['x'], np.arange(shape_4d[-2])),
+                'y': (['y'], np.arange(shape_4d[-3])),
+                'b': (['b'], np.arange(shape_4d[-1])),
+            },
+        )
+
+        self.ds_xy = xr.Dataset(
+            {
+                'x': (
+                    ['sample', 'feature'],
+                    np.random.random((shape[-1], shape[0])),
+                ),
+                'y': (['sample'], np.random.random(shape[-1])),
+            },
+        )
+
+
+class Generator(Base):
+    @parameterized(['preload_batch'], ([True, False]))
+    def time_batch_preload(self, preload_batch):
+        """
+        Construct a generator on a chunked DataSet with and without preloading
+        batches.
+        """
+        ds_dask = self.ds_xy.chunk({'sample': 2})
+        BatchGenerator(
+            ds_dask, input_dims={'sample': 2}, preload_batch=preload_batch
+        )
+
+    @parameterized(
+        ['input_dims', 'batch_dims', 'input_overlap'],
+        (
+            [{'x': 5}, {'x': 10}, {'x': 5, 'y': 5}, {'x': 10, 'y': 5}],
+            [{}, {'x': 20}, {'x': 30}],
+            [{}, {'x': 1}, {'x': 2}],
+        ),
+    )
+    def time_batch_input(self, input_dims, batch_dims, input_overlap):
+        """
+        Benchmark simple batch generation case.
+        """
+        BatchGenerator(
+            self.ds_3d,
+            input_dims=input_dims,
+            batch_dims=batch_dims,
+            input_overlap=input_overlap,
+        )
+
+    @parameterized(
+        ['input_dims', 'concat_input_dims'],
+        ([{'x': 5}, {'x': 10}, {'x': 5, 'y': 5}], [True, False]),
+    )
+    def time_batch_concat(self, input_dims, concat_input_dims):
+        """
+        Construct a generator on a DataSet with and without concatenating
+        chunks specified by ``input_dims`` into the batch dimension.
+        """
+        BatchGenerator(
+            self.ds_3d,
+            input_dims=input_dims,
+            concat_input_dims=concat_input_dims,
+        )
+
+    @parameterized(
+        ['input_dims', 'batch_dims', 'concat_input_dims'],
+        (
+            [{'x': 5}, {'x': 5, 'y': 5}],
+            [{}, {'x': 10}, {'x': 10, 'y': 10}],
+            [True, False],
+        ),
+    )
+    def time_batch_concat_4d(self, input_dims, batch_dims, concat_input_dims):
+        """
+        Construct a generator on a DataSet with and without concatenating
+        chunks specified by ``input_dims`` into the batch dimension.
+        """
+        BatchGenerator(
+            self.ds_4d,
+            input_dims=input_dims,
+            batch_dims=batch_dims,
+            concat_input_dims=concat_input_dims,
+        )
+
+
+class Accessor(Base):
+    @parameterized(
+        ['input_dims'],
+        ([{'x': 2}, {'x': 4}, {'x': 2, 'y': 2}, {'x': 4, 'y': 2}]),
+    )
+    def time_accessor_input_dim(self, input_dims):
+        """
+        Benchmark simple batch generation case using xarray accessor
+        Equivalent to subset of ``time_batch_input()``.
+        """
+        self.ds_3d.batch.generator(input_dims=input_dims)
+
+
+class TorchLoader(Base):
+    def setup(self, *args, **kwargs):
+        super().setup(**kwargs)
+        self.x_gen = BatchGenerator(self.ds_xy['x'], {'sample': 10})
+        self.y_gen = BatchGenerator(self.ds_xy['y'], {'sample': 10})
+
+    def time_map_dataset(self):
+        """
+        Benchmark MapDataset integration with torch DataLoader.
+        """
+        dataset = MapDataset(self.x_gen, self.y_gen)
+        loader = torch.utils.data.DataLoader(dataset)
+        iter(loader).next()
+
+    def time_iterable_dataset(self):
+        """
+        Benchmark IterableDataset integration with torch DataLoader.
+        """
+        dataset = IterableDataset(self.x_gen, self.y_gen)
+        loader = torch.utils.data.DataLoader(dataset)
+        iter(loader).next()
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -3,4 +3,5 @@ torch
 coverage
 pytest-cov
 adlfs
+asv
 -r requirements.txt