diff --git a/.gitignore b/.gitignore index 063a0eaa..0cf77b56 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,4 @@ venv/ __init__.pyc coverage.xml docs/api/ +benchmarks/cache/ diff --git a/benchmarks/README.md b/benchmarks/README.md index bebf562a..2b38d192 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -13,6 +13,14 @@ on the following machine: * Linux: Ubuntu 22.04 * Python: 3.10 +Before running any benchmark, +make sure to install missing requirements with: + +```bash +$ cd benchmarks/ +$ pip install -r requirements.txt +``` + ## audb.Dependencies methods @@ -23,39 +31,52 @@ This benchmark provides insights how to best represent the dependency table internally. -Results for a dependency table containing 1,000,000 files -represented by `pandas.DataFrame` -as of commit 4bbcc07. - -| Method | Execution time | -| ---------------------------------------------- | -------------- | -| `Dependency.__call__()` | 0.000 s | -| `Dependency.__contains__()` | 0.000 s | -| `Dependency.__get_item__()` | 0.000 s | -| `Dependency.__len__()` | 0.000 s | -| `Dependency.__str__()` | 0.006 s | -| `Dependency.archives` | 0.147 s | -| `Dependency.attachments` | 0.045 s | -| `Dependency.attachment_ids` | 0.045 s | -| `Dependency.files` | 0.185 s | -| `Dependency.media` | 0.264 s | -| `Dependency.removed_media` | 0.250 s | -| `Dependency.table_ids` | 0.053 s | -| `Dependency.tables` | 0.046 s | -| `Dependency.archive(1000 files)` | 0.005 s | -| `Dependency.bit_depth(1000 files)` | 0.004 s | -| `Dependency.channels(1000 files)` | 0.004 s | -| `Dependency.checksum(1000 files)` | 0.004 s | -| `Dependency.duration(1000 files)` | 0.004 s | -| `Dependency.format(1000 files)` | 0.004 s | -| `Dependency.removed(1000 files)` | 0.004 s | -| `Dependency.sampling_rate(1000 files)` | 0.004 s | -| `Dependency.type(1000 files)` | 0.005 s | -| `Dependency.version(1000 files)` | 0.004 s | -| `Dependency._add_attachment()` | 0.061 s | -| `Dependency._add_media(1000 files)` | 0.050 s | -| `Dependency._add_meta()` | 0.124 s | -| `Dependency._drop()` | 0.078 s | -| `Dependency._remove()` | 0.068 s | -| `Dependency._update_media()` | 0.073 s | -| `Dependency._update_media_version(1000 files)` | 0.008 s | +To run the benchmark execute: + +```bash +$ python benchmark-dependencies-methods.py +``` + +Execution times in seconds +of `audb.Dependencies` methods +for a dependency table +containing 1,000,000 files +stored as a `pandas.DataFrame` +using different dtype representations +(storing string as `string`, +storing string as `object`, +using `pyarrow` dtypes) +as of commit 91528e4. + +| method | string | object | pyarrow | +|------------------------------------------------|----------|----------|-----------| +| Dependencies.\_\_call__() | 0.000 | 0.000 | 0.000 | +| Dependencies.\_\_contains__() | 0.000 | 0.000 | 0.000 | +| Dependencies.\_\_get_item__() | 0.000 | 0.000 | 0.000 | +| Dependencies.\_\_len__() | 0.000 | 0.000 | 0.000 | +| Dependencies.\_\_str__() | 0.006 | 0.005 | 0.007 | +| Dependencies.archives | 0.141 | 0.116 | 0.144 | +| Dependencies.attachments | 0.029 | 0.018 | 0.017 | +| Dependencies.attachment_ids | 0.029 | 0.018 | 0.017 | +| Dependencies.files | 0.030 | 0.012 | 0.043 | +| Dependencies.media | 0.127 | 0.072 | 0.086 | +| Dependencies.removed_media | 0.117 | 0.069 | 0.081 | +| Dependencies.table_ids | 0.037 | 0.026 | 0.023 | +| Dependencies.tables | 0.028 | 0.017 | 0.017 | +| Dependencies.archive(1000 files) | 0.005 | 0.005 | 0.007 | +| Dependencies.bit_depth(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.channels(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.checksum(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.duration(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.format(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.removed(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.sampling_rate(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.type(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies.version(1000 files) | 0.004 | 0.004 | 0.006 | +| Dependencies._add_attachment() | 0.055 | 0.056 | 0.207 | +| Dependencies._add_media(1000 files) | 0.049 | 0.050 | 0.060 | +| Dependencies._add_meta() | 0.120 | 0.128 | 0.138 | +| Dependencies._drop() | 0.075 | 0.075 | 0.117 | +| Dependencies._remove() | 0.068 | 0.068 | 0.064 | +| Dependencies._update_media() | 0.071 | 0.072 | 0.125 | +| Dependencies._update_media_version(1000 files) | 0.008 | 0.008 | 0.017 | diff --git a/benchmarks/benchmark-dependencies-methods.py b/benchmarks/benchmark-dependencies-methods.py new file mode 100644 index 00000000..12c23da4 --- /dev/null +++ b/benchmarks/benchmark-dependencies-methods.py @@ -0,0 +1,371 @@ +import hashlib +import os +import pickle +import random +import string +import time + +import pandas as pd +import tabulate + +import audeer + +import audb + + +random.seed(1) + +cache = audeer.mkdir("./cache") + + +def astype(df, dtype): + """Convert to desired dataframe dtypes.""" + if dtype == "object": + # Use `object` to represent strings + df["archive"] = df["archive"].astype("object") + df["bit_depth"] = df["bit_depth"].astype("int32") + df["channels"] = df["channels"].astype("int32") + df["checksum"] = df["checksum"].astype("object") + df["duration"] = df["duration"].astype("float64") + df["format"] = df["format"].astype("object") + df["removed"] = df["removed"].astype("int32") + df["sampling_rate"] = df["sampling_rate"].astype("int32") + df["type"] = df["type"].astype("int32") + df["version"] = df["version"].astype("object") + df.index = df.index.astype("object") + # Set dtypes in library + audb.core.define.DEPEND_FIELD_DTYPES = { + "archive": "object", + "bit_depth": "int32", + "channels": "int32", + "checksum": "object", + "duration": "float64", + "format": "object", + "removed": "int32", + "sampling_rate": "int32", + "type": "int32", + "version": "object", + } + elif dtype == "string": + # Use `string` to represent strings + df["archive"] = df["archive"].astype("string") + df["bit_depth"] = df["bit_depth"].astype("int32") + df["channels"] = df["channels"].astype("int32") + df["checksum"] = df["checksum"].astype("string") + df["duration"] = df["duration"].astype("float64") + df["format"] = df["format"].astype("string") + df["removed"] = df["removed"].astype("int32") + df["sampling_rate"] = df["sampling_rate"].astype("int32") + df["type"] = df["type"].astype("int32") + df["version"] = df["version"].astype("string") + df.index = df.index.astype("string") + # Set dtypes in library + audb.core.define.DEPEND_FIELD_DTYPES = { + "archive": "string", + "bit_depth": "int32", + "channels": "int32", + "checksum": "string", + "duration": "float64", + "format": "string", + "removed": "int32", + "sampling_rate": "int32", + "type": "int32", + "version": "string", + } + elif dtype == "pyarrow": + # Use `pyarrow` to represent all dtypes + df["archive"] = df["archive"].astype("string[pyarrow]") + df["bit_depth"] = df["bit_depth"].astype("int32[pyarrow]") + df["channels"] = df["channels"].astype("int32[pyarrow]") + df["checksum"] = df["checksum"].astype("string[pyarrow]") + df["duration"] = df["duration"].astype("float64[pyarrow]") + df["format"] = df["format"].astype("string[pyarrow]") + df["removed"] = df["removed"].astype("int32[pyarrow]") + df["sampling_rate"] = df["sampling_rate"].astype("int32[pyarrow]") + df["type"] = df["type"].astype("int32[pyarrow]") + df["version"] = df["version"].astype("string[pyarrow]") + df.index = df.index.astype("string[pyarrow]") + # Set dtypes in library + audb.core.define.DEPEND_FIELD_DTYPES = { + "archive": "string[pyarrow]", + "bit_depth": "int32[pyarrow]", + "channels": "int32[pyarrow]", + "checksum": "string[pyarrow]", + "duration": "float64[pyarrow]", + "format": "string[pyarrow]", + "removed": "int32[pyarrow]", + "sampling_rate": "int32[pyarrow]", + "type": "int32[pyarrow]", + "version": "string[pyarrow]", + } + return df + + +# === Dependencies pandas.DataFrame === +data_cache = audeer.path(cache, "df.pkl") +num_rows = 1000000 +if not os.path.exists(data_cache): + bit_depths = [0, 16, 24] + channels = [0, 1, 2] + formats = ["csv", "wav", "txt"] + sampling_rates = [0, 16000, 44100] + types = [0, 1, 2] + versions = ["1.0.0", "1.1.0"] + records = [ + { + "file": f"file-{n}.wav", + "archive": f"archive-{n}", + "bit_depth": random.choices(bit_depths, weights=[0.1, 0.8, 0.1])[0], + "channels": random.choices(channels, weights=[0.1, 0.8, 0.1])[0], + "checksum": hashlib.md5( + pickle.dumps(random.choice(string.ascii_letters)) + ).hexdigest(), + "duration": 10 * random.random(), + "format": random.choices(formats, weights=[0.1, 0.8, 0.1])[0], + "removed": random.choices([0, 1], weights=[0.1, 0.9])[0], + "sampling_rate": random.choices(sampling_rates, weights=[0.1, 0.8, 0.1])[0], + "type": random.choices(types, weights=[0.1, 0.8, 0.1])[0], + "version": random.choices(versions, weights=[0.2, 0.8])[0], + } + for n in range(num_rows) + ] + df = pd.DataFrame.from_records(records) + for column, dtype in zip( + audb.core.define.DEPEND_FIELD_NAMES.values(), + audb.core.define.DEPEND_FIELD_DTYPES.values(), + ): + df[column] = df[column].astype(dtype) + df.set_index("file", inplace=True) + df.index.name = None + df.index = df.index.astype("string") + df.to_pickle(data_cache) + + +# ===== Benchmark audb.Dependencies ===== +deps = audb.Dependencies() +deps.load(data_cache) +file = "file-10.wav" +n_files = 1000 +_files = deps._df.index[:n_files].tolist() +dtypes = ["string", "object", "pyarrow"] +results = pd.DataFrame(columns=dtypes) +results.index.name = "method" + +for dtype in dtypes: + deps.load(data_cache) + deps._df = astype(deps._df, dtype) + + # Check we have the expected dtypes + # in dependency table + # and library + if dtype == "pyarrow": + expected_dtype = "string[pyarrow]" + else: + expected_dtype = dtype + assert deps._df.archive.dtype == expected_dtype + assert audb.core.define.DEPEND_FIELD_DTYPES["archive"] == expected_dtype + + method = "Dependencies.__call__()" + t0 = time.time() + deps() + t = time.time() - t0 + results.at[method, dtype] = t + + # Access the index one time. + # Further calls will be faster + file in deps + + method = "Dependencies.__contains__()" + t0 = time.time() + file in deps + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.__get_item__()" + t0 = time.time() + deps[file] + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.__len__()" + t0 = time.time() + len(deps) + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.__str__()" + t0 = time.time() + str(deps) + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.archives" + t0 = time.time() + deps.archives + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.attachments" + t0 = time.time() + deps.attachments + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.attachment_ids" + t0 = time.time() + deps.attachment_ids + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.files" + t0 = time.time() + deps.files + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.media" + t0 = time.time() + deps.media + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.removed_media" + t0 = time.time() + deps.removed_media + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.table_ids" + t0 = time.time() + deps.table_ids + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies.tables" + t0 = time.time() + deps.tables + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.archive({n_files} files)" + t0 = time.time() + [deps.archive(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.bit_depth({n_files} files)" + t0 = time.time() + [deps.bit_depth(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.channels({n_files} files)" + t0 = time.time() + [deps.channels(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.checksum({n_files} files)" + t0 = time.time() + [deps.checksum(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.duration({n_files} files)" + t0 = time.time() + [deps.duration(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.format({n_files} files)" + t0 = time.time() + [deps.format(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.removed({n_files} files)" + t0 = time.time() + [deps.removed(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.sampling_rate({n_files} files)" + t0 = time.time() + [deps.sampling_rate(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.type({n_files} files)" + t0 = time.time() + [deps.type(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies.version({n_files} files)" + t0 = time.time() + [deps.version(file) for file in _files] + t = time.time() - t0 + results.at[method, dtype] = t + + # ------------------------------------------------------------------------- + method = "Dependencies._add_attachment()" + t0 = time.time() + deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum") + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies._add_media({n_files} files)" + values = [ + ( + f"file-new-{n}.wav", # file + f"archive-new-{n}", # archive + 16, # bit_depth + 1, # channels + f"checksum-{n}", # checksum + 0.4, # duration + "wav", # format + 0, # removed + 16000, # sampling_rate + 1, # type + "1.0.0", # version + ) + for n in range(n_files) + ] + t0 = time.time() + deps._add_media(values) + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies._add_meta()" + t0 = time.time() + deps._add_meta("db.new-table.csv", "1.0.0", "archive", "checksum") + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies._drop()" + t0 = time.time() + deps._drop(["file-9000.wav"]) + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies._remove()" + t0 = time.time() + deps._remove(file) + t = time.time() - t0 + results.at[method, dtype] = t + + method = "Dependencies._update_media()" + t0 = time.time() + deps._update_media(values) + t = time.time() - t0 + results.at[method, dtype] = t + + method = f"Dependencies._update_media_version({n_files} files)" + t0 = time.time() + deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version") + t = time.time() - t0 + results.at[method, dtype] = t + +# ===== Print results ===== +table = tabulate.tabulate(results, headers="keys", tablefmt="github", floatfmt=".3f") +print(table) diff --git a/benchmarks/benchmark-dependency-methods.py b/benchmarks/benchmark-dependency-methods.py deleted file mode 100644 index e593c978..00000000 --- a/benchmarks/benchmark-dependency-methods.py +++ /dev/null @@ -1,209 +0,0 @@ -import hashlib -import os -import pickle -import random -import string -import time - -import pandas as pd - -import audeer - -import audb - - -random.seed(1) - -cache = audeer.mkdir("./cache") - - -# === Dependency pandas.DataFrame === -data_cache = audeer.path(cache, "df.pkl") -num_rows = 1000000 -if not os.path.exists(data_cache): - bit_depths = [0, 16, 24] - channels = [0, 1, 2] - formats = ["csv", "wav", "txt"] - sampling_rates = [0, 16000, 44100] - types = [0, 1, 2] - versions = ["1.0.0", "1.1.0"] - records = [ - { - "file": f"file-{n}.wav", - "archive": f"archive-{n}", - "bit_depth": random.choices(bit_depths, weights=[0.1, 0.8, 0.1])[0], - "channels": random.choices(channels, weights=[0.1, 0.8, 0.1])[0], - "checksum": hashlib.md5( - pickle.dumps(random.choice(string.ascii_letters)) - ).hexdigest(), - "duration": 10 * random.random(), - "format": random.choices(formats, weights=[0.1, 0.8, 0.1])[0], - "removed": random.choices([0, 1], weights=[0.1, 0.9])[0], - "sampling_rate": random.choices(sampling_rates, weights=[0.1, 0.8, 0.1])[0], - "type": random.choices(types, weights=[0.1, 0.8, 0.1])[0], - "version": random.choices(versions, weights=[0.2, 0.8])[0], - } - for n in range(num_rows) - ] - df = pd.DataFrame.from_records(records) - for column, dtype in zip( - audb.core.define.DEPEND_FIELD_NAMES.values(), - audb.core.define.DEPEND_FIELD_DTYPES.values(), - ): - df[column] = df[column].astype(dtype) - df.set_index("file", inplace=True) - df.index.name = None - df.index = df.index.astype("string") - df.to_pickle(data_cache) - -# === Create dependency object === -deps = audb.Dependencies() -deps.load(data_cache) - -# ===== Benchmark audb.Dependency ===== -file = "file-10.wav" - -t = time.time() -deps() -print(f"Dependency.__call__(): {time.time() -t:.3f} s") - -# Access the index one time. -# Further calls will be faster -file in deps - -t = time.time() -file in deps -print(f"Dependency.__contains__(): {time.time() -t:.3f} s") - -t = time.time() -deps[file] -print(f"Dependency.__get_item__(): {time.time() -t:.3f} s") - -t = time.time() -len(deps) -print(f"Dependency.__len__(): {time.time() -t:.3f} s") - -t = time.time() -str(deps) -print(f"Dependency.__str__(): {time.time() -t:.3f} s") - -t = time.time() -deps.archives -print(f"Dependency.archives: {time.time() -t:.3f} s") - -t = time.time() -deps.attachments -print(f"Dependency.attachments: {time.time() -t:.3f} s") - -t = time.time() -deps.attachment_ids -print(f"Dependency.attachment_ids: {time.time() -t:.3f} s") - -t = time.time() -files = deps.files -print(f"Dependency.files: {time.time() -t:.3f} s") - -t = time.time() -deps.media -print(f"Dependency.media: {time.time() -t:.3f} s") - -t = time.time() -deps.removed_media -print(f"Dependency.removed_media: {time.time() -t:.3f} s") - -t = time.time() -deps.table_ids -print(f"Dependency.table_ids: {time.time() -t:.3f} s") - -t = time.time() -deps.tables -print(f"Dependency.tables: {time.time() -t:.3f} s") - -n_files = 1000 -_files = files[:n_files] - -t = time.time() -[deps.archive(file) for file in _files] -print(f"Dependency.archive({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.bit_depth(file) for file in _files] -print(f"Dependency.bit_depth({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.channels(file) for file in _files] -print(f"Dependency.channels({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.checksum(file) for file in _files] -print(f"Dependency.checksum({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.duration(file) for file in _files] -print(f"Dependency.duration({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.format(file) for file in _files] -print(f"Dependency.format({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.removed(file) for file in _files] -print(f"Dependency.removed({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.sampling_rate(file) for file in _files] -print(f"Dependency.sampling_rate({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.type(file) for file in _files] -print(f"Dependency.type({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -[deps.version(file) for file in _files] -print(f"Dependency.version({n_files} files): {time.time() -t:.3f} s") - -# ------------------------------------------------------------------------- -t = time.time() -deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum") -print(f"Dependency._add_attachment(): {time.time() -t:.3f} s") - -values = [ - ( - f"file-new-{n}.wav", # file - f"archive-new-{n}", # archive - 16, # bit_depth - 1, # channels - f"checksum-{n}", # checksum - 0.4, # duration - "wav", # format - 0, # removed - 16000, # sampling_rate - 1, # type - "1.0.0", # version - ) - for n in range(n_files) -] - -t = time.time() -deps._add_media(values) -print(f"Dependency._add_media({n_files} files): {time.time() -t:.3f} s") - -t = time.time() -deps._add_meta("db.new-table.csv", "1.0.0", "archive", "checksum") -print(f"Dependency._add_meta(): {time.time() -t:.3f} s") - -t = time.time() -deps._drop(["file-9000.wav"]) -print(f"Dependency._drop(): {time.time() -t:.3f} s") - -t = time.time() -deps._remove(file) -print(f"Dependency._remove(): {time.time() -t:.3f} s") - -t = time.time() -deps._update_media(values) -print(f"Dependency._update_media(): {time.time() -t:.3f} s") - -t = time.time() -deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version") -print(f"Dependency._update_media_version({n_files} files): {time.time() -t:.3f} s") diff --git a/benchmarks/requirements.txt b/benchmarks/requirements.txt new file mode 100644 index 00000000..044a3b3a --- /dev/null +++ b/benchmarks/requirements.txt @@ -0,0 +1,2 @@ +pyarrow +tabulate