Skip to content

Commit

Permalink
Add benchmark for audb.Dependency methods
Browse files Browse the repository at this point in the history
  • Loading branch information
hagenw committed Feb 6, 2024
1 parent 4bbcc07 commit 7e84dbb
Show file tree
Hide file tree
Showing 2 changed files with 301 additions and 0 deletions.
61 changes: 61 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Benchmarks

This folder contains benchmarks
used to evaluate design decisions
in `audb`.

The reported results were calculated
on the following machine:

* CPU: 13th Gen Intel Core i7-1355U, 10-core (2-mt/8-st)
* RAM: 15.29 GiB
* Hard drive: KIOXIA KXG8AZNV1T02
* Linux: Ubuntu 22.04
* Python: 3.10


## audb.Dependencies methods

Benchmarks all methods of `audb.Dependencies`
besides `audb.Dependencies.load()`
and `audb.Dependencies.save()`.
This benchmark provides insights
how to best represent
the dependency table internally.

Results based on commit 4bbcc07,
using `pandas.DataFrame`
to represent the dependency table.

| Method | Execution time |
| ---------------------------------------------- | -------------- |
| `Dependency.__call__()` | 0.000 s |
| `Dependency.__contains__()` | 0.000 s |
| `Dependency.__get_item__()` | 0.000 s |
| `Dependency.__len__()` | 0.000 s |
| `Dependency.__str__()` | 0.015 s |
| `Dependency.archives` | 0.121 s |
| `Dependency.attachments` | 0.021 s |
| `Dependency.attachment_ids` | 0.022 s |
| `Dependency.files` | 0.028 s |
| `Dependency.media` | 0.098 s |
| `Dependency.removed_media` | 0.094 s |
| `Dependency.table_ids` | 0.031 s |
| `Dependency.tables` | 0.021 s |
| `Dependency.archive(1000 files)` | 0.005 s |
| `Dependency.bit_depth(1000 files)` | 0.005 s |
| `Dependency.channels(1000 files)` | 0.004 s |
| `Dependency.checksum(1000 files)` | 0.004 s |
| `Dependency.duration(1000 files)` | 0.004 s |
| `Dependency.format(1000 files)` | 0.004 s |
| `Dependency.removed(1000 files)` | 0.004 s |
| `Dependency.sampling_rate(1000 files)` | 0.004 s |
| `Dependency.type(1000 files)` | 0.005 s |
| `Dependency.version(1000 files)` | 0.004 s |
| `Dependency._add_attachment()` | 0.075 s |
| `Dependency._add_media(1000 files)` | 0.066 s |
| `Dependency._add_meta()` | 0.118 s |
| `Dependency._drop()` | 0.094 s |
| `Dependency._remove()` | 0.072 s |
| `Dependency._update_media()` | 0.085 s |
| `Dependency._update_media_version(1000 files)` | 0.008 s |
240 changes: 240 additions & 0 deletions benchmarks/benchmark-dependency-methods.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
import hashlib
import os
import pickle
import random
import string
import time

import pandas as pd

import audeer

import audb


random.seed(1)

cache = audeer.mkdir("./cache")


def active_branch():
head_dir = audeer.path("..", ".git", "HEAD")
with open(head_dir, "r") as f:
content = f.read().splitlines()
for line in content:
if line[0:4] == "ref:":
return line.partition("refs/heads/")[2]


# === Dependency pandas.DataFrame ===
data_cache = audeer.path(cache, "df.pkl")
num_rows = 1000000
dtypes = [str, str, int, int, str, float, str, int, int, int, str]
columns = [
"file",
"archive",
"bit_depth",
"channels",
"checksum",
"duration",
"format",
"removed",
"sampling_rate",
"type",
"version",
]
if not os.path.exists(data_cache):
records = [
{
"file": f"file-{n}.wav",
"archive": f"archive-{n}",
"bit_depth": random.choices([0, 16, 24], weights=[0.1, 0.8, 0.1])[0],
"channels": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
"checksum": hashlib.md5(
pickle.dumps(random.choice(string.ascii_letters))
).hexdigest(),
"duration": 10 * random.random(),
"format": random.choices(["csv", "wav", "txt"], weights=[0.1, 0.8, 0.1])[0],
"removed": random.choices([0, 1], weights=[0.1, 0.9])[0],
"sampling_rate": random.choices(
[0, 16000, 44100],
weights=[0.1, 0.8, 0.1],
)[0],
"type": random.choices([0, 1, 2], weights=[0.1, 0.8, 0.1])[0],
"version": random.choices(["1.0.0", "1.1.0"], weights=[0.2, 0.8])[0],
}
for n in range(num_rows)
]
df = pd.DataFrame.from_records(records)
for column, dtype in zip(df.columns, dtypes):
df[column] = df[column].astype(dtype)
df.set_index("file", inplace=True)
df.index.name = ""
df.to_pickle(data_cache)

# === Create dependency object ===
deps = audb.Dependencies()
deps.load(data_cache)

# ===== Benchmark audb.Dependency =====
file = "file-10.wav"

t = time.time()
deps()
print(f"Dependency.__call__(): {time.time() -t:.3f} s")

# Access the index one time.
# Further calls will be faster
file in deps

t = time.time()
file in deps
print(f"Dependency.__contains__(): {time.time() -t:.3f} s")

t = time.time()
deps[file]
print(f"Dependency.__get_item__(): {time.time() -t:.3f} s")

t = time.time()
len(deps)
print(f"Dependency.__len__(): {time.time() -t:.3f} s")

t = time.time()
str(deps)
print(f"Dependency.__str__(): {time.time() -t:.3f} s")

t = time.time()
deps.archives
print(f"Dependency.archives: {time.time() -t:.3f} s")

t = time.time()
deps.attachments
print(f"Dependency.attachments: {time.time() -t:.3f} s")

t = time.time()
deps.attachment_ids
print(f"Dependency.attachment_ids: {time.time() -t:.3f} s")

t = time.time()
files = deps.files
print(f"Dependency.files: {time.time() -t:.3f} s")

t = time.time()
deps.media
print(f"Dependency.media: {time.time() -t:.3f} s")

t = time.time()
deps.removed_media
print(f"Dependency.removed_media: {time.time() -t:.3f} s")

t = time.time()
deps.table_ids
print(f"Dependency.table_ids: {time.time() -t:.3f} s")

t = time.time()
deps.tables
print(f"Dependency.tables: {time.time() -t:.3f} s")

n_files = 1000
_files = files[:n_files]

t = time.time()
[deps.archive(file) for file in _files]
print(f"Dependency.archive({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.bit_depth(file) for file in _files]
print(f"Dependency.bit_depth({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.channels(file) for file in _files]
print(f"Dependency.channels({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.checksum(file) for file in _files]
print(f"Dependency.checksum({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.duration(file) for file in _files]
print(f"Dependency.duration({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.format(file) for file in _files]
print(f"Dependency.format({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.removed(file) for file in _files]
print(f"Dependency.removed({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.sampling_rate(file) for file in _files]
print(f"Dependency.sampling_rate({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.type(file) for file in _files]
print(f"Dependency.type({n_files} files): {time.time() -t:.3f} s")

t = time.time()
[deps.version(file) for file in _files]
print(f"Dependency.version({n_files} files): {time.time() -t:.3f} s")

# -------------------------------------------------------------------------
t = time.time()
deps._add_attachment("attachment.txt", "1.0.0", "archive", "checksum")
print(f"Dependency._add_attachment(): {time.time() -t:.3f} s")

if active_branch() == "deps-parquet":
values = [
(
f"file-new-{n}.wav", # file
f"archive-new-{n}", # archive
16, # bit_depth
1, # channels
f"checksum-{n}", # checksum
0.4, # duration
16000, # sampling_rate
"1.0.0", # version
)
for n in range(n_files)
]
else:
values = [
(
f"file-new-{n}.wav", # file
f"archive-new-{n}", # archive
16, # bit_depth
1, # channels
f"checksum-{n}", # checksum
0.4, # duration
"wav", # format
0, # removed
16000, # sampling_rate
1, # type
"1.0.0", # version
)
for n in range(n_files)
]

t = time.time()
deps._add_media(values)
print(f"Dependency._add_media({n_files} files): {time.time() -t:.3f} s")

t = time.time()
deps._add_meta("db.new-table.csv", "1.0.0", "archive", "checksum")
print(f"Dependency._add_meta(): {time.time() -t:.3f} s")

t = time.time()
deps._drop(["file-9000.wav"])
print(f"Dependency._drop(): {time.time() -t:.3f} s")

t = time.time()
deps._remove(file)
print(f"Dependency._remove(): {time.time() -t:.3f} s")

t = time.time()
deps._update_media(values)
print(f"Dependency._update_media(): {time.time() -t:.3f} s")

t = time.time()
deps._update_media_version([f"file-{n}.wav" for n in range(n_files)], "version")
print(f"Dependency._update_media_version({n_files} files): {time.time() -t:.3f} s")

0 comments on commit 7e84dbb

Please sign in to comment.