Skip to content

Commit

Permalink
Speed up accessing single Dependencies entries (#368)
Browse files Browse the repository at this point in the history
* Speed up accessing single Dependencies entries

* Update benchmark results

* Don't mentioning the exact commit
  • Loading branch information
hagenw authored Feb 12, 2024
1 parent 38822b5 commit 52c631b
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 41 deletions.
30 changes: 21 additions & 9 deletions audb/core/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ def bit_depth(self, file: str) -> int:
bit depth
"""
return int(self._df.bit_depth[file])
return self._column_loc("bit_depth", file, int)

def channels(self, file: str) -> int:
r"""Number of channels of media file.
Expand All @@ -222,7 +222,7 @@ def channels(self, file: str) -> int:
number of channels
"""
return int(self._df.channels[file])
return self._column_loc("channels", file, int)

def checksum(self, file: str) -> str:
r"""Checksum of file.
Expand All @@ -234,7 +234,7 @@ def checksum(self, file: str) -> str:
checksum of file
"""
return self._df.checksum[file]
return self._column_loc("checksum", file)

def duration(self, file: str) -> float:
r"""Duration of file.
Expand All @@ -246,7 +246,7 @@ def duration(self, file: str) -> float:
duration in seconds
"""
return float(self._df.duration[file])
return self._column_loc("duration", file, float)

def format(self, file: str) -> str:
r"""Format of file.
Expand All @@ -258,7 +258,7 @@ def format(self, file: str) -> str:
file format (always lower case)
"""
return self._df.format[file]
return self._column_loc("format", file)

def load(self, path: str):
r"""Read dependencies from file.
Expand Down Expand Up @@ -319,7 +319,7 @@ def removed(self, file: str) -> bool:
``True`` if file was removed
"""
return bool(self._df.removed[file])
return self._column_loc("removed", file, bool)

def sampling_rate(self, file: str) -> int:
r"""Sampling rate of media file.
Expand All @@ -331,7 +331,7 @@ def sampling_rate(self, file: str) -> int:
sampling rate in Hz
"""
return int(self._df.sampling_rate[file])
return self._column_loc("sampling_rate", file, int)

def save(self, path: str):
r"""Write dependencies to file.
Expand Down Expand Up @@ -360,7 +360,7 @@ def type(self, file: str) -> int:
type
"""
return int(self._df.type[file])
return self._column_loc("type", file, int)

def version(self, file: str) -> str:
r"""Version of file.
Expand All @@ -372,7 +372,7 @@ def version(self, file: str) -> str:
version string
"""
return self._df.version[file]
return self._column_loc("version", file)

def _add_attachment(
self,
Expand Down Expand Up @@ -468,6 +468,18 @@ def _add_meta(
version, # version
]

def _column_loc(
self,
column: str,
files: typing.Union[str, typing.Sequence[str]],
dtype: typing.Callable = None,
) -> typing.Union[typing.Any, typing.List[typing.Any]]:
r"""Column content for selected files."""
value = self._df.at[files, column]
if dtype is not None:
value = dtype(value)
return value

def _drop(self, files: typing.Sequence[str]):
r"""Drop files from table.
Expand Down
63 changes: 31 additions & 32 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,41 +45,40 @@ stored as a `pandas.DataFrame`
using different dtype representations
(storing string as `string`,
storing string as `object`,
using `pyarrow` dtypes)
as of commit 91528e4.
using `pyarrow` dtypes).

| method | string | object | pyarrow |
|-------------------------------------------------|----------|----------|-----------|
| Dependencies.__call__() | 0.000 | 0.000 | 0.000 |
| Dependencies.__contains__(10000 files) | 0.005 | 0.005 | 0.004 |
| Dependencies.__get_item__(10000 files) | 0.311 | 0.223 | 0.907 |
| Dependencies.__len__() | 0.000 | 0.000 | 0.000 |
| Dependencies.__str__() | 0.006 | 0.005 | 0.006 |
| Dependencies.archives | 0.145 | 0.112 | 0.144 |
| Dependencies.attachments | 0.029 | 0.018 | 0.017 |
| Dependencies.attachment_ids | 0.028 | 0.018 | 0.016 |
| Dependencies.files | 0.031 | 0.011 | 0.042 |
| Dependencies.media | 0.132 | 0.072 | 0.088 |
| Dependencies.removed_media | 0.118 | 0.063 | 0.081 |
| Dependencies.table_ids | 0.035 | 0.025 | 0.022 |
| Dependencies.tables | 0.028 | 0.017 | 0.016 |
| Dependencies.archive(10000 files) | 0.046 | 0.043 | 0.064 |
| Dependencies.bit_depth(10000 files) | 0.042 | 0.042 | 0.060 |
| Dependencies.channels(10000 files) | 0.041 | 0.042 | 0.060 |
| Dependencies.checksum(10000 files) | 0.043 | 0.041 | 0.064 |
| Dependencies.duration(10000 files) | 0.042 | 0.042 | 0.059 |
| Dependencies.format(10000 files) | 0.044 | 0.042 | 0.064 |
| Dependencies.removed(10000 files) | 0.041 | 0.042 | 0.059 |
| Dependencies.sampling_rate(10000 files) | 0.043 | 0.043 | 0.061 |
| Dependencies.type(10000 files) | 0.043 | 0.042 | 0.060 |
| Dependencies.version(10000 files) | 0.044 | 0.041 | 0.066 |
| Dependencies._add_attachment() | 0.068 | 0.057 | 0.222 |
| Dependencies._add_media(10000 files) | 0.057 | 0.057 | 0.068 |
| Dependencies._add_meta() | 0.121 | 0.138 | 0.148 |
| Dependencies._drop() | 0.077 | 0.076 | 0.117 |
| Dependencies._remove() | 0.061 | 0.065 | 0.066 |
| Dependencies._update_media() | 0.087 | 0.087 | 0.149 |
| Dependencies._update_media_version(10000 files) | 0.011 | 0.011 | 0.026 |
| Dependencies.\_\_call__() | 0.000 | 0.000 | 0.000 |
| Dependencies.\_\_contains__(10000 files) | 0.005 | 0.004 | 0.004 |
| Dependencies.\_\_get_item__(10000 files) | 0.322 | 0.224 | 0.900 |
| Dependencies.\_\_len__() | 0.000 | 0.000 | 0.000 |
| Dependencies.\_\_str__() | 0.006 | 0.005 | 0.006 |
| Dependencies.archives | 0.144 | 0.116 | 0.152 |
| Dependencies.attachments | 0.030 | 0.018 | 0.018 |
| Dependencies.attachment_ids | 0.029 | 0.018 | 0.018 |
| Dependencies.files | 0.030 | 0.011 | 0.046 |
| Dependencies.media | 0.129 | 0.073 | 0.095 |
| Dependencies.removed_media | 0.117 | 0.070 | 0.087 |
| Dependencies.table_ids | 0.037 | 0.026 | 0.023 |
| Dependencies.tables | 0.029 | 0.017 | 0.017 |
| Dependencies.archive(10000 files) | 0.045 | 0.042 | 0.065 |
| Dependencies.bit_depth(10000 files) | 0.024 | 0.024 | 0.045 |
| Dependencies.channels(10000 files) | 0.023 | 0.023 | 0.045 |
| Dependencies.checksum(10000 files) | 0.026 | 0.023 | 0.047 |
| Dependencies.duration(10000 files) | 0.023 | 0.023 | 0.043 |
| Dependencies.format(10000 files) | 0.026 | 0.023 | 0.047 |
| Dependencies.removed(10000 files) | 0.023 | 0.023 | 0.043 |
| Dependencies.sampling_rate(10000 files) | 0.023 | 0.023 | 0.043 |
| Dependencies.type(10000 files) | 0.023 | 0.023 | 0.043 |
| Dependencies.version(10000 files) | 0.026 | 0.023 | 0.047 |
| Dependencies._add_attachment() | 0.055 | 0.062 | 0.220 |
| Dependencies._add_media(10000 files) | 0.057 | 0.057 | 0.066 |
| Dependencies._add_meta() | 0.117 | 0.129 | 0.145 |
| Dependencies._drop() | 0.075 | 0.078 | 0.121 |
| Dependencies._remove() | 0.061 | 0.069 | 0.064 |
| Dependencies._update_media() | 0.087 | 0.086 | 0.145 |
| Dependencies._update_media_version(10000 files) | 0.011 | 0.011 | 0.020 |


## audb.Dependencies loading/writing to file
Expand Down

0 comments on commit 52c631b

Please sign in to comment.