Skip to content

Commit

Permalink
Store index of Dependencies._df as object dtype (#371)
Browse files Browse the repository at this point in the history
* Store index of Dependencies._df as object dtype

* Add memray to requirements for benchmark
  • Loading branch information
hagenw committed May 3, 2024
1 parent 799eb2d commit 7c21733
Show file tree
Hide file tree
Showing 8 changed files with 235 additions and 197 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ __init__.pyc
coverage.xml
docs/api/
benchmarks/cache/
benchmarks/results/
2 changes: 2 additions & 0 deletions audb/core/define.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ class DependField:
DependField.VERSION: "string",
}

DEPEND_INDEX_DTYPE = "object"


class DependType:
r"""Dependency file types."""
Expand Down
9 changes: 7 additions & 2 deletions audb/core/dependencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,11 @@ def load(self, path: str):
na_filter=False,
dtype=dtype_mapping,
)
self._df.index = self._df.index.astype("string")
# Set dtype of index for both CSV and PKL
# to make backward compatiple
# with old pickle files in cache
# that might use `string` as dtype
self._df.index = self._df.index.astype(define.DEPEND_INDEX_DTYPE)

def removed(
self,
Expand Down Expand Up @@ -476,6 +480,7 @@ def _add_media(
values,
columns=["file"] + list(define.DEPEND_FIELD_NAMES.values()),
).set_index("file")
df.index = df.index.astype(define.DEPEND_INDEX_DTYPE)

self._df = pd.concat([self._df, df])

Expand Down Expand Up @@ -586,7 +591,7 @@ def _update_media(
values,
columns=["file"] + list(define.DEPEND_FIELD_NAMES.values()),
).set_index("file")
df.index = df.index.astype("string")
df.index = df.index.astype(define.DEPEND_INDEX_DTYPE)
for name, dtype in zip(
define.DEPEND_FIELD_NAMES.values(),
define.DEPEND_FIELD_DTYPES.values(),
Expand Down
172 changes: 88 additions & 84 deletions benchmarks/README.md

Large diffs are not rendered by default.

8 changes: 4 additions & 4 deletions benchmarks/benchmark-dependencies-methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def astype(df, dtype):
df["sampling_rate"] = df["sampling_rate"].astype("int32")
df["type"] = df["type"].astype("int32")
df["version"] = df["version"].astype("object")
df.index = df.index.astype("object")
df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
# Set dtypes in library
audb.core.define.DEPEND_FIELD_DTYPES = {
"archive": "object",
Expand All @@ -58,7 +58,7 @@ def astype(df, dtype):
df["sampling_rate"] = df["sampling_rate"].astype("int32")
df["type"] = df["type"].astype("int32")
df["version"] = df["version"].astype("string")
df.index = df.index.astype("string")
df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
# Set dtypes in library
audb.core.define.DEPEND_FIELD_DTYPES = {
"archive": "string",
Expand All @@ -84,7 +84,7 @@ def astype(df, dtype):
df["sampling_rate"] = df["sampling_rate"].astype("int32[pyarrow]")
df["type"] = df["type"].astype("int32[pyarrow]")
df["version"] = df["version"].astype("string[pyarrow]")
df.index = df.index.astype("string[pyarrow]")
df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
# Set dtypes in library
audb.core.define.DEPEND_FIELD_DTYPES = {
"archive": "string[pyarrow]",
Expand Down Expand Up @@ -137,7 +137,7 @@ def astype(df, dtype):
df[column] = df[column].astype(dtype)
df.set_index("file", inplace=True)
df.index.name = None
df.index = df.index.astype("string")
df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
df.to_pickle(data_cache)


Expand Down
Loading

0 comments on commit 7c21733

Please sign in to comment.