Store index of Dependencies._df as object dtype (#371)

* Store index of Dependencies._df as object dtype * Add memray to requirements for benchmark
audeering · May 3, 2024 · 7c21733 · 7c21733
1 parent 799eb2d
commit 7c21733
Show file tree

Hide file tree

Showing 8 changed files with 235 additions and 197 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@ __init__.pyc
 coverage.xml
 docs/api/
 benchmarks/cache/
+benchmarks/results/
diff --git a/audb/core/define.py b/audb/core/define.py
@@ -60,6 +60,8 @@ class DependField:
     DependField.VERSION: "string",
 }
 
+DEPEND_INDEX_DTYPE = "object"
+
 
 class DependType:
     r"""Dependency file types."""

diff --git a/audb/core/dependencies.py b/audb/core/dependencies.py
@@ -337,7 +337,11 @@ def load(self, path: str):
                 na_filter=False,
                 dtype=dtype_mapping,
             )
-            self._df.index = self._df.index.astype("string")
+        # Set dtype of index for both CSV and PKL
+        # to make backward compatiple
+        # with old pickle files in cache
+        # that might use `string` as dtype
+        self._df.index = self._df.index.astype(define.DEPEND_INDEX_DTYPE)
 
     def removed(
         self,
@@ -476,6 +480,7 @@ def _add_media(
             values,
             columns=["file"] + list(define.DEPEND_FIELD_NAMES.values()),
         ).set_index("file")
+        df.index = df.index.astype(define.DEPEND_INDEX_DTYPE)
 
         self._df = pd.concat([self._df, df])
 
@@ -586,7 +591,7 @@ def _update_media(
             values,
             columns=["file"] + list(define.DEPEND_FIELD_NAMES.values()),
         ).set_index("file")
-        df.index = df.index.astype("string")
+        df.index = df.index.astype(define.DEPEND_INDEX_DTYPE)
         for name, dtype in zip(
             define.DEPEND_FIELD_NAMES.values(),
             define.DEPEND_FIELD_DTYPES.values(),

diff --git a/benchmarks/README.md b/benchmarks/README.md
diff --git a/benchmarks/benchmark-dependencies-methods.py b/benchmarks/benchmark-dependencies-methods.py
@@ -32,7 +32,7 @@ def astype(df, dtype):
         df["sampling_rate"] = df["sampling_rate"].astype("int32")
         df["type"] = df["type"].astype("int32")
         df["version"] = df["version"].astype("object")
-        df.index = df.index.astype("object")
+        df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
         # Set dtypes in library
         audb.core.define.DEPEND_FIELD_DTYPES = {
             "archive": "object",
@@ -58,7 +58,7 @@ def astype(df, dtype):
         df["sampling_rate"] = df["sampling_rate"].astype("int32")
         df["type"] = df["type"].astype("int32")
         df["version"] = df["version"].astype("string")
-        df.index = df.index.astype("string")
+        df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
         # Set dtypes in library
         audb.core.define.DEPEND_FIELD_DTYPES = {
             "archive": "string",
@@ -84,7 +84,7 @@ def astype(df, dtype):
         df["sampling_rate"] = df["sampling_rate"].astype("int32[pyarrow]")
         df["type"] = df["type"].astype("int32[pyarrow]")
         df["version"] = df["version"].astype("string[pyarrow]")
-        df.index = df.index.astype("string[pyarrow]")
+        df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
         # Set dtypes in library
         audb.core.define.DEPEND_FIELD_DTYPES = {
             "archive": "string[pyarrow]",
@@ -137,7 +137,7 @@ def astype(df, dtype):
         df[column] = df[column].astype(dtype)
     df.set_index("file", inplace=True)
     df.index.name = None
-    df.index = df.index.astype("string")
+    df.index = df.index.astype(audb.core.define.DEPEND_INDEX_DTYPE)
     df.to_pickle(data_cache)