From d6e9c7fba98d0e36090b4c803d3dfd821b5859f4 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 29 Apr 2024 09:05:04 +0200 Subject: [PATCH 1/6] Add support for TXT files as media files --- audb/core/publish.py | 79 ++++++++++++++++++++++++++++++++----------- tests/test_publish.py | 36 ++++++++++++++++++++ 2 files changed, 96 insertions(+), 19 deletions(-) diff --git a/audb/core/publish.py b/audb/core/publish.py index 76d70d25..48aca693 100644 --- a/audb/core/publish.py +++ b/audb/core/publish.py @@ -300,24 +300,65 @@ def _media_values( archive: str, checksum: str, ) -> typing.Tuple[str, str, int, int, str, float, str, int, float, int, str]: - r"""Return values of a media entry in dependencies.""" - format = audeer.file_extension(file).lower() + r"""Return values of a media entry in dependencies. + + The dependency table expects the following columns: + + * file + * archive + * bit depth + * channels + * checksum + * duration + * format + * removed + * sampling rate + * dependency type + * version - try: - path = os.path.join(root, file) - bit_depth = audiofile.bit_depth(path) - if bit_depth is None: # pragma: nocover (non SND files) - bit_depth = 0 - channels = audiofile.channels(path) - duration = audiofile.duration(path, sloppy=True) - sampling_rate = audiofile.sampling_rate(path) - except FileNotFoundError: # pragma: nocover - # If sox or mediafile are not installed - # we get a FileNotFoundError error - raise RuntimeError( - f"sox and mediainfo have to be installed " - f"to publish '{format}' media files." - ) + Args: + root: root of database + file: relative media file path + version: database version + archive: archive the media file is stored in + checksum: checksum of the media file + + Returns: + Tuple with a row to be added to the dependency table, + containing entries for ` + + """ + # Allow media file extensions, + # that do not support audio/video metadata + # (e.g. channels, sampling rate) + special_formats = ["txt"] + + dependency_type = define.DependType.MEDIA + format = audeer.file_extension(file).lower() + removed = 0 + + if format in special_formats: + bit_depth = 0 + channels = 0 + duration = 0.0 + sampling_rate = 0 + else: + # Inspect media file to get audio/video metadata + try: + path = os.path.join(root, file) + bit_depth = audiofile.bit_depth(path) + if bit_depth is None: # pragma: nocover (non SND files) + bit_depth = 0 + channels = audiofile.channels(path) + duration = audiofile.duration(path, sloppy=True) + sampling_rate = audiofile.sampling_rate(path) + except FileNotFoundError: # pragma: nocover + # If sox or mediafile are not installed + # we get a FileNotFoundError error + raise RuntimeError( + f"sox and mediainfo have to be installed " + f"to publish '{format}' media files." + ) return ( file, @@ -327,9 +368,9 @@ def _media_values( checksum, duration, format, - 0, # removed + removed, sampling_rate, - define.DependType.MEDIA, + dependency_type, version, ) diff --git a/tests/test_publish.py b/tests/test_publish.py index ea0fc2b4..50c6c166 100644 --- a/tests/test_publish.py +++ b/tests/test_publish.py @@ -1058,6 +1058,42 @@ def test_publish_error_version(tmpdir, repository): audb.publish(db_path, "2.0.0", repository, previous_version="1.0.0?") +def test_publish_text_media_files(tmpdir, dbs, repository): + r"""Test publishing databases containing text files as media files.""" + # Create a database, containing text media file + build_dir = audeer.path(tmpdir, "./build") + audeer.mkdir(build_dir) + data_dir = audeer.mkdir(build_dir, "data") + with open(audeer.path(data_dir, "file1.txt"), "w") as file: + file.write("Text written by a person.\n") + name = "text-db" + db = audformat.Database(name) + db.schemes["speaker"] = audformat.Scheme("str") + index = audformat.filewise_index(["data/file1.txt"]) + db["files"] = audformat.Table(index) + db["files"]["speaker"] = audformat.Column(scheme_id="speaker") + db["files"]["speaker"].set(["speaker-a"]) + db.save(build_dir) + + # Publish database, containing text media file + version = "1.0.0" + deps = audb.publish(build_dir, version, repository) + + assert deps.tables == ["db.files.csv"] + file = "data/file1.txt" + assert deps.media == [file] + assert deps.bit_depth(file) == 0 + assert deps.channels(file) == 0 + assert deps.duration(file) == 0.0 + assert deps.format(file) == "txt" + assert deps.sampling_rate(file) == 0 + + db = audb.load(name, version=version, verbose=False, full_path=False) + assert db.files == [file] + assert list(db) == ["files"] + assert os.path.exists(audeer.path(db.root, file)) + + def test_update_database(dbs, persistent_repository): version = "2.1.0" start_version = "2.0.0" From 9c635b31656fd5ad1d9446a4e1a0dfe0072dcc02 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 29 Apr 2024 09:13:49 +0200 Subject: [PATCH 2/6] Catch RuntimeError instead --- audb/core/publish.py | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/audb/core/publish.py b/audb/core/publish.py index 48aca693..0711e839 100644 --- a/audb/core/publish.py +++ b/audb/core/publish.py @@ -328,37 +328,34 @@ def _media_values( containing entries for ` """ - # Allow media file extensions, - # that do not support audio/video metadata - # (e.g. channels, sampling rate) - special_formats = ["txt"] - dependency_type = define.DependType.MEDIA format = audeer.file_extension(file).lower() removed = 0 - if format in special_formats: + # Inspect media file to get audio/video metadata + try: + path = os.path.join(root, file) + bit_depth = audiofile.bit_depth(path) + if bit_depth is None: # pragma: nocover (non SND files) + bit_depth = 0 + channels = audiofile.channels(path) + duration = audiofile.duration(path, sloppy=True) + sampling_rate = audiofile.sampling_rate(path) + except FileNotFoundError: # pragma: nocover + # If sox or mediafile are not installed + # we get a FileNotFoundError error + raise RuntimeError( + f"sox and mediainfo have to be installed " + f"to publish '{format}' media files." + ) + except RuntimeError: + # Skip audio/video metadata for media files, + # that don't support them + # (e.g. text files) bit_depth = 0 channels = 0 duration = 0.0 sampling_rate = 0 - else: - # Inspect media file to get audio/video metadata - try: - path = os.path.join(root, file) - bit_depth = audiofile.bit_depth(path) - if bit_depth is None: # pragma: nocover (non SND files) - bit_depth = 0 - channels = audiofile.channels(path) - duration = audiofile.duration(path, sloppy=True) - sampling_rate = audiofile.sampling_rate(path) - except FileNotFoundError: # pragma: nocover - # If sox or mediafile are not installed - # we get a FileNotFoundError error - raise RuntimeError( - f"sox and mediainfo have to be installed " - f"to publish '{format}' media files." - ) return ( file, From b1383e34f4b4f3c9e4d6ddbe4d184e80f977f94c Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 29 Apr 2024 09:17:48 +0200 Subject: [PATCH 3/6] Fix docstring --- audb/core/publish.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/audb/core/publish.py b/audb/core/publish.py index 0711e839..687f3a2e 100644 --- a/audb/core/publish.py +++ b/audb/core/publish.py @@ -324,8 +324,7 @@ def _media_values( checksum: checksum of the media file Returns: - Tuple with a row to be added to the dependency table, - containing entries for ` + row to be added to the dependency table as tuple """ dependency_type = define.DependType.MEDIA From 97e409e3bc4915bbe0cb0541cc6ee4ee6a9e2698 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 29 Apr 2024 09:32:23 +0200 Subject: [PATCH 4/6] Raise RuntimeError when a flavor is requested --- audb/core/load.py | 23 ++++++++++++++++------- tests/test_publish.py | 4 ++++ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/audb/core/load.py b/audb/core/load.py index 66b0e672..99e5dcfa 100644 --- a/audb/core/load.py +++ b/audb/core/load.py @@ -459,13 +459,18 @@ def job(archive: str, version: str): src_path = os.path.join(db_root_tmp, file) file = flavor.destination(file) dst_path = os.path.join(db_root_tmp, file) - flavor( - src_path, - dst_path, - src_bit_depth=bit_depth, - src_channels=channels, - src_sampling_rate=sampling_rate, - ) + try: + flavor( + src_path, + dst_path, + src_bit_depth=bit_depth, + src_channels=channels, + src_sampling_rate=sampling_rate, + ) + except RuntimeError: + raise RuntimeError( + f"Media file '{file}' does not support requesting a flavor." + ) if src_path != dst_path: os.remove(src_path) @@ -1001,6 +1006,10 @@ def load( ``format``, or ``sampling_rate`` is requested + RuntimeError: if a flavor is requested, + but the dataset contains media files, + that don't contain audio, + e.g. text files Examples: >>> db = audb.load( diff --git a/tests/test_publish.py b/tests/test_publish.py index 50c6c166..557b9f32 100644 --- a/tests/test_publish.py +++ b/tests/test_publish.py @@ -1093,6 +1093,10 @@ def test_publish_text_media_files(tmpdir, dbs, repository): assert list(db) == ["files"] assert os.path.exists(audeer.path(db.root, file)) + error_msg = f"Media file '{file}' does not support requesting a flavor." + with pytest.raises(RuntimeError, match=error_msg): + db = audb.load(name, version=version, channels=[0], verbose=False) + def test_update_database(dbs, persistent_repository): version = "2.1.0" From 250240b02d6321eb468f42b90e477a66d2ed3d22 Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 29 Apr 2024 11:47:50 +0200 Subject: [PATCH 5/6] Use database instead of dataset in docstring --- audb/core/load.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/audb/core/load.py b/audb/core/load.py index 99e5dcfa..72c0bf19 100644 --- a/audb/core/load.py +++ b/audb/core/load.py @@ -1007,7 +1007,7 @@ def load( or ``sampling_rate`` is requested RuntimeError: if a flavor is requested, - but the dataset contains media files, + but the database contains media files, that don't contain audio, e.g. text files From 509fdb03b84ee6fb518d25058ec6610ce1533ada Mon Sep 17 00:00:00 2001 From: Hagen Wierstorf Date: Mon, 29 Apr 2024 12:19:05 +0200 Subject: [PATCH 6/6] Add test for mixed text/audio database --- tests/test_publish.py | 47 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/tests/test_publish.py b/tests/test_publish.py index 557b9f32..2d6491d6 100644 --- a/tests/test_publish.py +++ b/tests/test_publish.py @@ -45,6 +45,7 @@ def dbs(tmpdir_factory): # # tables: # - emotion + # - files # misc tables: # - misc-in-scheme # - misc-not-in-scheme @@ -133,6 +134,7 @@ def dbs(tmpdir_factory): # # tables: # - emotion + # - files # misc tables: # - misc-in-scheme # - misc-not-in-scheme @@ -179,6 +181,7 @@ def dbs(tmpdir_factory): # # tables: # - emotion + # - files # misc tables: # - misc-in-scheme # - misc-not-in-scheme @@ -208,6 +211,7 @@ def dbs(tmpdir_factory): # # tables: # - emotion + # - files # misc tables: # - misc-in-scheme # - misc-not-in-scheme @@ -232,6 +236,7 @@ def dbs(tmpdir_factory): # # tables: # - emotion + # - files # misc tables: # - misc-in-scheme # - misc-not-in-scheme @@ -267,6 +272,7 @@ def dbs(tmpdir_factory): # # tables: # - emotion + # - files # misc tables: # - misc-in-scheme # - misc-not-in-scheme @@ -1072,7 +1078,7 @@ def test_publish_text_media_files(tmpdir, dbs, repository): index = audformat.filewise_index(["data/file1.txt"]) db["files"] = audformat.Table(index) db["files"]["speaker"] = audformat.Column(scheme_id="speaker") - db["files"]["speaker"].set(["speaker-a"]) + db["files"]["speaker"].set(["adam"]) db.save(build_dir) # Publish database, containing text media file @@ -1097,6 +1103,45 @@ def test_publish_text_media_files(tmpdir, dbs, repository): with pytest.raises(RuntimeError, match=error_msg): db = audb.load(name, version=version, channels=[0], verbose=False) + # Publish database, containing text and media files + audeer.rmdir(build_dir) + shutil.copytree(dbs["1.0.0"], build_dir) # start with db containing audio files + db = audformat.Database.load(build_dir) + speaker = db["files"]["speaker"].get() + files = list(db.files) + tables = list(db) + data_dir = audeer.mkdir(build_dir, "data") + with open(audeer.path(data_dir, "file1.txt"), "w") as file: + file.write("Text written by a person.\n") + index = audformat.filewise_index(["data/file1.txt"]) + db["files"].extend_index(index, inplace=True) + db["files"]["speaker"] = audformat.Column(scheme_id="speaker") + db["files"]["speaker"].set(list(speaker.values) + ["adam"]) + db.name = name + db.save(build_dir) + + # Publish database, containing text media file + version = "2.0.0" + deps = audb.publish(build_dir, version, repository, previous_version=None) + + assert deps.table_ids == tables + file = "data/file1.txt" + assert deps.media == files + [file] + assert deps.bit_depth(file) == 0 + assert deps.channels(file) == 0 + assert deps.duration(file) == 0.0 + assert deps.format(file) == "txt" + assert deps.sampling_rate(file) == 0 + + db = audb.load(name, version=version, verbose=False, full_path=False) + assert db.files == files + [file] + assert list(db) == tables + assert os.path.exists(audeer.path(db.root, file)) + + error_msg = f"Media file '{file}' does not support requesting a flavor." + with pytest.raises(RuntimeError, match=error_msg): + db = audb.load(name, version=version, channels=[0], verbose=False) + def test_update_database(dbs, persistent_repository): version = "2.1.0"