[merge] Update repo (#4)

* [docs] Update citation * [feat] Add multi-processing downloading --------- Co-authored-by: Anton Okhotnikov <[email protected]>
IDRnD · Aug 22, 2023 · e0f066c · e0f066c
1 parent c7c7073
commit e0f066c
Show file tree

Hide file tree

Showing 5 changed files with 74 additions and 4 deletions.
diff --git a/README.md b/README.md
@@ -44,5 +44,12 @@ This is a [live repository](https://github.com/IDRnD/VoxTube), so any changes to
 Please cite the paper below if you make use of the dataset:
 
 ```
-[1] Yakovlev, I., Okhotnikov, A., Torgashov, N., Makarov, R., Voevodin, Y., Simonchik, K. (2023) VoxTube: a multilingual speaker recognition dataset. Proc. INTERSPEECH 2023, 2238-2242, doi: 10.21437/Interspeech.2023-1083
+@inproceedings{yakovlev23_interspeech,
+  author={Ivan Yakovlev and Anton Okhotnikov and Nikita Torgashov and Rostislav Makarov and Yuri Voevodin and Konstantin Simonchik},
+  title={{VoxTube: a multilingual speaker recognition dataset}},
+  year=2023,
+  booktitle={Proc. INTERSPEECH 2023},
+  pages={2238--2242},
+  doi={10.21437/Interspeech.2023-1083}
+}
 ```
diff --git a/examples/README.md b/examples/README.md
@@ -95,9 +95,13 @@ python3 -m pip install -r requirements.txt
 > Note that in default example script each audio is converted to 16 kHz sampling frequency **.wav** file and is split into 4-seconds segments.
 
 ```bash
-# example of one speaker downloading using meta .json file
 cd VoxTube/examples
-python3 load_data.py ../resources/meta/UCFcL4NsBzfWh1bLr6brouWg.json <DATASET_ROOT>
+
+# example of one speaker downloading using meta .json file
+python3 load_example.py ../resources/meta/UC-9GWCoQoMr_ey6AMhClStQ.json <DATASET_ROOT>
+
+# example of downloading the whole dataset in N parallel jobs
+python3 load_all_examples.py -r <DATASET_ROOT> -j N
 ```
 
 

diff --git a/examples/load_all_examples.py b/examples/load_all_examples.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+import argparse
+import multiprocessing as mp
+import os
+import subprocess as sp
+from functools import partial
+from pathlib import Path
+
+from tqdm import tqdm
+
+
+def load_json(json_path, dataset_root, load_script_path):
+    try:
+        status = sp.run(["python3", load_script_path, str(json_path), dataset_root])
+        return status
+    except Exception as e:
+        print(f'Error while loading channel {json_path}')
+        print(f'Exception: {str(e)}')
+
+
+def main(dataset_root, nj=1):
+    # Define default variables
+    fwd = os.path.dirname(os.path.realpath(__file__))
+    meta_path = Path(f'{fwd}/../resources/meta')
+    json_paths = sorted(list(meta_path.glob('*.json')))
+    path_to_download_script = f'{fwd}/load_example.py'
+
+    # Run downloading
+    load_job = partial(
+        load_json,
+        dataset_root=dataset_root,
+        load_script_path=path_to_download_script
+    )
+
+    # with mp.Pool(nj) as pool:
+    #     _ = pool.imap(
+    #         load_job, tqdm(json_paths, total=len(json_paths))
+    #     )
+
+    with tqdm(total=len(json_paths)) as pb:
+        with mp.Pool(nj) as pool:
+            for _ in pool.imap(load_job, json_paths):
+                pb.update()
+
+    print(f'Finished loading! Please check {dataset_root}')
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument('-r', '--dataset_root', help='Path to store loaded data',
+                        required=True, type=str)
+    parser.add_argument('-j', '--njobs', help='Number of parallel jobs to run',
+                        default=1, type=int, required=False)
+    args = parser.parse_args()
+    main(args.dataset_root, args.njobs)
diff --git a/examples/load_data.py → examples/load_example.py b/examples/load_data.py → examples/load_example.py
diff --git a/index.md b/index.md
@@ -45,5 +45,7 @@ This is a [live repository](https://github.com/IDRnD/VoxTube), so any changes to
 Please cite the paper below if you make use of the dataset:
 
 ```
-[1] Yakovlev, I., Okhotnikov, A., Torgashov, N., Makarov, R., Voevodin, Y., Simonchik, K. (2023) VoxTube: a multilingual speaker recognition dataset. Proc. INTERSPEECH 2023, 2238-2242, doi: 10.21437/Interspeech.2023-1083
+[1] Yakovlev, I., Okhotnikov, A., Torgashov, N., Makarov, R., Voevodin, Y., Simonchik, K. (2023) 
+VoxTube: a multilingual speaker recognition dataset. 
+Proc. INTERSPEECH 2023, 2238-2242, doi: 10.21437/Interspeech.2023-1083
 ```