-
-
Notifications
You must be signed in to change notification settings - Fork 18
/
cache_all_datasets.py
38 lines (32 loc) · 1.22 KB
/
cache_all_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# flake8: noqa
import os
import openml
from openml import datasets
from openml.datasets.functions import DATASETS_CACHE_DIR_NAME
# get all datasets before running app, so that datasets are loaded faster
from openml.utils import _create_cache_directory_for_id
from server.src.dashboard.caching import CACHE_DIR_ROOT
openml.config.cache_directory = CACHE_DIR_ROOT
df = datasets.list_datasets(output_format="dataframe")
for idx, row in df.iterrows():
data_id = row["did"]
instances = row["NumberOfInstances"]
cols = row["NumberOfFeatures"]
print(data_id)
# delete existing cache directory
did_cache_dir = _create_cache_directory_for_id(
DATASETS_CACHE_DIR_NAME,
data_id,
)
# print(did_cache_dir)
if os.path.exists(did_cache_dir):
openml.utils._remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)
# download dataset and cache again, # (FileNotFoundError, openml.exceptions.OpenMLServerException)
try:
# feather is faster when rows are higher and cols are lower
if instances > 100000 and cols < 10000:
datasets.get_dataset(data_id, cache_format="feather")
else:
datasets.get_dataset(data_id)
except:
pass