Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unable to get file size when dataset has no protocol attribute #2174

Merged
1 change: 1 addition & 0 deletions RELEASE.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Please follow the established format:
- Display full dataset type with library prefix in metadata panel (#2136)
- Enable SQLite WAL mode for Azure ML to fix database locking issues (#2131)
- Replace `flake8`, `isort`, `pylint` and `black` by `ruff` (#2149)
- Refactor `DatasetStatsHook` to avoid showing error when dataset doesn't have file size info (#2174)


# Release 10.0.0
Expand Down
28 changes: 18 additions & 10 deletions package/kedro_viz/integrations/kedro/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path, PurePosixPath
from typing import Any, Union

import fsspec
from kedro.framework.hooks import hook_impl
from kedro.io import DataCatalog
from kedro.io.core import get_filepath_str
Expand Down Expand Up @@ -141,19 +142,26 @@ def get_file_size(self, dataset: Any) -> Union[int, None]:
Args:
dataset: A dataset instance for which we need the file size

Returns: file size for the dataset if file_path is valid, if not returns None
Returns:
File size for the dataset if available, otherwise None.
"""

if not (hasattr(dataset, "_filepath") and dataset._filepath):
return None

try:
file_path = get_filepath_str(
PurePosixPath(dataset._filepath), dataset._protocol
)
return dataset._fs.size(file_path)
if hasattr(dataset, "filepath") and dataset.filepath:
filepath = dataset.filepath
# Fallback to private '_filepath' for known datasets
elif hasattr(dataset, "_filepath") and dataset._filepath:
filepath = dataset._filepath
else:
return None

fs, path_in_fs = fsspec.core.url_to_fs(filepath)
if fs.exists(path_in_fs):
file_size = fs.size(path_in_fs)
return file_size
else:
return None

except Exception as exc:
except Exception as exc: # pragma: no cover
logger.warning(
"Unable to get file size for the dataset %s: %s", dataset, exc
)
Expand Down
41 changes: 41 additions & 0 deletions package/tests/test_integrations/test_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,3 +137,44 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data
assert example_dataset_stats_hook_obj.get_file_size(
example_csv_dataset
) == example_csv_dataset._fs.size(file_path)


def test_get_file_size_file_does_not_exist(example_dataset_stats_hook_obj, mocker):
class MockDataset:
def __init__(self):
self._filepath = "/non/existent/path.csv"

mock_dataset = MockDataset()
mock_fs = mocker.Mock()
mock_fs.exists.return_value = False

mocker.patch(
"fsspec.core.url_to_fs",
return_value=(mock_fs, "/non/existent/path.csv"),
)

# Call get_file_size and expect it to return None
file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset)
assert file_size is None


def test_get_file_size_public_filepath(example_dataset_stats_hook_obj, mocker):
class MockDataset:
def __init__(self):
self.filepath = "/path/to/existing/file.csv"

mock_dataset = MockDataset()

# Mock fs.exists to return True
mock_fs = mocker.Mock()
mock_fs.exists.return_value = True
mock_fs.size.return_value = 456

mocker.patch(
"fsspec.core.url_to_fs",
return_value=(mock_fs, "/path/to/existing/file.csv"),
)

# Call get_file_size and expect it to return the mocked file size
file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset)
assert file_size == 456
Loading