From b90ce3ec8b77e07875c369dbd35afa3e08ad3324 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 4 Nov 2024 13:48:06 +0000 Subject: [PATCH 01/10] add protocol check Signed-off-by: Sajid Alam --- package/kedro_viz/integrations/kedro/hooks.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/package/kedro_viz/integrations/kedro/hooks.py b/package/kedro_viz/integrations/kedro/hooks.py index 3089e61f5..c94d9f166 100644 --- a/package/kedro_viz/integrations/kedro/hooks.py +++ b/package/kedro_viz/integrations/kedro/hooks.py @@ -148,10 +148,15 @@ def get_file_size(self, dataset: Any) -> Union[int, None]: return None try: - file_path = get_filepath_str( - PurePosixPath(dataset._filepath), dataset._protocol - ) - return dataset._fs.size(file_path) + if hasattr(dataset, "_protocol") and hasattr(dataset, "_fs"): + file_path = get_filepath_str( + PurePosixPath(dataset._filepath), dataset._protocol + ) + file_size = dataset._fs.size(file_path) + return file_size + else: + # Unable to determine file size + return None except Exception as exc: logger.warning( From e47ed310356bad6d8cf0333b10a80e011537f3f5 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 4 Nov 2024 14:05:10 +0000 Subject: [PATCH 02/10] add test Signed-off-by: Sajid Alam --- package/tests/test_integrations/test_hooks.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/package/tests/test_integrations/test_hooks.py b/package/tests/test_integrations/test_hooks.py index 2f6d7dd13..145794961 100644 --- a/package/tests/test_integrations/test_hooks.py +++ b/package/tests/test_integrations/test_hooks.py @@ -137,3 +137,20 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data assert example_dataset_stats_hook_obj.get_file_size( example_csv_dataset ) == example_csv_dataset._fs.size(file_path) + + +def test_get_file_size_no_protocol(example_dataset_stats_hook_obj, mocker): + class MockDataset: + def __init__(self): + self._filepath = "/path/to/dataset.csv" + + mock_dataset = MockDataset() + + mocker.patch( + 'kedro_viz.integrations.kedro.hooks.get_filepath_str', + return_value=mock_dataset._filepath + ) + + # Call get_file_size and expect it to return None + file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) + assert file_size is None From b73cb969774e7d782e6d83924f81c856c0fa1767 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Mon, 4 Nov 2024 14:05:35 +0000 Subject: [PATCH 03/10] lint Signed-off-by: Sajid Alam --- package/tests/test_integrations/test_hooks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package/tests/test_integrations/test_hooks.py b/package/tests/test_integrations/test_hooks.py index 145794961..35a99767b 100644 --- a/package/tests/test_integrations/test_hooks.py +++ b/package/tests/test_integrations/test_hooks.py @@ -147,8 +147,8 @@ def __init__(self): mock_dataset = MockDataset() mocker.patch( - 'kedro_viz.integrations.kedro.hooks.get_filepath_str', - return_value=mock_dataset._filepath + "kedro_viz.integrations.kedro.hooks.get_filepath_str", + return_value=mock_dataset._filepath, ) # Call get_file_size and expect it to return None From 7b31509812f1d0e84f5749a2971af6582eb35bc1 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Wed, 6 Nov 2024 14:58:23 +0000 Subject: [PATCH 04/10] Update hooks.py Signed-off-by: Sajid Alam --- package/kedro_viz/integrations/kedro/hooks.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/package/kedro_viz/integrations/kedro/hooks.py b/package/kedro_viz/integrations/kedro/hooks.py index c94d9f166..0df846923 100644 --- a/package/kedro_viz/integrations/kedro/hooks.py +++ b/package/kedro_viz/integrations/kedro/hooks.py @@ -7,6 +7,7 @@ from pathlib import Path, PurePosixPath from typing import Any, Union +import fsspec from kedro.framework.hooks import hook_impl from kedro.io import DataCatalog from kedro.io.core import get_filepath_str @@ -141,21 +142,20 @@ def get_file_size(self, dataset: Any) -> Union[int, None]: Args: dataset: A dataset instance for which we need the file size - Returns: file size for the dataset if file_path is valid, if not returns None + Returns: + File size for the dataset if available, otherwise None. """ - - if not (hasattr(dataset, "_filepath") and dataset._filepath): - return None - try: - if hasattr(dataset, "_protocol") and hasattr(dataset, "_fs"): - file_path = get_filepath_str( - PurePosixPath(dataset._filepath), dataset._protocol - ) - file_size = dataset._fs.size(file_path) + if hasattr(dataset, "filepath") and dataset.filepath: + filepath = dataset.filepath + else: + return None + + fs, path_in_fs = fsspec.core.url_to_fs(filepath) + if fs.exists(path_in_fs): + file_size = fs.size(path_in_fs) return file_size else: - # Unable to determine file size return None except Exception as exc: From fef1a236ddc733917473d1efbf1550f7e07726cf Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Wed, 6 Nov 2024 15:21:52 +0000 Subject: [PATCH 05/10] add fallback to private attritbute for known datasets Signed-off-by: Sajid Alam --- package/kedro_viz/integrations/kedro/hooks.py | 3 +++ package/tests/test_integrations/test_hooks.py | 17 ----------------- 2 files changed, 3 insertions(+), 17 deletions(-) diff --git a/package/kedro_viz/integrations/kedro/hooks.py b/package/kedro_viz/integrations/kedro/hooks.py index 0df846923..1aa732c76 100644 --- a/package/kedro_viz/integrations/kedro/hooks.py +++ b/package/kedro_viz/integrations/kedro/hooks.py @@ -148,6 +148,9 @@ def get_file_size(self, dataset: Any) -> Union[int, None]: try: if hasattr(dataset, "filepath") and dataset.filepath: filepath = dataset.filepath + # Fallback to private '_filepath' for known datasets + elif hasattr(dataset, "_filepath") and dataset._filepath: + filepath = dataset._filepath else: return None diff --git a/package/tests/test_integrations/test_hooks.py b/package/tests/test_integrations/test_hooks.py index 35a99767b..2f6d7dd13 100644 --- a/package/tests/test_integrations/test_hooks.py +++ b/package/tests/test_integrations/test_hooks.py @@ -137,20 +137,3 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data assert example_dataset_stats_hook_obj.get_file_size( example_csv_dataset ) == example_csv_dataset._fs.size(file_path) - - -def test_get_file_size_no_protocol(example_dataset_stats_hook_obj, mocker): - class MockDataset: - def __init__(self): - self._filepath = "/path/to/dataset.csv" - - mock_dataset = MockDataset() - - mocker.patch( - "kedro_viz.integrations.kedro.hooks.get_filepath_str", - return_value=mock_dataset._filepath, - ) - - # Call get_file_size and expect it to return None - file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) - assert file_size is None From 1061f9ae919e63453c20b2289eb1f256fdc059b2 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 7 Nov 2024 09:49:49 +0000 Subject: [PATCH 06/10] coverage Signed-off-by: Sajid Alam --- package/tests/test_integrations/test_hooks.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/package/tests/test_integrations/test_hooks.py b/package/tests/test_integrations/test_hooks.py index 2f6d7dd13..ecb93fcb2 100644 --- a/package/tests/test_integrations/test_hooks.py +++ b/package/tests/test_integrations/test_hooks.py @@ -137,3 +137,22 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data assert example_dataset_stats_hook_obj.get_file_size( example_csv_dataset ) == example_csv_dataset._fs.size(file_path) + + +def test_get_file_size_file_does_not_exist(example_dataset_stats_hook_obj, mocker): + class MockDataset: + def __init__(self): + self._filepath = "/non/existent/path.csv" + + mock_dataset = MockDataset() + mock_fs = mocker.Mock() + mock_fs.exists.return_value = False + + mocker.patch( + "fsspec.core.url_to_fs", + return_value=(mock_fs, "/non/existent/path.csv"), + ) + + # Call get_file_size and expect it to return None + file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) + assert file_size is None From c7ab4c84545240ef32fc41b3ca098ac619c58ffa Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 7 Nov 2024 09:56:31 +0000 Subject: [PATCH 07/10] Update RELEASE.md Signed-off-by: Sajid Alam --- RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.md b/RELEASE.md index 3261588f5..b9a228e8a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -21,6 +21,7 @@ Please follow the established format: - Display full dataset type with library prefix in metadata panel (#2136) - Enable SQLite WAL mode for Azure ML to fix database locking issues (#2131) - Replace `flake8`, `isort`, `pylint` and `black` by `ruff` (#2149) +- Refactor `DatasetStatsHook` to avoid accessing private dataset attributes (#2174) # Release 10.0.0 From b9deceb18714413c9f93430056553ca0c3fa4175 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 7 Nov 2024 10:11:45 +0000 Subject: [PATCH 08/10] Update hooks.py Signed-off-by: Sajid Alam --- package/kedro_viz/integrations/kedro/hooks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package/kedro_viz/integrations/kedro/hooks.py b/package/kedro_viz/integrations/kedro/hooks.py index 1aa732c76..88bc5be17 100644 --- a/package/kedro_viz/integrations/kedro/hooks.py +++ b/package/kedro_viz/integrations/kedro/hooks.py @@ -161,7 +161,7 @@ def get_file_size(self, dataset: Any) -> Union[int, None]: else: return None - except Exception as exc: + except Exception as exc: # pragma: no cover logger.warning( "Unable to get file size for the dataset %s: %s", dataset, exc ) From ec0bf98d963e8e7a57b8d3f6c694cb3d55722336 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 7 Nov 2024 10:32:52 +0000 Subject: [PATCH 09/10] coverage Signed-off-by: Sajid Alam --- package/tests/test_integrations/test_hooks.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/package/tests/test_integrations/test_hooks.py b/package/tests/test_integrations/test_hooks.py index ecb93fcb2..600c594d1 100644 --- a/package/tests/test_integrations/test_hooks.py +++ b/package/tests/test_integrations/test_hooks.py @@ -156,3 +156,25 @@ def __init__(self): # Call get_file_size and expect it to return None file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) assert file_size is None + + +def test_get_file_size_public_filepath(example_dataset_stats_hook_obj, mocker): + class MockDataset: + def __init__(self): + self.filepath = "/path/to/existing/file.csv" + + mock_dataset = MockDataset() + + # Mock fs.exists to return True + mock_fs = mocker.Mock() + mock_fs.exists.return_value = True + mock_fs.size.return_value = 456 + + mocker.patch( + "fsspec.core.url_to_fs", + return_value=(mock_fs, "/path/to/existing/file.csv"), + ) + + # Call get_file_size and expect it to return the mocked file size + file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset) + assert file_size == 456 From 5c4cb53fccd5be5f4d6b0ecf7d4437e2eae37ed3 Mon Sep 17 00:00:00 2001 From: Sajid Alam Date: Thu, 7 Nov 2024 12:02:42 +0000 Subject: [PATCH 10/10] Update RELEASE.md Signed-off-by: Sajid Alam --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index b9a228e8a..5be798e33 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -21,7 +21,7 @@ Please follow the established format: - Display full dataset type with library prefix in metadata panel (#2136) - Enable SQLite WAL mode for Azure ML to fix database locking issues (#2131) - Replace `flake8`, `isort`, `pylint` and `black` by `ruff` (#2149) -- Refactor `DatasetStatsHook` to avoid accessing private dataset attributes (#2174) +- Refactor `DatasetStatsHook` to avoid showing error when dataset doesn't have file size info (#2174) # Release 10.0.0