kedro-org · SajidAlamQB · Nov 8, 2024 · Nov 4, 2024 · Nov 4, 2024 · Nov 4, 2024
@@ -21,6 +21,7 @@ Please follow the established format:
 - Display full dataset type with library prefix in metadata panel (#2136)
 - Enable SQLite WAL mode for Azure ML to fix database locking issues (#2131)
 - Replace `flake8`, `isort`, `pylint` and `black` by `ruff` (#2149)
+- Refactor `DatasetStatsHook` to avoid showing error when dataset doesn't have file size info (#2174)
 
 
 # Release 10.0.0

@@ -7,6 +7,7 @@
 from pathlib import Path, PurePosixPath
 from typing import Any, Union
 
+import fsspec
 from kedro.framework.hooks import hook_impl
 from kedro.io import DataCatalog
 from kedro.io.core import get_filepath_str
@@ -141,19 +142,26 @@ def get_file_size(self, dataset: Any) -> Union[int, None]:
         Args:
             dataset: A dataset instance for which we need the file size
 
-        Returns: file size for the dataset if file_path is valid, if not returns None
+        Returns:
+            File size for the dataset if available, otherwise None.
         """
-
-        if not (hasattr(dataset, "_filepath") and dataset._filepath):
-            return None
-
         try:
-            file_path = get_filepath_str(
-                PurePosixPath(dataset._filepath), dataset._protocol
-            )
-            return dataset._fs.size(file_path)
+            if hasattr(dataset, "filepath") and dataset.filepath:
+                filepath = dataset.filepath
+            # Fallback to private '_filepath' for known datasets
+            elif hasattr(dataset, "_filepath") and dataset._filepath:
+                filepath = dataset._filepath
+            else:
+                return None
+
+            fs, path_in_fs = fsspec.core.url_to_fs(filepath)
+            if fs.exists(path_in_fs):
+                file_size = fs.size(path_in_fs)
+                return file_size
+            else:
+                return None
 
-        except Exception as exc:
+        except Exception as exc:  # pragma: no cover
             logger.warning(
                 "Unable to get file size for the dataset %s: %s", dataset, exc
             )

@@ -137,3 +137,44 @@ def test_get_file_size(dataset, example_dataset_stats_hook_obj, example_csv_data
     assert example_dataset_stats_hook_obj.get_file_size(
         example_csv_dataset
     ) == example_csv_dataset._fs.size(file_path)
+
+
+def test_get_file_size_file_does_not_exist(example_dataset_stats_hook_obj, mocker):
+    class MockDataset:
+        def __init__(self):
+            self._filepath = "/non/existent/path.csv"
+
+    mock_dataset = MockDataset()
+    mock_fs = mocker.Mock()
+    mock_fs.exists.return_value = False
+
+    mocker.patch(
+        "fsspec.core.url_to_fs",
+        return_value=(mock_fs, "/non/existent/path.csv"),
+    )
+
+    # Call get_file_size and expect it to return None
+    file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset)
+    assert file_size is None
+
+
+def test_get_file_size_public_filepath(example_dataset_stats_hook_obj, mocker):
+    class MockDataset:
+        def __init__(self):
+            self.filepath = "/path/to/existing/file.csv"
+
+    mock_dataset = MockDataset()
+
+    # Mock fs.exists to return True
+    mock_fs = mocker.Mock()
+    mock_fs.exists.return_value = True
+    mock_fs.size.return_value = 456
+
+    mocker.patch(
+        "fsspec.core.url_to_fs",
+        return_value=(mock_fs, "/path/to/existing/file.csv"),
+    )
+
+    # Call get_file_size and expect it to return the mocked file size
+    file_size = example_dataset_stats_hook_obj.get_file_size(mock_dataset)
+    assert file_size == 456