feat(datasets): Replace geopandas.GeoJSONDataset with geopandas.Gener…

…icDataset (#812) * feat(datasets): Add geopandas ParquetDataset Signed-off-by: Harm Matthias Harms <[email protected]> * Add release notes Signed-off-by: Harm Matthias Harms <[email protected]> * Add parquet dataset to docs Signed-off-by: Harm Matthias Harms <[email protected]> * Fix typo in tests Signed-off-by: Harm Matthias Harms <[email protected]> * Fix pylint type Signed-off-by: Harm Matthias Harms <[email protected]> * Discard changes to kedro-datasets/docs/source/api/kedro_datasets.rst Signed-off-by: Harm Matthias Harms <[email protected]> * Discard changes to kedro-datasets/kedro_datasets/geopandas/__init__.py Signed-off-by: Harm Matthias Harms <[email protected]> * Extend geojson dataset to support more file types Signed-off-by: Harm Matthias Harms <[email protected]> * Update RELEASE.md Signed-off-by: Harm Matthias Harms <[email protected]> * Add test for unsupported file format Signed-off-by: Harm Matthias Harms <[email protected]> * Cleanup GeoJSONDataset Signed-off-by: Harm Matthias Harms <[email protected]> * Fix lint Signed-off-by: Harm Matthias Harms <[email protected]> * Replace GeoJSONDataset by GenericDataset Signed-off-by: Harm Matthias Harms <[email protected]> * Update pyproject.toml Signed-off-by: Harm Matthias Harms <[email protected]> * Update RELEASE.md Signed-off-by: Harm Matthias Harms <[email protected]> * Use new default fs args Signed-off-by: Harm Matthias Harms <[email protected]> * Fix pattern in test Signed-off-by: Harm Matthias Harms <[email protected]> * Use fiona for python < 3.11 Signed-off-by: Harm Matthias Harms <[email protected]> * Install fiona dependency for python < 3.11 Signed-off-by: Harm Matthias Harms <[email protected]> * Revert fiona test Signed-off-by: Harm Matthias Harms <[email protected]> * Use fiona because pyogrio doesnt support fsspec Signed-off-by: Harm Matthias Harms <[email protected]> * Format file Signed-off-by: Harm Matthias Harms <[email protected]> * Update kedro-datasets/kedro_datasets/geopandas/__init__.py Co-authored-by: ElenaKhaustova <[email protected]> Signed-off-by: Harm Matthias Harms <[email protected]> Signed-off-by: Harm Matthias Harms <[email protected]> * Improve none file system target error message Signed-off-by: Harm Matthias Harms <[email protected]> * Update RELEASE.md Signed-off-by: Harm Matthias Harms <[email protected]> --------- Signed-off-by: Harm Matthias Harms <[email protected]> Signed-off-by: Harm Matthias Harms <[email protected]> Signed-off-by: Ankita Katiyar <[email protected]> Co-authored-by: ElenaKhaustova <[email protected]> Co-authored-by: L. R. Couto <[email protected]> Co-authored-by: Ankita Katiyar <[email protected]> Co-authored-by: Ankita Katiyar <[email protected]>
kedro-org · Oct 10, 2024 · f13dd7a · f13dd7a
1 parent 2b1228e
commit f13dd7a
Show file tree

Hide file tree

Showing 7 changed files with 214 additions and 88 deletions.
diff --git a/kedro-datasets/RELEASE.md b/kedro-datasets/RELEASE.md
@@ -23,6 +23,7 @@
 
 ## Breaking Changes
 * Exposed `load` and `save` publicly for each dataset. This requires Kedro version 0.19.7 or higher.
+* Replaced the `geopandas.GeoJSONDataset` with `geopandas.GenericDataset` to support parquet and feather file formats.
 
 ## Community contributions
 Many thanks to the following Kedroids for contributing PRs to this release:
@@ -32,6 +33,7 @@ Many thanks to the following Kedroids for contributing PRs to this release:
 * [janickspirig](https://github.com/janickspirig)
 * [Galen Seilis](https://github.com/galenseilis)
 * [Mariusz Wojakowski](https://github.com/mariusz89016)
+* [harm-matthias-harms](https://github.com/harm-matthias-harms)
 * [Felix Scherz](https://github.com/felixscherz)
 
 

diff --git a/kedro-datasets/docs/source/api/kedro_datasets.rst b/kedro-datasets/docs/source/api/kedro_datasets.rst
@@ -17,7 +17,7 @@ kedro_datasets
    dask.ParquetDataset
    databricks.ManagedTableDataset
    email.EmailMessageDataset
-   geopandas.GeoJSONDataset
+   geopandas.GenericDataset
    holoviews.HoloviewsWriter
    huggingface.HFDataset
    huggingface.HFTransformerPipelineDataset

diff --git a/kedro-datasets/kedro_datasets/geopandas/README.md b/kedro-datasets/kedro_datasets/geopandas/README.md
diff --git a/kedro-datasets/kedro_datasets/geopandas/__init__.py b/kedro-datasets/kedro_datasets/geopandas/__init__.py
@@ -1,12 +1,12 @@
-"""``GeoJSONDataset`` is an ``AbstractVersionedDataset`` to save and load GeoJSON files."""
+"""``GenericDataset`` is an ``AbstractVersionedDataset`` to save and load GeoDataFrames."""
 
 from typing import Any
 
 import lazy_loader as lazy
 
 # https://github.com/pylint-dev/pylint/issues/4300#issuecomment-1043601901
-GeoJSONDataset: Any
+GenericDataset: Any
 
 __getattr__, __dir__, __all__ = lazy.attach(
-    __name__, submod_attrs={"geojson_dataset": ["GeoJSONDataset"]}
+    __name__, submod_attrs={"generic_dataset": ["GenericDataset"]}
 )
diff --git a/...dro_datasets/geopandas/geojson_dataset.py → ...dro_datasets/geopandas/generic_dataset.py b/...dro_datasets/geopandas/geojson_dataset.py → ...dro_datasets/geopandas/generic_dataset.py
@@ -1,7 +1,8 @@
-"""GeoJSONDataset loads and saves data to a local geojson file. The
+"""GenericDataset loads and saves data to a local file. The
 underlying functionality is supported by geopandas, so it supports all
 allowed geopandas (pandas) options for loading and saving geosjon files.
 """
+
 from __future__ import annotations
 
 import copy
@@ -18,30 +19,35 @@
     get_protocol_and_path,
 )
 
+# pyogrio currently supports no alternate file handlers https://github.com/geopandas/pyogrio/issues/430
+gpd.options.io_engine = "fiona"
+
+NON_FILE_SYSTEM_TARGETS = ["postgis"]
+
 
-class GeoJSONDataset(
+class GenericDataset(
     AbstractVersionedDataset[
         gpd.GeoDataFrame, gpd.GeoDataFrame | dict[str, gpd.GeoDataFrame]
     ]
 ):
-    """``GeoJSONDataset`` loads/saves data to a GeoJSON file using an underlying filesystem
+    """``GenericDataset`` loads/saves data to a file using an underlying filesystem
     (eg: local, S3, GCS).
     The underlying functionality is supported by geopandas, so it supports all
-    allowed geopandas (pandas) options for loading and saving GeoJSON files.
+    allowed geopandas (pandas) options for loading and saving files.
 
     Example:
 
     .. code-block:: pycon
 
         >>> import geopandas as gpd
-        >>> from kedro_datasets.geopandas import GeoJSONDataset
+        >>> from kedro_datasets.geopandas import GenericDataset
         >>> from shapely.geometry import Point
         >>>
         >>> data = gpd.GeoDataFrame(
         ...     {"col1": [1, 2], "col2": [4, 5], "col3": [5, 6]},
         ...     geometry=[Point(1, 1), Point(2, 4)],
         ... )
-        >>> dataset = GeoJSONDataset(filepath=tmp_path / "test.geojson", save_args=None)
+        >>> dataset = GenericDataset(filepath=tmp_path / "test.geojson")
         >>> dataset.save(data)
         >>> reloaded = dataset.load()
         >>>
@@ -50,35 +56,41 @@ class GeoJSONDataset(
     """
 
     DEFAULT_LOAD_ARGS: dict[str, Any] = {}
-    DEFAULT_SAVE_ARGS = {"driver": "GeoJSON"}
+    DEFAULT_SAVE_ARGS: dict[str, Any] = {}
+    DEFAULT_FS_ARGS: dict[str, Any] = {"open_args_save": {"mode": "wb"}}
 
     def __init__(  # noqa: PLR0913
         self,
         *,
         filepath: str,
+        file_format: str = "file",
         load_args: dict[str, Any] | None = None,
         save_args: dict[str, Any] | None = None,
         version: Version | None = None,
         credentials: dict[str, Any] | None = None,
         fs_args: dict[str, Any] | None = None,
         metadata: dict[str, Any] | None = None,
     ) -> None:
-        """Creates a new instance of ``GeoJSONDataset`` pointing to a concrete GeoJSON file
+        """Creates a new instance of ``GenericDataset`` pointing to a concrete file
         on a specific filesystem fsspec.
 
         Args:
 
-            filepath: Filepath in POSIX format to a GeoJSON file prefixed with a protocol like
+            filepath: Filepath in POSIX format to a file prefixed with a protocol like
                 `s3://`. If prefix is not provided `file` protocol (local filesystem) will be used.
                 The prefix should be any protocol supported by ``fsspec``.
                 Note: `http(s)` doesn't support versioning.
-            load_args: GeoPandas options for loading GeoJSON files.
+            file_format: String which is used to match the appropriate load/save method on a best
+                effort basis. For example if 'parquet' is passed in the `geopandas.read_parquet` and
+                `geopandas.DataFrame.to_parquet` will be identified. An error will be raised unless
+                at least one matching `read_{file_format}` or `to_{file_format}` method is
+                identified. Defaults to 'file'.
+            load_args: GeoPandas options for loading files.
                 Here you can find all available arguments:
                 https://geopandas.org/en/stable/docs/reference/api/geopandas.read_file.html
-            save_args: GeoPandas options for saving geojson files.
+            save_args: GeoPandas options for saving files.
                 Here you can find all available arguments:
                 https://geopandas.org/en/stable/docs/reference/api/geopandas.GeoDataFrame.to_file.html
-                The default_save_arg driver is 'GeoJSON', all others preserved.
             version: If specified, should be an instance of
                 ``kedro.io.core.Version``. If its ``load`` attribute is
                 None, the latest version will be loaded. If its ``save``
@@ -94,6 +106,9 @@ def __init__(  # noqa: PLR0913
             metadata: Any arbitrary metadata.
                 This is ignored by Kedro, but may be consumed by users or external plugins.
         """
+
+        self._file_format = file_format.lower()
+
         _fs_args = copy.deepcopy(fs_args) or {}
         _fs_open_args_load = _fs_args.pop("open_args_load", {})
         _fs_open_args_save = _fs_args.pop("open_args_save", {})
@@ -114,28 +129,57 @@ def __init__(  # noqa: PLR0913
             glob_function=self._fs.glob,
         )
 
-        self._load_args = copy.deepcopy(self.DEFAULT_LOAD_ARGS)
-        if load_args is not None:
-            self._load_args.update(load_args)
-
-        self._save_args = copy.deepcopy(self.DEFAULT_SAVE_ARGS)
-        if save_args is not None:
-            self._save_args.update(save_args)
+        # Handle default load and save and fs arguments
+        self._load_args = {**self.DEFAULT_LOAD_ARGS, **(load_args or {})}
+        self._save_args = {**self.DEFAULT_SAVE_ARGS, **(save_args or {})}
+        self._fs_open_args_load = {
+            **self.DEFAULT_FS_ARGS.get("open_args_load", {}),
+            **(_fs_open_args_load or {}),
+        }
+        self._fs_open_args_save = {
+            **self.DEFAULT_FS_ARGS.get("open_args_save", {}),
+            **(_fs_open_args_save or {}),
+        }
 
-        _fs_open_args_save.setdefault("mode", "wb")
-        self._fs_open_args_load = _fs_open_args_load
-        self._fs_open_args_save = _fs_open_args_save
+    def _ensure_file_system_target(self) -> None:
+        # Fail fast if provided a known non-filesystem target
+        if self._file_format in NON_FILE_SYSTEM_TARGETS:
+            raise DatasetError(
+                f"Cannot load or save a dataset of file_format '{self._file_format}' as it "
+                f"does not support a filepath target/source."
+            )
 
     def load(self) -> gpd.GeoDataFrame | dict[str, gpd.GeoDataFrame]:
+        self._ensure_file_system_target()
+
         load_path = get_filepath_str(self._get_load_path(), self._protocol)
-        with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
-            return gpd.read_file(fs_file, **self._load_args)
+        load_method = getattr(gpd, f"read_{self._file_format}", None)
+        if load_method:
+            with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
+                return load_method(fs_file, **self._load_args)
+        raise DatasetError(
+            f"Unable to retrieve 'geopandas.read_{self._file_format}' method, please ensure that your "
+            "'file_format' parameter has been defined correctly as per the GeoPandas API "
+            "https://geopandas.org/en/stable/docs/reference/io.html"
+        )
 
     def save(self, data: gpd.GeoDataFrame) -> None:
+        self._ensure_file_system_target()
+
         save_path = get_filepath_str(self._get_save_path(), self._protocol)
-        with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
-            data.to_file(fs_file, **self._save_args)
-        self.invalidate_cache()
+        save_method = getattr(data, f"to_{self._file_format}", None)
+        if save_method:
+            with self._fs.open(save_path, **self._fs_open_args_save) as fs_file:
+                # KEY ASSUMPTION - first argument is path/buffer/io
+                save_method(fs_file, **self._save_args)
+                self.invalidate_cache()
+        else:
+            raise DatasetError(
+                f"Unable to retrieve 'geopandas.DataFrame.to_{self._file_format}' method, please "
+                "ensure that your 'file_format' parameter has been defined correctly as "
+                "per the GeoPandas API "
+                "https://geopandas.org/en/stable/docs/reference/io.html"
+            )
 
     def _exists(self) -> bool:
         try:
@@ -147,6 +191,7 @@ def _exists(self) -> bool:
     def _describe(self) -> dict[str, Any]:
         return {
             "filepath": self._filepath,
+            "file_format": self._file_format,
             "protocol": self._protocol,
             "load_args": self._load_args,
             "save_args": self._save_args,

diff --git a/kedro-datasets/pyproject.toml b/kedro-datasets/pyproject.toml
@@ -40,8 +40,8 @@ dask = ["kedro-datasets[dask-parquetdataset, dask-csvdataset]"]
 databricks-managedtabledataset = ["kedro-datasets[spark-base,pandas-base,delta-base,hdfs-base,s3fs-base]"]
 databricks = ["kedro-datasets[databricks-managedtabledataset]"]
 
-geopandas-geojsondataset = ["geopandas>=0.6.0, <1.0", "pyproj~=3.0"]
-geopandas = ["kedro-datasets[geopandas-geojsondataset]"]
+geopandas-genericdataset = ["geopandas>=0.8.0, <2.0", "fiona >=1.8, <2.0"]
+geopandas = ["kedro-datasets[geopandas-genericdataset]"]
 
 holoviews-holoviewswriter = ["holoviews>=1.13.0"]
 holoviews = ["kedro-datasets[holoviews-holoviewswriter]"]
@@ -215,8 +215,9 @@ test = [
     "deltalake>=0.10.0",
     "dill~=0.3.1",
     "filelock>=3.4.0, <4.0",
+    "fiona >=1.8, <2.0",
     "gcsfs>=2023.1, <2023.3",
-    "geopandas>=0.6.0, <1.0",
+    "geopandas>=0.8.0, <2.0",
     "hdfs>=2.5.8, <3.0",
     "holoviews>=1.13.0",
     "ibis-framework[duckdb,examples]",
@@ -243,7 +244,6 @@ test = [
     "pyarrow>=1.0; python_version < '3.11'",
     "pyarrow>=7.0; python_version >= '3.11'",  # Adding to avoid numpy build errors
     "pyodbc~=5.0",
-    "pyproj~=3.0",
     "pyspark>=3.0; python_version < '3.11'",
     "pyspark>=3.4; python_version >= '3.11'",
     "pytest-cov~=3.0",