Project import generated by Copybara. (#102)

GitOrigin-RevId: 4224a34cc1f2c4a947efd6c5fcc6cea040c37bc6 Co-authored-by: Snowflake Authors <[email protected]>
snowflakedb · May 22, 2024 · 2932445 · 2932445
1 parent c530f5c
commit 2932445
Show file tree

Hide file tree

Showing 90 changed files with 4,477 additions and 1,158 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # Release History
 
+## 1.5.1
+
+### Bug Fixes
+
+- Dataset: Fix `snowflake.connector.errors.DataError: Query Result did not match expected number of rows` when accessing
+  DatasetVersion properties when case insensitive `SHOW VERSIONS IN DATASET` check matches multiple version names.
+- Dataset: Fix bug in SnowFS bulk file read when used with DuckDB
+- Registry: Fixed a bug when loading old models.
+- Lineage: Fix Dataset source lineage propagation through `snowpark.DataFrame` transformations
+
+### Behavior Changes
+
+- Feature Store: convert clear() into a private function. Also make it deletes feature views and entities only.
+- Feature Store: Use NULL as default value for timestamp tag value.
+
+### New Features
+
+- Feature Store: Added new `snowflake.ml.feature_store.setup_feature_store()` API to assist Feature Store RBAC setup.
+- Feature Store: Add `output_type` argument to `FeatureStore.generate_dataset()` to allow generating data snapshots
+  as Datasets or Tables.
+- Registry: `log_model`, `get_model`, `delete_model` now supports fully qualified name.
+- Modeling: Supports anonymous stored procedure during fit calls so that modeling would not require sufficient
+  permissions to operate on schema. Please call
+  `import snowflake.ml.modeling.parameters.enable_anonymous_sproc  # noqa: F401`
+
 ## 1.5.0
 
 ### Bug Fixes
@@ -40,12 +65,19 @@
 
 #### Feature Store (PrPr)
 
-`FeatureStore.generate_dataset` argument list has been changed to match the new
+- `FeatureStore.generate_dataset` argument list has been changed to match the new
 `snowflake.ml.dataset.Dataset` definition
 
-- `materialized_table` has been removed and replaced with `name` and `version`.
-- `name` moved to first positional argument
-- `save_mode` has been removed as `merge` behavior is no longer supported. The new behavior is always `errorifexists`.
+  - `materialized_table` has been removed and replaced with `name` and `version`.
+  - `name` moved to first positional argument
+  - `save_mode` has been removed as `merge` behavior is no longer supported. The new behavior is always `errorifexists`.
+
+- Change feature view version type from str to `FeatureViewVersion`. It is a restricted string literal.
+
+- Remove as_dataframe arg from FeatureStore.list_feature_views(), now always returns result as DataFrame.
+
+- Combines few metadata tags into a new tag: SNOWML_FEATURE_VIEW_METADATA. This will make previously created feature views
+not readable by new SDK.
 
 ### New Features
 
@@ -61,6 +93,10 @@
     and `Dataset.read.to_tf_dataset()` respectively.
 - Added `fsspec` style file integration using `Dataset.read.files()` and `Dataset.read.filesystem()`
 
+#### Feature Store
+
+- use new tag_reference_internal to speed up metadata lookup.
+
 ## 1.4.1 (2024-04-18)
 
 ### New Features
@@ -72,6 +108,10 @@
 
 - Registry: Fix a bug that leads to relax_version option is not working.
 
+### Behavior changes
+
+- Feature Store: update_feature_view takes refresh_freq and warehouse as argument.
+
 ## 1.4.0 (2024-04-08)
 
 ### Bug Fixes
@@ -93,6 +133,8 @@
 
 - Registry: `apply` method is no longer by default logged when logging a xgboost model. If that is required, it could
   be specified manually when logging the model by `log_model(..., options={"target_methods": ["apply", ...]})`.
+- Feature Store: register_entity returns an entity object.
+- Feature Store: register_feature_view `block=true` becomes default.
 
 ### New Features
 

diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.5.0
+  version: 1.5.1
 requirements:
   build:
     - python
@@ -27,7 +27,6 @@ requirements:
     - aiohttp!=4.0.0a0, !=4.0.0a1
     - anyio>=3.5.0,<4
     - cachetools>=3.1.1,<6
-    - catboost>=1.2.0, <1.3
     - cloudpickle>=2.0.0
     - fsspec>=2022.11,<2024
     - importlib_resources>=6.1.1, <7
@@ -49,11 +48,12 @@ requirements:
     - xgboost>=1.7.3,<2
     - python>=3.8,<3.12
   run_constrained:
-    - lightgbm>=3.3.5,<4.2
+    - catboost>=1.2.0, <2
+    - lightgbm>=3.3.5,<5
     - mlflow>=2.1.0,<2.4
     - pytorch>=2.0.1,<3
     - sentence-transformers>=2.2.2,<3
-    - sentencepiece>=0.1.95,<0.2
+    - sentencepiece>=0.1.95,<1
     - shap==0.42.1
     - tensorflow>=2.10,<3
     - tokenizers>=0.10,<1

diff --git a/requirements.yml b/requirements.yml
@@ -102,7 +102,9 @@
     - build_essential
 - name: catboost
   dev_version: 1.2.0
-  version_requirements: '>=1.2.0, <1.3'
+  version_requirements: '>=1.2.0, <2'
+  requirements_extra_tags:
+    - catboost
 - name: cloudpickle
   dev_version: 2.2.1
   version_requirements: '>=2.0.0'
@@ -141,7 +143,7 @@
   dev_version: 1.1.1
 - name: lightgbm
   dev_version: 3.3.5
-  version_requirements: '>=3.3.5,<4.2'
+  version_requirements: '>=3.3.5,<5'
   requirements_extra_tags:
     - lightgbm
   tags:
@@ -240,7 +242,7 @@
   version_requirements: '>=1.9,<2'
 - name: sentencepiece
   dev_version: 0.1.99
-  version_requirements: '>=0.1.95,<0.2'
+  version_requirements: '>=0.1.95,<1'
   requirements_extra_tags:
     - transformers
 - name: sentence-transformers

diff --git a/snowflake/ml/_internal/env_utils.py b/snowflake/ml/_internal/env_utils.py
@@ -553,6 +553,9 @@ def load_conda_env_file(
         A tuple of Dict of conda dependencies after validated, optional pip requirements if exist
         and a string 'major.minor.patchlevel' of python version.
     """
+    if not path.exists():
+        return collections.defaultdict(list), None, None
+
     with open(path, encoding="utf-8") as f:
         env = yaml.safe_load(stream=f)
 
@@ -603,6 +606,9 @@ def load_requirements_file(path: pathlib.Path) -> List[requirements.Requirement]
     Returns:
         List of dependencies string after validated.
     """
+    if not path.exists():
+        return []
+
     with open(path, encoding="utf-8") as f:
         reqs = f.readlines()
 

diff --git a/snowflake/ml/_internal/lineage/BUILD.bazel b/snowflake/ml/_internal/lineage/BUILD.bazel
@@ -1,11 +1,20 @@
-load("//bazel:py_rules.bzl", "py_library")
+load("//bazel:py_rules.bzl", "py_library", "py_test")
 
 package(default_visibility = ["//visibility:public"])
 
 py_library(
-    name = "dataset_dataframe",
+    name = "lineage_utils",
     srcs = [
         "data_source.py",
-        "dataset_dataframe.py",
+        "lineage_utils.py",
+    ],
+)
+
+py_test(
+    name = "lineage_utils_test",
+    srcs = ["lineage_utils_test.py"],
+    deps = [
+        ":lineage_utils",
+        "//snowflake/ml/utils:connection_params",
     ],
 )
diff --git a/snowflake/ml/_internal/lineage/dataset_dataframe.py b/snowflake/ml/_internal/lineage/dataset_dataframe.py
diff --git a/snowflake/ml/_internal/lineage/lineage_utils.py b/snowflake/ml/_internal/lineage/lineage_utils.py
@@ -0,0 +1,95 @@
+import copy
+import functools
+from typing import Any, Callable, List
+
+from snowflake import snowpark
+from snowflake.ml._internal.lineage import data_source
+
+DATA_SOURCES_ATTR = "_data_sources"
+
+
+def _get_datasources(*args: Any) -> List[data_source.DataSource]:
+    """Helper method for extracting data sources attribute from DataFrames in an argument list"""
+    result = []
+    for arg in args:
+        srcs = getattr(arg, DATA_SOURCES_ATTR, None)
+        if isinstance(srcs, list) and all(isinstance(s, data_source.DataSource) for s in srcs):
+            result += srcs
+    return result
+
+
+def _wrap_func(
+    fn: Callable[..., snowpark.DataFrame], data_sources: List[data_source.DataSource]
+) -> Callable[..., snowpark.DataFrame]:
+    """Wrap a DataFrame transform function to propagate data_sources to derived DataFrames."""
+
+    @functools.wraps(fn)
+    def wrapped(*args: Any, **kwargs: Any) -> snowpark.DataFrame:
+        df = fn(*args, **kwargs)
+        patch_dataframe(df, data_sources=data_sources, inplace=True)
+        return df
+
+    return wrapped
+
+
+def patch_dataframe(
+    df: snowpark.DataFrame, data_sources: List[data_source.DataSource], inplace: bool = False
+) -> snowpark.DataFrame:
+    """
+    Monkey patch a DataFrame to add attach the provided data_sources as an attribute of the DataFrame.
+    Also patches the DataFrame's transformation functions to propagate the new data sources attribute to
+    derived DataFrames.
+
+    Args:
+        df: DataFrame to be patched
+        data_sources: List of data sources for the DataFrame
+        inplace: If True, patches to DataFrame in-place. If False, creates a shallow copy of the DataFrame.
+
+    Returns:
+        Patched DataFrame
+    """
+    # Instance-level monkey-patches
+    funcs = [
+        "_with_plan",
+        "_lateral",
+        "group_by",
+        "group_by_grouping_sets",
+        "cube",
+        "pivot",
+        "rollup",
+        "cache_result",
+        "_to_df",  # RelationalGroupedDataFrame
+    ]
+    if not inplace:
+        df = copy.copy(df)
+    setattr(df, DATA_SOURCES_ATTR, data_sources)
+    for func in funcs:
+        fn = getattr(df, func, None)
+        if fn is not None:
+            setattr(df, func, _wrap_func(fn, data_sources=data_sources))
+    return df
+
+
+def _wrap_class_func(fn: Callable[..., snowpark.DataFrame]) -> Callable[..., snowpark.DataFrame]:
+    @functools.wraps(fn)
+    def wrapped(*args: Any, **kwargs: Any) -> snowpark.DataFrame:
+        df = fn(*args, **kwargs)
+        data_sources = _get_datasources(*args) + _get_datasources(*kwargs.values())
+        if data_sources:
+            patch_dataframe(df, data_sources, inplace=True)
+        return df
+
+    return wrapped
+
+
+# Class-level monkey-patches
+for klass, func_list in {
+    snowpark.DataFrame: [
+        "__copy__",
+    ],
+    snowpark.RelationalGroupedDataFrame: [],
+}.items():
+    assert isinstance(func_list, list)  # mypy
+    for func in func_list:
+        fn = getattr(klass, func)
+        setattr(klass, func, _wrap_class_func(fn))