Project import generated by Copybara. (#114)

GitOrigin-RevId: 516a6129d65f30b2dbfc2160bc41cc35c6f468a8 Co-authored-by: Snowflake Authors <[email protected]>
snowflakedb · Aug 12, 2024 · 2b044fc · 2b044fc
1 parent 123693a
commit 2b044fc
Show file tree

Hide file tree

Showing 144 changed files with 5,584 additions and 4,794 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,31 @@
 # Release History
 
-## 1.6.0
+## 1.6.1 (TBD)
+
+### Bug Fixes
+
+- Feature Store: Support large metadata blob when generating dataset
+- Feature Store: Added a hidden knob in FeatureView as kargs for setting customized
+  refresh_mode
+- Registry: Fix an error message in Model Version `run` when `function_name` is not mentioned and model has multiple
+  target methods.
+- Cortex inference: snowflake.cortex.Complete now only uses the REST API for streaming and the use_rest_api_experimental
+  is no longer needed.
+- Feature Store: Add a new API: FeatureView.list_columns() which list all column information.
+- Data: Fix `DataFrame` ingestion with `ArrowIngestor`.
+
+### New Features
+
+- Enable `set_params` to set the parameters of the underlying sklearn estimator, if the snowflake-ml model has been fit.
+- Data: Add top-level exports for `DataConnector` and `DataSource` to `snowflake.ml.data`.
+- Data: Add `snowflake.ml.data.ingestor_utils` module with utility functions helpful for `DataIngestor` implementations.
+- Data: Add new `to_torch_dataset()` connector to `DataConnector` to replace deprecated DataPipe.
+- Registry: Option to `enable_explainability` set to True by default for XGBoost, LightGBM and CatBoost as PuPr feature.
+- Registry: Option to `enable_explainability` when registering SHAP supported sklearn models.
+
+### Behavior Changes
+
+## 1.6.0 (2024-07-29)
 
 ### Bug Fixes
 
@@ -29,6 +54,14 @@
   distributed_hpo_trainer.ENABLE_EFFICIENT_MEMORY_USAGE = False
   `
 - Registry: Option to `enable_explainability` when registering LightGBM models as a pre-PuPr feature.
+- Data: Add new `snowflake.ml.data` preview module which contains data reading utilities like `DataConnector`
+  - `DataConnector` provides efficient connectors from Snowpark `DataFrame`
+  and Snowpark ML `Dataset` to external frameworks like PyTorch, TensorFlow, and Pandas. Create `DataConnector`
+  instances using the classmethod constructors `DataConnector.from_dataset()` and `DataConnector.from_dataframe()`.
+- Data: Add new `DataConnector.from_sources()` classmethod constructor for constructing from `DataSource` objects.
+- Data: Add new `ingestor_class` arg to `DataConnector` classmethod constructors for easier `DataIngestor` injection.
+- Dataset: `DatasetReader` now subclasses new `DataConnector` class.
+  - Add optional `limit` arg to `DatasetReader.to_pandas()`
 
 ### Behavior Changes
 

diff --git a/bazel/py_rules.bzl b/bazel/py_rules.bzl
@@ -256,6 +256,7 @@ def _py_wheel_impl(ctx):
             ctx.file.pyproject_toml.path,
             execution_root_relative_path,
             "--wheel",
+            "--sdist",
             "--outdir",
             wheel_output_dir.path,
         ],

diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.6.0
+  version: 1.6.1
 requirements:
   build:
     - python

diff --git a/ci/targets/quarantine/prod3.txt b/ci/targets/quarantine/prod3.txt
@@ -2,3 +2,4 @@
 //tests/integ/snowflake/ml/registry:model_registry_snowservice_integ_test
 //tests/integ/snowflake/ml/model:spcs_llm_model_integ_test
 //tests/integ/snowflake/ml/extra_tests:xgboost_external_memory_training_test
+//tests/integ/snowflake/ml/registry:model_registry_snowservice_merge_gate_integ_test
diff --git a/codegen/build_file_autogen.py b/codegen/build_file_autogen.py
@@ -14,7 +14,7 @@
 from absl import app
 
 from codegen import sklearn_wrapper_autogen as swa
-from snowflake.ml._internal.snowpark_pandassnowpark_pandas import imports
+from snowflake.ml._internal.snowpark_pandas import imports
 
 
 @dataclass(frozen=True)

diff --git a/snowflake/cortex/_complete.py b/snowflake/cortex/_complete.py
@@ -90,7 +90,6 @@ def _call_complete_rest(
     prompt: Union[str, List[ConversationMessage]],
     options: Optional[CompleteOptions] = None,
     session: Optional[snowpark.Session] = None,
-    stream: bool = False,
 ) -> requests.Response:
     session = session or context.get_active_session()
     if session is None:
@@ -121,7 +120,7 @@ def _call_complete_rest(
 
     data = {
         "model": model,
-        "stream": stream,
+        "stream": True,
     }
     if isinstance(prompt, List):
         data["messages"] = prompt
@@ -137,32 +136,15 @@ def _call_complete_rest(
         if "top_p" in options:
             data["top_p"] = options["top_p"]
 
-    logger.debug(f"making POST request to {url} (model={model}, stream={stream})")
+    logger.debug(f"making POST request to {url} (model={model})")
     return requests.post(
         url,
         json=data,
         headers=headers,
-        stream=stream,
+        stream=True,
     )
 
 
-def _process_rest_response(
-    response: requests.Response,
-    stream: bool = False,
-    deadline: Optional[float] = None,
-) -> Union[str, Iterator[str]]:
-    if stream:
-        return _return_stream_response(response, deadline)
-
-    try:
-        content = response.json()["choices"][0]["message"]["content"]
-        assert isinstance(content, str)
-        return content
-    except (KeyError, IndexError, AssertionError) as e:
-        # Unlike the streaming case, errors are not ignored because a message must be returned.
-        raise ResponseParseException("Failed to parse message from response.") from e
-
-
 def _return_stream_response(response: requests.Response, deadline: Optional[float]) -> Iterator[str]:
     client = SSEClient(response)
     for event in client.events():
@@ -243,7 +225,6 @@ def _complete_impl(
     prompt: Union[str, List[ConversationMessage], snowpark.Column],
     options: Optional[CompleteOptions] = None,
     session: Optional[snowpark.Session] = None,
-    use_rest_api_experimental: bool = False,
     stream: bool = False,
     function: str = "snowflake.cortex.complete",
     timeout: Optional[float] = None,
@@ -253,16 +234,14 @@ def _complete_impl(
         raise ValueError('only one of "timeout" and "deadline" must be set')
     if timeout is not None:
         deadline = time.time() + timeout
-    if use_rest_api_experimental:
+    if stream:
         if not isinstance(model, str):
             raise ValueError("in REST mode, 'model' must be a string")
         if not isinstance(prompt, str) and not isinstance(prompt, List):
             raise ValueError("in REST mode, 'prompt' must be a string or a list of ConversationMessage")
-        response = _call_complete_rest(model, prompt, options, session=session, stream=stream, deadline=deadline)
+        response = _call_complete_rest(model, prompt, options, session=session, deadline=deadline)
         assert response.status_code >= 200 and response.status_code < 300
-        return _process_rest_response(response, stream=stream)
-    if stream is True:
-        raise ValueError("streaming can only be enabled in REST mode, set use_rest_api_experimental=True")
+        return _return_stream_response(response, deadline)
     return _complete_sql_impl(function, model, prompt, options, session)
 
 
@@ -275,7 +254,6 @@ def Complete(
     *,
     options: Optional[CompleteOptions] = None,
     session: Optional[snowpark.Session] = None,
-    use_rest_api_experimental: bool = False,
     stream: bool = False,
     timeout: Optional[float] = None,
     deadline: Optional[float] = None,
@@ -287,16 +265,13 @@ def Complete(
         prompt: A Column of prompts to send to the LLM.
         options: A instance of snowflake.cortex.CompleteOptions
         session: The snowpark session to use. Will be inferred by context if not specified.
-        use_rest_api_experimental (bool): Toggles between the use of SQL and REST implementation. This feature is
-            experimental and can be removed at any time.
         stream (bool): Enables streaming. When enabled, a generator function is returned that provides the streaming
             output as it is received. Each update is a string containing the new text content since the previous update.
-            The use of streaming requires the experimental use_rest_api_experimental flag to be enabled.
         timeout (float): Timeout in seconds to retry failed REST requests.
         deadline (float): Time in seconds since the epoch (as returned by time.time()) to retry failed REST requests.
 
     Raises:
-        ValueError: If `stream` is set to True and `use_rest_api_experimental` is set to False.
+        ValueError: incorrect argument.
 
     Returns:
         A column of string responses.
@@ -307,7 +282,6 @@ def Complete(
             prompt,
             options=options,
             session=session,
-            use_rest_api_experimental=use_rest_api_experimental,
             stream=stream,
             timeout=timeout,
             deadline=deadline,