skrub-data · glemaitre · Jun 17, 2024 · jeromedockes · Jun 18, 2024 · TheooJ
diff --git a/skrub/_agg_joiner.py b/skrub/_agg_joiner.py
@@ -7,7 +7,7 @@
 """
 
 import numpy as np
-from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.base import BaseEstimator, TransformerMixin, _fit_context
 from sklearn.utils.multiclass import type_of_target
 from sklearn.utils.validation import check_is_fitted
 
@@ -67,7 +67,7 @@ class AggJoiner(TransformerMixin, BaseEstimator):
         The placeholder string "X" can be provided to perform
         self-aggregation on the input data.
 
-    key : str, default=None
+    key : str or iterable of str, default=None
         The column name to use for both `main_key` and `aux_key` when they
         are the same. Provide either `key` or both `main_key` and `aux_key`.
         If `key` is an iterable, we will perform a multi-column join.
@@ -138,6 +138,16 @@ class AggJoiner(TransformerMixin, BaseEstimator):
     1          2       NY JFK            DL               80.00...
     """
 
+    _parameter_constraints = {
+        "aux_table": "no_validation",  # we should have a DataFrameLike constraint
+        "key": [str, "array-like", None],
+        "main_key": [str, "array-like", None],
+        "aux_key": [str, "array-like", None],
+        "cols": [str, "array-like", None],
+        "operations": [str, "array-like", None],
+        "suffix": [str],
+    }
+
     def __init__(
         self,
         aux_table,
@@ -244,6 +254,7 @@ def _check_inputs(self, X):
         if not isinstance(self.suffix, str):
             raise ValueError(f"'suffix' must be a string. Got {self.suffix}")
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y=None):
         """Aggregate auxiliary table based on the main keys.
 
@@ -318,7 +329,7 @@ class AggTarget(TransformerMixin, BaseEstimator):
         aggregated using each key separately, then each aggregation of
         the target will be joined on the main table.
 
-    operation : str or iterable of str, optional
+    operation : str or iterable of str, default=None
         Aggregation operations to perform on the target.
 
         numerical : {"sum", "mean", "std", "min", "max", "hist", "value_counts"}
@@ -329,7 +340,7 @@ class AggTarget(TransformerMixin, BaseEstimator):
 
         If set to None (the default), ["mean", "mode"] will be used.
 
-    suffix : str, optional
+    suffix : str, default=None
         The suffix to append to the columns of the target table if the join
         results in duplicates columns.
         If set to None, "_target" is used.
@@ -370,6 +381,12 @@ class AggTarget(TransformerMixin, BaseEstimator):
     5         6             2  ...               1        1.000000
     """
 
+    _parameter_constraints = {
+        "main_key": [str, "array-like", None],
+        "operations": [str, "array-like", None],
+        "suffix": [str, None],
+    }
+
     def __init__(
         self,
         main_key,
@@ -380,6 +397,7 @@ def __init__(
         self.operation = operation
         self.suffix = suffix
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit(self, X, y):
         """Aggregate the target ``y`` based on keys from ``X``.
 

diff --git a/skrub/_datetime_encoder.py b/skrub/_datetime_encoder.py
@@ -1,7 +1,9 @@
 from datetime import datetime, timezone
 
 import pandas as pd
+from sklearn.base import _fit_context
 from sklearn.utils.validation import check_is_fitted
+from sklearn.utils._param_validation import StrOptions
 
 try:
     import polars as pl
@@ -255,11 +257,18 @@ class DatetimeEncoder(SingleColumnTransformer):
     timezone used during ``fit`` and that we get the same result for "hour".
     """  # noqa: E501
 
+    _parameter_constraints = {
+        "resolution": [StrOptions(set(_TIME_LEVELS)), None],
+        "add_weekday": ["boolean"],
+        "add_total_seconds": ["boolean"],
+    }
+
     def __init__(self, resolution="hour", add_weekday=False, add_total_seconds=True):
         self.resolution = resolution
         self.add_weekday = add_weekday
         self.add_total_seconds = add_total_seconds
 
+    @_fit_context(prefer_skip_nested_validation=True)
     def fit_transform(self, column, y=None):
         """Fit the encoder and transform a column.
 
@@ -277,7 +286,6 @@ def fit_transform(self, column, y=None):
             The extracted features.
         """
         del y
-        self._check_params()
         if not sbd.is_any_date(column):
             raise RejectColumn(
                 f"Column {sbd.name(column)!r} does not have Date or Datetime dtype."
@@ -316,10 +324,3 @@ def transform(self, column):
             extracted = sbd.to_float32(extracted)
             all_extracted.append(extracted)
         return sbd.make_dataframe_like(column, all_extracted)
-
-    def _check_params(self):
-        allowed = _TIME_LEVELS + [None]
-        if self.resolution not in allowed:
-            raise ValueError(
-                f"'resolution' options are {allowed}, got {self.resolution!r}."
-            )
diff --git a/skrub/tests/test_agg_joiner.py b/skrub/tests/test_agg_joiner.py
@@ -1,6 +1,8 @@
 import pandas as pd
 import pytest
 
+from sklearn.utils._param_validation import InvalidParameterError
+
 from skrub import _dataframe as sbd
 from skrub._agg_joiner import AggJoiner, AggTarget, split_num_categ_operations
 from skrub._dataframe._testing_utils import assert_frame_equal
@@ -216,7 +218,7 @@ def test_too_many_suffixes(df_module, main_table):
         cols="rating",
         suffix=["_user", "_movie", "_tag"],
     )
-    with pytest.raises(ValueError, match=r"(?='suffix' must be a string.*)"):
+    with pytest.raises(InvalidParameterError):
         agg_joiner.fit(main_table)
 
 
@@ -447,7 +449,7 @@ def test_no_aggregation_exception(main_table):
         main_key="userId",
         operation=[],
     )
-    with pytest.raises(ValueError, match=r"(?=.*No aggregation)"):
+    with pytest.raises(ValueError, match="No aggregation to perform"):
         agg_target.fit(main_table, y)
 
 
@@ -456,5 +458,6 @@ def test_wrong_args_ops(main_table):
         main_key="userId",
         operation="mean(2)",
     )
-    with pytest.raises(ValueError, match=r"(?=.*'mean')(?=.*argument)"):
+    err_msg = "Operator 'mean' doesn't take any argument, got 2"
+    with pytest.raises(ValueError, match=err_msg):
         agg_target.fit(main_table, y)
diff --git a/skrub/tests/test_datetime_encoder.py b/skrub/tests/test_datetime_encoder.py
@@ -2,6 +2,8 @@
 
 import pytest
 
+from sklearn.utils._param_validation import InvalidParameterError
+
 from skrub import DatetimeEncoder
 from skrub import _dataframe as sbd
 from skrub import _selectors as s
@@ -153,7 +155,7 @@ def test_time_not_extracted_from_date_col(datetime_cols):
 
 
 def test_invalid_resolution(datetime_cols):
-    with pytest.raises(ValueError, match=r".*'resolution' options are"):
+    with pytest.raises(InvalidParameterError):
         DatetimeEncoder(resolution="hello").fit(datetime_cols.datetime)