# Authors: The scikit-learn developers # SPDX-License-Identifier: BSD-3-Clause import pickle import re import warnings import numpy as np import pytest import scipy.sparse as sp from numpy.testing import assert_allclose import sklearn from sklearn import config_context, datasets from sklearn.base import ( BaseEstimator, OutlierMixin, TransformerMixin, clone, is_classifier, is_clusterer, is_outlier_detector, is_regressor, ) from sklearn.cluster import KMeans from sklearn.decomposition import PCA from sklearn.ensemble import IsolationForest from sklearn.exceptions import InconsistentVersionWarning from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC, SVR from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor from sklearn.utils._mocking import MockDataFrame from sklearn.utils._set_output import _get_output_config from sklearn.utils._testing import ( _convert_container, assert_array_equal, ) from sklearn.utils.validation import _check_n_features, validate_data ############################################################################# # A few test classes class MyEstimator(BaseEstimator): def __init__(self, l1=0, empty=None): self.l1 = l1 self.empty = empty class K(BaseEstimator): def __init__(self, c=None, d=None): self.c = c self.d = d class T(BaseEstimator): def __init__(self, a=None, b=None): self.a = a self.b = b class NaNTag(BaseEstimator): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = True return tags class NoNaNTag(BaseEstimator): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = False return tags class OverrideTag(NaNTag): def __sklearn_tags__(self): tags = super().__sklearn_tags__() tags.input_tags.allow_nan = False return tags class DiamondOverwriteTag(NaNTag, NoNaNTag): pass class InheritDiamondOverwriteTag(DiamondOverwriteTag): pass class ModifyInitParams(BaseEstimator): """Deprecated behavior. Equal parameters but with a type cast. Doesn't fulfill a is a """ def __init__(self, a=np.array([0])): self.a = a.copy() class Buggy(BaseEstimator): "A buggy estimator that does not set its parameters right." def __init__(self, a=None): self.a = 1 class NoEstimator: def __init__(self): pass def fit(self, X=None, y=None): return self def predict(self, X=None): return None class VargEstimator(BaseEstimator): """scikit-learn estimators shouldn't have vargs.""" def __init__(self, *vargs): pass ############################################################################# # The tests def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert selector is not new_selector assert selector.get_params() == new_selector.get_params() selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert selector is not new_selector def test_clone_2(): # Tests that clone doesn't copy everything. # We first create an estimator, give it an own attribute, and # make a copy of its original state. Then we check that the copy doesn't # have the specific attribute we manually added to the initial estimator. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.own_attribute = "test" new_selector = clone(selector) assert not hasattr(new_selector, "own_attribute") def test_clone_buggy(): # Check that clone raises an error on buggy estimators. buggy = Buggy() buggy.a = 2 with pytest.raises(RuntimeError): clone(buggy) no_estimator = NoEstimator() with pytest.raises(TypeError): clone(no_estimator) varg_est = VargEstimator() with pytest.raises(RuntimeError): clone(varg_est) est = ModifyInitParams() with pytest.raises(RuntimeError): clone(est) def test_clone_empty_array(): # Regression test for cloning estimators with empty arrays clf = MyEstimator(empty=np.array([])) clf2 = clone(clf) assert_array_equal(clf.empty, clf2.empty) clf = MyEstimator(empty=sp.csr_matrix(np.array([[0]]))) clf2 = clone(clf) assert_array_equal(clf.empty.data, clf2.empty.data) def test_clone_nan(): # Regression test for cloning estimators with default parameter as np.nan clf = MyEstimator(empty=np.nan) clf2 = clone(clf) assert clf.empty is clf2.empty def test_clone_dict(): # test that clone creates a clone of a dict orig = {"a": MyEstimator()} cloned = clone(orig) assert orig["a"] is not cloned["a"] def test_clone_sparse_matrices(): sparse_matrix_classes = [ cls for name in dir(sp) if name.endswith("_matrix") and type(cls := getattr(sp, name)) is type ] for cls in sparse_matrix_classes: sparse_matrix = cls(np.eye(5)) clf = MyEstimator(empty=sparse_matrix) clf_cloned = clone(clf) assert clf.empty.__class__ is clf_cloned.empty.__class__ assert_array_equal(clf.empty.toarray(), clf_cloned.empty.toarray()) def test_clone_estimator_types(): # Check that clone works for parameters that are types rather than # instances clf = MyEstimator(empty=MyEstimator) clf2 = clone(clf) assert clf.empty is clf2.empty def test_clone_class_rather_than_instance(): # Check that clone raises expected error message when # cloning class rather than instance msg = "You should provide an instance of scikit-learn estimator" with pytest.raises(TypeError, match=msg): clone(MyEstimator) def test_repr(): # Smoke test the repr of the base estimator. my_estimator = MyEstimator() repr(my_estimator) test = T(K(), K()) assert repr(test) == "T(a=K(), b=K())" some_est = T(a=["long_params"] * 1000) assert len(repr(some_est)) == 485 def test_str(): # Smoke test the str of the base estimator my_estimator = MyEstimator() str(my_estimator) def test_get_params(): test = T(K(), K) assert "a__d" in test.get_params(deep=True) assert "a__d" not in test.get_params(deep=False) test.set_params(a__d=2) assert test.a.d == 2 with pytest.raises(ValueError): test.set_params(a__a=2) # TODO(1.8): Remove this test when the deprecation is removed def test_is_estimator_type_class(): with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): assert is_classifier(SVC) with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): assert is_regressor(SVR) with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): assert is_clusterer(KMeans) with pytest.warns(FutureWarning, match="passing a class to.*is deprecated"): assert is_outlier_detector(IsolationForest) @pytest.mark.parametrize( "estimator, expected_result", [ (SVC(), True), (GridSearchCV(SVC(), {"C": [0.1, 1]}), True), (Pipeline([("svc", SVC())]), True), (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), True), (SVR(), False), (GridSearchCV(SVR(), {"C": [0.1, 1]}), False), (Pipeline([("svr", SVR())]), False), (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), False), ], ) def test_is_classifier(estimator, expected_result): assert is_classifier(estimator) == expected_result @pytest.mark.parametrize( "estimator, expected_result", [ (SVR(), True), (GridSearchCV(SVR(), {"C": [0.1, 1]}), True), (Pipeline([("svr", SVR())]), True), (Pipeline([("svr_cv", GridSearchCV(SVR(), {"C": [0.1, 1]}))]), True), (SVC(), False), (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), (Pipeline([("svc", SVC())]), False), (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), ], ) def test_is_regressor(estimator, expected_result): assert is_regressor(estimator) == expected_result @pytest.mark.parametrize( "estimator, expected_result", [ (KMeans(), True), (GridSearchCV(KMeans(), {"n_clusters": [3, 8]}), True), (Pipeline([("km", KMeans())]), True), (Pipeline([("km_cv", GridSearchCV(KMeans(), {"n_clusters": [3, 8]}))]), True), (SVC(), False), (GridSearchCV(SVC(), {"C": [0.1, 1]}), False), (Pipeline([("svc", SVC())]), False), (Pipeline([("svc_cv", GridSearchCV(SVC(), {"C": [0.1, 1]}))]), False), ], ) def test_is_clusterer(estimator, expected_result): assert is_clusterer(estimator) == expected_result def test_set_params(): # test nested estimator parameter setting clf = Pipeline([("svc", SVC())]) # non-existing parameter in svc with pytest.raises(ValueError): clf.set_params(svc__stupid_param=True) # non-existing parameter of pipeline with pytest.raises(ValueError): clf.set_params(svm__stupid_param=True) # we don't currently catch if the things in pipeline are estimators # bad_pipeline = Pipeline([("bad", NoEstimator())]) # with pytest.raises(AttributeError): # bad_pipeline.set_params(bad__stupid_param=True) def test_set_params_passes_all_parameters(): # Make sure all parameters are passed together to set_params # of nested estimator. Regression test for #9944 class TestDecisionTree(DecisionTreeClassifier): def set_params(self, **kwargs): super().set_params(**kwargs) # expected_kwargs is in test scope assert kwargs == expected_kwargs return self expected_kwargs = {"max_depth": 5, "min_samples_leaf": 2} for est in [ Pipeline([("estimator", TestDecisionTree())]), GridSearchCV(TestDecisionTree(), {}), ]: est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2) def test_set_params_updates_valid_params(): # Check that set_params tries to set SVC().C, not # DecisionTreeClassifier().C gscv = GridSearchCV(DecisionTreeClassifier(), {}) gscv.set_params(estimator=SVC(), estimator__C=42.0) assert gscv.estimator.C == 42.0 @pytest.mark.parametrize( "tree,dataset", [ ( DecisionTreeClassifier(max_depth=2, random_state=0), datasets.make_classification(random_state=0), ), ( DecisionTreeRegressor(max_depth=2, random_state=0), datasets.make_regression(random_state=0), ), ], ) def test_score_sample_weight(tree, dataset): rng = np.random.RandomState(0) # check that the score with and without sample weights are different X, y = dataset tree.fit(X, y) # generate random sample weights sample_weight = rng.randint(1, 10, size=len(y)) score_unweighted = tree.score(X, y) score_weighted = tree.score(X, y, sample_weight=sample_weight) msg = "Unweighted and weighted scores are unexpectedly equal" assert score_unweighted != score_weighted, msg def test_clone_pandas_dataframe(): class DummyEstimator(TransformerMixin, BaseEstimator): """This is a dummy class for generating numerical features This feature extractor extracts numerical features from pandas data frame. Parameters ---------- df: pandas data frame The pandas data frame parameter. Notes ----- """ def __init__(self, df=None, scalar_param=1): self.df = df self.scalar_param = scalar_param def fit(self, X, y=None): pass def transform(self, X): pass # build and clone estimator d = np.arange(10) df = MockDataFrame(d) e = DummyEstimator(df, scalar_param=1) cloned_e = clone(e) # the test assert (e.df == cloned_e.df).values.all() assert e.scalar_param == cloned_e.scalar_param def test_clone_protocol(): """Checks that clone works with `__sklearn_clone__` protocol.""" class FrozenEstimator(BaseEstimator): def __init__(self, fitted_estimator): self.fitted_estimator = fitted_estimator def __getattr__(self, name): return getattr(self.fitted_estimator, name) def __sklearn_clone__(self): return self def fit(self, *args, **kwargs): return self def fit_transform(self, *args, **kwargs): return self.fitted_estimator.transform(*args, **kwargs) X = np.array([[-1, -1], [-2, -1], [-3, -2]]) pca = PCA().fit(X) components = pca.components_ frozen_pca = FrozenEstimator(pca) assert_allclose(frozen_pca.components_, components) # Calling PCA methods such as `get_feature_names_out` still works assert_array_equal(frozen_pca.get_feature_names_out(), pca.get_feature_names_out()) # Fitting on a new data does not alter `components_` X_new = np.asarray([[-1, 2], [3, 4], [1, 2]]) frozen_pca.fit(X_new) assert_allclose(frozen_pca.components_, components) # `fit_transform` does not alter state frozen_pca.fit_transform(X_new) assert_allclose(frozen_pca.components_, components) # Cloning estimator is a no-op clone_frozen_pca = clone(frozen_pca) assert clone_frozen_pca is frozen_pca assert_allclose(clone_frozen_pca.components_, components) def test_pickle_version_warning_is_not_raised_with_matching_version(): iris = datasets.load_iris() tree = DecisionTreeClassifier().fit(iris.data, iris.target) tree_pickle = pickle.dumps(tree) assert b"_sklearn_version" in tree_pickle with warnings.catch_warnings(): warnings.simplefilter("error") tree_restored = pickle.loads(tree_pickle) # test that we can predict with the restored decision tree classifier score_of_original = tree.score(iris.data, iris.target) score_of_restored = tree_restored.score(iris.data, iris.target) assert score_of_original == score_of_restored class TreeBadVersion(DecisionTreeClassifier): def __getstate__(self): return dict(self.__dict__.items(), _sklearn_version="something") pickle_error_message = ( "Trying to unpickle estimator {estimator} from " "version {old_version} when using version " "{current_version}. This might " "lead to breaking code or invalid results. " "Use at your own risk." ) def test_pickle_version_warning_is_issued_upon_different_version(): iris = datasets.load_iris() tree = TreeBadVersion().fit(iris.data, iris.target) tree_pickle_other = pickle.dumps(tree) message = pickle_error_message.format( estimator="TreeBadVersion", old_version="something", current_version=sklearn.__version__, ) with pytest.warns(UserWarning, match=message) as warning_record: pickle.loads(tree_pickle_other) message = warning_record.list[0].message assert isinstance(message, InconsistentVersionWarning) assert message.estimator_name == "TreeBadVersion" assert message.original_sklearn_version == "something" assert message.current_sklearn_version == sklearn.__version__ class TreeNoVersion(DecisionTreeClassifier): def __getstate__(self): return self.__dict__ def test_pickle_version_warning_is_issued_when_no_version_info_in_pickle(): iris = datasets.load_iris() # TreeNoVersion has no getstate, like pre-0.18 tree = TreeNoVersion().fit(iris.data, iris.target) tree_pickle_noversion = pickle.dumps(tree) assert b"_sklearn_version" not in tree_pickle_noversion message = pickle_error_message.format( estimator="TreeNoVersion", old_version="pre-0.18", current_version=sklearn.__version__, ) # check we got the warning about using pre-0.18 pickle with pytest.warns(UserWarning, match=message): pickle.loads(tree_pickle_noversion) def test_pickle_version_no_warning_is_issued_with_non_sklearn_estimator(): iris = datasets.load_iris() tree = TreeNoVersion().fit(iris.data, iris.target) tree_pickle_noversion = pickle.dumps(tree) try: module_backup = TreeNoVersion.__module__ TreeNoVersion.__module__ = "notsklearn" with warnings.catch_warnings(): warnings.simplefilter("error") pickle.loads(tree_pickle_noversion) finally: TreeNoVersion.__module__ = module_backup class DontPickleAttributeMixin: def __getstate__(self): data = self.__dict__.copy() data["_attribute_not_pickled"] = None return data def __setstate__(self, state): state["_restored"] = True self.__dict__.update(state) class MultiInheritanceEstimator(DontPickleAttributeMixin, BaseEstimator): def __init__(self, attribute_pickled=5): self.attribute_pickled = attribute_pickled self._attribute_not_pickled = None def test_pickling_when_getstate_is_overwritten_by_mixin(): estimator = MultiInheritanceEstimator() estimator._attribute_not_pickled = "this attribute should not be pickled" serialized = pickle.dumps(estimator) estimator_restored = pickle.loads(serialized) assert estimator_restored.attribute_pickled == 5 assert estimator_restored._attribute_not_pickled is None assert estimator_restored._restored def test_pickling_when_getstate_is_overwritten_by_mixin_outside_of_sklearn(): try: estimator = MultiInheritanceEstimator() text = "this attribute should not be pickled" estimator._attribute_not_pickled = text old_mod = type(estimator).__module__ type(estimator).__module__ = "notsklearn" serialized = estimator.__getstate__() assert serialized == {"_attribute_not_pickled": None, "attribute_pickled": 5} serialized["attribute_pickled"] = 4 estimator.__setstate__(serialized) assert estimator.attribute_pickled == 4 assert estimator._restored finally: type(estimator).__module__ = old_mod class SingleInheritanceEstimator(BaseEstimator): def __init__(self, attribute_pickled=5): self.attribute_pickled = attribute_pickled self._attribute_not_pickled = None def __getstate__(self): state = super().__getstate__() state["_attribute_not_pickled"] = None return state def test_pickling_works_when_getstate_is_overwritten_in_the_child_class(): estimator = SingleInheritanceEstimator() estimator._attribute_not_pickled = "this attribute should not be pickled" serialized = pickle.dumps(estimator) estimator_restored = pickle.loads(serialized) assert estimator_restored.attribute_pickled == 5 assert estimator_restored._attribute_not_pickled is None def test_tag_inheritance(): # test that changing tags by inheritance is not allowed nan_tag_est = NaNTag() no_nan_tag_est = NoNaNTag() assert nan_tag_est.__sklearn_tags__().input_tags.allow_nan assert not no_nan_tag_est.__sklearn_tags__().input_tags.allow_nan redefine_tags_est = OverrideTag() assert not redefine_tags_est.__sklearn_tags__().input_tags.allow_nan diamond_tag_est = DiamondOverwriteTag() assert diamond_tag_est.__sklearn_tags__().input_tags.allow_nan inherit_diamond_tag_est = InheritDiamondOverwriteTag() assert inherit_diamond_tag_est.__sklearn_tags__().input_tags.allow_nan def test_raises_on_get_params_non_attribute(): class MyEstimator(BaseEstimator): def __init__(self, param=5): pass def fit(self, X, y=None): return self est = MyEstimator() msg = "'MyEstimator' object has no attribute 'param'" with pytest.raises(AttributeError, match=msg): est.get_params() def test_repr_mimebundle_(): # Checks the display configuration flag controls the json output tree = DecisionTreeClassifier() output = tree._repr_mimebundle_() assert "text/plain" in output assert "text/html" in output with config_context(display="text"): output = tree._repr_mimebundle_() assert "text/plain" in output assert "text/html" not in output def test_repr_html_wraps(): # Checks the display configuration flag controls the html output tree = DecisionTreeClassifier() output = tree._repr_html_() assert "<style>" in output with config_context(display="text"): msg = "_repr_html_ is only defined when" with pytest.raises(AttributeError, match=msg): output = tree._repr_html_() def test_n_features_in_validation(): """Check that `_check_n_features` validates data when reset=False""" est = MyEstimator() X_train = [[1, 2, 3], [4, 5, 6]] _check_n_features(est, X_train, reset=True) assert est.n_features_in_ == 3 msg = "X does not contain any features, but MyEstimator is expecting 3 features" with pytest.raises(ValueError, match=msg): _check_n_features(est, "invalid X", reset=False) def test_n_features_in_no_validation(): """Check that `_check_n_features` does not validate data when n_features_in_ is not defined.""" est = MyEstimator() _check_n_features(est, "invalid X", reset=True) assert not hasattr(est, "n_features_in_") # does not raise _check_n_features(est, "invalid X", reset=False) def test_feature_names_in(): """Check that feature_name_in are recorded by `_validate_data`""" pd = pytest.importorskip("pandas") iris = datasets.load_iris() X_np = iris.data df = pd.DataFrame(X_np, columns=iris.feature_names) class NoOpTransformer(TransformerMixin, BaseEstimator): def fit(self, X, y=None): validate_data(self, X) return self def transform(self, X): validate_data(self, X, reset=False) return X # fit on dataframe saves the feature names trans = NoOpTransformer().fit(df) assert_array_equal(trans.feature_names_in_, df.columns) # fit again but on ndarray does not keep the previous feature names (see #21383) trans.fit(X_np) assert not hasattr(trans, "feature_names_in_") trans.fit(df) msg = "The feature names should match those that were passed" df_bad = pd.DataFrame(X_np, columns=iris.feature_names[::-1]) with pytest.raises(ValueError, match=msg): trans.transform(df_bad) # warns when fitted on dataframe and transforming a ndarray msg = ( "X does not have valid feature names, but NoOpTransformer was " "fitted with feature names" ) with pytest.warns(UserWarning, match=msg): trans.transform(X_np) # warns when fitted on a ndarray and transforming dataframe msg = "X has feature names, but NoOpTransformer was fitted without feature names" trans = NoOpTransformer().fit(X_np) with pytest.warns(UserWarning, match=msg): trans.transform(df) # fit on dataframe with all integer feature names works without warning df_int_names = pd.DataFrame(X_np) trans = NoOpTransformer() with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) trans.fit(df_int_names) # fit on dataframe with no feature names or all integer feature names # -> do not warn on transform Xs = [X_np, df_int_names] for X in Xs: with warnings.catch_warnings(): warnings.simplefilter("error", UserWarning) trans.transform(X) # fit on dataframe with feature names that are mixed raises an error: df_mixed = pd.DataFrame(X_np, columns=["a", "b", 1, 2]) trans = NoOpTransformer() msg = re.escape( "Feature names are only supported if all input features have string names, " "but your input has ['int', 'str'] as feature name / column name types. " "If you want feature names to be stored and validated, you must convert " "them all to strings, by using X.columns = X.columns.astype(str) for " "example. Otherwise you can remove feature / column names from your input " "data, or convert them all to a non-string data type." ) with pytest.raises(TypeError, match=msg): trans.fit(df_mixed) # transform on feature names that are mixed also raises: with pytest.raises(TypeError, match=msg): trans.transform(df_mixed) def test_validate_data_skip_check_array(): """Check skip_check_array option of _validate_data.""" pd = pytest.importorskip("pandas") iris = datasets.load_iris() df = pd.DataFrame(iris.data, columns=iris.feature_names) y = pd.Series(iris.target) class NoOpTransformer(TransformerMixin, BaseEstimator): pass no_op = NoOpTransformer() X_np_out = validate_data(no_op, df, skip_check_array=False) assert isinstance(X_np_out, np.ndarray) assert_allclose(X_np_out, df.to_numpy()) X_df_out = validate_data(no_op, df, skip_check_array=True) assert X_df_out is df y_np_out = validate_data(no_op, y=y, skip_check_array=False) assert isinstance(y_np_out, np.ndarray) assert_allclose(y_np_out, y.to_numpy()) y_series_out = validate_data(no_op, y=y, skip_check_array=True) assert y_series_out is y X_np_out, y_np_out = validate_data(no_op, df, y, skip_check_array=False) assert isinstance(X_np_out, np.ndarray) assert_allclose(X_np_out, df.to_numpy()) assert isinstance(y_np_out, np.ndarray) assert_allclose(y_np_out, y.to_numpy()) X_df_out, y_series_out = validate_data(no_op, df, y, skip_check_array=True) assert X_df_out is df assert y_series_out is y msg = "Validation should be done on X, y or both." with pytest.raises(ValueError, match=msg): validate_data(no_op) def test_clone_keeps_output_config(): """Check that clone keeps the set_output config.""" ss = StandardScaler().set_output(transform="pandas") config = _get_output_config("transform", ss) ss_clone = clone(ss) config_clone = _get_output_config("transform", ss_clone) assert config == config_clone class _Empty: pass class EmptyEstimator(_Empty, BaseEstimator): pass @pytest.mark.parametrize("estimator", [BaseEstimator(), EmptyEstimator()]) def test_estimator_empty_instance_dict(estimator): """Check that ``__getstate__`` returns an empty ``dict`` with an empty instance. Python 3.11+ changed behaviour by returning ``None`` instead of raising an ``AttributeError``. Non-regression test for gh-25188. """ state = estimator.__getstate__() expected = {"_sklearn_version": sklearn.__version__} assert state == expected # this should not raise pickle.loads(pickle.dumps(BaseEstimator())) def test_estimator_getstate_using_slots_error_message(): """Using a `BaseEstimator` with `__slots__` is not supported.""" class WithSlots: __slots__ = ("x",) class Estimator(BaseEstimator, WithSlots): pass msg = ( "You cannot use `__slots__` in objects inheriting from " "`sklearn.base.BaseEstimator`" ) with pytest.raises(TypeError, match=msg): Estimator().__getstate__() with pytest.raises(TypeError, match=msg): pickle.dumps(Estimator()) @pytest.mark.parametrize( "constructor_name, minversion", [ ("dataframe", "1.5.0"), ("pyarrow", "12.0.0"), ("polars", "0.20.23"), ], ) def test_dataframe_protocol(constructor_name, minversion): """Uses the dataframe exchange protocol to get feature names.""" data = [[1, 4, 2], [3, 3, 6]] columns = ["col_0", "col_1", "col_2"] df = _convert_container( data, constructor_name, columns_name=columns, minversion=minversion ) class NoOpTransformer(TransformerMixin, BaseEstimator): def fit(self, X, y=None): validate_data(self, X) return self def transform(self, X): return validate_data(self, X, reset=False) no_op = NoOpTransformer() no_op.fit(df) assert_array_equal(no_op.feature_names_in_, columns) X_out = no_op.transform(df) if constructor_name != "pyarrow": # pyarrow does not work with `np.asarray` # https://github.com/apache/arrow/issues/34886 assert_allclose(df, X_out) bad_names = ["a", "b", "c"] df_bad = _convert_container(data, constructor_name, columns_name=bad_names) with pytest.raises(ValueError, match="The feature names should match"): no_op.transform(df_bad) @config_context(enable_metadata_routing=True) def test_transformer_fit_transform_with_metadata_in_transform(): """Test that having a transformer with metadata for transform raises a warning when calling fit_transform.""" class CustomTransformer(BaseEstimator, TransformerMixin): def fit(self, X, y=None, prop=None): return self def transform(self, X, prop=None): return X # passing the metadata to `fit_transform` should raise a warning since it # could potentially be consumed by `transform` with pytest.warns(UserWarning, match="`transform` method which consumes metadata"): CustomTransformer().set_transform_request(prop=True).fit_transform( [[1]], [1], prop=1 ) # not passing a metadata which can potentially be consumed by `transform` should # not raise a warning with warnings.catch_warnings(record=True) as record: CustomTransformer().set_transform_request(prop=True).fit_transform([[1]], [1]) assert len(record) == 0 @config_context(enable_metadata_routing=True) def test_outlier_mixin_fit_predict_with_metadata_in_predict(): """Test that having an OutlierMixin with metadata for predict raises a warning when calling fit_predict.""" class CustomOutlierDetector(BaseEstimator, OutlierMixin): def fit(self, X, y=None, prop=None): return self def predict(self, X, prop=None): return X # passing the metadata to `fit_predict` should raise a warning since it # could potentially be consumed by `predict` with pytest.warns(UserWarning, match="`predict` method which consumes metadata"): CustomOutlierDetector().set_predict_request(prop=True).fit_predict( [[1]], [1], prop=1 ) # not passing a metadata which can potentially be consumed by `predict` should # not raise a warning with warnings.catch_warnings(record=True) as record: CustomOutlierDetector().set_predict_request(prop=True).fit_predict([[1]], [1]) assert len(record) == 0