FIX/ENH CheckingClassifier support parameters and sparse matrices (scikit-learn#17259)

glemaitre · viclafargue · commit 8cb5708d2a39 · 2020-06-26T11:43:34.000Z
diff --git a/sklearn/utils/_mocking.py b/sklearn/utils/_mocking.py
@@ -51,31 +51,46 @@ def __ne__(self, other):
 class CheckingClassifier(ClassifierMixin, BaseEstimator):
     """Dummy classifier to test pipelining and meta-estimators.
 
-    Checks some property of X and y in fit / predict.
+    Checks some property of `X` and `y`in fit / predict.
     This allows testing whether pipelines / cross-validation or metaestimators
     changed the input.
 
     Parameters
     ----------
-    check_y
-    check_X
-    foo_param
-    expected_fit_params
+    check_y, check_X : callable, default=None
+        The callable used to validate `X` and `y`. These callable should return
+        a bool where `False` will trigger an `AssertionError`.
+
+    check_y_params, check_X_params : dict, default=None
+        The optional parameters to pass to `check_X` and `check_y`.
+
+    foo_param : int, default=0
+        A `foo` param. When `foo > 1`, the output of :meth:`score` will be 1
+        otherwise it is 0.
+
+    expected_fit_params : list of str, default=None
+        A list of the expected parameters given when calling `fit`.
 
     Attributes
     ----------
-    classes_
+    classes_ : int
+        The classes seen during `fit`.
+
+    n_features_in_ : int
+        The number of features seen during `fit`.
     """
-    def __init__(self, check_y=None, check_X=None, foo_param=0,
+    def __init__(self, *, check_y=None, check_y_params=None,
+                 check_X=None, check_X_params=None, foo_param=0,
                  expected_fit_params=None):
         self.check_y = check_y
+        self.check_y_params = check_y_params
         self.check_X = check_X
+        self.check_X_params = check_X_params
         self.foo_param = foo_param
         self.expected_fit_params = expected_fit_params
 
     def fit(self, X, y, **fit_params):
-        """
-        Fit classifier
+        """Fit classifier.
 
         Parameters
         ----------
@@ -89,48 +104,114 @@ def fit(self, X, y, **fit_params):
 
         **fit_params : dict of string -> object
             Parameters passed to the ``fit`` method of the estimator
+
+        Returns
+        -------
+        self
         """
-        assert len(X) == len(y)
+        assert _num_samples(X) == _num_samples(y)
         if self.check_X is not None:
-            assert self.check_X(X)
+            params = {} if self.check_X_params is None else self.check_X_params
+            assert self.check_X(X, **params)
         if self.check_y is not None:
+            params = {} if self.check_y_params is None else self.check_y_params
             assert self.check_y(y)
-        self.n_features_in_ = len(X)
-        self.classes_ = np.unique(check_array(y, ensure_2d=False,
-                                              allow_nd=True))
+        self.n_features_in_ = np.shape(X)[1]
+        self.classes_ = np.unique(
+            check_array(y, ensure_2d=False, allow_nd=True)
+        )
         if self.expected_fit_params:
             missing = set(self.expected_fit_params) - set(fit_params)
-            assert len(missing) == 0, 'Expected fit parameter(s) %s not ' \
-                                      'seen.' % list(missing)
+            if missing:
+                raise AssertionError(
+                    f'Expected fit parameter(s) {list(missing)} not seen.'
+                )
             for key, value in fit_params.items():
-                assert len(value) == len(X), (
-                        'Fit parameter %s has length %d; '
-                        'expected %d.'
-                        % (key, len(value), len(X)))
+                if _num_samples(value) != _num_samples(X):
+                    raise AssertionError(
+                        f'Fit parameter {key} has length {_num_samples(value)}'
+                        f'; expected {_num_samples(X)}.'
+                    )
 
         return self
 
-    def predict(self, T):
-        """
+    def predict(self, X):
+        """Predict the first class seen in `classes_`.
+
         Parameters
         ----------
-        T : indexable, length n_samples
+        X : array-like of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        preds : ndarray of shape (n_samples,)
+            Predictions of the first class seens in `classes_`.
         """
         if self.check_X is not None:
-            assert self.check_X(T)
-        return self.classes_[np.zeros(_num_samples(T), dtype=np.int)]
+            params = {} if self.check_X_params is None else self.check_X_params
+            assert self.check_X(X, **params)
+        return self.classes_[np.zeros(_num_samples(X), dtype=np.int)]
 
-    def score(self, X=None, Y=None):
+    def predict_proba(self, X):
+        """Predict probabilities for each class.
+
+        Here, the dummy classifier will provide a probability of 1 for the
+        first class of `classes_` and 0 otherwise.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        proba : ndarray of shape (n_samples, n_classes)
+            The probabilities for each sample and class.
         """
+        proba = np.zeros((_num_samples(X), len(self.classes_)))
+        proba[:, 0] = 1
+        return proba
+
+    def decision_function(self, X):
+        """Confidence score.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            The input data.
+
+        Returns
+        -------
+        decision : ndarray of shape (n_samples,) if n_classes == 2\
+                else (n_samples, n_classes)
+            Confidence score.
+        """
+        if len(self.classes_) == 2:
+            # for binary classifier, the confidence score is related to
+            # classes_[1] and therefore should be null.
+            return np.zeros(_num_samples(X))
+        else:
+            return self.predict_proba(X)
+
+    def score(self, X=None, Y=None):
+        """Fake score.
+
         Parameters
         ----------
         X : array-like of shape (n_samples, n_features)
             Input data, where n_samples is the number of samples and
             n_features is the number of features.
 
-        Y : array-like of shape (n_samples, n_output) or (n_samples,), optional
+        Y : array-like of shape (n_samples, n_output) or (n_samples,)
             Target relative to X for classification or regression;
             None for unsupervised learning.
+
+        Returns
+        -------
+        score : float
+            Either 0 or 1 depending of `foo_param` (i.e. `foo_param > 1 =>
+            score=1` otherwise `score=0`).
         """
         if self.foo_param > 1:
             score = 1.
diff --git a/sklearn/utils/tests/test_mocking.py b/sklearn/utils/tests/test_mocking.py
@@ -0,0 +1,108 @@
+import numpy as np
+import pytest
+from scipy import sparse
+
+from numpy.testing import assert_array_equal
+from numpy.testing import assert_allclose
+
+from sklearn.datasets import load_iris
+from sklearn.utils import check_array
+from sklearn.utils import _safe_indexing
+from sklearn.utils._testing import _convert_container
+
+from sklearn.utils._mocking import CheckingClassifier
+
+
+@pytest.fixture
+def iris():
+    return load_iris(return_X_y=True)
+
+
+@pytest.mark.parametrize(
+    "input_type", ["list", "array", "sparse", "dataframe"]
+)
+def test_checking_classifier(iris, input_type):
+    # Check that the CheckingClassifier outputs what we expect
+    X, y = iris
+    X = _convert_container(X, input_type)
+    clf = CheckingClassifier()
+    clf.fit(X, y)
+
+    assert_array_equal(clf.classes_, np.unique(y))
+    assert len(clf.classes_) == 3
+    assert clf.n_features_in_ == 4
+
+    y_pred = clf.predict(X)
+    assert_array_equal(y_pred, np.zeros(y_pred.size, dtype=np.int))
+
+    assert clf.score(X) == pytest.approx(0)
+    clf.set_params(foo_param=10)
+    assert clf.fit(X, y).score(X) == pytest.approx(1)
+
+    y_proba = clf.predict_proba(X)
+    assert y_proba.shape == (150, 3)
+    assert_allclose(y_proba[:, 0], 1)
+    assert_allclose(y_proba[:, 1:], 0)
+
+    y_decision = clf.decision_function(X)
+    assert y_decision.shape == (150, 3)
+    assert_allclose(y_decision[:, 0], 1)
+    assert_allclose(y_decision[:, 1:], 0)
+
+    # check the shape in case of binary classification
+    first_2_classes = np.logical_or(y == 0, y == 1)
+    X = _safe_indexing(X, first_2_classes)
+    y = _safe_indexing(y, first_2_classes)
+    clf.fit(X, y)
+
+    y_proba = clf.predict_proba(X)
+    assert y_proba.shape == (100, 2)
+    assert_allclose(y_proba[:, 0], 1)
+    assert_allclose(y_proba[:, 1], 0)
+
+    y_decision = clf.decision_function(X)
+    assert y_decision.shape == (100,)
+    assert_allclose(y_decision, 0)
+
+
+def test_checking_classifier_with_params(iris):
+    X, y = iris
+    X_sparse = sparse.csr_matrix(X)
+
+    def check_X_is_sparse(X):
+        if not sparse.issparse(X):
+            raise ValueError("X is not sparse")
+        return True
+
+    clf = CheckingClassifier(check_X=check_X_is_sparse)
+    with pytest.raises(ValueError, match="X is not sparse"):
+        clf.fit(X, y)
+    clf.fit(X_sparse, y)
+
+    def _check_array(X, **params):
+        check_array(X, **params)
+        return True
+
+    clf = CheckingClassifier(
+        check_X=_check_array, check_X_params={"accept_sparse": False}
+    )
+    clf.fit(X, y)
+    with pytest.raises(TypeError, match="A sparse matrix was passed"):
+        clf.fit(X_sparse, y)
+
+
+def test_checking_classifier_fit_params(iris):
+    # check the error raised when the number of samples is not the one expected
+    X, y = iris
+    clf = CheckingClassifier(expected_fit_params=["sample_weight"])
+    sample_weight = np.ones(len(X) // 2)
+
+    with pytest.raises(AssertionError, match="Fit parameter sample_weight"):
+        clf.fit(X, y, sample_weight=sample_weight)
+
+
+def test_checking_classifier_missing_fit_params(iris):
+    X, y = iris
+    clf = CheckingClassifier(expected_fit_params=["sample_weight"])
+    with pytest.raises(AssertionError, match="Expected fit parameter"):
+        clf.fit(X, y)