Merge pull request #89 from paulgb/categoricalimputer-improvements

dukebody · web-flow · commit b0df3ddb5e99 · 2017-04-29T18:44:20.000+02:00
Categoricalimputer improvements
diff --git a/sklearn_pandas/categorical_imputer.py b/sklearn_pandas/categorical_imputer.py
@@ -1,34 +1,50 @@
-"""
-
-Impute missing values from a categorical/string np.ndarray or pd.Series with
-the most frequent value on the training data.
-
-"""
-
 import pandas as pd
 import numpy as np
 
-from sklearn.base import TransformerMixin
 
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
 
-class CategoricalImputer(TransformerMixin):
 
+def _get_mask(X, value):
     """
+    Compute the boolean mask X == missing_values.
+    """
+    if value == "NaN" or \
+       value is None or \
+       (isinstance(value, float) and np.isnan(value)):
+        return pd.isnull(X)
+    else:
+        return X == value
 
-    Attributes
+
+class CategoricalImputer(BaseEstimator, TransformerMixin):
+    """
+    Impute missing values from a categorical/string np.ndarray or pd.Series
+    with the most frequent value on the training data.
+
+    Parameters
     ----------
+    missing_values : string or "NaN", optional (default="NaN")
+        The placeholder for the missing values. All occurrences of
+        `missing_values` will be imputed. None and np.nan are treated
+        as being the same, use the string value "NaN" for them.
 
-    fill : str
+    copy : boolean, optional (default=True)
+        If True, a copy of X will be created.
+
+    Attributes
+    ----------
+    fill_ : str
         Most frequent value of the training data.
 
     """
 
-    def __init__(self):
-
-        self.fill = None
-
-    def fit(self, X):
+    def __init__(self, missing_values='NaN', copy=True):
+        self.missing_values = missing_values
+        self.copy = copy
 
+    def fit(self, X, y=None):
         """
 
         Get the most frequent value.
@@ -38,22 +54,29 @@ def fit(self, X):
             X : np.ndarray or pd.Series
                 Training data.
 
+            y : Passthrough for ``Pipeline`` compatibility.
+
         Returns
         -------
-        CategoricalImputer
-            Itself.
-
+            self: CategoricalImputer
         """
 
-        self.fill = pd.Series(X).mode().values[0]
+        mask = _get_mask(X, self.missing_values)
+        X = X[~mask]
+
+        modes = pd.Series(X).mode()
+        if modes.shape[0] == 0:
+            raise ValueError('No value is repeated more than '
+                             'once in the column')
+        else:
+            self.fill_ = modes[0]
 
         return self
 
     def transform(self, X):
-
         """
 
-        Replaces null values in the input data with the most frequent value
+        Replaces missing values in the input data with the most frequent value
         of the training data.
 
         Parameters
@@ -65,11 +88,14 @@ def transform(self, X):
         -------
             np.ndarray
                 Data with imputed values.
-
         """
 
-        X = X.copy()
+        check_is_fitted(self, 'fill_')
+
+        if self.copy:
+            X = X.copy()
 
-        X[pd.isnull(X)] = self.fill
+        mask = _get_mask(X, self.missing_values)
+        X[mask] = self.fill_
 
         return np.asarray(X)
diff --git a/tests/test_categorical_imputer.py b/tests/test_categorical_imputer.py
@@ -6,6 +6,13 @@
 from sklearn_pandas import CategoricalImputer
 from sklearn_pandas import DataFrameMapper
 
+# In sklearn18 NotFittedError was moved from utils.validation
+# to exceptions module.
+try:
+    from sklearn.exceptions import NotFittedError
+except ImportError:
+    from sklearn.utils.validation import NotFittedError
+
 
 @pytest.mark.parametrize('none_value', [None, np.nan])
 @pytest.mark.parametrize('input_type', ['np', 'pd'])
@@ -16,16 +23,79 @@ def test_unit(input_type, none_value):
     if input_type == 'pd':
         X = pd.Series(data)
     else:
-        X = np.asarray(data)
+        X = np.asarray(data, dtype=object)
 
     Xc = X.copy()
 
     Xt = CategoricalImputer().fit_transform(X)
 
     assert (np.asarray(X) == np.asarray(Xc)).all()
     assert type(Xt) == np.ndarray
-    assert len(X) == len(Xt)
-    assert len(Xt[pd.isnull(Xt)]) == 0
+    assert (Xt == ['a', 'b', 'b', 'b']).all()
+
+
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_no_mode(input_type):
+
+    data = ['a', 'b', 'c', np.nan]
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data, dtype=object)
+
+    with pytest.raises(ValueError):
+        CategoricalImputer().fit_transform(X)
+
+
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_missing_values_param(input_type):
+
+    data = ['x', 'y', 'a_missing', 'y']
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data, dtype=object)
+
+    imp = CategoricalImputer(missing_values='a_missing')
+    Xt = imp.fit_transform(X)
+
+    assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
+
+
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_copy_param(input_type):
+
+    data = ['a', np.nan, 'b', 'a']
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data, dtype=object)
+
+    imp = CategoricalImputer(copy=False)
+    Xt = imp.fit_transform(X)
+
+    Xe = np.array(['a', 'a', 'b', 'a'])
+    assert (Xt == Xe).all()
+    assert (X == Xe).all()
+
+
+@pytest.mark.parametrize('input_type', ['np', 'pd'])
+def test_data_type(input_type):
+
+    data = ['a', np.nan, 'b', 3, 'a', 3, 'a', 4.5]
+
+    if input_type == 'pd':
+        X = pd.Series(data)
+    else:
+        X = np.asarray(data, dtype=object)
+
+    Xt = CategoricalImputer().fit_transform(X)
+
+    Xe = np.array(['a', 'a', 'b', 3, 'a', 3, 'a', 4.5], dtype=object)
+    assert (Xt == Xe).all()
 
 
 @pytest.mark.parametrize('none_value', [None, np.nan])
@@ -50,3 +120,12 @@ def test_integration(none_value):
 
     assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
     assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()
+
+
+def test_not_fitted():
+    """
+    If imputer is not fitted, NotFittedError is raised.
+    """
+    imp = CategoricalImputer()
+    with pytest.raises(NotFittedError):
+        imp.transform(np.array(['a', 'b', 'b', None]))