Skip to content

Commit b0df3dd

Browse files
authored
Merge pull request #89 from paulgb/categoricalimputer-improvements
Categoricalimputer improvements
2 parents 0776122 + 5ce407c commit b0df3dd

File tree

2 files changed

+133
-28
lines changed

2 files changed

+133
-28
lines changed

sklearn_pandas/categorical_imputer.py

+51-25
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,50 @@
1-
"""
2-
3-
Impute missing values from a categorical/string np.ndarray or pd.Series with
4-
the most frequent value on the training data.
5-
6-
"""
7-
81
import pandas as pd
92
import numpy as np
103

11-
from sklearn.base import TransformerMixin
124

5+
from sklearn.base import BaseEstimator, TransformerMixin
6+
from sklearn.utils.validation import check_is_fitted
137

14-
class CategoricalImputer(TransformerMixin):
158

9+
def _get_mask(X, value):
1610
"""
11+
Compute the boolean mask X == missing_values.
12+
"""
13+
if value == "NaN" or \
14+
value is None or \
15+
(isinstance(value, float) and np.isnan(value)):
16+
return pd.isnull(X)
17+
else:
18+
return X == value
1719

18-
Attributes
20+
21+
class CategoricalImputer(BaseEstimator, TransformerMixin):
22+
"""
23+
Impute missing values from a categorical/string np.ndarray or pd.Series
24+
with the most frequent value on the training data.
25+
26+
Parameters
1927
----------
28+
missing_values : string or "NaN", optional (default="NaN")
29+
The placeholder for the missing values. All occurrences of
30+
`missing_values` will be imputed. None and np.nan are treated
31+
as being the same, use the string value "NaN" for them.
2032
21-
fill : str
33+
copy : boolean, optional (default=True)
34+
If True, a copy of X will be created.
35+
36+
Attributes
37+
----------
38+
fill_ : str
2239
Most frequent value of the training data.
2340
2441
"""
2542

26-
def __init__(self):
27-
28-
self.fill = None
29-
30-
def fit(self, X):
43+
def __init__(self, missing_values='NaN', copy=True):
44+
self.missing_values = missing_values
45+
self.copy = copy
3146

47+
def fit(self, X, y=None):
3248
"""
3349
3450
Get the most frequent value.
@@ -38,22 +54,29 @@ def fit(self, X):
3854
X : np.ndarray or pd.Series
3955
Training data.
4056
57+
y : Passthrough for ``Pipeline`` compatibility.
58+
4159
Returns
4260
-------
43-
CategoricalImputer
44-
Itself.
45-
61+
self: CategoricalImputer
4662
"""
4763

48-
self.fill = pd.Series(X).mode().values[0]
64+
mask = _get_mask(X, self.missing_values)
65+
X = X[~mask]
66+
67+
modes = pd.Series(X).mode()
68+
if modes.shape[0] == 0:
69+
raise ValueError('No value is repeated more than '
70+
'once in the column')
71+
else:
72+
self.fill_ = modes[0]
4973

5074
return self
5175

5276
def transform(self, X):
53-
5477
"""
5578
56-
Replaces null values in the input data with the most frequent value
79+
Replaces missing values in the input data with the most frequent value
5780
of the training data.
5881
5982
Parameters
@@ -65,11 +88,14 @@ def transform(self, X):
6588
-------
6689
np.ndarray
6790
Data with imputed values.
68-
6991
"""
7092

71-
X = X.copy()
93+
check_is_fitted(self, 'fill_')
94+
95+
if self.copy:
96+
X = X.copy()
7297

73-
X[pd.isnull(X)] = self.fill
98+
mask = _get_mask(X, self.missing_values)
99+
X[mask] = self.fill_
74100

75101
return np.asarray(X)

tests/test_categorical_imputer.py

+82-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
from sklearn_pandas import CategoricalImputer
77
from sklearn_pandas import DataFrameMapper
88

9+
# In sklearn18 NotFittedError was moved from utils.validation
10+
# to exceptions module.
11+
try:
12+
from sklearn.exceptions import NotFittedError
13+
except ImportError:
14+
from sklearn.utils.validation import NotFittedError
15+
916

1017
@pytest.mark.parametrize('none_value', [None, np.nan])
1118
@pytest.mark.parametrize('input_type', ['np', 'pd'])
@@ -16,16 +23,79 @@ def test_unit(input_type, none_value):
1623
if input_type == 'pd':
1724
X = pd.Series(data)
1825
else:
19-
X = np.asarray(data)
26+
X = np.asarray(data, dtype=object)
2027

2128
Xc = X.copy()
2229

2330
Xt = CategoricalImputer().fit_transform(X)
2431

2532
assert (np.asarray(X) == np.asarray(Xc)).all()
2633
assert type(Xt) == np.ndarray
27-
assert len(X) == len(Xt)
28-
assert len(Xt[pd.isnull(Xt)]) == 0
34+
assert (Xt == ['a', 'b', 'b', 'b']).all()
35+
36+
37+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
38+
def test_no_mode(input_type):
39+
40+
data = ['a', 'b', 'c', np.nan]
41+
42+
if input_type == 'pd':
43+
X = pd.Series(data)
44+
else:
45+
X = np.asarray(data, dtype=object)
46+
47+
with pytest.raises(ValueError):
48+
CategoricalImputer().fit_transform(X)
49+
50+
51+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
52+
def test_missing_values_param(input_type):
53+
54+
data = ['x', 'y', 'a_missing', 'y']
55+
56+
if input_type == 'pd':
57+
X = pd.Series(data)
58+
else:
59+
X = np.asarray(data, dtype=object)
60+
61+
imp = CategoricalImputer(missing_values='a_missing')
62+
Xt = imp.fit_transform(X)
63+
64+
assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
65+
66+
67+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
68+
def test_copy_param(input_type):
69+
70+
data = ['a', np.nan, 'b', 'a']
71+
72+
if input_type == 'pd':
73+
X = pd.Series(data)
74+
else:
75+
X = np.asarray(data, dtype=object)
76+
77+
imp = CategoricalImputer(copy=False)
78+
Xt = imp.fit_transform(X)
79+
80+
Xe = np.array(['a', 'a', 'b', 'a'])
81+
assert (Xt == Xe).all()
82+
assert (X == Xe).all()
83+
84+
85+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
86+
def test_data_type(input_type):
87+
88+
data = ['a', np.nan, 'b', 3, 'a', 3, 'a', 4.5]
89+
90+
if input_type == 'pd':
91+
X = pd.Series(data)
92+
else:
93+
X = np.asarray(data, dtype=object)
94+
95+
Xt = CategoricalImputer().fit_transform(X)
96+
97+
Xe = np.array(['a', 'a', 'b', 3, 'a', 3, 'a', 4.5], dtype=object)
98+
assert (Xt == Xe).all()
2999

30100

31101
@pytest.mark.parametrize('none_value', [None, np.nan])
@@ -50,3 +120,12 @@ def test_integration(none_value):
50120

51121
assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
52122
assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()
123+
124+
125+
def test_not_fitted():
126+
"""
127+
If imputer is not fitted, NotFittedError is raised.
128+
"""
129+
imp = CategoricalImputer()
130+
with pytest.raises(NotFittedError):
131+
imp.transform(np.array(['a', 'b', 'b', None]))

0 commit comments

Comments
 (0)