Skip to content

Commit c2bccd1

Browse files
arnau126dukebody
authored andcommitted
CategoricalImputer enhancements.
1 parent 0776122 commit c2bccd1

File tree

2 files changed

+115
-28
lines changed

2 files changed

+115
-28
lines changed

sklearn_pandas/categorical_imputer.py

+47-25
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,51 @@
1-
"""
2-
3-
Impute missing values from a categorical/string np.ndarray or pd.Series with
4-
the most frequent value on the training data.
5-
6-
"""
7-
81
import pandas as pd
92
import numpy as np
103

11-
from sklearn.base import TransformerMixin
4+
from collections import Counter
125

6+
from sklearn.base import BaseEstimator, TransformerMixin
7+
from sklearn.utils.validation import check_is_fitted
138

14-
class CategoricalImputer(TransformerMixin):
159

10+
def _get_mask(X, value):
1611
"""
12+
Compute the boolean mask X == missing_values.
13+
"""
14+
if value == "NaN" or \
15+
value is None or \
16+
(isinstance(value, float) and np.isnan(value)):
17+
return pd.isnull(X)
18+
else:
19+
return X == value
1720

18-
Attributes
21+
22+
class CategoricalImputer(BaseEstimator, TransformerMixin):
23+
"""
24+
Impute missing values from a categorical/string np.ndarray or pd.Series
25+
with the most frequent value on the training data.
26+
27+
Parameters
1928
----------
29+
missing_values : string or "NaN", optional (default="NaN")
30+
The placeholder for the missing values. All occurrences of
31+
`missing_values` will be imputed. None and np.nan are treated
32+
as being the same, use the string value "NaN" for them.
2033
21-
fill : str
34+
copy : boolean, optional (default=True)
35+
If True, a copy of X will be created.
36+
37+
Attributes
38+
----------
39+
fill_ : str
2240
Most frequent value of the training data.
2341
2442
"""
2543

26-
def __init__(self):
27-
28-
self.fill = None
29-
30-
def fit(self, X):
44+
def __init__(self, missing_values='NaN', copy=True):
45+
self.missing_values = missing_values
46+
self.copy = copy
3147

48+
def fit(self, X, y=None):
3249
"""
3350
3451
Get the most frequent value.
@@ -38,22 +55,24 @@ def fit(self, X):
3855
X : np.ndarray or pd.Series
3956
Training data.
4057
58+
y : Passthrough for ``Pipeline`` compatibility.
59+
4160
Returns
4261
-------
43-
CategoricalImputer
44-
Itself.
45-
62+
self: CategoricalImputer
4663
"""
4764

48-
self.fill = pd.Series(X).mode().values[0]
65+
mask = _get_mask(X, self.missing_values)
66+
X = X[~mask]
67+
68+
self.fill_ = Counter(X).most_common(1)[0][0]
4969

5070
return self
5171

5272
def transform(self, X):
53-
5473
"""
5574
56-
Replaces null values in the input data with the most frequent value
75+
Replaces missing values in the input data with the most frequent value
5776
of the training data.
5877
5978
Parameters
@@ -65,11 +84,14 @@ def transform(self, X):
6584
-------
6685
np.ndarray
6786
Data with imputed values.
68-
6987
"""
7088

71-
X = X.copy()
89+
check_is_fitted(self, 'fill_')
90+
91+
if self.copy:
92+
X = X.copy()
7293

73-
X[pd.isnull(X)] = self.fill
94+
mask = _get_mask(X, self.missing_values)
95+
X[mask] = self.fill_
7496

7597
return np.asarray(X)

tests/test_categorical_imputer.py

+68-3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@
66
from sklearn_pandas import CategoricalImputer
77
from sklearn_pandas import DataFrameMapper
88

9+
# In sklearn18 NotFittedError was moved from utils.validation
10+
# to exceptions module.
11+
try:
12+
from sklearn.exceptions import NotFittedError
13+
except ImportError:
14+
from sklearn.utils.validation import NotFittedError
15+
916

1017
@pytest.mark.parametrize('none_value', [None, np.nan])
1118
@pytest.mark.parametrize('input_type', ['np', 'pd'])
@@ -16,16 +23,65 @@ def test_unit(input_type, none_value):
1623
if input_type == 'pd':
1724
X = pd.Series(data)
1825
else:
19-
X = np.asarray(data)
26+
X = np.asarray(data, dtype=object)
2027

2128
Xc = X.copy()
2229

2330
Xt = CategoricalImputer().fit_transform(X)
2431

2532
assert (np.asarray(X) == np.asarray(Xc)).all()
2633
assert type(Xt) == np.ndarray
27-
assert len(X) == len(Xt)
28-
assert len(Xt[pd.isnull(Xt)]) == 0
34+
assert (Xt == ['a', 'b', 'b', 'b']).all()
35+
36+
37+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
38+
def test_missing_values_param(input_type):
39+
40+
data = ['x', 'y', 'a_missing', 'y']
41+
42+
if input_type == 'pd':
43+
X = pd.Series(data)
44+
else:
45+
X = np.asarray(data, dtype=object)
46+
47+
imp = CategoricalImputer(missing_values='a_missing')
48+
Xt = imp.fit_transform(X)
49+
50+
assert (Xt == np.array(['x', 'y', 'y', 'y'])).all()
51+
52+
53+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
54+
def test_copy_param(input_type):
55+
56+
data = ['a', np.nan, 'b', 'a']
57+
58+
if input_type == 'pd':
59+
X = pd.Series(data)
60+
else:
61+
X = np.asarray(data, dtype=object)
62+
63+
imp = CategoricalImputer(copy=False)
64+
Xt = imp.fit_transform(X)
65+
66+
Xe = np.array(['a', 'a', 'b', 'a'])
67+
assert (Xt == Xe).all()
68+
assert (X == Xe).all()
69+
70+
71+
@pytest.mark.parametrize('input_type', ['np', 'pd'])
72+
def test_data_type(input_type):
73+
74+
data = ['a', np.nan, 'b', 3, 'a', 3, 'a', 4.5]
75+
76+
if input_type == 'pd':
77+
X = pd.Series(data)
78+
else:
79+
X = np.asarray(data, dtype=object)
80+
81+
Xt = CategoricalImputer().fit_transform(X)
82+
83+
Xe = np.array(['a', 'a', 'b', 3, 'a', 3, 'a', 4.5], dtype=object)
84+
assert (Xt == Xe).all()
2985

3086

3187
@pytest.mark.parametrize('none_value', [None, np.nan])
@@ -50,3 +106,12 @@ def test_integration(none_value):
50106

51107
assert (df['cat'][val_idx] == df_t['cat'][val_idx]).all()
52108
assert (df_t['cat'][nan_idx] == df['cat'].mode().values[0]).all()
109+
110+
111+
def test_not_fitted():
112+
"""
113+
If imputer is not fitted, NotFittedError is raised.
114+
"""
115+
imp = CategoricalImputer()
116+
with pytest.raises(NotFittedError):
117+
imp.transform(np.array(['a', 'b', 'b', None]))

0 commit comments

Comments
 (0)