Skip to content

Commit e3ce76f

Browse files
authored
Merge pull request #153 from JohnnyC08/ordinal-encoder-support-new-handle-unknown-handle-missing
Ordinal encoder support new handle unknown handle missing
2 parents 5cb4e40 + a98a8cc commit e3ce76f

26 files changed

+1587
-623
lines changed

.travis.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ env:
1515
matrix:
1616
# The versions should match the minimal requirements in requirements.txt and setup.py
1717
- DISTRIB="conda" PYTHON_VERSION="2.7" CYTHON_VERSION="0.21"
18-
NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.20.1" PATSY_VERSION="0.4.1"
18+
NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.21.1" PATSY_VERSION="0.4.1"
1919
SCIKIT_VERSION="0.17.1" SCIPY_VERSION="0.17.0" STATSMODELS_VERSION="0.6.1"
2020
- DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" CYTHON_VERSION="0.23.4"
21-
NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.20.1" PATSY_VERSION="0.4.1"
21+
NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.21.1" PATSY_VERSION="0.4.1"
2222
SCIKIT_VERSION="0.17.1" SCIPY_VERSION="0.17.0" STATSMODELS_VERSION="0.6.1"
2323

2424
install: source ci_scripts/install.sh

category_encoders/backward_difference.py

Lines changed: 60 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,13 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
2424
boolean for whether or not to drop columns with 0 variance.
2525
return_df: bool
2626
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
27-
impute_missing: bool
28-
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
2927
handle_unknown: str
30-
options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
31-
impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes
28+
options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used,
29+
an extra column will be added in if the transform matrix has unknown categories. This can cause
30+
unexpected changes in dimension in some cases.
31+
handle_missing: str
32+
options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used,
33+
an extra column will be added in if the transform matrix has unknown categories. This can cause
3234
unexpected changes in dimension in some cases.
3335
3436
Example
@@ -82,14 +84,15 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
8284
8385
"""
8486

85-
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'):
87+
def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True,
88+
handle_unknown='value', handle_missing='value'):
8689
self.return_df = return_df
8790
self.drop_invariant = drop_invariant
8891
self.drop_cols = []
8992
self.verbose = verbose
9093
self.mapping = mapping
91-
self.impute_missing = impute_missing
9294
self.handle_unknown = handle_unknown
95+
self.handle_missing = handle_missing
9396
self.cols = cols
9497
self.ordinal_encoder = None
9598
self._dim = None
@@ -128,22 +131,28 @@ def fit(self, X, y=None, **kwargs):
128131
else:
129132
self.cols = util.convert_cols_to_list(self.cols)
130133

134+
if self.handle_missing == 'error':
135+
if X[self.cols].isnull().any().bool():
136+
raise ValueError('Columns to be encoded can not contain null')
137+
131138
# train an ordinal pre-encoder
132139
self.ordinal_encoder = OrdinalEncoder(
133140
verbose=self.verbose,
134141
cols=self.cols,
135-
impute_missing=self.impute_missing,
136-
handle_unknown=self.handle_unknown
142+
handle_unknown='value',
143+
handle_missing='value'
137144
)
138145
self.ordinal_encoder = self.ordinal_encoder.fit(X)
139146

140147
ordinal_mapping = self.ordinal_encoder.category_mapping
141148

142149
mappings_out = []
143150
for switch in ordinal_mapping:
144-
values = switch.get('mapping').get_values()
145-
column_mapping = self.fit_backward_difference_coding(values)
146-
mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, })
151+
values = switch.get('mapping')
152+
col = switch.get('col')
153+
154+
column_mapping = self.fit_backward_difference_coding(col, values, self.handle_missing, self.handle_unknown)
155+
mappings_out.append({'col': col, 'mapping': column_mapping, })
147156

148157
self.mapping = mappings_out
149158

@@ -180,6 +189,10 @@ def transform(self, X, override_return_df=False):
180189
181190
"""
182191

192+
if self.handle_missing == 'error':
193+
if X[self.cols].isnull().any().bool():
194+
raise ValueError('Columns to be encoded can not contain null')
195+
183196
if self._dim is None:
184197
raise ValueError('Must train encoder before it can be used to transform data.')
185198

@@ -194,6 +207,11 @@ def transform(self, X, override_return_df=False):
194207
return X
195208

196209
X = self.ordinal_encoder.transform(X)
210+
211+
if self.handle_unknown == 'error':
212+
if X[self.cols].isin([-1]).any().any():
213+
raise ValueError('Columns to be encoded can not contain new values')
214+
197215
X = self.backward_difference_coding(X, mapping=self.mapping)
198216

199217
if self.drop_invariant:
@@ -206,14 +224,32 @@ def transform(self, X, override_return_df=False):
206224
return X.values
207225

208226
@staticmethod
209-
def fit_backward_difference_coding(values):
227+
def fit_backward_difference_coding(col, values, handle_missing, handle_unknown):
228+
if handle_missing == 'value':
229+
values = values[values > 0]
230+
231+
values_to_encode = values.get_values()
232+
210233
if len(values) < 2:
211-
return pd.DataFrame()
234+
return pd.DataFrame(index=values_to_encode)
235+
236+
if handle_unknown == 'indicator':
237+
values_to_encode = np.append(values_to_encode, -1)
238+
239+
backwards_difference_matrix = Diff().code_without_intercept(values_to_encode)
240+
df = pd.DataFrame(data=backwards_difference_matrix.matrix, index=values_to_encode,
241+
columns=[str(col) + '_%d' % (i, ) for i in range(len(backwards_difference_matrix.column_suffixes))])
242+
243+
if handle_unknown == 'return_nan':
244+
df.loc[-1] = np.nan
245+
elif handle_unknown == 'value':
246+
df.loc[-1] = np.zeros(len(values_to_encode) - 1)
247+
248+
if handle_missing == 'return_nan':
249+
df.loc[values.loc[np.nan]] = np.nan
250+
elif handle_missing == 'value':
251+
df.loc[-2] = np.zeros(len(values_to_encode) - 1)
212252

213-
backwards_difference_matrix = Diff().code_without_intercept(values)
214-
df = pd.DataFrame(data=backwards_difference_matrix.matrix, columns=backwards_difference_matrix.column_suffixes)
215-
df.index += 1
216-
df.loc[0] = np.zeros(len(values) - 1)
217253
return df
218254

219255
@staticmethod
@@ -230,19 +266,17 @@ def backward_difference_coding(X_in, mapping):
230266
for switch in mapping:
231267
col = switch.get('col')
232268
mod = switch.get('mapping')
233-
new_columns = []
234-
for i in range(len(mod.columns)):
235-
c = mod.columns[i]
236-
new_col = str(col) + '_%d' % (i, )
237-
X[new_col] = mod[c].loc[X[col]].values
238-
new_columns.append(new_col)
269+
270+
base_df = mod.loc[X[col]]
271+
base_df.set_index(X.index, inplace=True)
272+
X = pd.concat([base_df, X], axis=1)
273+
239274
old_column_index = cols.index(col)
240-
cols[old_column_index: old_column_index + 1] = new_columns
275+
cols[old_column_index: old_column_index + 1] = mod.columns
241276

242277
cols = ['intercept'] + cols
243-
X = X.reindex(columns=cols)
244278

245-
return X
279+
return X.reindex(columns=cols)
246280

247281
def get_feature_names(self):
248282
"""

0 commit comments

Comments
 (0)