@@ -24,11 +24,13 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
24
24
boolean for whether or not to drop columns with 0 variance.
25
25
return_df: bool
26
26
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
27
- impute_missing: bool
28
- boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
29
27
handle_unknown: str
30
- options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if
31
- impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes
28
+ options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used,
29
+ an extra column will be added in if the transform matrix has unknown categories. This can cause
30
+ unexpected changes in dimension in some cases.
31
+ handle_missing: str
32
+ options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used,
33
+ an extra column will be added in if the transform matrix has unknown categories. This can cause
32
34
unexpected changes in dimension in some cases.
33
35
34
36
Example
@@ -82,14 +84,15 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
82
84
83
85
"""
84
86
85
- def __init__ (self , verbose = 0 , cols = None , mapping = None , drop_invariant = False , return_df = True , impute_missing = True , handle_unknown = 'impute' ):
87
+ def __init__ (self , verbose = 0 , cols = None , mapping = None , drop_invariant = False , return_df = True ,
88
+ handle_unknown = 'value' , handle_missing = 'value' ):
86
89
self .return_df = return_df
87
90
self .drop_invariant = drop_invariant
88
91
self .drop_cols = []
89
92
self .verbose = verbose
90
93
self .mapping = mapping
91
- self .impute_missing = impute_missing
92
94
self .handle_unknown = handle_unknown
95
+ self .handle_missing = handle_missing
93
96
self .cols = cols
94
97
self .ordinal_encoder = None
95
98
self ._dim = None
@@ -128,22 +131,28 @@ def fit(self, X, y=None, **kwargs):
128
131
else :
129
132
self .cols = util .convert_cols_to_list (self .cols )
130
133
134
+ if self .handle_missing == 'error' :
135
+ if X [self .cols ].isnull ().any ().bool ():
136
+ raise ValueError ('Columns to be encoded can not contain null' )
137
+
131
138
# train an ordinal pre-encoder
132
139
self .ordinal_encoder = OrdinalEncoder (
133
140
verbose = self .verbose ,
134
141
cols = self .cols ,
135
- impute_missing = self . impute_missing ,
136
- handle_unknown = self . handle_unknown
142
+ handle_unknown = 'value' ,
143
+ handle_missing = 'value'
137
144
)
138
145
self .ordinal_encoder = self .ordinal_encoder .fit (X )
139
146
140
147
ordinal_mapping = self .ordinal_encoder .category_mapping
141
148
142
149
mappings_out = []
143
150
for switch in ordinal_mapping :
144
- values = switch .get ('mapping' ).get_values ()
145
- column_mapping = self .fit_backward_difference_coding (values )
146
- mappings_out .append ({'col' : switch .get ('col' ), 'mapping' : column_mapping , })
151
+ values = switch .get ('mapping' )
152
+ col = switch .get ('col' )
153
+
154
+ column_mapping = self .fit_backward_difference_coding (col , values , self .handle_missing , self .handle_unknown )
155
+ mappings_out .append ({'col' : col , 'mapping' : column_mapping , })
147
156
148
157
self .mapping = mappings_out
149
158
@@ -180,6 +189,10 @@ def transform(self, X, override_return_df=False):
180
189
181
190
"""
182
191
192
+ if self .handle_missing == 'error' :
193
+ if X [self .cols ].isnull ().any ().bool ():
194
+ raise ValueError ('Columns to be encoded can not contain null' )
195
+
183
196
if self ._dim is None :
184
197
raise ValueError ('Must train encoder before it can be used to transform data.' )
185
198
@@ -194,6 +207,11 @@ def transform(self, X, override_return_df=False):
194
207
return X
195
208
196
209
X = self .ordinal_encoder .transform (X )
210
+
211
+ if self .handle_unknown == 'error' :
212
+ if X [self .cols ].isin ([- 1 ]).any ().any ():
213
+ raise ValueError ('Columns to be encoded can not contain new values' )
214
+
197
215
X = self .backward_difference_coding (X , mapping = self .mapping )
198
216
199
217
if self .drop_invariant :
@@ -206,14 +224,32 @@ def transform(self, X, override_return_df=False):
206
224
return X .values
207
225
208
226
@staticmethod
209
- def fit_backward_difference_coding (values ):
227
+ def fit_backward_difference_coding (col , values , handle_missing , handle_unknown ):
228
+ if handle_missing == 'value' :
229
+ values = values [values > 0 ]
230
+
231
+ values_to_encode = values .get_values ()
232
+
210
233
if len (values ) < 2 :
211
- return pd .DataFrame ()
234
+ return pd .DataFrame (index = values_to_encode )
235
+
236
+ if handle_unknown == 'indicator' :
237
+ values_to_encode = np .append (values_to_encode , - 1 )
238
+
239
+ backwards_difference_matrix = Diff ().code_without_intercept (values_to_encode )
240
+ df = pd .DataFrame (data = backwards_difference_matrix .matrix , index = values_to_encode ,
241
+ columns = [str (col ) + '_%d' % (i , ) for i in range (len (backwards_difference_matrix .column_suffixes ))])
242
+
243
+ if handle_unknown == 'return_nan' :
244
+ df .loc [- 1 ] = np .nan
245
+ elif handle_unknown == 'value' :
246
+ df .loc [- 1 ] = np .zeros (len (values_to_encode ) - 1 )
247
+
248
+ if handle_missing == 'return_nan' :
249
+ df .loc [values .loc [np .nan ]] = np .nan
250
+ elif handle_missing == 'value' :
251
+ df .loc [- 2 ] = np .zeros (len (values_to_encode ) - 1 )
212
252
213
- backwards_difference_matrix = Diff ().code_without_intercept (values )
214
- df = pd .DataFrame (data = backwards_difference_matrix .matrix , columns = backwards_difference_matrix .column_suffixes )
215
- df .index += 1
216
- df .loc [0 ] = np .zeros (len (values ) - 1 )
217
253
return df
218
254
219
255
@staticmethod
@@ -230,19 +266,17 @@ def backward_difference_coding(X_in, mapping):
230
266
for switch in mapping :
231
267
col = switch .get ('col' )
232
268
mod = switch .get ('mapping' )
233
- new_columns = []
234
- for i in range (len (mod .columns )):
235
- c = mod .columns [i ]
236
- new_col = str (col ) + '_%d' % (i , )
237
- X [new_col ] = mod [c ].loc [X [col ]].values
238
- new_columns .append (new_col )
269
+
270
+ base_df = mod .loc [X [col ]]
271
+ base_df .set_index (X .index , inplace = True )
272
+ X = pd .concat ([base_df , X ], axis = 1 )
273
+
239
274
old_column_index = cols .index (col )
240
- cols [old_column_index : old_column_index + 1 ] = new_columns
275
+ cols [old_column_index : old_column_index + 1 ] = mod . columns
241
276
242
277
cols = ['intercept' ] + cols
243
- X = X .reindex (columns = cols )
244
278
245
- return X
279
+ return X . reindex ( columns = cols )
246
280
247
281
def get_feature_names (self ):
248
282
"""
0 commit comments