5
5
import warnings
6
6
from sklearn .base import BaseEstimator , TransformerMixin
7
7
from category_encoders .ordinal import OrdinalEncoder
8
- from category_encoders .utils import get_obj_cols , convert_input
8
+ from category_encoders .utils import get_obj_cols , convert_input , get_generated_cols
9
9
10
10
__author__ = 'willmcginnis'
11
11
@@ -21,11 +21,13 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
21
21
verbose: int
22
22
integer indicating verbosity of output. 0 for none.
23
23
cols: list
24
- a list of columns to encode, if None, all string columns will be encoded
24
+ a list of columns to encode, if None, all string columns will be encoded.
25
25
drop_invariant: bool
26
- boolean for whether or not to drop columns with 0 variance
26
+ boolean for whether or not to drop columns with 0 variance.
27
27
return_df: bool
28
- boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
28
+ boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
29
+ base: int
30
+ when the downstream model copes well with nonlinearities (like decision tree), use higher base.
29
31
impute_missing: bool
30
32
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
31
33
handle_unknown: str
@@ -35,24 +37,25 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
35
37
36
38
Example
37
39
-------
38
- >>>from category_encoders import *
39
- >>>import pandas as pd
40
- >>>from sklearn.datasets import load_boston
41
- >>>bunch = load_boston()
42
- >>>y = bunch.target
43
- >>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
44
- >>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
45
- >>>numeric_dataset = enc.transform(X)
46
- >>>print(numeric_dataset.info())
47
-
40
+ >>> from category_encoders import *
41
+ >>> import pandas as pd
42
+ >>> from sklearn.datasets import load_boston
43
+ >>> bunch = load_boston()
44
+ >>> y = bunch.target
45
+ >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
46
+ >>> enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
47
+ >>> numeric_dataset = enc.transform(X)
48
+ >>> print(numeric_dataset.info())
48
49
<class 'pandas.core.frame.DataFrame'>
49
50
RangeIndex: 506 entries, 0 to 505
50
- Data columns (total 16 columns):
51
+ Data columns (total 18 columns):
51
52
CHAS_0 506 non-null int64
53
+ CHAS_1 506 non-null int64
52
54
RAD_0 506 non-null int64
53
55
RAD_1 506 non-null int64
54
56
RAD_2 506 non-null int64
55
57
RAD_3 506 non-null int64
58
+ RAD_4 506 non-null int64
56
59
CRIM 506 non-null float64
57
60
ZN 506 non-null float64
58
61
INDUS 506 non-null float64
@@ -64,8 +67,8 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
64
67
PTRATIO 506 non-null float64
65
68
B 506 non-null float64
66
69
LSTAT 506 non-null float64
67
- dtypes: float64(11), int64(5 )
68
- memory usage: 63.3 KB
70
+ dtypes: float64(11), int64(7 )
71
+ memory usage: 71.2 KB
69
72
None
70
73
71
74
"""
@@ -135,7 +138,8 @@ def fit(self, X, y=None, **kwargs):
135
138
if self .drop_invariant :
136
139
self .drop_cols = []
137
140
X_temp = self .transform (X )
138
- self .drop_cols = [x for x in X_temp .columns .values if X_temp [x ].var () <= 10e-5 ]
141
+ generated_cols = get_generated_cols (X , X_temp , self .cols )
142
+ self .drop_cols = [x for x in generated_cols if X_temp [x ].var () <= 10e-5 ]
139
143
140
144
return self
141
145
@@ -168,25 +172,23 @@ def transform(self, X, override_return_df=False):
168
172
if not self .cols :
169
173
return X
170
174
171
- original_cols = set (X .columns )
172
- X = self .ordinal_encoder .transform (X )
173
- X = self .basen_encode (X , cols = self .cols )
175
+ X_out = self .ordinal_encoder .transform (X )
176
+ X_out = self .basen_encode (X_out , cols = self .cols )
174
177
175
178
if self .drop_invariant :
176
179
for col in self .drop_cols :
177
- X .drop (col , 1 , inplace = True )
180
+ X_out .drop (col , 1 , inplace = True )
178
181
179
182
# impute missing values only in the generated columns
180
- current_cols = set (X .columns )
181
- fillna_cols = list (current_cols - (original_cols - set (self .cols )))
182
- X [fillna_cols ] = X [fillna_cols ].fillna (value = 0.0 )
183
+ generated_cols = get_generated_cols (X , X_out , self .cols )
184
+ X_out [generated_cols ] = X_out [generated_cols ].fillna (value = 0.0 )
183
185
184
186
if self .return_df or override_return_df :
185
- return X
187
+ return X_out
186
188
else :
187
- return X .values
189
+ return X_out .values
188
190
189
- def inverse_transform (self , Xt ):
191
+ def inverse_transform (self , X_in ):
190
192
"""
191
193
Perform the inverse transformation to encoded data.
192
194
@@ -201,15 +203,15 @@ def inverse_transform(self, Xt):
201
203
"""
202
204
203
205
warnings .warn ('Inverse transform in basen is a currently experimental feature, please be careful' )
204
- X = Xt .copy (deep = True )
206
+ X = X_in .copy (deep = True )
205
207
206
208
# first check the type
207
209
X = convert_input (X )
208
210
209
211
if self ._dim is None :
210
212
raise ValueError ('Must train encoder before it can be used to inverse_transform data' )
211
213
212
- X = self .basen_to_interger (X , self .cols , self .base )
214
+ X = self .basen_to_integer (X , self .cols , self .base )
213
215
214
216
# then make sure that it is the right size
215
217
if X .shape [1 ] != self ._dim :
@@ -284,7 +286,7 @@ def basen_encode(self, X_in, cols=None):
284
286
285
287
return X
286
288
287
- def basen_to_interger (self , X , cols , base ):
289
+ def basen_to_integer (self , X , cols , base ):
288
290
"""
289
291
Convert basen code as integers.
290
292
@@ -304,7 +306,7 @@ def basen_to_interger(self, X, cols, base):
304
306
out_cols = X .columns .values
305
307
306
308
for col in cols :
307
- col_list = [col0 for col0 in out_cols if str (col0 ).startswith (col )]
309
+ col_list = [col0 for col0 in out_cols if str (col0 ).startswith (str ( col ) )]
308
310
for col0 in col_list :
309
311
if any (X [col0 ].isnull ()):
310
312
raise ValueError ("inverse_transform is not supported because transform impute"
0 commit comments