Skip to content

Commit 0e8f669

Browse files
authored
Merge pull request #110 from janmotl/cumulative
Fix of #107, #104, #103, #101, #95 and addition of #59 and #46
2 parents f807642 + bbf5a96 commit 0e8f669

22 files changed

+3228
-266
lines changed

category_encoders/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from category_encoders.basen import BaseNEncoder
1818
from category_encoders.leave_one_out import LeaveOneOutEncoder
1919
from category_encoders.target_encoder import TargetEncoder
20+
from category_encoders.woe import WOEEncoder
2021

2122
__author__ = 'willmcginnis'
2223

@@ -31,5 +32,6 @@
3132
'PolynomialEncoder',
3233
'BaseNEncoder',
3334
'LeaveOneOutEncoder',
34-
'TargetEncoder'
35+
'TargetEncoder',
36+
'WOEEncoder'
3537
]

category_encoders/backward_difference.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from sklearn.base import BaseEstimator, TransformerMixin
77
from patsy.highlevel import dmatrix
88
from category_encoders.ordinal import OrdinalEncoder
9-
from category_encoders.utils import get_obj_cols, convert_input
9+
from category_encoders.utils import get_obj_cols, convert_input, get_generated_cols
1010

1111
__author__ = 'willmcginnis'
1212

@@ -20,11 +20,11 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
2020
verbose: int
2121
integer indicating verbosity of output. 0 for none.
2222
cols: list
23-
a list of columns to encode, if None, all string columns will be encoded
23+
a list of columns to encode, if None, all string columns will be encoded.
2424
drop_invariant: bool
25-
boolean for whether or not to drop columns with 0 variance
25+
boolean for whether or not to drop columns with 0 variance.
2626
return_df: bool
27-
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
27+
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
2828
impute_missing: bool
2929
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
3030
handle_unknown: str
@@ -34,16 +34,15 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin):
3434
3535
Example
3636
-------
37-
>>>from category_encoders import *
38-
>>>import pandas as pd
39-
>>>from sklearn.datasets import load_boston
40-
>>>bunch = load_boston()
41-
>>>y = bunch.target
42-
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
43-
>>>enc = BackwardDifferenceEncoder(cols=['CHAS', 'RAD']).fit(X, y)
44-
>>>numeric_dataset = enc.transform(X)
45-
>>>print(numeric_dataset.info())
46-
37+
>>> from category_encoders import *
38+
>>> import pandas as pd
39+
>>> from sklearn.datasets import load_boston
40+
>>> bunch = load_boston()
41+
>>> y = bunch.target
42+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
43+
>>> enc = BackwardDifferenceEncoder(cols=['CHAS', 'RAD']).fit(X, y)
44+
>>> numeric_dataset = enc.transform(X)
45+
>>> print(numeric_dataset.info())
4746
<class 'pandas.core.frame.DataFrame'>
4847
RangeIndex: 506 entries, 0 to 505
4948
Data columns (total 22 columns):
@@ -140,7 +139,8 @@ def fit(self, X, y=None, **kwargs):
140139
if self.drop_invariant:
141140
self.drop_cols = []
142141
X_temp = self.transform(X)
143-
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
142+
generated_cols = get_generated_cols(X, X_temp, self.cols)
143+
self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
144144

145145
return self
146146

category_encoders/basen.py

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import warnings
66
from sklearn.base import BaseEstimator, TransformerMixin
77
from category_encoders.ordinal import OrdinalEncoder
8-
from category_encoders.utils import get_obj_cols, convert_input
8+
from category_encoders.utils import get_obj_cols, convert_input, get_generated_cols
99

1010
__author__ = 'willmcginnis'
1111

@@ -21,11 +21,13 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
2121
verbose: int
2222
integer indicating verbosity of output. 0 for none.
2323
cols: list
24-
a list of columns to encode, if None, all string columns will be encoded
24+
a list of columns to encode, if None, all string columns will be encoded.
2525
drop_invariant: bool
26-
boolean for whether or not to drop columns with 0 variance
26+
boolean for whether or not to drop columns with 0 variance.
2727
return_df: bool
28-
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
28+
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
29+
base: int
30+
when the downstream model copes well with nonlinearities (like decision tree), use higher base.
2931
impute_missing: bool
3032
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
3133
handle_unknown: str
@@ -35,24 +37,25 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
3537
3638
Example
3739
-------
38-
>>>from category_encoders import *
39-
>>>import pandas as pd
40-
>>>from sklearn.datasets import load_boston
41-
>>>bunch = load_boston()
42-
>>>y = bunch.target
43-
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
44-
>>>enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
45-
>>>numeric_dataset = enc.transform(X)
46-
>>>print(numeric_dataset.info())
47-
40+
>>> from category_encoders import *
41+
>>> import pandas as pd
42+
>>> from sklearn.datasets import load_boston
43+
>>> bunch = load_boston()
44+
>>> y = bunch.target
45+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
46+
>>> enc = BaseNEncoder(cols=['CHAS', 'RAD']).fit(X, y)
47+
>>> numeric_dataset = enc.transform(X)
48+
>>> print(numeric_dataset.info())
4849
<class 'pandas.core.frame.DataFrame'>
4950
RangeIndex: 506 entries, 0 to 505
50-
Data columns (total 16 columns):
51+
Data columns (total 18 columns):
5152
CHAS_0 506 non-null int64
53+
CHAS_1 506 non-null int64
5254
RAD_0 506 non-null int64
5355
RAD_1 506 non-null int64
5456
RAD_2 506 non-null int64
5557
RAD_3 506 non-null int64
58+
RAD_4 506 non-null int64
5659
CRIM 506 non-null float64
5760
ZN 506 non-null float64
5861
INDUS 506 non-null float64
@@ -64,8 +67,8 @@ class BaseNEncoder(BaseEstimator, TransformerMixin):
6467
PTRATIO 506 non-null float64
6568
B 506 non-null float64
6669
LSTAT 506 non-null float64
67-
dtypes: float64(11), int64(5)
68-
memory usage: 63.3 KB
70+
dtypes: float64(11), int64(7)
71+
memory usage: 71.2 KB
6972
None
7073
7174
"""
@@ -135,7 +138,8 @@ def fit(self, X, y=None, **kwargs):
135138
if self.drop_invariant:
136139
self.drop_cols = []
137140
X_temp = self.transform(X)
138-
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
141+
generated_cols = get_generated_cols(X, X_temp, self.cols)
142+
self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
139143

140144
return self
141145

@@ -168,25 +172,23 @@ def transform(self, X, override_return_df=False):
168172
if not self.cols:
169173
return X
170174

171-
original_cols = set(X.columns)
172-
X = self.ordinal_encoder.transform(X)
173-
X = self.basen_encode(X, cols=self.cols)
175+
X_out = self.ordinal_encoder.transform(X)
176+
X_out = self.basen_encode(X_out, cols=self.cols)
174177

175178
if self.drop_invariant:
176179
for col in self.drop_cols:
177-
X.drop(col, 1, inplace=True)
180+
X_out.drop(col, 1, inplace=True)
178181

179182
# impute missing values only in the generated columns
180-
current_cols = set(X.columns)
181-
fillna_cols = list(current_cols - (original_cols - set(self.cols)))
182-
X[fillna_cols] = X[fillna_cols].fillna(value=0.0)
183+
generated_cols = get_generated_cols(X, X_out, self.cols)
184+
X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0)
183185

184186
if self.return_df or override_return_df:
185-
return X
187+
return X_out
186188
else:
187-
return X.values
189+
return X_out.values
188190

189-
def inverse_transform(self, Xt):
191+
def inverse_transform(self, X_in):
190192
"""
191193
Perform the inverse transformation to encoded data.
192194
@@ -201,15 +203,15 @@ def inverse_transform(self, Xt):
201203
"""
202204

203205
warnings.warn('Inverse transform in basen is a currently experimental feature, please be careful')
204-
X = Xt.copy(deep=True)
206+
X = X_in.copy(deep=True)
205207

206208
# first check the type
207209
X = convert_input(X)
208210

209211
if self._dim is None:
210212
raise ValueError('Must train encoder before it can be used to inverse_transform data')
211213

212-
X = self.basen_to_interger(X, self.cols, self.base)
214+
X = self.basen_to_integer(X, self.cols, self.base)
213215

214216
# then make sure that it is the right size
215217
if X.shape[1] != self._dim:
@@ -284,7 +286,7 @@ def basen_encode(self, X_in, cols=None):
284286

285287
return X
286288

287-
def basen_to_interger(self, X, cols, base):
289+
def basen_to_integer(self, X, cols, base):
288290
"""
289291
Convert basen code as integers.
290292
@@ -304,7 +306,7 @@ def basen_to_interger(self, X, cols, base):
304306
out_cols = X.columns.values
305307

306308
for col in cols:
307-
col_list = [col0 for col0 in out_cols if str(col0).startswith(col)]
309+
col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))]
308310
for col0 in col_list:
309311
if any(X[col0].isnull()):
310312
raise ValueError("inverse_transform is not supported because transform impute"

category_encoders/binary.py

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import numpy as np
66
from sklearn.base import BaseEstimator, TransformerMixin
77
from category_encoders.ordinal import OrdinalEncoder
8-
from category_encoders.utils import get_obj_cols, convert_input
8+
from category_encoders.utils import get_obj_cols, convert_input, get_generated_cols
99

1010
__author__ = 'willmcginnis'
1111

@@ -19,11 +19,11 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
1919
verbose: int
2020
integer indicating verbosity of output. 0 for none.
2121
cols: list
22-
a list of columns to encode, if None, all string columns will be encoded
22+
a list of columns to encode, if None, all string columns will be encoded.
2323
drop_invariant: bool
24-
boolean for whether or not to drop columns with 0 variance
24+
boolean for whether or not to drop columns with 0 variance.
2525
return_df: bool
26-
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array)
26+
boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array).
2727
impute_missing: bool
2828
boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future.
2929
handle_unknown: str
@@ -33,24 +33,25 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
3333
3434
Example
3535
-------
36-
>>>from category_encoders import *
37-
>>>import pandas as pd
38-
>>>from sklearn.datasets import load_boston
39-
>>>bunch = load_boston()
40-
>>>y = bunch.target
41-
>>>X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
42-
>>>enc = BinaryEncoder(cols=['CHAS', 'RAD']).fit(X, y)
43-
>>>numeric_dataset = enc.transform(X)
44-
>>>print(numeric_dataset.info())
45-
36+
>>> from category_encoders import *
37+
>>> import pandas as pd
38+
>>> from sklearn.datasets import load_boston
39+
>>> bunch = load_boston()
40+
>>> y = bunch.target
41+
>>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names)
42+
>>> enc = BinaryEncoder(cols=['CHAS', 'RAD']).fit(X, y)
43+
>>> numeric_dataset = enc.transform(X)
44+
>>> print(numeric_dataset.info())
4645
<class 'pandas.core.frame.DataFrame'>
4746
RangeIndex: 506 entries, 0 to 505
48-
Data columns (total 16 columns):
47+
Data columns (total 18 columns):
4948
CHAS_0 506 non-null int64
49+
CHAS_1 506 non-null int64
5050
RAD_0 506 non-null int64
5151
RAD_1 506 non-null int64
5252
RAD_2 506 non-null int64
5353
RAD_3 506 non-null int64
54+
RAD_4 506 non-null int64
5455
CRIM 506 non-null float64
5556
ZN 506 non-null float64
5657
INDUS 506 non-null float64
@@ -62,8 +63,8 @@ class BinaryEncoder(BaseEstimator, TransformerMixin):
6263
PTRATIO 506 non-null float64
6364
B 506 non-null float64
6465
LSTAT 506 non-null float64
65-
dtypes: float64(11), int64(5)
66-
memory usage: 63.3 KB
66+
dtypes: float64(11), int64(7)
67+
memory usage: 71.2 KB
6768
None
6869
6970
"""
@@ -128,7 +129,8 @@ def fit(self, X, y=None, **kwargs):
128129
if self.drop_invariant:
129130
self.drop_cols = []
130131
X_temp = self.transform(X)
131-
self.drop_cols = [x for x in X_temp.columns.values if X_temp[x].var() <= 10e-5]
132+
generated_cols = get_generated_cols(X, X_temp, self.cols)
133+
self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5]
132134

133135
return self
134136

@@ -174,7 +176,7 @@ def transform(self, X):
174176
else:
175177
return X.values
176178

177-
def inverse_transform(self, Xt):
179+
def inverse_transform(self, X_in):
178180
"""
179181
Perform the inverse transformation to encoded data.
180182
@@ -187,15 +189,15 @@ def inverse_transform(self, Xt):
187189
p: array, the same size of X_in
188190
189191
"""
190-
X = Xt.copy(deep=True)
192+
X = X_in.copy(deep=True)
191193

192194
# first check the type
193195
X = convert_input(X)
194196

195197
if self._dim is None:
196198
raise ValueError('Must train encoder before it can be used to inverse_transform data')
197199

198-
X = self.binery_to_interger(X, self.cols)
200+
X = self.binary_to_integer(X, self.cols)
199201

200202
# then make sure that it is the right size
201203
if X.shape[1] != self._dim:
@@ -266,7 +268,7 @@ def binary(self, X_in, cols=None):
266268

267269
return X
268270

269-
def binery_to_interger(self, X, cols):
271+
def binary_to_integer(self, X, cols):
270272
"""
271273
Convert binary code as integers.
272274
@@ -284,7 +286,7 @@ def binery_to_interger(self, X, cols):
284286
out_cols = X.columns.values
285287

286288
for col in cols:
287-
col_list = [col0 for col0 in out_cols if col0.startswith(col)]
289+
col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))]
288290
for col0 in col_list:
289291
if any(X[col0].isnull()):
290292
raise ValueError("inverse_transform is not supported because transform impute "

0 commit comments

Comments
 (0)