Skip to content

Commit b1993e0

Browse files
authored
Merge pull request #119 from JohnnyC08/issue-106-retrain-leave-one-out
Issue 106 retrain leave one out
2 parents c438e89 + cee2813 commit b1993e0

File tree

2 files changed

+102
-66
lines changed

2 files changed

+102
-66
lines changed

category_encoders/leave_one_out.py

Lines changed: 68 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
8080
self.drop_invariant = drop_invariant
8181
self.drop_cols = []
8282
self.verbose = verbose
83+
self.use_default_cols = cols is None # important when we call fit() repeatedly
8384
self.cols = cols
8485
self._dim = None
8586
self.mapping = None
@@ -122,12 +123,11 @@ def fit(self, X, y, **kwargs):
122123
self._dim = X.shape[1]
123124

124125
# if columns aren't passed, just use every string column
125-
if self.cols is None:
126+
if self.use_default_cols:
126127
self.cols = get_obj_cols(X)
127128

128-
_, categories = self.leave_one_out(
129+
categories = self.fit_leave_one_out(
129130
X, y,
130-
mapping=self.mapping,
131131
cols=self.cols,
132132
impute_missing=self.impute_missing,
133133
handle_unknown=self.handle_unknown
@@ -183,10 +183,9 @@ def transform(self, X, y=None):
183183

184184
if not self.cols:
185185
return X
186-
X, _ = self.leave_one_out(
186+
X = self.transform_leave_one_out(
187187
X, y,
188188
mapping=self.mapping,
189-
cols=self.cols,
190189
impute_missing=self.impute_missing,
191190
handle_unknown=self.handle_unknown
192191
)
@@ -209,74 +208,77 @@ def fit_transform(self, X, y=None, **fit_params):
209208
"""
210209
return self.fit(X, y, **fit_params).transform(X, y)
211210

212-
def leave_one_out(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'):
213-
"""
214-
Leave one out encoding uses a single column of floats to represent the means of the target variables.
215-
"""
216-
211+
def fit_leave_one_out(self, X_in, y, cols=None, impute_missing=True, handle_unknown='impute'):
217212
X = X_in.copy(deep=True)
218213

219214
if cols is None:
220215
cols = X.columns.values
221216

222-
if mapping is not None:
223-
mapping_out = mapping
224-
random_state_ = check_random_state(self.random_state)
225-
for switch in mapping:
226-
X[str(switch.get('col')) + '_tmp'] = np.nan
227-
for val in switch.get('mapping'):
228-
if y is None:
229-
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = \
230-
switch.get('mapping')[val]['mean']
231-
elif switch.get('mapping')[val]['count'] == 1:
232-
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = self._mean
233-
else:
234-
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = (
235-
(switch.get('mapping')[val]['sum'] - y[(X[switch.get('col')] == val).values]) / (
236-
switch.get('mapping')[val]['count'] - 1)
237-
)
238-
del X[switch.get('col')]
239-
X.rename(columns={str(switch.get('col')) + '_tmp': switch.get('col')}, inplace=True)
240-
241-
if impute_missing:
242-
if handle_unknown == 'impute':
243-
X[switch.get('col')].fillna(self._mean, inplace=True)
244-
elif handle_unknown == 'error':
245-
missing = X[switch.get('col')].isnull()
246-
if any(missing):
247-
raise ValueError('Unexpected categories found in column %s' % switch.get('col'))
248-
249-
if self.randomized and y is not None:
250-
X[switch.get('col')] = (X[switch.get('col')] *
251-
random_state_.normal(1., self.sigma, X[switch.get('col')].shape[0]))
252-
253-
X[switch.get('col')] = X[switch.get('col')].astype(float).values.reshape(-1, )
254-
else:
255-
self._mean = y.mean()
256-
mapping_out = []
257-
258-
for col in cols:
259-
tmp = y.groupby(X[col]).agg(['sum', 'count'])
260-
tmp['mean'] = tmp['sum'] / tmp['count']
261-
tmp = tmp.to_dict(orient='index')
262-
263-
X[str(col) + '_tmp'] = np.nan
264-
for val in tmp:
265-
"""if the val only appear once ,encoder it as mean of y"""
266-
if tmp[val]['count'] == 1:
267-
X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
268-
else:
269-
X.loc[X[col] == val, str(col) + '_tmp'] = (tmp[val]['sum'] - y.loc[X[col] == val]) / (
217+
self._mean = y.mean()
218+
mapping_out = []
219+
220+
for col in cols:
221+
tmp = y.groupby(X[col]).agg(['sum', 'count'])
222+
tmp['mean'] = tmp['sum'] / tmp['count']
223+
tmp = tmp.to_dict(orient='index')
224+
225+
X[str(col) + '_tmp'] = np.nan
226+
for val in tmp:
227+
"""if the val only appear once ,encoder it as mean of y"""
228+
if tmp[val]['count'] == 1:
229+
X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
230+
else:
231+
X.loc[X[col] == val, str(col) + '_tmp'] = (tmp[val]['sum'] - y.loc[X[col] == val]) / (
270232
tmp[val]['count'] - 1)
271-
del X[col]
272-
X.rename(columns={str(col) + '_tmp': col}, inplace=True)
233+
del X[col]
234+
X.rename(columns={str(col) + '_tmp': col}, inplace=True)
273235

274-
if impute_missing:
275-
if handle_unknown == 'impute':
276-
X[col].fillna(self._mean, inplace=True)
236+
if impute_missing:
237+
if handle_unknown == 'impute':
238+
X[col].fillna(self._mean, inplace=True)
277239

278-
X[col] = X[col].astype(float).values.reshape(-1, )
240+
X[col] = X[col].astype(float).values.reshape(-1, )
279241

280-
mapping_out.append({'col': col, 'mapping': tmp}, )
242+
mapping_out.append({'col': col, 'mapping': tmp}, )
243+
244+
return mapping_out
245+
246+
def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'):
247+
"""
248+
Leave one out encoding uses a single column of floats to represent the means of the target variables.
249+
"""
250+
251+
X = X_in.copy(deep=True)
281252

282-
return X, mapping_out
253+
random_state_ = check_random_state(self.random_state)
254+
for switch in mapping:
255+
X[str(switch.get('col')) + '_tmp'] = np.nan
256+
for val in switch.get('mapping'):
257+
if y is None:
258+
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = \
259+
switch.get('mapping')[val]['mean']
260+
elif switch.get('mapping')[val]['count'] == 1:
261+
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = self._mean
262+
else:
263+
X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = (
264+
(switch.get('mapping')[val]['sum'] - y[(X[switch.get('col')] == val).values]) / (
265+
switch.get('mapping')[val]['count'] - 1)
266+
)
267+
del X[switch.get('col')]
268+
X.rename(columns={str(switch.get('col')) + '_tmp': switch.get('col')}, inplace=True)
269+
270+
if impute_missing:
271+
if handle_unknown == 'impute':
272+
X[switch.get('col')].fillna(self._mean, inplace=True)
273+
elif handle_unknown == 'error':
274+
missing = X[switch.get('col')].isnull()
275+
if any(missing):
276+
raise ValueError('Unexpected categories found in column %s' % switch.get('col'))
277+
278+
if self.randomized and y is not None:
279+
X[switch.get('col')] = (X[switch.get('col')] *
280+
random_state_.normal(1., self.sigma, X[switch.get('col')].shape[0]))
281+
282+
X[switch.get('col')] = X[switch.get('col')].astype(float).values.reshape(-1, )
283+
284+
return X

category_encoders/tests/test_encoders.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,40 @@ def test_leave_one_out(self):
575575
self.verify_numeric(enc.transform(X_t))
576576
self.verify_numeric(enc.transform(X_t, y_t))
577577

578+
def test_fit_CallTwiceOnDifferentData_ExpectRefitMapping(self):
579+
x_a = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_a'])
580+
x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b']) # Different column name
581+
y_dummy = [True, False, True, False, True, False]
582+
encoder = encoders.LeaveOneOutEncoder()
583+
584+
encoder.fit(x_a, y_dummy)
585+
encoder.fit(x_b, y_dummy)
586+
mapping = encoder.mapping
587+
588+
self.assertEqual(1, len(mapping))
589+
col_b_mapping = mapping[0]
590+
self.assertEqual('col_b', col_b_mapping['col'])
591+
self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1'])
592+
self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2'])
593+
594+
def test_transform_CallTwiceOnDifferentData_ExpecCorrectTransformation(self):
595+
x_a = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_a'])
596+
x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b']) # Different column name
597+
y_dummy = [True, False, True, False, True, False]
598+
encoder = encoders.LeaveOneOutEncoder()
599+
600+
encoder.fit(x_a, y_dummy)
601+
encoder.fit(x_b, y_dummy)
602+
result = encoder.transform(x_b)['col_b'].values
603+
604+
self.assertEqual(6, len(result))
605+
self.assertEqual(2.0/3.0, result[0])
606+
self.assertEqual(2.0 / 3.0, result[1])
607+
self.assertEqual(2.0 / 3.0, result[2])
608+
self.assertEqual(1.0 / 3.0, result[3])
609+
self.assertEqual(1.0 / 3.0, result[4])
610+
self.assertEqual(1.0 / 3.0, result[5])
611+
578612
def test_target_encode_np(self):
579613
"""
580614

0 commit comments

Comments
 (0)