Skip to content

Commit f807642

Browse files
authored
Merge pull request #113 from JohnnyC08/issue-109-smoothing-min-sample
Issue 109 smoothing min sample
2 parents fc3bb18 + 2dcc9b4 commit f807642

File tree

2 files changed

+44
-5
lines changed

2 files changed

+44
-5
lines changed

category_encoders/target_encoder.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
8080
self.verbose = verbose
8181
self.cols = cols
8282
self.min_samples_leaf = min_samples_leaf
83-
self.smoothing = smoothing
83+
# Make smoothing a float so that python 2 does not treat as integer division
84+
self.smoothing = float(smoothing)
8485
self._dim = None
8586
self.mapping = None
8687
self.impute_missing = impute_missing
@@ -119,7 +120,9 @@ def fit(self, X, y, **kwargs):
119120
mapping=self.mapping,
120121
cols=self.cols,
121122
impute_missing=self.impute_missing,
122-
handle_unknown=self.handle_unknown
123+
handle_unknown=self.handle_unknown,
124+
smoothing_in=self.smoothing,
125+
min_samples_leaf=self.min_samples_leaf
123126
)
124127
self.mapping = categories
125128

@@ -163,8 +166,6 @@ def transform(self, X, y=None):
163166
cols=self.cols,
164167
impute_missing=self.impute_missing,
165168
handle_unknown=self.handle_unknown,
166-
min_samples_leaf=self.min_samples_leaf,
167-
smoothing_in=self.smoothing
168169
)
169170

170171
if self.drop_invariant:
@@ -215,7 +216,6 @@ def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True,
215216

216217
X[str(col) + '_tmp'] = np.nan
217218
for val in tmp:
218-
tmp[val]['mean'] = tmp[val]['sum']/tmp[val]['count']
219219
if tmp[val]['count'] == 1:
220220
X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
221221
else:

category_encoders/tests/test_encoders.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,3 +627,42 @@ def test_target_encode_out(self):
627627
enc.fit(X, y)
628628
self.verify_numeric(enc.transform(X_t))
629629
self.verify_numeric(enc.transform(X_t, y_t))
630+
631+
def test_fit_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectUsedInFit(self):
632+
"""
633+
634+
:return:
635+
"""
636+
k = 2
637+
f = 10
638+
binary_cat_example = pd.DataFrame(
639+
{'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'],
640+
'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
641+
encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f)
642+
643+
encoder.fit(binary_cat_example, binary_cat_example['target'])
644+
trend_mapping = encoder.mapping[0]['mapping']
645+
646+
self.assertAlmostEquals(0.4125, trend_mapping['DOWN']['smoothing'], delta=1e-4)
647+
self.assertEqual(0.5, trend_mapping['FLAT']['smoothing'])
648+
self.assertAlmostEquals(0.5874, trend_mapping['UP']['smoothing'], delta=1e-4)
649+
650+
def test_fit_transform_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectCorrectValueInResult(self):
651+
"""
652+
653+
:return:
654+
"""
655+
k = 2
656+
f = 10
657+
binary_cat_example = pd.DataFrame(
658+
{'Trend': ['UP', 'UP', 'DOWN', 'FLAT', 'DOWN', 'UP', 'DOWN', 'FLAT', 'FLAT', 'FLAT'],
659+
'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]})
660+
encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f)
661+
662+
result = encoder.fit_transform(binary_cat_example, binary_cat_example['target'])
663+
values = result['Trend'].values
664+
665+
self.assertAlmostEquals(0.5874, values[0], delta=1e-4)
666+
self.assertAlmostEquals(0.5874, values[1], delta=1e-4)
667+
self.assertAlmostEquals(0.4125, values[2], delta=1e-4)
668+
self.assertEqual(0.5, values[3])

0 commit comments

Comments
 (0)