Merge pull request #119 from JohnnyC08/issue-106-retrain-leave-one-out

janmotl · web-flow · commit b1993e09ce8b · 2018-09-08T14:11:43.000+02:00
Issue 106 retrain leave one out
diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py
@@ -80,6 +80,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i
         self.drop_invariant = drop_invariant
         self.drop_cols = []
         self.verbose = verbose
+        self.use_default_cols = cols is None  # important when we call fit() repeatedly
         self.cols = cols
         self._dim = None
         self.mapping = None
@@ -122,12 +123,11 @@ def fit(self, X, y, **kwargs):
         self._dim = X.shape[1]
 
         # if columns aren't passed, just use every string column
-        if self.cols is None:
+        if self.use_default_cols:
             self.cols = get_obj_cols(X)
 
-        _, categories = self.leave_one_out(
+        categories = self.fit_leave_one_out(
             X, y,
-            mapping=self.mapping,
             cols=self.cols,
             impute_missing=self.impute_missing,
             handle_unknown=self.handle_unknown
@@ -183,10 +183,9 @@ def transform(self, X, y=None):
 
         if not self.cols:
             return X
-        X, _ = self.leave_one_out(
+        X = self.transform_leave_one_out(
             X, y,
             mapping=self.mapping,
-            cols=self.cols,
             impute_missing=self.impute_missing,
             handle_unknown=self.handle_unknown
         )
@@ -209,74 +208,77 @@ def fit_transform(self, X, y=None, **fit_params):
         """
         return self.fit(X, y, **fit_params).transform(X, y)
 
-    def leave_one_out(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'):
-        """
-        Leave one out encoding uses a single column of floats to represent the means of the target variables.
-        """
-
+    def fit_leave_one_out(self, X_in, y, cols=None, impute_missing=True, handle_unknown='impute'):
         X = X_in.copy(deep=True)
 
         if cols is None:
             cols = X.columns.values
 
-        if mapping is not None:
-            mapping_out = mapping
-            random_state_ = check_random_state(self.random_state)
-            for switch in mapping:
-                X[str(switch.get('col')) + '_tmp'] = np.nan
-                for val in switch.get('mapping'):
-                    if y is None:
-                        X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = \
-                            switch.get('mapping')[val]['mean']
-                    elif switch.get('mapping')[val]['count'] == 1:
-                        X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = self._mean
-                    else:
-                        X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = (
-                            (switch.get('mapping')[val]['sum'] - y[(X[switch.get('col')] == val).values]) / (
-                                switch.get('mapping')[val]['count'] - 1)
-                        )
-                del X[switch.get('col')]
-                X.rename(columns={str(switch.get('col')) + '_tmp': switch.get('col')}, inplace=True)
-
-                if impute_missing:
-                    if handle_unknown == 'impute':
-                        X[switch.get('col')].fillna(self._mean, inplace=True)
-                    elif handle_unknown == 'error':
-                        missing = X[switch.get('col')].isnull()
-                        if any(missing):
-                            raise ValueError('Unexpected categories found in column %s' % switch.get('col'))
-
-                if self.randomized and y is not None:
-                    X[switch.get('col')] = (X[switch.get('col')] *
-                                            random_state_.normal(1., self.sigma, X[switch.get('col')].shape[0]))
-
-                X[switch.get('col')] = X[switch.get('col')].astype(float).values.reshape(-1, )
-        else:
-            self._mean = y.mean()
-            mapping_out = []
-
-            for col in cols:
-                tmp = y.groupby(X[col]).agg(['sum', 'count'])
-                tmp['mean'] = tmp['sum'] / tmp['count']
-                tmp = tmp.to_dict(orient='index')
-
-                X[str(col) + '_tmp'] = np.nan
-                for val in tmp:
-                    """if the val only appear once ,encoder it as mean of y"""
-                    if tmp[val]['count'] == 1:
-                        X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
-                    else:
-                        X.loc[X[col] == val, str(col) + '_tmp'] = (tmp[val]['sum'] - y.loc[X[col] == val]) / (
+        self._mean = y.mean()
+        mapping_out = []
+
+        for col in cols:
+            tmp = y.groupby(X[col]).agg(['sum', 'count'])
+            tmp['mean'] = tmp['sum'] / tmp['count']
+            tmp = tmp.to_dict(orient='index')
+
+            X[str(col) + '_tmp'] = np.nan
+            for val in tmp:
+                """if the val only appear once ,encoder it as mean of y"""
+                if tmp[val]['count'] == 1:
+                    X.loc[X[col] == val, str(col) + '_tmp'] = self._mean
+                else:
+                    X.loc[X[col] == val, str(col) + '_tmp'] = (tmp[val]['sum'] - y.loc[X[col] == val]) / (
                             tmp[val]['count'] - 1)
-                del X[col]
-                X.rename(columns={str(col) + '_tmp': col}, inplace=True)
+            del X[col]
+            X.rename(columns={str(col) + '_tmp': col}, inplace=True)
 
-                if impute_missing:
-                    if handle_unknown == 'impute':
-                        X[col].fillna(self._mean, inplace=True)
+            if impute_missing:
+                if handle_unknown == 'impute':
+                    X[col].fillna(self._mean, inplace=True)
 
-                X[col] = X[col].astype(float).values.reshape(-1, )
+            X[col] = X[col].astype(float).values.reshape(-1, )
 
-                mapping_out.append({'col': col, 'mapping': tmp}, )
+            mapping_out.append({'col': col, 'mapping': tmp}, )
+
+        return mapping_out
+
+    def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'):
+        """
+        Leave one out encoding uses a single column of floats to represent the means of the target variables.
+        """
+
+        X = X_in.copy(deep=True)
 
-        return X, mapping_out
+        random_state_ = check_random_state(self.random_state)
+        for switch in mapping:
+            X[str(switch.get('col')) + '_tmp'] = np.nan
+            for val in switch.get('mapping'):
+                if y is None:
+                    X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = \
+                        switch.get('mapping')[val]['mean']
+                elif switch.get('mapping')[val]['count'] == 1:
+                    X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = self._mean
+                else:
+                    X.loc[X[switch.get('col')] == val, str(switch.get('col')) + '_tmp'] = (
+                        (switch.get('mapping')[val]['sum'] - y[(X[switch.get('col')] == val).values]) / (
+                            switch.get('mapping')[val]['count'] - 1)
+                    )
+            del X[switch.get('col')]
+            X.rename(columns={str(switch.get('col')) + '_tmp': switch.get('col')}, inplace=True)
+
+            if impute_missing:
+                if handle_unknown == 'impute':
+                    X[switch.get('col')].fillna(self._mean, inplace=True)
+                elif handle_unknown == 'error':
+                    missing = X[switch.get('col')].isnull()
+                    if any(missing):
+                        raise ValueError('Unexpected categories found in column %s' % switch.get('col'))
+
+            if self.randomized and y is not None:
+                X[switch.get('col')] = (X[switch.get('col')] *
+                                        random_state_.normal(1., self.sigma, X[switch.get('col')].shape[0]))
+
+            X[switch.get('col')] = X[switch.get('col')].astype(float).values.reshape(-1, )
+
+        return X
diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py
@@ -575,6 +575,40 @@ def test_leave_one_out(self):
         self.verify_numeric(enc.transform(X_t))
         self.verify_numeric(enc.transform(X_t, y_t))
 
+    def test_fit_CallTwiceOnDifferentData_ExpectRefitMapping(self):
+        x_a = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_a'])
+        x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b'])  # Different column name
+        y_dummy = [True, False, True, False, True, False]
+        encoder = encoders.LeaveOneOutEncoder()
+
+        encoder.fit(x_a, y_dummy)
+        encoder.fit(x_b, y_dummy)
+        mapping = encoder.mapping
+
+        self.assertEqual(1, len(mapping))
+        col_b_mapping = mapping[0]
+        self.assertEqual('col_b', col_b_mapping['col'])
+        self.assertEqual({'sum': 2.0, 'count': 3, 'mean': 2.0/3.0}, col_b_mapping['mapping']['1'])
+        self.assertEqual({'sum': 1.0, 'count': 3, 'mean': 01.0/3.0}, col_b_mapping['mapping']['2'])
+
+    def test_transform_CallTwiceOnDifferentData_ExpecCorrectTransformation(self):
+        x_a = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_a'])
+        x_b = pd.DataFrame(data=['1', '1', '1', '2', '2', '2'], columns=['col_b'])  # Different column name
+        y_dummy = [True, False, True, False, True, False]
+        encoder = encoders.LeaveOneOutEncoder()
+
+        encoder.fit(x_a, y_dummy)
+        encoder.fit(x_b, y_dummy)
+        result = encoder.transform(x_b)['col_b'].values
+
+        self.assertEqual(6, len(result))
+        self.assertEqual(2.0/3.0, result[0])
+        self.assertEqual(2.0 / 3.0, result[1])
+        self.assertEqual(2.0 / 3.0, result[2])
+        self.assertEqual(1.0 / 3.0, result[3])
+        self.assertEqual(1.0 / 3.0, result[4])
+        self.assertEqual(1.0 / 3.0, result[5])
+
     def test_target_encode_np(self):
         """