Merge pull request #102 from janmotl/issue_96_and_97

wdm0006 · web-flow · commit fc3bb1894fba · 2018-08-10T20:52:49.000-04:00
Issue 96 and 97
diff --git a/category_encoders/basen.py b/category_encoders/basen.py
@@ -168,14 +168,19 @@ def transform(self, X, override_return_df=False):
         if not self.cols:
             return X
 
+        original_cols = set(X.columns)
         X = self.ordinal_encoder.transform(X)
         X = self.basen_encode(X, cols=self.cols)
 
         if self.drop_invariant:
             for col in self.drop_cols:
                 X.drop(col, 1, inplace=True)
 
-        X.fillna(0.0, inplace=True)
+        # impute missing values only in the generated columns
+        current_cols = set(X.columns)
+        fillna_cols = list(current_cols - (original_cols - set(self.cols)))
+        X[fillna_cols] = X[fillna_cols].fillna(value=0.0)
+
         if self.return_df or override_return_df:
             return X
         else:
@@ -299,13 +304,13 @@ def basen_to_interger(self, X, cols, base):
         out_cols = X.columns.values
 
         for col in cols:
-            col_list = [col0 for col0 in out_cols if col0.startswith(col)]
+            col_list = [col0 for col0 in out_cols if str(col0).startswith(col)]
             for col0 in col_list:
                 if any(X[col0].isnull()):
                     raise ValueError("inverse_transform is not supported because transform impute"
                                      "the unknown category -1 when encode %s" % (col,))
             if base == 1:
-                value_array = np.array([int(col0.split('_')[1]) for col0 in col_list])
+                value_array = np.array([int(col0.split('_')[-1]) for col0 in col_list])
             else:
                 len0 = len(col_list)
                 value_array = np.array([base ** (len0 - 1 - i) for i in range(len0)])
diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py
@@ -297,14 +297,15 @@ def reverse_dummies(self, X, cols):
         out_cols = X.columns.values
 
         for col in cols:
-            col_list = [col0 for col0 in out_cols if col0.startswith(col)]
+            col_list = [col0 for col0 in out_cols if str(col0).startswith(col)]
+            prefix_length = len(col)+1 # original column name plus underscore
             if self.use_cat_names:
                 X[col] = 0
                 for tran_col in col_list:
-                    val = tran_col.split('_')[1]
+                    val = tran_col[prefix_length:]
                     X.loc[X[tran_col] == 1, col] = val
             else:
-                value_array = np.array([int(col0.split('_')[1]) for col0 in col_list])
+                value_array = np.array([int(col0[prefix_length:]) for col0 in col_list])
                 X[col] = np.dot(X[col_list].values, value_array.T)
             out_cols = [col0 for col0 in out_cols if col0 not in col_list]