From 9efba105ea76b8a40e7215018999f16b0e03ac9d Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sun, 28 Oct 2018 21:19:28 -0600 Subject: [PATCH 01/40] Add tests and logic to ensure ordinal encoder supports the handle unknown return nan for transform and inverse transform --- category_encoders/ordinal.py | 7 +++++- category_encoders/tests/test_encoders.py | 29 +++++++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 8be33e6f..2fee7901 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -1,7 +1,6 @@ """Ordinal or label encoding""" import pandas as pd -import numpy as np from sklearn.base import BaseEstimator, TransformerMixin import category_encoders.utils as util @@ -229,6 +228,12 @@ def inverse_transform(self, X_in): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col,)) + if self.handle_unknown == 'return_nan': + for col in self.cols: + if X[col].isnull().any(): + raise ValueError("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + for switch in self.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index c251adf4..b3d54540 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -128,6 +128,34 @@ def test_handle_unknown_error(self): with self.assertRaises(ValueError): _ = enc.transform(X_t) + def test_handle_unknown_return_nan(self): + train = pd.DataFrame({'city': ['chicago', 'los angeles']}) + test = pd.DataFrame({'city': ['chicago', 'denver']}) + y = pd.Series([1, 0]) + + # TODO - implement for all encoders + for encoder_name in ['OrdinalEncoder']: + with self.subTest(encoder_name=encoder_name): + + enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') + enc.fit(train, y) + result = enc.transform(test) + self.assertTrue(result.iloc[1, :].isnull().all()) + + def test_inverse_transform_handle_unknown_return_nan_expect_value_error(self): + train = pd.DataFrame({'city': ['chicago', 'los angeles']}) + test = pd.DataFrame({'city': ['chicago', 'denver']}) + y = pd.Series([1, 0]) + + # TODO - implement for all encoders supporting inverse transform + for encoder_name in ['OrdinalEncoder']: + with self.subTest(encoder_name=encoder_name): + enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') + enc.fit(train, y) + result = enc.transform(test) + with self.assertRaises(ValueError): + _ = enc.inverse_transform(result) + def test_sklearn_compliance(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): @@ -285,4 +313,3 @@ def test_string_index(self): enc = getattr(encoders, encoder_name)(cols=['CHAS', 'RAD']) result = enc.fit_transform(X, y) self.assertFalse(result.isnull().values.any(), 'There should not be any missing value!') - From 8cac21ac79fb1b3f4244ca91dcea1f8392882fa9 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sun, 28 Oct 2018 21:35:13 -0600 Subject: [PATCH 02/40] Make all encoders support value over impute --- category_encoders/backward_difference.py | 6 +++--- category_encoders/basen.py | 8 ++++---- category_encoders/binary.py | 8 ++++---- category_encoders/helmert.py | 8 +++++--- category_encoders/leave_one_out.py | 10 ++++------ category_encoders/one_hot.py | 10 +++++----- category_encoders/ordinal.py | 10 +++++----- category_encoders/polynomial.py | 6 +++--- category_encoders/sum_coding.py | 6 +++--- category_encoders/target_encoder.py | 10 ++++------ category_encoders/tests/test_encoders.py | 14 ++++++++++++++ category_encoders/woe.py | 6 +++--- 12 files changed, 57 insertions(+), 45 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 53cd11b5..68aa5c9b 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -27,8 +27,8 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example @@ -82,7 +82,7 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 04781f55..8d6847e3 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -32,8 +32,8 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example @@ -75,7 +75,7 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2, impute_missing=True, - handle_unknown='impute'): + handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -227,7 +227,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': + if self.impute_missing and self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 24ce7c8d..7f068597 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -27,8 +27,8 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example @@ -70,7 +70,7 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute'): + handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -212,7 +212,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': + if self.impute_missing and self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 5787aeca..d90efbde 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -28,8 +28,8 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. Example @@ -82,7 +82,7 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -223,6 +223,8 @@ def helmert_coding(X_in, mapping): for i in range(len(mod.columns)): c = mod.columns[i] new_col = str(col) + '_%d' % (i, ) + # TODO Use https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike to + # set new values X[new_col] = mod[c].loc[X[col]].values new_columns.append(new_col) old_column_index = cols.index(col) diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index af902db4..14510d84 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -29,9 +29,7 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can causes - unexpected changes in dimension in some cases. + options are 'error', 'ignore' and 'value', defaults to 'value', which will impute the target mean. sigma: float adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma gives the standard deviation (spread or "width") of the normal distribution. @@ -75,7 +73,7 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', random_state=None, sigma=None): + handle_unknown='value', random_state=None, sigma=None): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -216,7 +214,7 @@ def fit_leave_one_out(self, X_in, y, cols=None): self._mean = y.mean() return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols} - def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='impute'): + def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='value'): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ @@ -237,7 +235,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, ha X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if impute_missing: - if handle_unknown == 'impute': + if handle_unknown == 'value': X[col].fillna(self._mean, inplace=True) elif handle_unknown == 'error': if X[col].isnull().any(): diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 60d0ff6e..03bc17c9 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -26,8 +26,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. use_cat_names: bool if True, category values will be included in the encoded column names. Since this can result into duplicate column names, duplicates are suffixed with '#' symbol until a unique name is generated. @@ -86,7 +86,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute', use_cat_names=False): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value', use_cat_names=False): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -159,7 +159,7 @@ def generate_mapping(self): col = switch.get('col') column_mapping = switch.get('mapping').copy(deep=True) - if self.handle_unknown == 'impute': + if self.handle_unknown == 'value': column_mapping = column_mapping.append(pd.Series(data=[-1], index=['-1'])) col_mappings = [] @@ -252,7 +252,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': + if self.impute_missing and self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 2fee7901..6b2b2ec2 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -34,7 +34,7 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. + options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. Example ------- @@ -80,7 +80,7 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute'): + handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -222,7 +222,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'impute': + if self.impute_missing and self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " @@ -242,7 +242,7 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values @staticmethod - def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='impute'): + def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='value'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes @@ -266,7 +266,7 @@ def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_ X[column] = X[column].astype(float) if impute_missing: - if handle_unknown == 'impute': + if handle_unknown == 'value': X[column].fillna(0, inplace=True) elif handle_unknown == 'error': missing = X[column].isnull() diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index f7b4611e..81280d7b 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -27,8 +27,8 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. Example @@ -81,7 +81,7 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index 0ba3993c..4263db7b 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -27,8 +27,8 @@ class SumEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause + options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. Example @@ -81,7 +81,7 @@ class SumEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='impute'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index c1b3828b..8afcbae0 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -9,7 +9,7 @@ class TargetEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', min_samples_leaf=1, smoothing=1.0): + handle_unknown='value', min_samples_leaf=1, smoothing=1.0): """Target encoding for categorical features. For the case of categorical target: features are replaced with a blend of posterior probability of the target given particular categorical value and prior probability of the target over all the training data. @@ -30,9 +30,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'impute', defaults to 'impute', which will impute the category -1. Warning: if - impute is used, an extra column will be added in if the transform matrix has unknown categories. This can cause - unexpected changes in the dimension in some cases. + options are 'error', 'ignore' and 'value', defaults to 'valie', which will impute the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float @@ -202,7 +200,7 @@ def fit_transform(self, X, y=None, **fit_params): """ return self.fit(X, y, **fit_params).transform(X, y) - def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='impute', min_samples_leaf=1, smoothing_in=1.0): + def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='value', min_samples_leaf=1, smoothing_in=1.0): X = X_in.copy(deep=True) if cols is None: cols = X.columns.values @@ -211,7 +209,7 @@ def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True, h for col in cols: X[col] = X[col].map(mapping[col]) if impute_missing: - if handle_unknown == 'impute': + if handle_unknown == 'value': X[col].fillna(self._mean, inplace=True) elif handle_unknown == 'error': if X[col].isnull().any(): diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index b3d54540..9b94f23c 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -142,6 +142,20 @@ def test_handle_unknown_return_nan(self): result = enc.transform(test) self.assertTrue(result.iloc[1, :].isnull().all()) + def test_handle_unknown_value(self): + train = pd.DataFrame({'city': ['chicago', 'los angeles']}) + test = pd.DataFrame({'city': ['chicago', 'denver']}) + y = pd.Series([1, 0]) + + # TODO - implement for all encoders + for encoder_name in ['OrdinalEncoder']: + with self.subTest(encoder_name=encoder_name): + + enc = getattr(encoders, encoder_name)(handle_unknown='value') + enc.fit(train, y) + result = enc.transform(test) + self.assertFalse(result.iloc[1, :].isnull().all()) + def test_inverse_transform_handle_unknown_return_nan_expect_value_error(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) diff --git a/category_encoders/woe.py b/category_encoders/woe.py index 57604abf..387be893 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -24,7 +24,7 @@ class WOEEncoder(BaseEstimator, TransformerMixin): impute_missing: bool boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'ignore', 'error' and 'impute', defaults to 'impute', which will assume WOE=0. + options are 'ignore', 'error' and 'value', defaults to 'value', which will assume WOE=0. randomized: bool, adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float @@ -74,7 +74,7 @@ class WOEEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='impute', random_state=None, randomized=False, sigma=0.05, regularization=1.0): + handle_unknown='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0): self.verbose = verbose self.return_df = return_df self.drop_invariant = drop_invariant @@ -257,7 +257,7 @@ def _score(self, X, y): # Replace missing values only in the computed columns if self.impute_missing: - if self.handle_unknown == 'impute': + if self.handle_unknown == 'value': X[col].fillna(0, inplace=True) elif self.handle_unknown == 'error': if X[col].isnull().any(): From c9ff8e063a56458ad6e1c8e157a526ebe48a4f61 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sun, 28 Oct 2018 22:16:13 -0600 Subject: [PATCH 03/40] Add handle missing to ordinal encoder --- category_encoders/ordinal.py | 13 ++++++++++++- category_encoders/tests/test_encoders.py | 17 +++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 6b2b2ec2..4efa644e 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -35,6 +35,8 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. + handle_missing: str + options are 'error', 'return_nan', and 'value, default to 'value', which will impute the category -2. Example ------- @@ -80,7 +82,7 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): """ def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='value'): + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -89,6 +91,7 @@ def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, ret self.mapping = mapping self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self._dim = None @property @@ -126,6 +129,10 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + _, categories = self.ordinal_encoding( X, mapping=self.mapping, @@ -163,6 +170,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index 9b94f23c..14b2b7a0 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -128,6 +128,23 @@ def test_handle_unknown_error(self): with self.assertRaises(ValueError): _ = enc.transform(X_t) + def test_handle_missing_error(self): + non_null = pd.DataFrame({'city': ['chicago', 'los angeles']}) + has_null = pd.DataFrame({'city': ['chicago', np.nan]}) + y = pd.Series([1, 0]) + + # TODO - implement for all encoders + for encoder_name in ['OrdinalEncoder']: + with self.subTest(encoder_name=encoder_name): + + enc = getattr(encoders, encoder_name)(handle_missing='error') + with self.assertRaises(ValueError): + enc.fit(has_null) + + enc.fit(non_null) + with self.assertRaises(ValueError): + enc.transform(has_null) + def test_handle_unknown_return_nan(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) From f940bdbbb8dd2fe2237226022799c3991bd3aa59 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Thu, 1 Nov 2018 21:11:34 -0700 Subject: [PATCH 04/40] Add handle na settings return_nan and value for ordinal encoder --- category_encoders/ordinal.py | 19 +++++++++--- category_encoders/tests/test_ordinal.py | 40 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 4efa644e..d086f529 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -1,5 +1,6 @@ """Ordinal or label encoding""" +import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin import category_encoders.utils as util @@ -36,7 +37,7 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. handle_missing: str - options are 'error', 'return_nan', and 'value, default to 'value', which will impute the category -2. + options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category. Example ------- @@ -138,7 +139,8 @@ def fit(self, X, y=None, **kwargs): mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + handle_missing=self.handle_missing ) self.mapping = categories @@ -192,7 +194,8 @@ def transform(self, X): mapping=self.mapping, cols=self.cols, impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + handle_missing=self.handle_missing ) if self.drop_invariant: @@ -253,13 +256,15 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values @staticmethod - def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='value'): + def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='value', handle_missing='valie'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes are assumed to have no true order and integers are selected at random. """ + return_nan_series = pd.Series(data=[np.nan], index=[-2]) + X = X_in.copy(deep=True) if cols is None: @@ -284,6 +289,9 @@ def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_ if any(missing): raise ValueError('Unexpected categories found in column %s' % column) + if handle_missing == 'return_nan': + X[column] = X[column].map(return_nan_series).where(X[column] == -2, X[column]) + else: mapping_out = [] for col in cols: @@ -302,6 +310,9 @@ def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_ mapping = pd.Series(data=values, index=index) + if handle_missing == 'return_nan': + mapping[np.nan] = -2 + mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) return X, mapping_out diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index e6fa0143..10a4f13c 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -73,3 +73,43 @@ def test_pandas_categorical(self): self.assertEqual(3, out['Categorical'][1]) self.assertEqual(1, out['Categorical'][2]) self.assertEqual(2, out['Categorical'][3]) + + def test_handle_missing_have_nan_fit_time_expect_as_category(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value') + out = enc.fit_transform(train) + + self.assertListEqual([1, 2], out['city'].tolist()) + + def test_handle_missing_have_nan_transform_time_expect_zero(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value') + enc.fit(train) + out = enc.transform(test) + + self.assertListEqual([1, 0], out['city'].tolist()) + + def test_handle_unknown_have_nan_fit_time_return_nan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan') + out = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(2, len(out)) + self.assertEqual(1.0, out[0]) + self.assertTrue(np.isnan(out[1])) + + def test_handle_unknown_have_nan_transform_time_return_nan(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan') + enc.fit(train) + out = enc.transform(test)['city'].tolist() + + self.assertEqual(2, len(out)) + self.assertEqual(1.0, out[0]) + self.assertTrue(np.isnan(out[1])) From 4fff3932d9fc9f61980ace5eb135168fbd288d8e Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Thu, 1 Nov 2018 21:17:24 -0700 Subject: [PATCH 05/40] Remove impute_missing field from ordinal encoder --- category_encoders/backward_difference.py | 1 - category_encoders/basen.py | 1 - category_encoders/binary.py | 1 - category_encoders/helmert.py | 1 - category_encoders/one_hot.py | 1 - category_encoders/ordinal.py | 27 ++++++++++-------------- category_encoders/polynomial.py | 1 - category_encoders/sum_coding.py | 1 - category_encoders/tests/test_ordinal.py | 10 ++++----- 9 files changed, 16 insertions(+), 28 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 68aa5c9b..99d8f4fb 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -131,7 +131,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 8d6847e3..9061236a 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -125,7 +125,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 7f068597..0b62a1c4 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -118,7 +118,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) X = X.drop_duplicates(subset=self.cols) if self.cols else X diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index d90efbde..15f70c6e 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -128,7 +128,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 03bc17c9..b0b6164a 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -137,7 +137,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index d086f529..60634a11 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -32,12 +32,11 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): the value of 'col' should be the feature name. the value of 'mapping' should be a list of tuples of format (original_label, encoded_label). example mapping: [{'col': 'col1', 'mapping': [(None, 0), ('a', 1), ('b', 2)]}] - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. handle_missing: str - options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category. + options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time, + or 0 at transform time if nan is not a category during fit. Example ------- @@ -82,7 +81,7 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, impute_missing=True, + def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant @@ -90,7 +89,6 @@ def __init__(self, verbose=0, mapping=None, cols=None, drop_invariant=False, ret self.verbose = verbose self.cols = cols self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown self.handle_missing = handle_missing self._dim = None @@ -138,7 +136,6 @@ def fit(self, X, y=None, **kwargs): X, mapping=self.mapping, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing ) @@ -193,7 +190,6 @@ def transform(self, X): X, mapping=self.mapping, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, handle_missing=self.handle_missing ) @@ -236,7 +232,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'value': + if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " @@ -256,7 +252,7 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values @staticmethod - def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_unknown='value', handle_missing='valie'): + def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='valie'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes @@ -281,13 +277,12 @@ def ordinal_encoding(X_in, mapping=None, cols=None, impute_missing=True, handle_ except ValueError as e: X[column] = X[column].astype(float) - if impute_missing: - if handle_unknown == 'value': - X[column].fillna(0, inplace=True) - elif handle_unknown == 'error': - missing = X[column].isnull() - if any(missing): - raise ValueError('Unexpected categories found in column %s' % column) + if handle_unknown == 'value': + X[column].fillna(0, inplace=True) + elif handle_unknown == 'error': + missing = X[column].isnull() + if any(missing): + raise ValueError('Unexpected categories found in column %s' % column) if handle_missing == 'return_nan': X[column] = X[column].map(return_nan_series).where(X[column] == -2, X[column]) diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index 81280d7b..6c26f21d 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -129,7 +129,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index 4263db7b..7de2ba4c 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -129,7 +129,6 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 10a4f13c..7da6a9dd 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -20,7 +20,7 @@ class TestOrdinalEncoder(TestCase): def test_ordinal(self): - enc = encoders.OrdinalEncoder(verbose=1, return_df=True, impute_missing=True) + enc = encoders.OrdinalEncoder(verbose=1, return_df=True) enc.fit(X) out = enc.transform(X_t) self.assertEqual(len(set(out['extra'].values)), 4) @@ -28,14 +28,14 @@ def test_ordinal(self): self.assertFalse(enc.mapping is None) self.assertTrue(len(enc.mapping) > 0) - enc = encoders.OrdinalEncoder(verbose=1, mapping=enc.mapping, return_df=True, impute_missing=True) + enc = encoders.OrdinalEncoder(verbose=1, mapping=enc.mapping, return_df=True) enc.fit(X) out = enc.transform(X_t) self.assertEqual(len(set(out['extra'].values)), 4) self.assertIn(0, set(out['extra'].values)) self.assertTrue(len(enc.mapping) > 0) - enc = encoders.OrdinalEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') + enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='ignore') enc.fit(X) out = enc.transform(X_t) out_cats = [x for x in set(out['extra'].values) if np.isfinite(x)] @@ -47,13 +47,13 @@ def test_ordinal_dist(self): ['apple', None], ['peach', 'lemon'] ]) - encoder = encoders.OrdinalEncoder(impute_missing=True) + encoder = encoders.OrdinalEncoder() encoder.fit(data) a = encoder.transform(data) self.assertEqual(a.values[0, 1], 0) self.assertEqual(a.values[1, 1], 1) - encoder = encoders.OrdinalEncoder(impute_missing=False) + encoder = encoders.OrdinalEncoder(handle_missing='return_nan') encoder.fit(data) a = encoder.transform(data) self.assertTrue(np.isnan(a.values[0, 1])) From 162097cf7b62d983452db425422fb1ca992615c1 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Thu, 1 Nov 2018 21:57:47 -0700 Subject: [PATCH 06/40] Make Ordinal Encoder return -2 at transform time if and only if nan is not a category during fit --- category_encoders/backward_difference.py | 4 +++- category_encoders/basen.py | 4 +++- category_encoders/binary.py | 4 +++- category_encoders/helmert.py | 4 +++- category_encoders/one_hot.py | 4 +++- category_encoders/ordinal.py | 18 +++++++++--------- category_encoders/polynomial.py | 4 +++- category_encoders/sum_coding.py | 4 +++- category_encoders/tests/test_ordinal.py | 10 +++++----- 9 files changed, 35 insertions(+), 21 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 99d8f4fb..9f971557 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -131,7 +131,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 9061236a..8d844b30 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -125,7 +125,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 0b62a1c4..65e50501 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -118,7 +118,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) X = X.drop_duplicates(subset=self.cols) if self.cols else X self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 15f70c6e..154d32d9 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -128,7 +128,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index b0b6164a..9570a78a 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -137,7 +137,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 60634a11..58695d6a 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -36,7 +36,7 @@ class OrdinalEncoder(BaseEstimator, TransformerMixin): options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the category -1. handle_missing: str options are 'error', 'return_nan', and 'value, default to 'value', which treat nan as a category at fit time, - or 0 at transform time if nan is not a category during fit. + or -2 at transform time if nan is not a category during fit. Example ------- @@ -294,20 +294,20 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand if util.is_category(X[col].dtype): categories = X[col].cat.categories else: - categories = [x for x in pd.unique(X[col].values) if x is not None] + categories = [x if x is not None else np.nan for x in pd.unique(X[col].values)] - index = [] - values = [] + data = {} - for i in range(len(categories)): - index.append(categories[i]) - values.append(i + 1) + if handle_missing == 'value': + data[np.nan] = -2 - mapping = pd.Series(data=values, index=index) + for i in range(len(categories)): + data[categories[i]] = i + 1 if handle_missing == 'return_nan': - mapping[np.nan] = -2 + data[np.nan] = -2 + mapping = pd.Series(data) mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) return X, mapping_out diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index 6c26f21d..89f87d2f 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -129,7 +129,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index 7de2ba4c..97d6b96f 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -129,7 +129,9 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown + handle_unknown=self.handle_unknown, + # TODO Properly set handle missing when it's implemented here + handle_missing='ignore' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 7da6a9dd..35f1009a 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -50,14 +50,14 @@ def test_ordinal_dist(self): encoder = encoders.OrdinalEncoder() encoder.fit(data) a = encoder.transform(data) - self.assertEqual(a.values[0, 1], 0) - self.assertEqual(a.values[1, 1], 1) + self.assertEqual(a.values[0, 1], 1) + self.assertEqual(a.values[1, 1], 2) encoder = encoders.OrdinalEncoder(handle_missing='return_nan') encoder.fit(data) a = encoder.transform(data) self.assertTrue(np.isnan(a.values[0, 1])) - self.assertEqual(a.values[1, 1], 1) + self.assertEqual(a.values[1, 1], 2.0) def test_pandas_categorical(self): X = pd.DataFrame({ @@ -82,7 +82,7 @@ def test_handle_missing_have_nan_fit_time_expect_as_category(self): self.assertListEqual([1, 2], out['city'].tolist()) - def test_handle_missing_have_nan_transform_time_expect_zero(self): + def test_handle_missing_have_nan_transform_time_expect_negative_2(self): train = pd.DataFrame({'city': ['chicago', 'st louis']}) test = pd.DataFrame({'city': ['chicago', np.nan]}) @@ -90,7 +90,7 @@ def test_handle_missing_have_nan_transform_time_expect_zero(self): enc.fit(train) out = enc.transform(test) - self.assertListEqual([1, 0], out['city'].tolist()) + self.assertListEqual([1, -2], out['city'].tolist()) def test_handle_unknown_have_nan_fit_time_return_nan(self): train = pd.DataFrame({'city': ['chicago', np.nan]}) From b1c0717234eeae5122e1cd08af8059cb90f9b86e Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 2 Nov 2018 15:46:49 -0700 Subject: [PATCH 07/40] Refactor handle missing error test --- category_encoders/tests/test_encoders.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index 14b2b7a0..91917665 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -129,19 +129,19 @@ def test_handle_unknown_error(self): _ = enc.transform(X_t) def test_handle_missing_error(self): - non_null = pd.DataFrame({'city': ['chicago', 'los angeles']}) - has_null = pd.DataFrame({'city': ['chicago', np.nan]}) + non_null = pd.DataFrame({'city': ['chicago', 'los angeles'], 'color': ['red', np.nan]}) # only 'city' column is going to be transformed + has_null = pd.DataFrame({'city': ['chicago', np.nan], 'color': ['red', np.nan]}) y = pd.Series([1, 0]) # TODO - implement for all encoders for encoder_name in ['OrdinalEncoder']: with self.subTest(encoder_name=encoder_name): - enc = getattr(encoders, encoder_name)(handle_missing='error') + enc = getattr(encoders, encoder_name)(handle_missing='error', cols='city') with self.assertRaises(ValueError): - enc.fit(has_null) + enc.fit(has_null, y) - enc.fit(non_null) + enc.fit(non_null, y) # we raise an error only if a missing value is in one of the transformed columns with self.assertRaises(ValueError): enc.transform(has_null) From ab105b2c4765bab6e1f34ed2f763d3306e42a408 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 2 Nov 2018 15:48:24 -0700 Subject: [PATCH 08/40] In test ordinal dist, check every value --- category_encoders/tests/test_ordinal.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 35f1009a..84657a53 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -44,20 +44,19 @@ def test_ordinal(self): def test_ordinal_dist(self): data = np.array([ - ['apple', None], - ['peach', 'lemon'] + ['apple', 'lemon'], + ['peach', None] ]) encoder = encoders.OrdinalEncoder() - encoder.fit(data) - a = encoder.transform(data) - self.assertEqual(a.values[0, 1], 1) - self.assertEqual(a.values[1, 1], 2) + result = encoder.fit_transform(data) + self.assertEqual(2, len(result[0].unique()), "We expect two unique values in the column") + self.assertEqual(2, len(result[1].unique()), "We expect two unique values in the column") + self.assertFalse(np.isnan(result.values[1, 1])) encoder = encoders.OrdinalEncoder(handle_missing='return_nan') - encoder.fit(data) - a = encoder.transform(data) - self.assertTrue(np.isnan(a.values[0, 1])) - self.assertEqual(a.values[1, 1], 2.0) + result = encoder.fit_transform(data) + self.assertEqual(2, len(result[0].unique()), "We expect two unique values in the column") + self.assertEqual(2, len(result[1].unique()), "We expect two unique values in the column") def test_pandas_categorical(self): X = pd.DataFrame({ From 8971f18e58080a109abd54e2379a1686f4b661f6 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Wed, 7 Nov 2018 21:32:51 -0800 Subject: [PATCH 09/40] Convert encoders that use multi column outputs to support the ordinal encoder changes for the new handle missing and handle unknown --- category_encoders/backward_difference.py | 49 ++++++++++++----- category_encoders/binary.py | 23 ++++---- category_encoders/helmert.py | 53 +++++++++++++------ category_encoders/ordinal.py | 2 +- category_encoders/polynomial.py | 48 ++++++++++++----- category_encoders/sum_coding.py | 48 ++++++++++++----- .../tests/test_backward_difference.py | 12 ++--- category_encoders/tests/test_encoders.py | 19 ++++--- category_encoders/tests/test_helmert.py | 12 ++--- category_encoders/tests/test_ordinal.py | 19 +++++-- category_encoders/tests/test_polynomial.py | 12 ++--- category_encoders/tests/test_sum_coding.py | 12 ++--- 12 files changed, 206 insertions(+), 103 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 9f971557..699f6d66 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -24,12 +24,14 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. + handle_missing: str + options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes + unexpected changes in dimension in some cases. Example ------- @@ -82,14 +84,15 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -127,13 +130,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -141,8 +147,8 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').get_values() - column_mapping = self.fit_backward_difference_coding(values) + values = switch.get('mapping') + column_mapping = self.fit_backward_difference_coding(values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -172,6 +178,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -186,6 +196,11 @@ def transform(self, X): return X X = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.backward_difference_coding(X, mapping=self.mapping) if self.drop_invariant: @@ -198,14 +213,22 @@ def transform(self, X): return X.values @staticmethod - def fit_backward_difference_coding(values): + def fit_backward_difference_coding(values, handle_missing, handle_unknown): + if handle_missing == 'value': + del values[np.nan] + if len(values) < 2: return pd.DataFrame() - backwards_difference_matrix = Diff().code_without_intercept(values) + backwards_difference_matrix = Diff().code_without_intercept(values.get_values()) df = pd.DataFrame(data=backwards_difference_matrix.matrix, columns=backwards_difference_matrix.column_suffixes) df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values) - 1) + return df @staticmethod @@ -226,7 +249,7 @@ def backward_difference_coding(X_in, mapping): for i in range(len(mod.columns)): c = mod.columns[i] new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values + X.loc[:, new_col] = mod[c].loc[X[col]].values new_columns.append(new_col) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = new_columns diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 65e50501..97ded0c9 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -24,12 +24,14 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. + handle_missing: str + options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes + unexpected changes in dimension in some cases. Example ------- @@ -69,14 +71,14 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='value'): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -118,9 +120,8 @@ def fit(self, X, y=None, **kwargs): self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) X = X.drop_duplicates(subset=self.cols) if self.cols else X self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -168,6 +169,10 @@ def transform(self, X): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.binary(X, cols=self.cols) if self.drop_invariant: @@ -213,7 +218,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'value': + if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 154d32d9..9a7fc9e7 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -25,10 +25,12 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes + unexpected changes in dimension in some cases. + handle_missing: str + options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can causes unexpected changes in dimension in some cases. @@ -82,14 +84,15 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='indicator', handle_missing='indicator'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -125,12 +128,15 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -139,8 +145,8 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').get_values() - column_mapping = self.fit_helmert_coding(values) + values = switch.get('mapping') + column_mapping = self.fit_helmert_coding(values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -169,6 +175,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -184,6 +194,10 @@ def transform(self, X): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.helmert_coding(X, mapping=self.mapping) if self.drop_invariant: @@ -196,14 +210,22 @@ def transform(self, X): return X.values @staticmethod - def fit_helmert_coding(values): + def fit_helmert_coding(values, handle_missing, handle_unknown): + if handle_missing == 'value': + del values[np.nan] + if len(values) < 2: return pd.DataFrame() - helmert_contrast_matrix = Helmert().code_without_intercept(values) + helmert_contrast_matrix = Helmert().code_without_intercept(values.get_values()) df = pd.DataFrame(data=helmert_contrast_matrix.matrix, columns=helmert_contrast_matrix.column_suffixes) df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values) - 1) + return df @staticmethod @@ -224,9 +246,8 @@ def helmert_coding(X_in, mapping): for i in range(len(mod.columns)): c = mod.columns[i] new_col = str(col) + '_%d' % (i, ) - # TODO Use https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike to - # set new values - X[new_col] = mod[c].loc[X[col]].values + + X.loc[:, new_col] = mod[c].loc[X[col]].values new_columns.append(new_col) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = new_columns diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 58695d6a..dd570a72 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -278,7 +278,7 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand X[column] = X[column].astype(float) if handle_unknown == 'value': - X[column].fillna(0, inplace=True) + X[column].fillna(-1, inplace=True) elif handle_unknown == 'error': missing = X[column].isnull() if any(missing): diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index 89f87d2f..ece37fa2 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -24,12 +24,14 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. + handle_missing: str + options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes + unexpected changes in dimension in some cases. Example ------- @@ -81,14 +83,15 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -125,13 +128,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -139,8 +145,8 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').get_values() - column_mapping = self.fit_polynomial_coding(values) + values = switch.get('mapping') + column_mapping = self.fit_polynomial_coding(values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -170,6 +176,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -185,6 +195,10 @@ def transform(self, X): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.polynomial_coding(X, self.mapping) if self.drop_invariant: @@ -197,14 +211,22 @@ def transform(self, X): return X.values @staticmethod - def fit_polynomial_coding(values): + def fit_polynomial_coding(values, handle_missing, handle_unknown): + if handle_missing == 'value': + del values[np.nan] + if len(values) < 2: return pd.DataFrame() - polynomial_contrast_matrix = Poly().code_without_intercept(values) + polynomial_contrast_matrix = Poly().code_without_intercept(values.get_values()) df = pd.DataFrame(data=polynomial_contrast_matrix.matrix, columns=polynomial_contrast_matrix.column_suffixes) df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values) - 1) + return df @staticmethod @@ -225,7 +247,7 @@ def polynomial_coding(X_in, mapping): for i in range(len(mod.columns)): c = mod.columns[i] new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values + X.loc[:, new_col] = mod[c].loc[X[col]].values new_columns.append(new_col) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = new_columns diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index 97d6b96f..e476f496 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -24,12 +24,14 @@ class SumEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. + handle_missing: str + options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + an extra column will be added in if the transform matrix has unknown categories. This can causes + unexpected changes in dimension in some cases. Example ------- @@ -81,14 +83,15 @@ class SumEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose self.mapping = mapping - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing=handle_missing self.cols = cols self.ordinal_encoder = None self._dim = None @@ -125,13 +128,16 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) @@ -139,8 +145,8 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping').tolist() - column_mapping = self.fit_sum_coding(values) + values = switch.get('mapping') + column_mapping = self.fit_sum_coding(values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -170,6 +176,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -185,6 +195,10 @@ def transform(self, X): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.sum_coding(X, mapping=self.mapping) if self.drop_invariant: @@ -197,14 +211,22 @@ def transform(self, X): return X.values @staticmethod - def fit_sum_coding(values): + def fit_sum_coding(values, handle_missing, handle_unknown): + if handle_missing == 'value': + del values[np.nan] + if len(values) < 2: return pd.DataFrame() - sum_contrast_matrix = Sum().code_without_intercept(values) + sum_contrast_matrix = Sum().code_without_intercept(values.tolist()) df = pd.DataFrame(data=sum_contrast_matrix.matrix, columns=sum_contrast_matrix.column_suffixes) df.index += 1 - df.loc[0] = np.zeros(len(values) - 1) + + if handle_unknown == 'return_nan': + df.loc[-1] = np.nan + elif handle_unknown == 'value': + df.loc[-1] = np.zeros(len(values) - 1) + return df @staticmethod @@ -225,7 +247,7 @@ def sum_coding(X_in, mapping): for i in range(len(mod.columns)): c = mod.columns[i] new_col = str(col) + '_%d' % (i, ) - X[new_col] = mod[c].loc[X[col]].values + X.loc[:, new_col] = mod[c].loc[X[col]].values new_columns.append(new_col) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = new_columns diff --git a/category_encoders/tests/test_backward_difference.py b/category_encoders/tests/test_backward_difference.py index a3fb7cef..d920f6be 100644 --- a/category_encoders/tests/test_backward_difference.py +++ b/category_encoders/tests/test_backward_difference.py @@ -10,7 +10,7 @@ def test_backwards_difference_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -23,7 +23,7 @@ def test_backwards_difference_encoder_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -36,7 +36,7 @@ def test_backwards_difference_encoder_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -50,7 +50,7 @@ def test_backwards_difference_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -63,7 +63,7 @@ def test_backwards_difference_encoder_preserve_dimension_4(self): def test_backwards_difference_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -80,7 +80,7 @@ def test_backwards_difference_encoder_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.BackwardDifferenceEncoder() + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index 91917665..f0091651 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -134,7 +134,7 @@ def test_handle_missing_error(self): y = pd.Series([1, 0]) # TODO - implement for all encoders - for encoder_name in ['OrdinalEncoder']: + for encoder_name in ['OrdinalEncoder', 'HelmertEncoder', 'BackwardDifferenceEncoder', 'PolynomialEncoder', 'SumEncoder']: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='error', cols='city') @@ -151,13 +151,17 @@ def test_handle_unknown_return_nan(self): y = pd.Series([1, 0]) # TODO - implement for all encoders - for encoder_name in ['OrdinalEncoder']: + for encoder_name in ['OrdinalEncoder', 'HelmertEncoder', 'BackwardDifferenceEncoder', 'PolynomialEncoder', 'SumEncoder']: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') enc.fit(train, y) - result = enc.transform(test) - self.assertTrue(result.iloc[1, :].isnull().all()) + result = enc.transform(test).iloc[1, :] + + if len(result) == 1: + self.assertTrue(result.isnull().all()) + else: + self.assertTrue(result[1:].isnull().all()) def test_handle_unknown_value(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) @@ -165,7 +169,7 @@ def test_handle_unknown_value(self): y = pd.Series([1, 0]) # TODO - implement for all encoders - for encoder_name in ['OrdinalEncoder']: + for encoder_name in ['OrdinalEncoder', 'HelmertEncoder', 'BackwardDifferenceEncoder', 'PolynomialEncoder', 'SumEncoder']: with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='value') @@ -216,11 +220,6 @@ def test_inverse_transform(self): enc.fit(X) tu.verify_inverse_transform(X_t, enc.inverse_transform(enc.transform(X_t))) - # when a new value is encountered, do not raise an exception - enc = getattr(encoders, encoder_name)(verbose=1, cols=cols) - enc.fit(X, y) - _ = enc.inverse_transform(enc.transform(X_t_extra)) - def test_types(self): X = pd.DataFrame({ 'Int': [1, 2, 1, 2], diff --git a/category_encoders/tests/test_helmert.py b/category_encoders/tests/test_helmert.py index cb3fb52a..fd4c344f 100644 --- a/category_encoders/tests/test_helmert.py +++ b/category_encoders/tests/test_helmert.py @@ -10,7 +10,7 @@ def test_helmert_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -23,7 +23,7 @@ def test_helmert_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -36,7 +36,7 @@ def test_helmert_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -50,7 +50,7 @@ def test_helmert_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -63,7 +63,7 @@ def test_helmert_preserve_dimension_4(self): def test_helmert_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -80,7 +80,7 @@ def test_helmert_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.HelmertEncoder() + encoder = encoders.HelmertEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 84657a53..225177b9 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -24,7 +24,7 @@ def test_ordinal(self): enc.fit(X) out = enc.transform(X_t) self.assertEqual(len(set(out['extra'].values)), 4) - self.assertIn(0, set(out['extra'].values)) + self.assertIn(-1, set(out['extra'].values)) self.assertFalse(enc.mapping is None) self.assertTrue(len(enc.mapping) > 0) @@ -32,7 +32,7 @@ def test_ordinal(self): enc.fit(X) out = enc.transform(X_t) self.assertEqual(len(set(out['extra'].values)), 4) - self.assertIn(0, set(out['extra'].values)) + self.assertIn(-1, set(out['extra'].values)) self.assertTrue(len(enc.mapping) > 0) enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='ignore') @@ -91,7 +91,7 @@ def test_handle_missing_have_nan_transform_time_expect_negative_2(self): self.assertListEqual([1, -2], out['city'].tolist()) - def test_handle_unknown_have_nan_fit_time_return_nan(self): + def test_handle_missing_have_nan_fit_time_return_nan(self): train = pd.DataFrame({'city': ['chicago', np.nan]}) enc = encoders.OrdinalEncoder(handle_missing='return_nan') @@ -101,7 +101,7 @@ def test_handle_unknown_have_nan_fit_time_return_nan(self): self.assertEqual(1.0, out[0]) self.assertTrue(np.isnan(out[1])) - def test_handle_unknown_have_nan_transform_time_return_nan(self): + def test_handle_missing_have_nan_transform_time_return_nan(self): train = pd.DataFrame({'city': ['chicago', 'st louis']}) test = pd.DataFrame({'city': ['chicago', np.nan]}) @@ -112,3 +112,14 @@ def test_handle_unknown_have_nan_transform_time_return_nan(self): self.assertEqual(2, len(out)) self.assertEqual(1.0, out[0]) self.assertTrue(np.isnan(out[1])) + + def test_handle_unknown_have_new_value_expect_negative_1(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + expected = [1.0, -1.0] + + enc = encoders.OrdinalEncoder(handle_missing='return_nan') + enc.fit(train) + result = enc.transform(test)['city'].tolist() + + self.assertEqual(expected, result) diff --git a/category_encoders/tests/test_polynomial.py b/category_encoders/tests/test_polynomial.py index 3690072c..426c9dad 100644 --- a/category_encoders/tests/test_polynomial.py +++ b/category_encoders/tests/test_polynomial.py @@ -16,7 +16,7 @@ def test_polynomial_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -29,7 +29,7 @@ def test_polynomial_encoder_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -42,7 +42,7 @@ def test_polynomial_encoder_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -56,7 +56,7 @@ def test_polynomial_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -69,7 +69,7 @@ def test_polynomial_encoder_preserve_dimension_4(self): def test_polynomial_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -86,7 +86,7 @@ def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.PolynomialEncoder() + encoder = encoders.PolynomialEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values diff --git a/category_encoders/tests/test_sum_coding.py b/category_encoders/tests/test_sum_coding.py index caa7f206..e52387b0 100644 --- a/category_encoders/tests/test_sum_coding.py +++ b/category_encoders/tests/test_sum_coding.py @@ -14,7 +14,7 @@ def test_sum_encoder_preserve_dimension_1(self): train = ['A', 'B', 'C'] test = ['A', 'D', 'E'] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -27,7 +27,7 @@ def test_sum_encoder_preserve_dimension_2(self): train = ['A', 'B', 'C'] test = ['B', 'D', 'E'] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -40,7 +40,7 @@ def test_sum_encoder_preserve_dimension_3(self): train = ['A', 'B', 'C'] test = ['A', 'B', 'C', None] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -54,7 +54,7 @@ def test_sum_encoder_preserve_dimension_4(self): train = ['A', 'B', 'C'] test = ['D', 'B', 'C', None] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) test_t = encoder.transform(test) @@ -67,7 +67,7 @@ def test_sum_encoder_preserve_dimension_4(self): def test_sum_encoder_2cols(self): train = [['A', 'A'], ['B', 'B'], ['C', 'C']] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) obtained = encoder.transform(train) @@ -84,7 +84,7 @@ def test_sum_encoder_2StringCols_ExpectCorrectOrder(self): }, columns=['col1', 'col2', 'col3', 'col4']) expected_columns = ['intercept', 'col1', 'col2_0', 'col2_1', 'col2_2', 'col3', 'col4_0', 'col4_1'] - encoder = encoders.SumEncoder() + encoder = encoders.SumEncoder(handle_unknown='value', handle_missing='value') encoder.fit(train) columns = encoder.transform(train).columns.values From 3526a6d7d5005d0cb126a39c894ad7f46b441260 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 17 Nov 2018 17:18:40 -0800 Subject: [PATCH 10/40] Make all encoders handle error for handle unknown and handle missing, rewrote the binary and basen encoders to generate a mapping before hand --- category_encoders/basen.py | 98 ++++++++++++------- category_encoders/binary.py | 83 ++++++++++------ category_encoders/leave_one_out.py | 30 +++--- category_encoders/one_hot.py | 74 ++++++++++---- category_encoders/target_encoder.py | 31 +++--- category_encoders/tests/test_basen.py | 18 ++++ category_encoders/tests/test_encoders.py | 13 +-- category_encoders/tests/test_leave_one_out.py | 2 +- category_encoders/tests/test_one_hot.py | 57 +++++++++-- category_encoders/tests/test_ordinal.py | 9 ++ category_encoders/tests/test_woe.py | 2 +- category_encoders/woe.py | 29 +++--- 12 files changed, 307 insertions(+), 139 deletions(-) create mode 100644 category_encoders/tests/test_basen.py diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 8d844b30..bfa6ce9c 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -29,8 +29,6 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). base: int when the downstream model copes well with nonlinearities (like decision tree), use higher base. - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can causes @@ -74,20 +72,19 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, base=2, impute_missing=True, - handle_unknown='value'): + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, base=2, + handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] self.verbose = verbose - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.cols = cols + self.mapping = mapping self.ordinal_encoder = None self._dim = None self.base = base - self._encoded_columns = None - self.digits_per_col = {} def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. @@ -121,22 +118,23 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) - for col in self.cols: - self.digits_per_col[col] = self.calc_required_digits(X, col) + self.mapping = self.fit_base_n_encoding(X) # do a transform on the training data to get a column list X_t = self.transform(X, override_return_df=True) - self._encoded_columns = X_t.columns.values # drop all output columns with 0 variance. if self.drop_invariant: @@ -147,6 +145,37 @@ def fit(self, X, y=None, **kwargs): return self + def fit_base_n_encoding(self, X): + mappings_out = [] + + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') + + if self.handle_missing == 'value': + del values[np.nan] + + if len(values) < 2: + return pd.DataFrame() + + digits = self.calc_required_digits(X, col) + X_unique = pd.DataFrame(index=values) + + X_unique_to_cols = X_unique.index.map(lambda x: self.col_transform(x, digits)) + + for dig in range(digits): + X_unique[str(col) + '_%d' % (dig,)] = X_unique_to_cols.map( + lambda r: int(r[dig]) if r is not None else None) + + if self.handle_unknown == 'return_nan': + X_unique.loc[-1] = np.nan + elif self.handle_unknown == 'value': + X_unique.loc[-1] = 0 + + mappings_out.append({'col': col, 'mapping': X_unique}) + + return mappings_out + def transform(self, X, override_return_df=False): """Perform the transformation to new categorical data. @@ -163,6 +192,10 @@ def transform(self, X, override_return_df=False): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -177,6 +210,11 @@ def transform(self, X, override_return_df=False): return X X_out = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X_out[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X_out = self.basen_encode(X_out, cols=self.cols) if self.drop_invariant: @@ -184,8 +222,8 @@ def transform(self, X, override_return_df=False): X_out.drop(col, 1, inplace=True) # impute missing values only in the generated columns - generated_cols = util.get_generated_cols(X, X_out, self.cols) - X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0) + # generated_cols = util.get_generated_cols(X, X_out, self.cols) + # X_out[generated_cols] = X_out[generated_cols].fillna(value=0.0) if self.return_df or override_return_df: return X_out @@ -228,7 +266,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'value': + if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " @@ -266,28 +304,20 @@ def basen_encode(self, X_in, cols=None): X = X_in.copy(deep=True) - if cols is None: - cols = X.columns.values - pass_thru = [] - else: - pass_thru = [col for col in X.columns.values if col not in cols] + cols = X.columns.values.tolist() - bin_cols = [] - for col in cols: - # get how many digits we need to represent the classes present - digits = self.calc_required_digits(X, col) + for switch in self.mapping: + col = switch.get('col') + mod = switch.get('mapping') - # map the ordinal column into a list of these digits, of length digits - X[col] = X[col].map(lambda x: self.col_transform(x, digits)) + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) - for dig in range(digits): - X[str(col) + '_%d' % (dig,)] = X[col].map(lambda r: int(r[dig]) if r is not None else None) - bin_cols.append(str(col) + '_%d' % (dig,)) + old_column_index = cols.index(col) + cols[old_column_index: old_column_index + 1] = mod.columns - if self._encoded_columns is None: - X = X.reindex(columns=bin_cols + pass_thru) - else: - X = X.reindex(columns=self._encoded_columns) + X = X.reindex(columns=cols) return X diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 97ded0c9..16513ca8 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -71,7 +71,7 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value'): self.return_df = return_df self.drop_invariant = drop_invariant @@ -80,9 +80,9 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, self.handle_unknown = handle_unknown self.handle_missing = handle_missing self.cols = cols + self.mapping = mapping self.ordinal_encoder = None self._dim = None - self.digits_per_col = {} def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. @@ -116,6 +116,10 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # train an ordinal pre-encoder self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, @@ -126,8 +130,7 @@ def fit(self, X, y=None, **kwargs): X = X.drop_duplicates(subset=self.cols) if self.cols else X self.ordinal_encoder = self.ordinal_encoder.fit(X) - for col in self.cols: - self.digits_per_col[col] = self.calc_required_digits(X, col) + self.mapping = self.fit_binary_encoding(X) # drop all output columns with 0 variance. if self.drop_invariant: @@ -138,6 +141,37 @@ def fit(self, X, y=None, **kwargs): return self + def fit_binary_encoding(self, X): + mappings_out = [] + + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') + + if self.handle_missing == 'value': + del values[np.nan] + + if len(values) < 2: + return pd.DataFrame() + + digits = self.calc_required_digits(X, col) + X_unique = pd.DataFrame(index=values) + + X_unique_to_cols = X_unique.index.map(lambda x: self.col_transform(x, digits)) + + for dig in range(digits): + X_unique[str(col) + '_%d' % (dig,)] = X_unique_to_cols.map( + lambda r: int(r[dig]) if r is not None else None) + + if self.handle_unknown == 'return_nan': + X_unique.loc[-1] = np.nan + elif self. handle_unknown == 'value': + X_unique.loc[-1] = 0 + + mappings_out.append({'col': col, 'mapping': X_unique}) + + return mappings_out + def transform(self, X): """Perform the transformation to new categorical data. @@ -154,6 +188,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -173,7 +211,7 @@ def transform(self, X): if X[self.cols].isin([-1]).any().any(): raise ValueError('Columns to be encoded can not contain new values') - X = self.binary(X, cols=self.cols) + X = self.binary(X) if self.drop_invariant: for col in self.drop_cols: @@ -231,7 +269,7 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values - def binary(self, X_in, cols=None): + def binary(self, X_in): """ Binary encoding encodes the integers as binary code with one column per digit. @@ -247,33 +285,20 @@ def binary(self, X_in, cols=None): X = X_in.copy(deep=True) - if cols is None: - cols = X.columns.values - pass_thru = [] - else: - pass_thru = [col for col in X.columns.values if col not in cols] - - output = [] - bin_cols = [] - for col in cols: - # get how many digits we need to represent the classes present - digits = self.digits_per_col[col] + cols = X.columns.values.tolist() - X_unique = pd.DataFrame(index=X[col].unique()) - # map the ordinal column into a list of these digits, of length digits - X_unique_to_cols = X_unique.index.map(lambda x: self.col_transform(x, digits)) + for switch in self.mapping: + col = switch.get('col') + mod = switch.get('mapping') - for dig in range(digits): - X_unique[str(col) + '_%d' % (dig, )] = X_unique_to_cols.map( - lambda r: int(r[dig]) if r is not None else None) - bin_cols.append(str(col) + '_%d' % (dig,)) + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) - output.append(X[[col]].merge( - X_unique, how='left', left_on=col, right_index=True).drop(labels=col, axis=1)) + old_column_index = cols.index(col) + cols[old_column_index: old_column_index + 1] = mod.columns - if pass_thru: - output.append(X[pass_thru]) - X = pd.concat(output, axis=1).reindex(columns=bin_cols + pass_thru) + X = X.reindex(columns=cols) return X diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 14510d84..af32fa31 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -26,8 +26,6 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value', which will impute the target mean. sigma: float @@ -72,8 +70,8 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): https://www.kaggle.com/c/caterpillar-tube-pricing/discussion/15748#143154. """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='value', random_state=None, sigma=None): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value', random_state=None, sigma=None): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -82,8 +80,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.cols = cols self._dim = None self.mapping = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self._mean = None self.random_state = random_state self.sigma = sigma @@ -125,6 +123,10 @@ def fit(self, X, y, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + categories = self.fit_leave_one_out( X, y, cols=self.cols @@ -159,6 +161,10 @@ def transform(self, X, y=None): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -183,7 +189,6 @@ def transform(self, X, y=None): X = self.transform_leave_one_out( X, y, mapping=self.mapping, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown ) @@ -214,7 +219,7 @@ def fit_leave_one_out(self, X_in, y, cols=None): self._mean = y.mean() return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols} - def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, handle_unknown='value'): + def transform_leave_one_out(self, X_in, y, mapping=None, handle_unknown='value'): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ @@ -234,12 +239,11 @@ def transform_leave_one_out(self, X_in, y, mapping=None, impute_missing=True, ha # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) - if impute_missing: - if handle_unknown == 'value': - X[col].fillna(self._mean, inplace=True) - elif handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) + if handle_unknown == 'value': + X[col].fillna(self._mean, inplace=True) + elif handle_unknown == 'error': + if X[col].isnull().any(): + raise ValueError('Unexpected categories found in column %s' % col) if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 9570a78a..dc187a16 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -23,8 +23,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause @@ -41,15 +39,22 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) - >>> enc = OneHotEncoder(cols=['CHAS', 'RAD']).fit(X, y) + >>> enc = OneHotEncoder(cols=['CHAS', 'RAD'], handle_unknown='indicator').fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) RangeIndex: 506 entries, 0 to 505 Data columns (total 24 columns): + CRIM 506 non-null float64 + ZN 506 non-null float64 + INDUS 506 non-null float64 CHAS_1 506 non-null int64 CHAS_2 506 non-null int64 CHAS_-1 506 non-null int64 + NOX 506 non-null float64 + RM 506 non-null float64 + AGE 506 non-null float64 + DIS 506 non-null float64 RAD_1 506 non-null int64 RAD_2 506 non-null int64 RAD_3 506 non-null int64 @@ -60,13 +65,6 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): RAD_8 506 non-null int64 RAD_9 506 non-null int64 RAD_-1 506 non-null int64 - CRIM 506 non-null float64 - ZN 506 non-null float64 - INDUS 506 non-null float64 - NOX 506 non-null float64 - RM 506 non-null float64 - AGE 506 non-null float64 - DIS 506 non-null float64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 @@ -86,7 +84,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, handle_unknown='value', use_cat_names=False): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_missing='value', handle_unknown='value', use_cat_names=False): self.return_df = return_df self.drop_invariant = drop_invariant self.drop_cols = [] @@ -95,8 +94,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.cols = cols self.ordinal_encoder = None self._dim = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self.use_cat_names = use_cat_names @property @@ -134,12 +133,15 @@ def fit(self, X, y=None, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + self.ordinal_encoder = OrdinalEncoder( verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - # TODO Properly set handle missing when it's implemented here - handle_missing='ignore' + handle_unknown='value', + handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) self.mapping = self.generate_mapping() @@ -160,8 +162,9 @@ def generate_mapping(self): col = switch.get('col') column_mapping = switch.get('mapping').copy(deep=True) - if self.handle_unknown == 'value': - column_mapping = column_mapping.append(pd.Series(data=[-1], index=['-1'])) + # TODO test with nan in dataset + if self.handle_missing == 'value': + del column_mapping[np.nan] col_mappings = [] for cat_name, class_ in column_mapping.iteritems(): @@ -174,6 +177,14 @@ def generate_mapping(self): n_col_name = str(col) + '_%s' % (class_,) col_mappings.append({'new_col_name': n_col_name, 'val': class_}) + if self.handle_unknown == 'indicator': + n_col_name = str(col) + '_%s' % (-1,) + if self.use_cat_names: + found_count = found_column_counts.get(n_col_name, 0) + found_column_counts[n_col_name] = found_count + 1 + n_col_name += '#' * found_count + + col_mappings.append({'new_col_name': n_col_name, 'val': -1}) mapping.append({'col': col, 'mapping': col_mappings}) return mapping @@ -193,6 +204,10 @@ def transform(self, X): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -208,6 +223,10 @@ def transform(self, X): X = self.ordinal_encoder.transform(X) + if self.handle_unknown == 'error': + if X[self.cols].isin([-1]).any().any(): + raise ValueError('Columns to be encoded can not contain new values') + X = self.get_dummies(X, mapping=self.mapping) if self.drop_invariant: @@ -253,7 +272,7 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.impute_missing and self.handle_unknown == 'value': + if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): raise ValueError("inverse_transform is not supported because transform impute " @@ -288,12 +307,29 @@ def get_dummies(self, X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') + + if len(mod) == 0: + continue + + base_matrix = np.eye(N=len(mod), dtype=np.int) + + index = [] new_columns = [] + for column_mapping in mod: new_col_name = column_mapping['new_col_name'] val = column_mapping['val'] - X[new_col_name] = (X[col] == val).astype(int) + index.append(val) new_columns.append(new_col_name) + + base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) + + if self.handle_unknown == 'value': + base_df.loc[-1] = np.zeros(len(mod)) + + base_df = base_df.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = new_columns diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 8afcbae0..55ea67d1 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -8,7 +8,7 @@ class TargetEncoder(BaseEstimator, TransformerMixin): - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, handle_missing='value', handle_unknown='value', min_samples_leaf=1, smoothing=1.0): """Target encoding for categorical features. For the case of categorical target: features are replaced with a blend of posterior probability of the target @@ -27,8 +27,6 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'valie', which will impute the target mean. min_samples_leaf: int @@ -83,8 +81,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.smoothing = float(smoothing) # Make smoothing a float so that python 2 does not treat as integer division self._dim = None self.mapping = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing=handle_missing self._mean = None def fit(self, X, y, **kwargs): @@ -119,11 +117,14 @@ def fit(self, X, y, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + _, self.mapping = self.target_encode( X, y, mapping=None, cols=self.cols, - impute_missing=self.impute_missing, handle_unknown=self.handle_unknown, smoothing_in=self.smoothing, min_samples_leaf=self.min_samples_leaf @@ -151,6 +152,10 @@ def transform(self, X, y=None): Transformed values with encoding applied. """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -176,8 +181,7 @@ def transform(self, X, y=None): X, y, mapping=self.mapping, cols=self.cols, - impute_missing=self.impute_missing, - handle_unknown=self.handle_unknown, + handle_unknown=self.handle_unknown, min_samples_leaf=self.min_samples_leaf, smoothing_in=self.smoothing ) @@ -200,7 +204,7 @@ def fit_transform(self, X, y=None, **fit_params): """ return self.fit(X, y, **fit_params).transform(X, y) - def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True, handle_unknown='value', min_samples_leaf=1, smoothing_in=1.0): + def target_encode(self, X_in, y, mapping=None, cols=None, handle_unknown='value', min_samples_leaf=1, smoothing_in=1.0): X = X_in.copy(deep=True) if cols is None: cols = X.columns.values @@ -208,12 +212,11 @@ def target_encode(self, X_in, y, mapping=None, cols=None, impute_missing=True, h if mapping is not None: for col in cols: X[col] = X[col].map(mapping[col]) - if impute_missing: - if handle_unknown == 'value': - X[col].fillna(self._mean, inplace=True) - elif handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) + if handle_unknown == 'value': + X[col].fillna(self._mean, inplace=True) + elif handle_unknown == 'error': + if X[col].isnull().any(): + raise ValueError('Unexpected categories found in column %s' % col) else: mapping = {} prior = self._mean = y.mean() diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py new file mode 100644 index 00000000..945bf8f3 --- /dev/null +++ b/category_encoders/tests/test_basen.py @@ -0,0 +1,18 @@ +import pandas as pd +from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ + +import category_encoders as encoders + + +class TestBaseNEncoder(TestCase): + + def test_fit_transform_have_base_2_expect_Correct_Encoding(self): + train = pd.Series(['a', 'b', 'c', 'd']) + + result = encoders.BaseNEncoder(base=2).fit_transform(train) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index f0091651..eae2ebf3 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -84,7 +84,7 @@ def test_impact_encoders(self): tu.verify_numeric(enc.transform(X_t, y_t)) # when we run transform(X, y) and there is a new value in X, something is wrong and we raise an error - enc = getattr(encoders, encoder_name)(impute_missing=True, handle_unknown='error', cols=['extra']) + enc = getattr(encoders, encoder_name)(handle_unknown='error', cols=['extra']) enc.fit(X, y) self.assertRaises(ValueError, enc.transform, (X_t, y_t)) @@ -133,8 +133,7 @@ def test_handle_missing_error(self): has_null = pd.DataFrame({'city': ['chicago', np.nan], 'color': ['red', np.nan]}) y = pd.Series([1, 0]) - # TODO - implement for all encoders - for encoder_name in ['OrdinalEncoder', 'HelmertEncoder', 'BackwardDifferenceEncoder', 'PolynomialEncoder', 'SumEncoder']: + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_missing='error', cols='city') @@ -150,8 +149,7 @@ def test_handle_unknown_return_nan(self): test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) - # TODO - implement for all encoders - for encoder_name in ['OrdinalEncoder', 'HelmertEncoder', 'BackwardDifferenceEncoder', 'PolynomialEncoder', 'SumEncoder']: + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') @@ -168,8 +166,7 @@ def test_handle_unknown_value(self): test = pd.DataFrame({'city': ['chicago', 'denver']}) y = pd.Series([1, 0]) - # TODO - implement for all encoders - for encoder_name in ['OrdinalEncoder', 'HelmertEncoder', 'BackwardDifferenceEncoder', 'PolynomialEncoder', 'SumEncoder']: + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded with self.subTest(encoder_name=encoder_name): enc = getattr(encoders, encoder_name)(handle_unknown='value') @@ -251,7 +248,7 @@ def test_preserve_column_order(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): - print(encoder_name) + encoder = getattr(encoders, encoder_name)() result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) columns = result.columns.values diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py index 12e2f5af..41fdac5e 100644 --- a/category_encoders/tests/test_leave_one_out.py +++ b/category_encoders/tests/test_leave_one_out.py @@ -54,7 +54,7 @@ def test_leave_one_out_unique(self): X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col']) y = np.array([1, 0, 1, 0, 1]) - encoder = encoders.LeaveOneOutEncoder(impute_missing=False) + encoder = encoders.LeaveOneOutEncoder(handle_unknown='value') result = encoder.fit(X, y).transform(X, y) self.assertFalse(result.isnull().any().any(), 'There should not be any missing value') diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index 6def3dc8..ee7f60b2 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -25,17 +25,17 @@ def test_one_hot(self): enc.transform(X_t[X_t['extra'] != 'A']).shape[1], 'We have to get the same count of columns') - enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True) + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='indicator') enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='ignore') + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore') enc.fit(X) out = enc.transform(X_t) self.assertEqual(len([x for x in out.columns.values if str(x).startswith('extra_')]), 3) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, impute_missing=True, handle_unknown='error') + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='error') enc.fit(X) with self.assertRaises(ValueError): out = enc.transform(X_t) @@ -45,7 +45,7 @@ def test_one_hot(self): out = enc.transform(X_t) self.assertIn('extra_A', out.columns.values) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True) + enc = encoders.OneHotEncoder(verbose=1, return_df=True, use_cat_names=True, handle_unknown='indicator') enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) @@ -53,7 +53,6 @@ def test_one_hot(self): # test inverse_transform X_i = tu.create_dataset(n_rows=100, has_none=False) X_i_t = tu.create_dataset(n_rows=50, has_none=False) - X_i_t_extra = tu.create_dataset(n_rows=50, extras=True, has_none=False) cols = ['underscore', 'none', 'extra', 321, 'categorical'] enc = encoders.OneHotEncoder(verbose=1, use_cat_names=True, cols=cols) @@ -62,7 +61,7 @@ def test_one_hot(self): tu.verify_inverse_transform(X_i_t, obtained) def test_fit_transform_HaveMissingValuesAndUseCatNames_ExpectCorrectValue(self): - encoder = encoders.OneHotEncoder(cols=[0], use_cat_names=True) + encoder = encoders.OneHotEncoder(cols=[0], use_cat_names=True, handle_unknown='indicator') result = encoder.fit_transform([[-1]]) @@ -87,10 +86,52 @@ def test_inverse_transform_HaveNoCatNames_ExpectCorrectInverseTransform(self): assert value.equals(inverse_transformed) def test_fit_transform_HaveColumnAppearTwice_ExpectColumnsDeduped(self): - encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=True) - value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series(-1)}) + encoder = encoders.OneHotEncoder(cols=['match', 'match_box'], use_cat_names=True, handle_unknown='indicator') + value = pd.DataFrame({'match': pd.Series('box_-1'), 'match_box': pd.Series('-1')}) result = encoder.fit_transform(value) columns = result.columns.tolist() self.assertSetEqual({'match_box_-1', 'match_-1', 'match_box_-1#', 'match_box_-1##'}, set(columns)) + + def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) + expected_result = pd.DataFrame({'city_1': [1.0, 0.0], 'city_2': [0.0, 0.0]}) + + enc = encoders.OneHotEncoder(handle_unknown='value') + result = enc.fit(train).transform(test) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_HaveHandleUnknownValueAndSeenValues_ExpectMappingUsed(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + expected_result = pd.DataFrame({'city_1': [1.0, 0.0], 'city_2': [0.0, 1.0]}) + + enc = encoders.OneHotEncoder(handle_unknown='value') + result = enc.fit(train).transform(train) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_HaveHandleUnknownIndicatorAndNoMissingValue_ExpectExtraColumn(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 1], + 'city_-1': [0, 0]}) + + enc = encoders.OneHotEncoder(handle_unknown='indicator') + result = enc.fit(train).transform(train) + + pd.testing.assert_frame_equal(expected_result, result) + + def test_fit_transform_HaveHandleUnknownIndicatorAndMissingValue_ExpectValueSet(self): + train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) + test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 0], + 'city_-1': [0, 1]}) + + enc = encoders.OneHotEncoder(handle_unknown='indicator') + result = enc.fit(train).transform(test) + + pd.testing.assert_frame_equal(expected_result, result) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 225177b9..e54d8d2b 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -123,3 +123,12 @@ def test_handle_unknown_have_new_value_expect_negative_1(self): result = enc.transform(test)['city'].tolist() self.assertEqual(expected, result) + + def test_HaveNegativeOneInTrain_ExpectCodedAsOne(self): + train = pd.DataFrame({'city': [-1]}) + expected = [1] + + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(expected, result) diff --git a/category_encoders/tests/test_woe.py b/category_encoders/tests/test_woe.py index c5de98ea..b25b687e 100644 --- a/category_encoders/tests/test_woe.py +++ b/category_encoders/tests/test_woe.py @@ -95,7 +95,7 @@ def test_woe(self): enc.fit(X_balanced, y_missing) # impute missing - enc = encoders.WOEEncoder(impute_missing=False) + enc = encoders.WOEEncoder(handle_missing='return_nan') enc.fit(X, np_y) X1 = enc.transform(X_t) tu.verify_numeric(X1) diff --git a/category_encoders/woe.py b/category_encoders/woe.py index 387be893..1c58d484 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -21,8 +21,6 @@ class WOEEncoder(BaseEstimator, TransformerMixin): boolean for whether or not to drop columns with 0 variance. return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). - impute_missing: bool - boolean for whether or not to apply the logic for handle_unknown, will be deprecated in the future. handle_unknown: str options are 'ignore', 'error' and 'value', defaults to 'value', which will assume WOE=0. randomized: bool, @@ -73,8 +71,8 @@ class WOEEncoder(BaseEstimator, TransformerMixin): """ - def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, impute_missing=True, - handle_unknown='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0): + def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, + handle_unknown='value', handle_missing='value', random_state=None, randomized=False, sigma=0.05, regularization=1.0): self.verbose = verbose self.return_df = return_df self.drop_invariant = drop_invariant @@ -82,8 +80,8 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, i self.cols = cols self._dim = None self.mapping = None - self.impute_missing = impute_missing self.handle_unknown = handle_unknown + self.handle_missing = handle_missing self._sum = None self._count = None self.random_state = random_state @@ -142,6 +140,10 @@ def fit(self, X, y, **kwargs): else: self.cols = util.convert_cols_to_list(self.cols) + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + # Training self.mapping = self._train(X, y, cols=self.cols) @@ -175,6 +177,10 @@ def transform(self, X, y=None): """ + if self.handle_missing == 'error': + if X[self.cols].isnull().any().bool(): + raise ValueError('Columns to be encoded can not contain null') + if self._dim is None: raise ValueError('Must train encoder before it can be used to transform data.') @@ -228,7 +234,7 @@ def _train(self, X, y, cols=None): mapping = {} # Calculate global statistics - self._sum = y.sum() + self._sum = y.sum() self._count = y.count() for col in cols: @@ -256,12 +262,11 @@ def _score(self, X, y): X[col] = X[col].map(self.mapping[col]) # Replace missing values only in the computed columns - if self.impute_missing: - if self.handle_unknown == 'value': - X[col].fillna(0, inplace=True) - elif self.handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) + if self.handle_unknown == 'value': + X[col].fillna(0, inplace=True) + elif self.handle_unknown == 'error': + if X[col].isnull().any(): + raise ValueError('Unexpected categories found in column %s' % col) # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: From 28c81a82d102dfd196c96fc308d08ad81d654089 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 17 Nov 2018 17:30:00 -0800 Subject: [PATCH 11/40] When creating expected dataframes set the column order in tests --- category_encoders/tests/test_one_hot.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index ee7f60b2..5cbc2f8f 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -97,7 +97,9 @@ def test_fit_transform_HaveColumnAppearTwice_ExpectColumnsDeduped(self): def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(self): train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) - expected_result = pd.DataFrame({'city_1': [1.0, 0.0], 'city_2': [0.0, 0.0]}) + expected_result = pd.DataFrame({'city_1': [1.0, 0.0], + 'city_2': [0.0, 0.0]}, + columns=['city_1', 'city_2']) enc = encoders.OneHotEncoder(handle_unknown='value') result = enc.fit(train).transform(test) @@ -106,7 +108,9 @@ def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(sel def test_fit_transform_HaveHandleUnknownValueAndSeenValues_ExpectMappingUsed(self): train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) - expected_result = pd.DataFrame({'city_1': [1.0, 0.0], 'city_2': [0.0, 1.0]}) + expected_result = pd.DataFrame({'city_1': [1.0, 0.0], + 'city_2': [0.0, 1.0]}, + columns=['city_1', 'city_2']) enc = encoders.OneHotEncoder(handle_unknown='value') result = enc.fit(train).transform(train) @@ -117,7 +121,8 @@ def test_fit_transform_HaveHandleUnknownIndicatorAndNoMissingValue_ExpectExtraCo train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) expected_result = pd.DataFrame({'city_1': [1, 0], 'city_2': [0, 1], - 'city_-1': [0, 0]}) + 'city_-1': [0, 0]}, + columns=['city_1', 'city_2', 'city_-1']) enc = encoders.OneHotEncoder(handle_unknown='indicator') result = enc.fit(train).transform(train) @@ -129,7 +134,8 @@ def test_fit_transform_HaveHandleUnknownIndicatorAndMissingValue_ExpectValueSet( test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) expected_result = pd.DataFrame({'city_1': [1, 0], 'city_2': [0, 0], - 'city_-1': [0, 1]}) + 'city_-1': [0, 1]}, + columns=['city_1', 'city_2', 'city_-1']) enc = encoders.OneHotEncoder(handle_unknown='indicator') result = enc.fit(train).transform(test) From 374ca541aaf62aba88a144acbbc7398ca3e995ef Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Mon, 26 Nov 2018 21:02:13 -0800 Subject: [PATCH 12/40] Added functionality to handle return_nan for train and test in handle_missing --- category_encoders/backward_difference.py | 7 +- category_encoders/basen.py | 10 +- category_encoders/binary.py | 7 +- category_encoders/helmert.py | 14 ++- category_encoders/leave_one_out.py | 29 ++++-- category_encoders/one_hot.py | 74 ++++++++------- category_encoders/ordinal.py | 21 ++--- category_encoders/polynomial.py | 7 +- category_encoders/sum_coding.py | 7 +- category_encoders/target_encoder.py | 91 +++++++++++-------- category_encoders/tests/test_basen.py | 9 ++ category_encoders/tests/test_encoders.py | 29 ++++++ category_encoders/tests/test_one_hot.py | 8 +- category_encoders/tests/test_ordinal.py | 18 ++++ .../tests/test_target_encoder.py | 22 ++++- category_encoders/woe.py | 46 +++++++--- 16 files changed, 275 insertions(+), 124 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 699f6d66..ccebeb28 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -215,7 +215,7 @@ def transform(self, X): @staticmethod def fit_backward_difference_coding(values, handle_missing, handle_unknown): if handle_missing == 'value': - del values[np.nan] + values = values[values > 0] if len(values) < 2: return pd.DataFrame() @@ -229,6 +229,11 @@ def fit_backward_difference_coding(values, handle_missing, handle_unknown): elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values) - 1) + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values) - 1) + return df @staticmethod diff --git a/category_encoders/basen.py b/category_encoders/basen.py index bfa6ce9c..eaa6fb41 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -153,10 +153,7 @@ def fit_base_n_encoding(self, X): values = switch.get('mapping') if self.handle_missing == 'value': - del values[np.nan] - - if len(values) < 2: - return pd.DataFrame() + values = values[values > 0] digits = self.calc_required_digits(X, col) X_unique = pd.DataFrame(index=values) @@ -172,6 +169,11 @@ def fit_base_n_encoding(self, X): elif self.handle_unknown == 'value': X_unique.loc[-1] = 0 + if self.handle_missing == 'return_nan': + X_unique.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + X_unique.loc[-2] = 0 + mappings_out.append({'col': col, 'mapping': X_unique}) return mappings_out diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 16513ca8..0aececd4 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -149,7 +149,7 @@ def fit_binary_encoding(self, X): values = switch.get('mapping') if self.handle_missing == 'value': - del values[np.nan] + values = values[values > 0] if len(values) < 2: return pd.DataFrame() @@ -168,6 +168,11 @@ def fit_binary_encoding(self, X): elif self. handle_unknown == 'value': X_unique.loc[-1] = 0 + if self.handle_missing == 'return_nan': + X_unique.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + X_unique.loc[-2] = 0 + mappings_out.append({'col': col, 'mapping': X_unique}) return mappings_out diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 9a7fc9e7..35483e10 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -212,20 +212,26 @@ def transform(self, X): @staticmethod def fit_helmert_coding(values, handle_missing, handle_unknown): if handle_missing == 'value': - del values[np.nan] + values = values[values > 0] - if len(values) < 2: + if len(values) == 0: return pd.DataFrame() helmert_contrast_matrix = Helmert().code_without_intercept(values.get_values()) - df = pd.DataFrame(data=helmert_contrast_matrix.matrix, columns=helmert_contrast_matrix.column_suffixes) - df.index += 1 + df = pd.DataFrame(data=helmert_contrast_matrix.matrix, + columns=helmert_contrast_matrix.column_suffixes, + index=values.get_values()) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values) - 1) + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values) - 1) + return df @staticmethod diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index af32fa31..15906d5c 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -111,7 +111,7 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0].astype(float) else: - y = pd.Series(y, name='target', dtype=float) + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -180,7 +180,7 @@ def transform(self, X, y=None): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0].astype(float) else: - y = pd.Series(y, name='target', dtype=float) + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -188,8 +188,7 @@ def transform(self, X, y=None): return X X = self.transform_leave_one_out( X, y, - mapping=self.mapping, - handle_unknown=self.handle_unknown + mapping=self.mapping ) if self.drop_invariant: @@ -219,7 +218,7 @@ def fit_leave_one_out(self, X_in, y, cols=None): self._mean = y.mean() return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols} - def transform_leave_one_out(self, X_in, y, mapping=None, handle_unknown='value'): + def transform_leave_one_out(self, X_in, y, mapping=None): """ Leave one out encoding uses a single column of floats to represent the means of the target variables. """ @@ -229,6 +228,12 @@ def transform_leave_one_out(self, X_in, y, mapping=None, handle_unknown='value') for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 + is_null = X[col].isnull() + is_unknown = ~X[col].isin(colmap.index) + + if self.handle_unknown == 'error' and is_unknown.any(): + raise ValueError('Columns to be encoded can not contain new values') + if y is None: # Replace level with its mean target; if level occurs only once, use global mean level_means = (colmap['sum'] / colmap['count']).where(level_notunique, self._mean) X[col] = X[col].map(level_means) @@ -239,11 +244,15 @@ def transform_leave_one_out(self, X_in, y, mapping=None, handle_unknown='value') # The 'where' fills in singleton levels (count = 1 -> div by 0) with the global mean X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) - if handle_unknown == 'value': - X[col].fillna(self._mean, inplace=True) - elif handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) + if self.handle_unknown == 'value': + X[col][is_unknown] = self._mean + elif self.handle_unknown == 'return_nan': + X[col][is_unknown] = np.nan + + if self.handle_missing == 'value': + X[col][is_null] = self._mean + elif self.handle_missing == 'return_nan': + X[col][is_null] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index dc187a16..c4821719 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -160,14 +160,19 @@ def generate_mapping(self): for switch in self.ordinal_encoder.mapping: col = switch.get('col') - column_mapping = switch.get('mapping').copy(deep=True) + values = switch.get('mapping').copy(deep=True) # TODO test with nan in dataset if self.handle_missing == 'value': - del column_mapping[np.nan] + values = values[values > 0] - col_mappings = [] - for cat_name, class_ in column_mapping.iteritems(): + if len(values) == 0: + continue + + index = [] + new_columns = [] + + for cat_name, class_ in values.iteritems(): if self.use_cat_names: n_col_name = str(col) + '_%s' % (cat_name,) found_count = found_column_counts.get(n_col_name, 0) @@ -175,7 +180,9 @@ def generate_mapping(self): n_col_name += '#' * found_count else: n_col_name = str(col) + '_%s' % (class_,) - col_mappings.append({'new_col_name': n_col_name, 'val': class_}) + + index.append(class_) + new_columns.append(n_col_name) if self.handle_unknown == 'indicator': n_col_name = str(col) + '_%s' % (-1,) @@ -183,9 +190,22 @@ def generate_mapping(self): found_count = found_column_counts.get(n_col_name, 0) found_column_counts[n_col_name] = found_count + 1 n_col_name += '#' * found_count + new_columns.append(n_col_name) + index.append(-1) + + base_matrix = np.eye(N=len(index), dtype=np.int) + base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) + + if self.handle_unknown == 'value': + base_df.loc[-1] = 0 + + if self.handle_missing == 'return_nan': + base_df.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + base_df.loc[-2] = 0 + + mapping.append({'col': col, 'mapping': base_df}) - col_mappings.append({'new_col_name': n_col_name, 'val': -1}) - mapping.append({'col': col, 'mapping': col_mappings}) return mapping def transform(self, X): @@ -304,36 +324,20 @@ def get_dummies(self, X_in, mapping): cols = X.columns.values.tolist() - for switch in mapping: + for switch in self.mapping: col = switch.get('col') mod = switch.get('mapping') - if len(mod) == 0: - continue - - base_matrix = np.eye(N=len(mod), dtype=np.int) - - index = [] - new_columns = [] - - for column_mapping in mod: - new_col_name = column_mapping['new_col_name'] - val = column_mapping['val'] - index.append(val) - new_columns.append(new_col_name) - - base_df = pd.DataFrame(data=base_matrix, columns=new_columns, index=index) - - if self.handle_unknown == 'value': - base_df.loc[-1] = np.zeros(len(mod)) - - base_df = base_df.loc[X[col]] - base_df.set_index(X.index, inplace=True) + base_df = mod.loc[X[col]] + base_df = base_df.set_index(X.index) X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns + + X = X.reindex(columns=cols) - return X.reindex(columns=cols) + return X def reverse_dummies(self, X, mapping): """ @@ -359,9 +363,11 @@ def reverse_dummies(self, X, mapping): cols.append(col) X[col] = 0 - for column_mapping in mod: - existing_col = column_mapping.get('new_col_name') - val = column_mapping.get('val') + positive_indexes = mod.index[mod.index > 0] + for i in range(positive_indexes.shape[0]): + existing_col = mod.columns[i] + val = positive_indexes[i] + X.loc[X[existing_col] == 1, col] = val mapped_columns.append(existing_col) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index dd570a72..3dd0d568 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -291,23 +291,22 @@ def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', hand mapping_out = [] for col in cols: + nan_identity = np.nan + if util.is_category(X[col].dtype): categories = X[col].cat.categories else: - categories = [x if x is not None else np.nan for x in pd.unique(X[col].values)] - - data = {} + categories = X[col].unique() - if handle_missing == 'value': - data[np.nan] = -2 + index = pd.Series(categories).fillna(nan_identity).unique() - for i in range(len(categories)): - data[categories[i]] = i + 1 + data = pd.Series(index=index, data=range(1, len(index) + 1)) - if handle_missing == 'return_nan': - data[np.nan] = -2 + if handle_missing == 'value' and ~data.index.isnull().any(): + data.loc[nan_identity] = -2 + elif handle_missing == 'return_nan': + data.loc[nan_identity] = -2 - mapping = pd.Series(data) - mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, ) + mapping_out.append({'col': col, 'mapping': data, 'data_type': X[col].dtype}, ) return X, mapping_out diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index ece37fa2..24ed034c 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -213,7 +213,7 @@ def transform(self, X): @staticmethod def fit_polynomial_coding(values, handle_missing, handle_unknown): if handle_missing == 'value': - del values[np.nan] + values = values[values > 0] if len(values) < 2: return pd.DataFrame() @@ -227,6 +227,11 @@ def fit_polynomial_coding(values, handle_missing, handle_unknown): elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values) - 1) + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values) - 1) + return df @staticmethod diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index e476f496..bf2c9bd2 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -213,7 +213,7 @@ def transform(self, X): @staticmethod def fit_sum_coding(values, handle_missing, handle_unknown): if handle_missing == 'value': - del values[np.nan] + values = values[values > 0] if len(values) < 2: return pd.DataFrame() @@ -227,6 +227,11 @@ def fit_sum_coding(values, handle_missing, handle_unknown): elif handle_unknown == 'value': df.loc[-1] = np.zeros(len(values) - 1) + if handle_missing == 'return_nan': + df.loc[values.loc[np.nan]] = np.nan + elif handle_missing == 'value': + df.loc[-2] = np.zeros(len(values) - 1) + return df @staticmethod diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 55ea67d1..0bad539a 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util __author__ = 'chappers' @@ -77,6 +78,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h self.drop_cols = [] self.verbose = verbose self.cols = cols + self.ordinal_encoder = None self.min_samples_leaf = min_samples_leaf self.smoothing = float(smoothing) # Make smoothing a float so that python 2 does not treat as integer division self._dim = None @@ -105,7 +107,7 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -121,14 +123,15 @@ def fit(self, X, y, **kwargs): if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') - _, self.mapping = self.target_encode( - X, y, - mapping=None, + self.ordinal_encoder = OrdinalEncoder( + verbose=self.verbose, cols=self.cols, - handle_unknown=self.handle_unknown, - smoothing_in=self.smoothing, - min_samples_leaf=self.min_samples_leaf + handle_unknown='value', + handle_missing='value' ) + self.ordinal_encoder = self.ordinal_encoder.fit(X) + X = self.ordinal_encoder.transform(X) + self.mapping = self.fit_target_encoding(X, y) if self.drop_invariant: self.drop_cols = [] @@ -138,6 +141,35 @@ def fit(self, X, y, **kwargs): return self + def fit_target_encoding(self, X, y): + mapping = {} + + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') + + prior = self._mean = y.mean() + + stats = y.groupby(X[col]).agg(['count', 'mean']) + + smoove = 1 / (1 + np.exp(-(stats['count'] - self.min_samples_leaf) / self.smoothing)) + smoothing = prior * (1 - smoove) + stats['mean'] * smoove + smoothing[stats['count'] == 1] = prior + + if self.handle_unknown == 'return_nan': + smoothing.loc[-1] = np.nan + elif self.handle_unknown == 'value': + smoothing.loc[-1] = prior + + if self.handle_missing == 'return_nan': + smoothing.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + smoothing.loc[-2] = prior + + mapping[col] = smoothing + + return mapping + def transform(self, X, y=None): """Perform the transformation to new categorical data. Parameters @@ -177,14 +209,14 @@ def transform(self, X, y=None): if not self.cols: return X - X, _ = self.target_encode( - X, y, - mapping=self.mapping, - cols=self.cols, - handle_unknown=self.handle_unknown, - min_samples_leaf=self.min_samples_leaf, - smoothing_in=self.smoothing - ) + + X = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X[self.cols].isnull().any(): + raise ValueError('Unexpected categories found in dataframe') + + X = self.target_encode(X) if self.drop_invariant: for col in self.drop_cols: @@ -204,27 +236,10 @@ def fit_transform(self, X, y=None, **fit_params): """ return self.fit(X, y, **fit_params).transform(X, y) - def target_encode(self, X_in, y, mapping=None, cols=None, handle_unknown='value', min_samples_leaf=1, smoothing_in=1.0): + def target_encode(self, X_in): X = X_in.copy(deep=True) - if cols is None: - cols = X.columns.values - - if mapping is not None: - for col in cols: - X[col] = X[col].map(mapping[col]) - if handle_unknown == 'value': - X[col].fillna(self._mean, inplace=True) - elif handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) - else: - mapping = {} - prior = self._mean = y.mean() - for col in cols: - stats = y.groupby(X[col]).agg(['count', 'mean']) - smoove = 1 / (1 + np.exp(-(stats['count'] - min_samples_leaf) / smoothing_in)) - smoothing = prior * (1 - smoove) + stats['mean'] * smoove - smoothing[stats['count'] == 1] = prior - mapping[col] = smoothing - - return X, mapping + + for col in self.cols: + X[col] = X[col].map(self.mapping[col]) + + return X diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py index 945bf8f3..32ec6007 100644 --- a/category_encoders/tests/test_basen.py +++ b/category_encoders/tests/test_basen.py @@ -16,3 +16,12 @@ def test_fit_transform_have_base_2_expect_Correct_Encoding(self): self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_inverse_transform_HaveData_ExpectResultReturned(self): + train = pd.Series(list('abcd')).to_frame('letter') + + enc = encoders.BaseNEncoder(base=2) + result = enc.fit_transform(train) + inversed_result = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, inversed_result) diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index eae2ebf3..7febd391 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -161,6 +161,35 @@ def test_handle_unknown_return_nan(self): else: self.assertTrue(result[1:].isnull().all()) + def test_handle_missing_return_nan_train(self): + X = pd.DataFrame({'city': ['chicago', 'los angeles', None]}) + y = pd.Series([1, 0, 1]) + + for encoder_name in ( set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + enc = getattr(encoders, encoder_name)(handle_missing='return_nan') + result = enc.fit_transform(X, y).iloc[2, :] + + if len(result) == 1: + self.assertTrue(result.isnull().all()) + else: + self.assertTrue(result[1:].isnull().all()) + + def test_handle_missing_return_nan_test(self): + X = pd.DataFrame({'city': ['chicago', 'los angeles', 'chicago']}) + X_t = pd.DataFrame({'city': ['chicago', 'los angeles', None]}) + y = pd.Series([1, 0, 1]) + + for encoder_name in (set(encoders.__all__) - {'HashingEncoder'}): # HashingEncoder supports new values by design -> excluded + with self.subTest(encoder_name=encoder_name): + enc = getattr(encoders, encoder_name)(handle_missing='return_nan') + result = enc.fit(X, y).transform(X_t).iloc[2, :] + + if len(result) == 1: + self.assertTrue(result.isnull().all()) + else: + self.assertTrue(result[1:].isnull().all()) + def test_handle_unknown_value(self): train = pd.DataFrame({'city': ['chicago', 'los angeles']}) test = pd.DataFrame({'city': ['chicago', 'denver']}) diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index 5cbc2f8f..fec268bd 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -97,8 +97,8 @@ def test_fit_transform_HaveColumnAppearTwice_ExpectColumnsDeduped(self): def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(self): train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) test = pd.DataFrame({'city': ['Chicago', 'Detroit']}) - expected_result = pd.DataFrame({'city_1': [1.0, 0.0], - 'city_2': [0.0, 0.0]}, + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 0]}, columns=['city_1', 'city_2']) enc = encoders.OneHotEncoder(handle_unknown='value') @@ -108,8 +108,8 @@ def test_fit_transform_HaveHandleUnknownValueAndUnseenValues_ExpectAllZeroes(sel def test_fit_transform_HaveHandleUnknownValueAndSeenValues_ExpectMappingUsed(self): train = pd.DataFrame({'city': ['Chicago', 'Seattle']}) - expected_result = pd.DataFrame({'city_1': [1.0, 0.0], - 'city_2': [0.0, 1.0]}, + expected_result = pd.DataFrame({'city_1': [1, 0], + 'city_2': [0, 1]}, columns=['city_1', 'city_2']) enc = encoders.OneHotEncoder(handle_unknown='value') diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index e54d8d2b..7b328f80 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -132,3 +132,21 @@ def test_HaveNegativeOneInTrain_ExpectCodedAsOne(self): result = enc.fit_transform(train)['city'].tolist() self.assertEqual(expected, result) + + def test_HaveNaNInTrain_ExpectCodedAsOne(self): + train = pd.DataFrame({'city': [np.nan]}) + expected = [1] + + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(expected, result) + + def test_HaveNoneAndNan_ExpectCodesAsOne(self): + train = pd.DataFrame({'city': [np.nan, None]}) + expected = [1, 1] + + enc = encoders.OrdinalEncoder(cols=['city']) + result = enc.fit_transform(train)['city'].tolist() + + self.assertEqual(expected, result) diff --git a/category_encoders/tests/test_target_encoder.py b/category_encoders/tests/test_target_encoder.py index bbd206dc..b18cc4dd 100644 --- a/category_encoders/tests/test_target_encoder.py +++ b/category_encoders/tests/test_target_encoder.py @@ -34,9 +34,11 @@ def test_target_encoder_fit_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectU encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) encoder.fit(binary_cat_example, binary_cat_example['target']) trend_mapping = encoder.mapping['Trend'] - self.assertAlmostEqual(0.4125, trend_mapping['DOWN'], delta=1e-4) - self.assertEqual(0.5, trend_mapping['FLAT']) - self.assertAlmostEqual(0.5874, trend_mapping['UP'], delta=1e-4) + ordinal_mapping = encoder.ordinal_encoder.category_mapping[0]['mapping'] + + self.assertAlmostEqual(0.4125, trend_mapping[ordinal_mapping.loc['DOWN']], delta=1e-4) + self.assertEqual(0.5, trend_mapping[ordinal_mapping.loc['FLAT']]) + self.assertAlmostEqual(0.5874, trend_mapping[ordinal_mapping.loc['UP']], delta=1e-4) def test_target_encoder_fit_transform_HaveConstructorSetSmoothingAndMinSamplesLeaf_ExpectCorrectValueInResult(self): k = 2 @@ -67,6 +69,20 @@ def test_target_encoder_fit_transform_HaveCategoricalColumn_ExpectCorrectValueIn self.assertAlmostEqual(0.4125, values[2], delta=1e-4) self.assertEqual(0.5, values[3]) + def test_target_encoder_fit_transform_HaveNanValue_ExpectCorrectValueInResult(self): + k = 2 + f = 10 + binary_cat_example = pd.DataFrame( + {'Trend': pd.Series([np.nan, np.nan, 'DOWN', 'FLAT', 'DOWN', np.nan, 'DOWN', 'FLAT', 'FLAT', 'FLAT']), + 'target': [1, 1, 0, 0, 1, 0, 0, 0, 1, 1]}) + encoder = encoders.TargetEncoder(cols=['Trend'], min_samples_leaf=k, smoothing=f) + result = encoder.fit_transform(binary_cat_example, binary_cat_example['target']) + values = result['Trend'].values + self.assertAlmostEqual(0.5874, values[0], delta=1e-4) + self.assertAlmostEqual(0.5874, values[1], delta=1e-4) + self.assertAlmostEqual(0.4125, values[2], delta=1e-4) + self.assertEqual(0.5, values[3]) + def test_target_encoder_noncontiguous_index(self): data = pd.DataFrame({'x': ['a', 'b', np.nan, 'd', 'e'], 'y': range(5)}).dropna() result = encoders.TargetEncoder(cols=['x']).fit_transform(data[['x']], data['y']) diff --git a/category_encoders/woe.py b/category_encoders/woe.py index 1c58d484..4b9541fe 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -2,6 +2,7 @@ import numpy as np import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin +from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util from sklearn.utils.random import check_random_state @@ -78,6 +79,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, self.drop_invariant = drop_invariant self.drop_cols = [] self.cols = cols + self.ordinal_encoder = None self._dim = None self.mapping = None self.handle_unknown = handle_unknown @@ -115,7 +117,7 @@ def fit(self, X, y, **kwargs): if isinstance(y, pd.DataFrame): y = y.iloc[:,0] else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', index=X.index) # The lengths must be equal if X.shape[0] != y.shape[0]: @@ -144,8 +146,17 @@ def fit(self, X, y, **kwargs): if X[self.cols].isnull().any().bool(): raise ValueError('Columns to be encoded can not contain null') + self.ordinal_encoder = OrdinalEncoder( + verbose=self.verbose, + cols=self.cols, + handle_unknown='value', + handle_missing='value' + ) + self.ordinal_encoder = self.ordinal_encoder.fit(X) + X = self.ordinal_encoder.transform(X) + # Training - self.mapping = self._train(X, y, cols=self.cols) + self.mapping = self._train(X, y) # Store column names with approximately constant variance on the training data if self.drop_invariant: @@ -196,7 +207,7 @@ def transform(self, X, y=None): if isinstance(y, pd.DataFrame): y = y.iloc[:, 0] else: - y = pd.Series(y, name='target') + y = pd.Series(y, name='target', index=X.index) if X.shape[0] != y.shape[0]: raise ValueError("The length of X is " + str(X.shape[0]) + " but length of y is " + str(y.shape[0]) + ".") @@ -206,6 +217,12 @@ def transform(self, X, y=None): # Do not modify the input argument X = X.copy(deep=True) + X = self.ordinal_encoder.transform(X) + + if self.handle_unknown == 'error': + if X[self.cols].isnull().any(): + raise ValueError('Unexpected categories found in dataframe') + # Loop over columns and replace nominal values with WOE X = self._score(X, y) @@ -229,7 +246,7 @@ def fit_transform(self, X, y=None, **fit_params): """ return self.fit(X, y, **fit_params).transform(X, y) - def _train(self, X, y, cols=None): + def _train(self, X, y): # Initialize the output mapping = {} @@ -237,7 +254,9 @@ def _train(self, X, y, cols=None): self._sum = y.sum() self._count = y.count() - for col in cols: + for switch in self.ordinal_encoder.category_mapping: + col = switch.get('col') + values = switch.get('mapping') # Calculate sum and count of the target for each unique value in the feature col stats = y.groupby(X[col]).agg(['sum', 'count']) # Count of x_{i,+} and x_i @@ -251,6 +270,16 @@ def _train(self, X, y, cols=None): # Ignore unique values. This helps to prevent overfitting on id-like columns. woe[stats['count'] == 1] = 0 + if self.handle_unknown == 'return_nan': + woe.loc[-1] = np.nan + elif self.handle_unknown == 'value': + woe.loc[-1] = 0 + + if self.handle_missing == 'return_nan': + woe.loc[values.loc[np.nan]] = np.nan + elif self.handle_missing == 'value': + woe.loc[-2] = 0 + # Store WOE for transform() function mapping[col] = woe @@ -261,13 +290,6 @@ def _score(self, X, y): # Score the column X[col] = X[col].map(self.mapping[col]) - # Replace missing values only in the computed columns - if self.handle_unknown == 'value': - X[col].fillna(0, inplace=True) - elif self.handle_unknown == 'error': - if X[col].isnull().any(): - raise ValueError('Unexpected categories found in column %s' % col) - # Randomization is meaningful only for training data -> we do it only if y is present if self.randomized and y is not None: random_state_generator = check_random_state(self.random_state) From df9e0ba7b1de6862aa69a21f9cac7490f488eb40 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Tue, 11 Dec 2018 20:45:37 -0800 Subject: [PATCH 13/40] Make leave one out encoder respect value settings --- category_encoders/leave_one_out.py | 35 ++++++++--- category_encoders/tests/test_leave_one_out.py | 58 ++++++++++++++++++- category_encoders/tests/test_ordinal.py | 22 ------- 3 files changed, 84 insertions(+), 31 deletions(-) diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 15906d5c..651f33d3 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -216,7 +216,22 @@ def fit_leave_one_out(self, X_in, y, cols=None): cols = X.columns.values self._mean = y.mean() - return {col: y.groupby(X[col]).agg(['sum', 'count']) for col in cols} + + return {col: self.fit_column_map(X[col], y) for col in cols} + + def fit_column_map(self, series, y): + category = pd.Categorical(series) + + categories = category.categories + codes = category.codes.copy() + + codes[codes == -1] = len(categories) + categories = np.append(categories, np.nan) + + return_map = pd.Series(dict([(code, category) for code, category in enumerate(categories)])) + + result = y.groupby(codes).agg(['sum', 'count']) + return result.rename(return_map) def transform_leave_one_out(self, X_in, y, mapping=None): """ @@ -228,10 +243,14 @@ def transform_leave_one_out(self, X_in, y, mapping=None): for col, colmap in mapping.items(): level_notunique = colmap['count'] > 1 - is_null = X[col].isnull() - is_unknown = ~X[col].isin(colmap.index) - if self.handle_unknown == 'error' and is_unknown.any(): + unique_train = colmap.index + unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train]) + + is_nan = X[col].isnull() + is_unknown_value = X[col].isin(unseen_values.dropna()) + + if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') if y is None: # Replace level with its mean target; if level occurs only once, use global mean @@ -245,14 +264,14 @@ def transform_leave_one_out(self, X_in, y, mapping=None): X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if self.handle_unknown == 'value': - X[col][is_unknown] = self._mean + X[col][is_unknown_value] = self._mean elif self.handle_unknown == 'return_nan': - X[col][is_unknown] = np.nan + X[col][is_unknown_value] = np.nan if self.handle_missing == 'value': - X[col][is_null] = self._mean + X[col][is_nan & unseen_values.isnull().any()] = self._mean elif self.handle_missing == 'return_nan': - X[col][is_null] = np.nan + X[col][is_nan] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py index 41fdac5e..04d8aa2f 100644 --- a/category_encoders/tests/test_leave_one_out.py +++ b/category_encoders/tests/test_leave_one_out.py @@ -48,7 +48,7 @@ def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): self.assertEqual(1, len(mapping)) self.assertIn('col_b', mapping) # the model should have the updated mapping expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2']) - pd.testing.assert_frame_equal(expected, mapping['col_b'], check_like=True) + np.testing.assert_equal(expected.values, mapping['col_b'].values) def test_leave_one_out_unique(self): X = pd.DataFrame(data=['1', '2', '2', '2', '3'], columns=['col']) @@ -60,3 +60,59 @@ def test_leave_one_out_unique(self): self.assertFalse(result.isnull().any().any(), 'There should not be any missing value') expected = pd.DataFrame(data=[y.mean(), 0.5, 0, 0.5, y.mean()], columns=['col']) pd.testing.assert_frame_equal(expected, result) + + def test_HandleMissingIsValueAndNanInTrain_ExpectAtValueSet(self): + df = pd.DataFrame({ + 'color': [np.nan, np.nan, np.nan, "b", "b", "b"], + 'outcome': [2, 2, 0, 1, 0, 1]}) + + X = df.drop('outcome', axis=1) + y = df.drop('color', axis=1) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + obtained = ce_leave.fit_transform(X, y['outcome']) + + self.assertEqual([1, 1, 2, 0.5, 1.0, 0.5], list(obtained['color'])) + + def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): + df = pd.DataFrame({ + 'color': ["a", "a", "a", "b", "b", "b"], + 'outcome': [1.6, 0, 0, 1, 0, 1]}) + + train = df.drop('outcome', axis=1) + target = df.drop('color', axis=1) + test = pd.Series([np.nan, 'b'], name='color') + test_target = pd.Series([0, 0]) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + ce_leave.fit(train, target['outcome']) + obtained = ce_leave.transform(test, test_target) + + self.assertEqual([.6, 1.0], list(obtained['color'])) + + def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self): + df = pd.DataFrame({ + 'color': ["a", "a", "a", "b", "b", "b"], + 'outcome': [1, 0, 0, 1, 0, 1]}) + + train = df.drop('outcome', axis=1) + target = df.drop('color', axis=1) + test = pd.Series([np.nan, 'b'], name='color') + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + ce_leave.fit(train, target['outcome']) + obtained = ce_leave.transform(test) + + self.assertEqual([.5, 2/3], list(obtained['color'])) + + def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): + train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') + target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') + test = pd.Series(['b', 'c'], name='color') + test_target = pd.Series([0, 0]) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_unknown='value') + ce_leave.fit(train, target) + obtained = ce_leave.transform(test, test_target) + + self.assertEqual([1.0, .6], list(obtained['color'])) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 7b328f80..8c7b0ba0 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -91,28 +91,6 @@ def test_handle_missing_have_nan_transform_time_expect_negative_2(self): self.assertListEqual([1, -2], out['city'].tolist()) - def test_handle_missing_have_nan_fit_time_return_nan(self): - train = pd.DataFrame({'city': ['chicago', np.nan]}) - - enc = encoders.OrdinalEncoder(handle_missing='return_nan') - out = enc.fit_transform(train)['city'].tolist() - - self.assertEqual(2, len(out)) - self.assertEqual(1.0, out[0]) - self.assertTrue(np.isnan(out[1])) - - def test_handle_missing_have_nan_transform_time_return_nan(self): - train = pd.DataFrame({'city': ['chicago', 'st louis']}) - test = pd.DataFrame({'city': ['chicago', np.nan]}) - - enc = encoders.OrdinalEncoder(handle_missing='return_nan') - enc.fit(train) - out = enc.transform(test)['city'].tolist() - - self.assertEqual(2, len(out)) - self.assertEqual(1.0, out[0]) - self.assertTrue(np.isnan(out[1])) - def test_handle_unknown_have_new_value_expect_negative_1(self): train = pd.DataFrame({'city': ['chicago', 'st louis']}) test = pd.DataFrame({'city': ['chicago', 'los angeles']}) From fc4917ae8a7320fc9a258b50d82a177ed2124a91 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 15:06:02 -0700 Subject: [PATCH 14/40] Convert backward difference and basen to use value and indicator correctly --- category_encoders/backward_difference.py | 41 +++++++------ category_encoders/basen.py | 22 +++---- .../tests/test_backward_difference.py | 59 +++++++++++++++++- category_encoders/tests/test_basen.py | 60 ++++++++++++++++++- .../tests/test_target_encoder.py | 27 +++++++++ category_encoders/utils.py | 2 +- 6 files changed, 176 insertions(+), 35 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index ccebeb28..7d343eea 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -148,8 +148,10 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') - column_mapping = self.fit_backward_difference_coding(values, self.handle_missing, self.handle_unknown) - mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) + col = switch.get('col') + + column_mapping = self.fit_backward_difference_coding(col, values, self.handle_missing, self.handle_unknown) + mappings_out.append({'col': col, 'mapping': column_mapping, }) self.mapping = mappings_out @@ -213,26 +215,31 @@ def transform(self, X): return X.values @staticmethod - def fit_backward_difference_coding(values, handle_missing, handle_unknown): + def fit_backward_difference_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) - backwards_difference_matrix = Diff().code_without_intercept(values.get_values()) - df = pd.DataFrame(data=backwards_difference_matrix.matrix, columns=backwards_difference_matrix.column_suffixes) - df.index += 1 + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) + + backwards_difference_matrix = Diff().code_without_intercept(values_to_encode) + df = pd.DataFrame(data=backwards_difference_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i, ) for i in range(len(backwards_difference_matrix.column_suffixes))]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': - df.loc[-1] = np.zeros(len(values) - 1) + df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': - df.loc[-2] = np.zeros(len(values) - 1) + df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df @@ -250,16 +257,14 @@ def backward_difference_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X.loc[:, new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X \ No newline at end of file + return X.reindex(columns=cols) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index eaa6fb41..18b534f3 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -155,14 +155,10 @@ def fit_base_n_encoding(self, X): if self.handle_missing == 'value': values = values[values > 0] - digits = self.calc_required_digits(X, col) - X_unique = pd.DataFrame(index=values) - - X_unique_to_cols = X_unique.index.map(lambda x: self.col_transform(x, digits)) - - for dig in range(digits): - X_unique[str(col) + '_%d' % (dig,)] = X_unique_to_cols.map( - lambda r: int(r[dig]) if r is not None else None) + digits = self.calc_required_digits(values) + X_unique = pd.DataFrame(index=values, + columns=[str(col) + '_%d' % x for x in range(digits)], + data=np.array([self.col_transform(x, digits) for x in range(1, len(values) + 1)])) if self.handle_unknown == 'return_nan': X_unique.loc[-1] = np.nan @@ -281,12 +277,12 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values - def calc_required_digits(self, X, col): + def calc_required_digits(self, values): # figure out how many digits we need to represent the classes present if self.base == 1: - digits = len(X[col].unique()) + 1 + digits = len(values) + 1 else: - digits = int(np.ceil(math.log(len(X[col].unique()), self.base))) + 1 + digits = int(np.ceil(math.log(len(values), self.base))) + 1 return digits @@ -319,9 +315,7 @@ def basen_encode(self, X_in, cols=None): old_column_index = cols.index(col) cols[old_column_index: old_column_index + 1] = mod.columns - X = X.reindex(columns=cols) - - return X + return X.reindex(columns=cols) def basen_to_integer(self, X, cols, base): """ diff --git a/category_encoders/tests/test_backward_difference.py b/category_encoders/tests/test_backward_difference.py index d920f6be..23f21961 100644 --- a/category_encoders/tests/test_backward_difference.py +++ b/category_encoders/tests/test_backward_difference.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders @@ -86,3 +86,60 @@ def test_backwards_difference_encoder_2StringCols_ExpectCorrectOrder(self): columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') + result = encoder.fit_transform(train) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0], + [1, 1 / 3.0, 2 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') + result = encoder.fit_transform(train) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.BackwardDifferenceEncoder(handle_missing='indicator') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0], + [1, 1 / 3.0, 2 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='indicator') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0], + [1, 1 / 3.0, 2 / 3.0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.BackwardDifferenceEncoder(handle_unknown='indicator') + result = encoder.fit_transform(train) + + expected = [[1, -2 / 3.0, -1 / 3.0], + [1, 1 / 3.0, -1 / 3.0]] + self.assertEqual(result.values.tolist(), expected) diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py index 32ec6007..219526af 100644 --- a/category_encoders/tests/test_basen.py +++ b/category_encoders/tests/test_basen.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders @@ -25,3 +25,61 @@ def test_inverse_transform_HaveData_ExpectResultReturned(self): inversed_result = enc.inverse_transform(result) pd.testing.assert_frame_equal(train, inversed_result) + + def test_HaveIndicatorAndNanValue_ExpectNewColumn(self): + train = pd.Series(['a', 'b', 'c', np.nan]) + + result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_HandleMissingIndicator_HaveNoNan_ExpectThirdColumn(self): + train = pd.Series(['a', 'b', 'c']) + + result = encoders.BaseNEncoder(handle_missing='indicator', base=2).fit_transform(train) + + self.assertEqual(3, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = pd.Series(['a', 'b', 'c']) + test = pd.Series(['a', 'b', 'c', np.nan]) + + encoder = encoders.BaseNEncoder(handle_missing='indicator') + encoder.fit(train) + result = encoder.transform(test) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + # def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + # train = ['A', 'B'] + # test = ['A', 'B', 'C'] + # + # encoder = encoders.BaseNEncoder(handle_unknown='indicator') + # encoder.fit(train) + # result = encoder.transform(test) + # + # expected = [[1, -2 / 3.0, -1 / 3.0], + # [1, 1 / 3.0, -1 / 3.0], + # [1, 1 / 3.0, 2 / 3.0]] + # self.assertEqual(result.values.tolist(), expected) + # + # def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + # train = ['A', 'B'] + # + # encoder = encoders.BaseNEncoder(handle_unknown='indicator') + # result = encoder.fit_transform(train) + # + # expected = [[1, -2 / 3.0, -1 / 3.0], + # [1, 1 / 3.0, -1 / 3.0]] + # self.assertEqual(result.values.tolist(), expected) diff --git a/category_encoders/tests/test_target_encoder.py b/category_encoders/tests/test_target_encoder.py index b18cc4dd..057a0381 100644 --- a/category_encoders/tests/test_target_encoder.py +++ b/category_encoders/tests/test_target_encoder.py @@ -88,3 +88,30 @@ def test_target_encoder_noncontiguous_index(self): result = encoders.TargetEncoder(cols=['x']).fit_transform(data[['x']], data['y']) self.assertTrue(np.allclose(result, 2.0)) + def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): + df = pd.DataFrame({ + 'color': ["a", "a", "a", "b", "b", "b"], + 'outcome': [1.6, 0, 0, 1, 0, 1]}) + + train = df.drop('outcome', axis=1) + target = df.drop('color', axis=1) + test = pd.Series([np.nan, 'b'], name='color') + test_target = pd.Series([0, 0]) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') + ce_leave.fit(train, target['outcome']) + obtained = ce_leave.transform(test, test_target) + + self.assertEqual(.6, list(obtained['color'])[0]) + + def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): + train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') + target = pd.Series([1.6, 0, 0, 1, 0, 1], name='target') + test = pd.Series(['c', 'b'], name='color') + test_target = pd.Series([0, 0]) + + ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_unknown='value') + ce_leave.fit(train, target) + obtained = ce_leave.transform(test, test_target) + + self.assertEqual(.6, list(obtained['color'])[0]) diff --git a/category_encoders/utils.py b/category_encoders/utils.py index 7343f3d5..3e9bef88 100644 --- a/category_encoders/utils.py +++ b/category_encoders/utils.py @@ -46,7 +46,7 @@ def convert_input(X): """ if not isinstance(X, pd.DataFrame): if isinstance(X, list): - X = pd.DataFrame(np.array(X)) + X = pd.DataFrame(X) elif isinstance(X, (np.generic, np.ndarray)): X = pd.DataFrame(X) elif isinstance(X, csr_matrix): From f00e77a2fbb1468e51f4d4566626f6d1288bdbb9 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 15:12:33 -0700 Subject: [PATCH 15/40] Convert binary encoder to internally use the base n encoder so that way the handle missing and value params get inherited --- category_encoders/binary.py | 246 ++---------------------------------- 1 file changed, 8 insertions(+), 238 deletions(-) diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 0aececd4..7c5985f4 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -1,11 +1,9 @@ """Binary encoding""" -import copy import pandas as pd -import numpy as np from sklearn.base import BaseEstimator, TransformerMixin -from category_encoders.ordinal import OrdinalEncoder -import category_encoders.utils as util + +import category_encoders as ce __author__ = 'willmcginnis' @@ -73,16 +71,9 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): def __init__(self, verbose=0, cols=None, mapping=None, drop_invariant=False, return_df=True, handle_unknown='value', handle_missing='value'): - self.return_df = return_df - self.drop_invariant = drop_invariant - self.drop_cols = [] - self.verbose = verbose - self.handle_unknown = handle_unknown - self.handle_missing = handle_missing - self.cols = cols - self.mapping = mapping - self.ordinal_encoder = None - self._dim = None + self.base_n_encoder = ce.BaseNEncoder(base=2, verbose=verbose, cols=cols, mapping=mapping, + drop_invariant=drop_invariant, return_df=return_df, + handle_unknown=handle_unknown, handle_missing=handle_missing) def fit(self, X, y=None, **kwargs): """Fit encoder according to X and y. @@ -104,79 +95,10 @@ def fit(self, X, y=None, **kwargs): """ - # if the input dataset isn't already a dataframe, convert it to one (using default column names) - # first check the type - X = util.convert_input(X) - - self._dim = X.shape[1] - - # if columns aren't passed, just use every string column - if self.cols is None: - self.cols = util.get_obj_cols(X) - else: - self.cols = util.convert_cols_to_list(self.cols) - - if self.handle_missing == 'error': - if X[self.cols].isnull().any().bool(): - raise ValueError('Columns to be encoded can not contain null') - - # train an ordinal pre-encoder - self.ordinal_encoder = OrdinalEncoder( - verbose=self.verbose, - cols=self.cols, - handle_unknown='value', - handle_missing='value' - ) - X = X.drop_duplicates(subset=self.cols) if self.cols else X - self.ordinal_encoder = self.ordinal_encoder.fit(X) - - self.mapping = self.fit_binary_encoding(X) - - # drop all output columns with 0 variance. - if self.drop_invariant: - self.drop_cols = [] - X_temp = self.transform(X) - generated_cols = util.get_generated_cols(X, X_temp, self.cols) - self.drop_cols = [x for x in generated_cols if X_temp[x].var() <= 10e-5] + self.base_n_encoder.fit(X, y, **kwargs) return self - def fit_binary_encoding(self, X): - mappings_out = [] - - for switch in self.ordinal_encoder.category_mapping: - col = switch.get('col') - values = switch.get('mapping') - - if self.handle_missing == 'value': - values = values[values > 0] - - if len(values) < 2: - return pd.DataFrame() - - digits = self.calc_required_digits(X, col) - X_unique = pd.DataFrame(index=values) - - X_unique_to_cols = X_unique.index.map(lambda x: self.col_transform(x, digits)) - - for dig in range(digits): - X_unique[str(col) + '_%d' % (dig,)] = X_unique_to_cols.map( - lambda r: int(r[dig]) if r is not None else None) - - if self.handle_unknown == 'return_nan': - X_unique.loc[-1] = np.nan - elif self. handle_unknown == 'value': - X_unique.loc[-1] = 0 - - if self.handle_missing == 'return_nan': - X_unique.loc[values.loc[np.nan]] = np.nan - elif self.handle_missing == 'value': - X_unique.loc[-2] = 0 - - mappings_out.append({'col': col, 'mapping': X_unique}) - - return mappings_out - def transform(self, X): """Perform the transformation to new categorical data. @@ -193,39 +115,7 @@ def transform(self, X): """ - if self.handle_missing == 'error': - if X[self.cols].isnull().any().bool(): - raise ValueError('Columns to be encoded can not contain null') - - if self._dim is None: - raise ValueError('Must train encoder before it can be used to transform data.') - - # first check the type - X = util.convert_input(X) - - # then make sure that it is the right size - if X.shape[1] != self._dim: - raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) - - if not self.cols: - return X if self.return_df else X.values - - X = self.ordinal_encoder.transform(X) - - if self.handle_unknown == 'error': - if X[self.cols].isin([-1]).any().any(): - raise ValueError('Columns to be encoded can not contain new values') - - X = self.binary(X) - - if self.drop_invariant: - for col in self.drop_cols: - X.drop(col, 1, inplace=True) - - if self.return_df: - return X - else: - return X.values + return self.base_n_encoder.transform(X) def inverse_transform(self, X_in): """ @@ -239,126 +129,6 @@ def inverse_transform(self, X_in): ------- p: array, the same size of X_in - """ - X = X_in.copy(deep=True) - - # first check the type - X = util.convert_input(X) - - if self._dim is None: - raise ValueError('Must train encoder before it can be used to inverse_transform data') - - X = self.binary_to_integer(X, self.cols) - - # then make sure that it is the right size - if X.shape[1] != self._dim: - if self.drop_invariant: - raise ValueError("Unexpected input dimension %d, the attribute drop_invariant should " - "set as False when transform data" % (X.shape[1],)) - else: - raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,)) - - if not self.cols: - return X if self.return_df else X.values - - if self.handle_unknown == 'value': - for col in self.cols: - if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) - - for switch in self.ordinal_encoder.mapping: - column_mapping = switch.get('mapping') - inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) - X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) - - return X if self.return_df else X.values - - def binary(self, X_in): - """ - Binary encoding encodes the integers as binary code with one column per digit. - - Parameters - ---------- - X_in: DataFrame - cols: list-like, default None - Column names in the DataFrame to be encoded - Returns - ------- - dummies : DataFrame - """ - - X = X_in.copy(deep=True) - - cols = X.columns.values.tolist() - - for switch in self.mapping: - col = switch.get('col') - mod = switch.get('mapping') - - base_df = mod.loc[X[col]] - base_df.set_index(X.index, inplace=True) - X = pd.concat([base_df, X], axis=1) - - old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = mod.columns - - X = X.reindex(columns=cols) - - return X - - def binary_to_integer(self, X, cols): - """ - Convert binary code as integers. - - Parameters - ---------- - X : DataFrame - encoded data - cols : list-like - Column names in the DataFrame that be encoded - - Returns - ------- - numerical: DataFrame - """ - out_cols = X.columns.values - - for col in cols: - col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))] - for col0 in col_list: - if any(X[col0].isnull()): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) - - len0 = len(col_list) - value_array = np.array([2 ** (len0 - 1 - i) for i in range(len0)]) - - X[col] = np.dot(X[col_list].values, value_array.T) - out_cols = [col0 for col0 in out_cols if col0 not in col_list] - - X = X.reindex(columns=out_cols + cols) - - return X - - @staticmethod - def calc_required_digits(X, col): - """ - figure out how many digits we need to represent the classes present - """ - return int(np.ceil(np.log2(len(X[col].unique())))) + 1 - - @staticmethod - def col_transform(col, digits): - """ - The lambda body to transform the column values """ - if col is None or np.isnan(col) or float(col) < 0.0: - return None - else: - col = list("{0:b}".format(int(col))) - if len(col) == digits: - return col - else: - return [str(0) for _ in range(digits - len(col))] + col + return self.base_n_encoder.inverse_transform(X_in) From 111b9f858f6713b8194a48dc7a3460eeece9edf3 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 15:32:41 -0700 Subject: [PATCH 16/40] Have base n with handle unknown indicator --- category_encoders/basen.py | 3 ++ category_encoders/tests/test_basen.py | 45 ++++++++++++++------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 18b534f3..d6e3656e 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -155,6 +155,9 @@ def fit_base_n_encoding(self, X): if self.handle_missing == 'value': values = values[values > 0] + if self.handle_unknown == 'indicator': + values = np.append(values, -1) + digits = self.calc_required_digits(values) X_unique = pd.DataFrame(index=values, columns=[str(col) + '_%d' % x for x in range(digits)], diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py index 219526af..5447897c 100644 --- a/category_encoders/tests/test_basen.py +++ b/category_encoders/tests/test_basen.py @@ -61,25 +61,26 @@ def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) - # def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): - # train = ['A', 'B'] - # test = ['A', 'B', 'C'] - # - # encoder = encoders.BaseNEncoder(handle_unknown='indicator') - # encoder.fit(train) - # result = encoder.transform(test) - # - # expected = [[1, -2 / 3.0, -1 / 3.0], - # [1, 1 / 3.0, -1 / 3.0], - # [1, 1 / 3.0, 2 / 3.0]] - # self.assertEqual(result.values.tolist(), expected) - # - # def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): - # train = ['A', 'B'] - # - # encoder = encoders.BaseNEncoder(handle_unknown='indicator') - # result = encoder.fit_transform(train) - # - # expected = [[1, -2 / 3.0, -1 / 3.0], - # [1, 1 / 3.0, -1 / 3.0]] - # self.assertEqual(result.values.tolist(), expected) + def test_HandleUnknown_HaveUnknown_ExpectIndicatorInTest(self): + train = ['A', 'B', 'C'] + test = ['A', 'B', 'C', 'D'] + + encoder = encoders.BaseNEncoder(handle_unknown='indicator') + encoder.fit(train) + result = encoder.transform(test) + + self.assertEqual(4, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + self.assertListEqual([0, 1, 1], result.iloc[2, :].tolist()) + self.assertListEqual([1, 0, 0], result.iloc[3, :].tolist()) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.BaseNEncoder(handle_unknown='indicator') + result = encoder.fit_transform(train) + + self.assertEqual(2, result.shape[0]) + self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) + self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) From bdc3fe70d33b85cfdaa5ed02ccfd60807ae4f1da Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 15:53:40 -0700 Subject: [PATCH 17/40] Have helmert check value and indicator and make faster --- category_encoders/helmert.py | 45 ++++++++++--------- category_encoders/tests/test_helmert.py | 59 ++++++++++++++++++++++++- 2 files changed, 82 insertions(+), 22 deletions(-) diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 35483e10..a2d22aa8 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -144,10 +144,11 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: - values = switch.get('mapping') - column_mapping = self.fit_helmert_coding(values, self.handle_missing, self.handle_unknown) - mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) + col = switch.get('col') + + column_mapping = self.fit_helmert_coding(col, values, self.handle_missing, self.handle_unknown) + mappings_out.append({'col': col, 'mapping': column_mapping, }) self.mapping = mappings_out @@ -210,27 +211,32 @@ def transform(self, X): return X.values @staticmethod - def fit_helmert_coding(values, handle_missing, handle_unknown): + def fit_helmert_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] - if len(values) == 0: - return pd.DataFrame() + values_to_encode = values.get_values() - helmert_contrast_matrix = Helmert().code_without_intercept(values.get_values()) - df = pd.DataFrame(data=helmert_contrast_matrix.matrix, - columns=helmert_contrast_matrix.column_suffixes, - index=values.get_values()) + if len(values) < 2: + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) + + helmert_contrast_matrix = Helmert().code_without_intercept(values_to_encode) + df = pd.DataFrame(data=helmert_contrast_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i,) for i in + range(len(helmert_contrast_matrix.column_suffixes))]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': - df.loc[-1] = np.zeros(len(values) - 1) + df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': - df.loc[-2] = np.zeros(len(values) - 1) + df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df @@ -248,17 +254,14 @@ def helmert_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X.loc[:, new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) diff --git a/category_encoders/tests/test_helmert.py b/category_encoders/tests/test_helmert.py index fd4c344f..c62447fd 100644 --- a/category_encoders/tests/test_helmert.py +++ b/category_encoders/tests/test_helmert.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders @@ -86,3 +86,60 @@ def test_helmert_2StringCols_ExpectCorrectOrder(self): columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, -1, -1], + [1, 1, -1], + [1, 0, 2]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, -1, -1], + [1, 1, -1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.HelmertEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -1, -1], + [1, 1, -1], + [1, 0, 2]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, -1, -1], + [1, 1, -1], + [1, 0, 2]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectExtraColumn(self): + train = ['A', 'B'] + + encoder = encoders.HelmertEncoder(handle_unknown='indicator', handle_missing='value') + result = encoder.fit_transform(train) + + expected = [[1, -1, -1], + [1, 1, -1]] + self.assertEqual(result.values.tolist(), expected) From 1d3aa4f80b332ed0cd0c908273893323782d1e92 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 20:17:15 -0700 Subject: [PATCH 18/40] Add tests for one hot encoder to ensure correct handle missing and handle unknown --- category_encoders/one_hot.py | 1 - category_encoders/tests/test_one_hot.py | 57 +++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index c4821719..f7fb9172 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -162,7 +162,6 @@ def generate_mapping(self): col = switch.get('col') values = switch.get('mapping').copy(deep=True) - # TODO test with nan in dataset if self.handle_missing == 'value': values = values[values > 0] diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index fec268bd..a6dba24a 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -141,3 +141,60 @@ def test_fit_transform_HaveHandleUnknownIndicatorAndMissingValue_ExpectValueSet( result = enc.fit(train).transform(test) pd.testing.assert_frame_equal(expected_result, result) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], + [0, 1, 0]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.OneHotEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.OneHotEncoder(handle_unknown='indicator', handle_missing='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [[1, 0, 0], + [0, 1, 0], + [0, 0, 1]] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.OneHotEncoder(handle_unknown='indicator', handle_missing='value') + result = encoder.fit_transform(train) + + expected = [[1, 0, 0], + [0, 1, 0]] + self.assertEqual(result.values.tolist(), expected) From 4e1bb8d51e8b01c22cf52a0a5a8d7466083dd2da Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 20:25:17 -0700 Subject: [PATCH 19/40] Make polynomial encoder handle unknown and missing --- category_encoders/polynomial.py | 38 +++++++------- category_encoders/tests/test_polynomial.py | 59 +++++++++++++++++++++- 2 files changed, 79 insertions(+), 18 deletions(-) diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index 24ed034c..661bdaf6 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -146,7 +146,8 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') - column_mapping = self.fit_polynomial_coding(values, self.handle_missing, self.handle_unknown) + col = switch.get('col') + column_mapping = self.fit_polynomial_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -211,26 +212,31 @@ def transform(self, X): return X.values @staticmethod - def fit_polynomial_coding(values, handle_missing, handle_unknown): + def fit_polynomial_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) - polynomial_contrast_matrix = Poly().code_without_intercept(values.get_values()) - df = pd.DataFrame(data=polynomial_contrast_matrix.matrix, columns=polynomial_contrast_matrix.column_suffixes) - df.index += 1 + polynomial_contrast_matrix = Poly().code_without_intercept(values_to_encode) + df = pd.DataFrame(data=polynomial_contrast_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i, ) for i in range(len(polynomial_contrast_matrix.column_suffixes))]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': - df.loc[-1] = np.zeros(len(values) - 1) + df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': - df.loc[-2] = np.zeros(len(values) - 1) + df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df @@ -248,16 +254,14 @@ def polynomial_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X.loc[:, new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) diff --git a/category_encoders/tests/test_polynomial.py b/category_encoders/tests/test_polynomial.py index 426c9dad..2dac0d44 100644 --- a/category_encoders/tests/test_polynomial.py +++ b/category_encoders/tests/test_polynomial.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders from category_encoders.tests.test_utils import deep_round @@ -92,3 +92,60 @@ def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self): columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='valie') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.PolynomialEncoder(handle_unknown='indicator') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.PolynomialEncoder(handle_unknown='indicator') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(result.values.tolist(), expected) From ea2d40dce8f0295e9c66d4f5ff34539b36b92ba8 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 20:55:47 -0700 Subject: [PATCH 20/40] Make sum encoding handle missing and unknown --- category_encoders/sum_coding.py | 38 ++++++++------ category_encoders/tests/test_sum_coding.py | 60 +++++++++++++++++++++- 2 files changed, 80 insertions(+), 18 deletions(-) diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index bf2c9bd2..e9ccfe5d 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -146,7 +146,8 @@ def fit(self, X, y=None, **kwargs): mappings_out = [] for switch in ordinal_mapping: values = switch.get('mapping') - column_mapping = self.fit_sum_coding(values, self.handle_missing, self.handle_unknown) + col = switch.get('col') + column_mapping = self.fit_sum_coding(col, values, self.handle_missing, self.handle_unknown) mappings_out.append({'col': switch.get('col'), 'mapping': column_mapping, }) self.mapping = mappings_out @@ -211,26 +212,31 @@ def transform(self, X): return X.values @staticmethod - def fit_sum_coding(values, handle_missing, handle_unknown): + def fit_sum_coding(col, values, handle_missing, handle_unknown): if handle_missing == 'value': values = values[values > 0] + values_to_encode = values.get_values() + if len(values) < 2: - return pd.DataFrame() + return pd.DataFrame(index=values_to_encode) + + if handle_unknown == 'indicator': + values_to_encode = np.append(values_to_encode, -1) - sum_contrast_matrix = Sum().code_without_intercept(values.tolist()) - df = pd.DataFrame(data=sum_contrast_matrix.matrix, columns=sum_contrast_matrix.column_suffixes) - df.index += 1 + sum_contrast_matrix = Sum().code_without_intercept(values_to_encode.tolist()) + df = pd.DataFrame(data=sum_contrast_matrix.matrix, index=values_to_encode, + columns=[str(col) + '_%d' % (i, ) for i in range(len(sum_contrast_matrix.column_suffixes))]) if handle_unknown == 'return_nan': df.loc[-1] = np.nan elif handle_unknown == 'value': - df.loc[-1] = np.zeros(len(values) - 1) + df.loc[-1] = np.zeros(len(values_to_encode) - 1) if handle_missing == 'return_nan': df.loc[values.loc[np.nan]] = np.nan elif handle_missing == 'value': - df.loc[-2] = np.zeros(len(values) - 1) + df.loc[-2] = np.zeros(len(values_to_encode) - 1) return df @@ -248,16 +254,14 @@ def sum_coding(X_in, mapping): for switch in mapping: col = switch.get('col') mod = switch.get('mapping') - new_columns = [] - for i in range(len(mod.columns)): - c = mod.columns[i] - new_col = str(col) + '_%d' % (i, ) - X.loc[:, new_col] = mod[c].loc[X[col]].values - new_columns.append(new_col) + + base_df = mod.loc[X[col]] + base_df.set_index(X.index, inplace=True) + X = pd.concat([base_df, X], axis=1) + old_column_index = cols.index(col) - cols[old_column_index: old_column_index + 1] = new_columns + cols[old_column_index: old_column_index + 1] = mod.columns cols = ['intercept'] + cols - X = X.reindex(columns=cols) - return X + return X.reindex(columns=cols) diff --git a/category_encoders/tests/test_sum_coding.py b/category_encoders/tests/test_sum_coding.py index e52387b0..900e5956 100644 --- a/category_encoders/tests/test_sum_coding.py +++ b/category_encoders/tests/test_sum_coding.py @@ -1,6 +1,6 @@ import pandas as pd from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ - +import numpy as np import category_encoders as encoders a_encoding = [1, 1, 0] @@ -90,3 +90,61 @@ def test_sum_encoder_2StringCols_ExpectCorrectOrder(self): columns = encoder.transform(train).columns.values self.assertItemsEqual(expected_columns, columns) + + def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): + train = ['A', 'B', np.nan] + + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): + train = ['A', 'B'] + test = ['A', 'B', np.nan] + + encoder = encoders.SumEncoder(handle_missing='indicator', handle_unknown='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): + train = ['A', 'B'] + test = ['A', 'B', 'C'] + + encoder = encoders.SumEncoder(handle_unknown='indicator', handle_missing='value') + encoder.fit(train) + result = encoder.transform(test) + + expected = [a_encoding, + b_encoding, + c_encoding] + self.assertEqual(result.values.tolist(), expected) + + def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): + train = ['A', 'B'] + + encoder = encoders.SumEncoder(handle_unknown='indicator', handle_missing='value') + result = encoder.fit_transform(train) + + expected = [a_encoding, + b_encoding] + self.assertEqual(result.values.tolist(), expected) + From c00664b03d1195ab5804cf2c52533d102d04204f Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 21:27:29 -0700 Subject: [PATCH 21/40] Add tests to check handle missing and handle unknown for woe --- category_encoders/tests/test_woe.py | 44 +++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/category_encoders/tests/test_woe.py b/category_encoders/tests/test_woe.py index b25b687e..5873c0ad 100644 --- a/category_encoders/tests/test_woe.py +++ b/category_encoders/tests/test_woe.py @@ -108,3 +108,47 @@ def test_woe(self): self.assertTrue(X1.isnull().values.any()) self.assertEqual(len(list(X_t)), len(list(X2)), 'The count of attributes must not change') self.assertEqual(len(X_t), len(X2), 'The count of rows must not change') + + def test_HaveArrays_ExpectCalculatedProperly(self): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + enc = encoders.WOEEncoder() + + result = enc.fit_transform(X, y) + + expected = pd.Series([0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119], name=0) + pd.testing.assert_series_equal(expected, result[0]) + + def test_HandleMissingValue_HaveMissingInTrain_ExpectEncoded(self): + X = ['a', 'a', np.nan, np.nan] + y = [1, 0, 0, 0] + enc = encoders.WOEEncoder(handle_missing='value') + + result = enc.fit_transform(X, y) + + expected = pd.Series([0.5108256237659906, .5108256237659906, -0.587786664902119, -0.587786664902119], name=0) + pd.testing.assert_series_equal(expected, result[0]) + + def test_HandleMissingValue_HaveMissingInTest_ExpectEncodedWithZero(self): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + test = ['a', np.nan] + enc = encoders.WOEEncoder(handle_missing='value') + + enc.fit(X, y) + result = enc.transform(test) + + expected = pd.Series([0.5108256237659906, 0], name=0) + pd.testing.assert_series_equal(expected, result[0]) + + def test_HandleUnknownValue_HaveUnknown_ExpectEncodedWithZero(self): + X = ['a', 'a', 'b', 'b'] + y = [1, 0, 0, 0] + test = ['a', 'c'] + enc = encoders.WOEEncoder(handle_unknown='value') + + enc.fit(X, y) + result = enc.transform(test) + + expected = pd.Series([0.5108256237659906, 0], name=0) + pd.testing.assert_series_equal(expected, result[0]) From e5ab10dc5a9dff31294909314cf224bef161e7c0 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 21:51:37 -0700 Subject: [PATCH 22/40] Add override return df --- category_encoders/target_encoder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 5feea609..cfffc541 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -180,7 +180,7 @@ def fit_target_encoding(self, X, y): return mapping - def transform(self, X, y=None): + def transform(self, X, y=None, override_return_df=False): """Perform the transformation to new categorical data. Parameters ---------- @@ -232,7 +232,7 @@ def transform(self, X, y=None): for col in self.drop_cols: X.drop(col, 1, inplace=True) - if self.return_df: + if self.return_df or override_return_df: return X else: return X.values From 3390a6222eea1181b98c8b251326492260b17710 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 22:34:03 -0700 Subject: [PATCH 23/40] Fix problems from detached head --- category_encoders/target_encoder.py | 6 +++--- category_encoders/tests/test_one_hot.py | 3 ++- category_encoders/woe.py | 6 +++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index cfffc541..9c8e3083 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -131,8 +131,8 @@ def fit(self, X, y, **kwargs): handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) - X = self.ordinal_encoder.transform(X) - self.mapping = self.fit_target_encoding(X, y) + X_ordinal = self.ordinal_encoder.transform(X) + self.mapping = self.fit_target_encoding(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = list(X_temp.columns) @@ -223,7 +223,7 @@ def transform(self, X, y=None, override_return_df=False): X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': - if X[self.cols].isnull().any(): + if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') X = self.target_encode(X) diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index 43f83805..3b031c1b 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -39,8 +39,9 @@ def test_one_hot(self): enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='error') # The exception is already raised in fit() because transform() is called there to get # feature_names right. + enc.fit(X) with self.assertRaises(ValueError): - enc.fit(X_t) + enc.transform(X_t) enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore', use_cat_names=True) enc.fit(X) diff --git a/category_encoders/woe.py b/category_encoders/woe.py index a65f158c..1224cd5e 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -154,10 +154,10 @@ def fit(self, X, y, **kwargs): handle_missing='value' ) self.ordinal_encoder = self.ordinal_encoder.fit(X) - X = self.ordinal_encoder.transform(X) + X_ordinal = self.ordinal_encoder.transform(X) # Training - self.mapping = self._train(X, y) + self.mapping = self._train(X_ordinal, y) X_temp = self.transform(X, override_return_df=True) self.feature_names = X_temp.columns.tolist() @@ -228,7 +228,7 @@ def transform(self, X, y=None, override_return_df=False): X = self.ordinal_encoder.transform(X) if self.handle_unknown == 'error': - if X[self.cols].isnull().any(): + if X[self.cols].isin([-1]).any().any(): raise ValueError('Unexpected categories found in dataframe') # Loop over columns and replace nominal values with WOE From 3b1517e09c60dd0268c534328b9b3a6e00ca14b0 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:03:19 -0700 Subject: [PATCH 24/40] See if leave one out tests are fixed --- category_encoders/leave_one_out.py | 2 +- category_encoders/tests/test_target_encoder.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 89cfc736..09d7ff91 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -257,7 +257,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None): unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train]) is_nan = X[col].isnull() - is_unknown_value = X[col].isin(unseen_values.dropna()) + is_unknown_value = X[col].isin(unseen_values.dropna().values) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') diff --git a/category_encoders/tests/test_target_encoder.py b/category_encoders/tests/test_target_encoder.py index 057a0381..c6fc5b2e 100644 --- a/category_encoders/tests/test_target_encoder.py +++ b/category_encoders/tests/test_target_encoder.py @@ -98,9 +98,9 @@ def test_HandleMissingIsValueAndNanInTest_ExpectMean(self): test = pd.Series([np.nan, 'b'], name='color') test_target = pd.Series([0, 0]) - ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_missing='value') - ce_leave.fit(train, target['outcome']) - obtained = ce_leave.transform(test, test_target) + enc = encoders.TargetEncoder(cols=['color'], handle_missing='value') + enc.fit(train, target['outcome']) + obtained = enc.transform(test, test_target) self.assertEqual(.6, list(obtained['color'])[0]) @@ -110,8 +110,8 @@ def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): test = pd.Series(['c', 'b'], name='color') test_target = pd.Series([0, 0]) - ce_leave = encoders.LeaveOneOutEncoder(cols=['color'], handle_unknown='value') - ce_leave.fit(train, target) - obtained = ce_leave.transform(test, test_target) + enc = encoders.TargetEncoder(cols=['color'], handle_unknown='value') + enc.fit(train, target) + obtained = enc.transform(test, test_target) self.assertEqual(.6, list(obtained['color'])[0]) From 90fadd7f6269bbde2f7b712137f97820a49c6b00 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:16:47 -0700 Subject: [PATCH 25/40] Update pandas version for issue in regards to isin for empty series --- category_encoders/leave_one_out.py | 2 +- requirements.txt | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 09d7ff91..89cfc736 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -257,7 +257,7 @@ def transform_leave_one_out(self, X_in, y, mapping=None): unseen_values = pd.Series([x for x in X[col].unique() if x not in unique_train]) is_nan = X[col].isnull() - is_unknown_value = X[col].isin(unseen_values.dropna().values) + is_unknown_value = X[col].isin(unseen_values.dropna()) if self.handle_unknown == 'error' and is_unknown_value.any(): raise ValueError('Columns to be encoded can not contain new values') diff --git a/requirements.txt b/requirements.txt index 28f4c0c1..89e38a18 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,5 @@ numpy>=1.11.1 scikit-learn>=0.17.1 scipy>=0.17.0 statsmodels>=0.6.1 -pandas>=0.20.1 +pandas>=0.21.1 patsy>=0.4.1 \ No newline at end of file diff --git a/setup.py b/setup.py index 960b707b..b32cbd29 100644 --- a/setup.py +++ b/setup.py @@ -33,7 +33,7 @@ 'scikit-learn>=0.17.1', 'scipy>=0.17.0', 'statsmodels>=0.6.1', - 'pandas>=0.20.1', + 'pandas>=0.21.1', 'patsy>=0.4.1', ], author_email='will@pedalwrencher.com' From eb1bfe7586d6b317ff2252f562f6fdfdc094e59b Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:25:39 -0700 Subject: [PATCH 26/40] In travis yaml use pandas version the package uses --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 86a71a65..3694c279 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,10 +15,10 @@ env: matrix: # The versions should match the minimal requirements in requirements.txt and setup.py - DISTRIB="conda" PYTHON_VERSION="2.7" CYTHON_VERSION="0.21" - NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.20.1" PATSY_VERSION="0.4.1" + NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.21.1" PATSY_VERSION="0.4.1" SCIKIT_VERSION="0.17.1" SCIPY_VERSION="0.17.0" STATSMODELS_VERSION="0.6.1" - DISTRIB="conda" PYTHON_VERSION="3.5" COVERAGE="true" CYTHON_VERSION="0.23.4" - NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.20.1" PATSY_VERSION="0.4.1" + NUMPY_VERSION="1.11.1" PANDAS_VERSION="0.21.1" PATSY_VERSION="0.4.1" SCIKIT_VERSION="0.17.1" SCIPY_VERSION="0.17.0" STATSMODELS_VERSION="0.6.1" install: source ci_scripts/install.sh From 30a6f622555f9defb100f24b3dc4a3fa219ea2d0 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:34:09 -0700 Subject: [PATCH 27/40] Use loc to update mapping dataframe to eliminate on copy warning --- category_encoders/leave_one_out.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index 89cfc736..da7e0ed6 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -273,14 +273,14 @@ def transform_leave_one_out(self, X_in, y, mapping=None): X[col] = level_means.where(X[col].map(colmap['count'][level_notunique]).notnull(), self._mean) if self.handle_unknown == 'value': - X[col][is_unknown_value] = self._mean + X.loc[is_unknown_value, col] = self._mean elif self.handle_unknown == 'return_nan': - X[col][is_unknown_value] = np.nan + X.loc[is_unknown_value, col] = np.nan if self.handle_missing == 'value': - X[col][is_nan & unseen_values.isnull().any()] = self._mean + X.loc[is_nan & unseen_values.isnull().any(), col] = self._mean elif self.handle_missing == 'return_nan': - X[col][is_nan] = np.nan + X.loc[is_nan, col] = np.nan if self.sigma is not None and y is not None: X[col] = X[col] * random_state_.normal(1., self.sigma, X[col].shape[0]) From de1af1b5069a14405693f69bb0e520b08917803b Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:34:21 -0700 Subject: [PATCH 28/40] Fixed doctests --- category_encoders/binary.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 7e949cea..049c0f66 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -45,20 +45,20 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): RangeIndex: 506 entries, 0 to 505 Data columns (total 18 columns): - CHAS_0 506 non-null int64 - CHAS_1 506 non-null int64 - RAD_0 506 non-null int64 - RAD_1 506 non-null int64 - RAD_2 506 non-null int64 - RAD_3 506 non-null int64 - RAD_4 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 + CHAS_0 506 non-null int64 + CHAS_1 506 non-null int64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 + RAD_0 506 non-null int64 + RAD_1 506 non-null int64 + RAD_2 506 non-null int64 + RAD_3 506 non-null int64 + RAD_4 506 non-null int64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 From a237af454e03571119b04bbb67c723530b9e8e19 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:36:23 -0700 Subject: [PATCH 29/40] Fix helmert doc tests --- category_encoders/helmert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 2a91d21b..0ed8891c 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -42,7 +42,7 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): >>> bunch = load_boston() >>> y = bunch.target >>> X = pd.DataFrame(bunch.data, columns=bunch.feature_names) - >>> enc = HelmertEncoder(cols=['CHAS', 'RAD']).fit(X, y) + >>> enc = HelmertEncoder(cols=['CHAS', 'RAD'], handle_unknown='value', handle_missing='value').fit(X, y) >>> numeric_dataset = enc.transform(X) >>> print(numeric_dataset.info()) From e2af58779ef79454fcb5d93bfe074663398e1b3c Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:38:43 -0700 Subject: [PATCH 30/40] Fix base n doc test --- category_encoders/basen.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 5c381ba9..4433dadd 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -48,20 +48,20 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): RangeIndex: 506 entries, 0 to 505 Data columns (total 18 columns): - CHAS_0 506 non-null int64 - CHAS_1 506 non-null int64 - RAD_0 506 non-null int64 - RAD_1 506 non-null int64 - RAD_2 506 non-null int64 - RAD_3 506 non-null int64 - RAD_4 506 non-null int64 CRIM 506 non-null float64 ZN 506 non-null float64 INDUS 506 non-null float64 + CHAS_0 506 non-null int64 + CHAS_1 506 non-null int64 NOX 506 non-null float64 RM 506 non-null float64 AGE 506 non-null float64 DIS 506 non-null float64 + RAD_0 506 non-null int64 + RAD_1 506 non-null int64 + RAD_2 506 non-null int64 + RAD_3 506 non-null int64 + RAD_4 506 non-null int64 TAX 506 non-null float64 PTRATIO 506 non-null float64 B 506 non-null float64 From f7f415592efe2c2043f2b44bb4035815e2b36773 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:43:26 -0700 Subject: [PATCH 31/40] Specify column ordering for refit test so we have consistent behavior between python and pandas versions --- category_encoders/tests/test_leave_one_out.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py index 04d8aa2f..5c43824c 100644 --- a/category_encoders/tests/test_leave_one_out.py +++ b/category_encoders/tests/test_leave_one_out.py @@ -47,7 +47,7 @@ def test_leave_one_out_fit_callTwiceOnDifferentData_ExpectRefit(self): mapping = encoder.mapping self.assertEqual(1, len(mapping)) self.assertIn('col_b', mapping) # the model should have the updated mapping - expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2']) + expected = pd.DataFrame({'sum': [2.0, 1.0], 'count': [3, 3]}, index=['1', '2'], columns=['sum', 'count']) np.testing.assert_equal(expected.values, mapping['col_b'].values) def test_leave_one_out_unique(self): From 0ace39b69bb255b09cc9587ddce1bc1fc744a3ba Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Fri, 21 Dec 2018 23:51:19 -0700 Subject: [PATCH 32/40] Use deep round on polynomial tests to satisfy python2 --- category_encoders/tests/test_polynomial.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/category_encoders/tests/test_polynomial.py b/category_encoders/tests/test_polynomial.py index 2dac0d44..51dd3058 100644 --- a/category_encoders/tests/test_polynomial.py +++ b/category_encoders/tests/test_polynomial.py @@ -102,7 +102,7 @@ def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): expected = [a_encoding, b_encoding, c_encoding] - self.assertEqual(result.values.tolist(), expected) + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): train = ['A', 'B'] @@ -112,7 +112,7 @@ def test_HandleMissingIndicator_HaveNoNan_ExpectSecondColumn(self): expected = [a_encoding, b_encoding] - self.assertEqual(result.values.tolist(), expected) + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): train = ['A', 'B'] @@ -125,7 +125,7 @@ def test_HandleMissingIndicator_NanNoNanInTrain_ExpectAsNanColumn(self): expected = [a_encoding, b_encoding, c_encoding] - self.assertEqual(result.values.tolist(), expected) + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): train = ['A', 'B'] @@ -138,7 +138,7 @@ def test_HandleUnknown_HaveNoUnknownInTrain_ExpectIndicatorInTest(self): expected = [a_encoding, b_encoding, c_encoding] - self.assertEqual(result.values.tolist(), expected) + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): train = ['A', 'B'] @@ -148,4 +148,4 @@ def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): expected = [a_encoding, b_encoding] - self.assertEqual(result.values.tolist(), expected) + self.assertEqual(deep_round(result.values.tolist()), deep_round(expected)) From a3214d008418839a42f9e06d2241472907ec80fe Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 22 Dec 2018 00:07:19 -0700 Subject: [PATCH 33/40] Use reindex on one hot to remove warning message --- category_encoders/one_hot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 0a04c9d7..5d26a838 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -340,7 +340,7 @@ def get_dummies(self, X_in, mapping): col = switch.get('col') mod = switch.get('mapping') - base_df = mod.loc[X[col]] + base_df = mod.reindex(X[col]) base_df = base_df.set_index(X.index) X = pd.concat([base_df, X], axis=1) From aea80fac1d880aa4116ffaf6123572f868a5f4ae Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 22 Dec 2018 00:07:57 -0700 Subject: [PATCH 34/40] Make leave one out test a decimal for python2 --- category_encoders/tests/test_leave_one_out.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/category_encoders/tests/test_leave_one_out.py b/category_encoders/tests/test_leave_one_out.py index 5c43824c..4eb254fb 100644 --- a/category_encoders/tests/test_leave_one_out.py +++ b/category_encoders/tests/test_leave_one_out.py @@ -103,7 +103,7 @@ def test_HandleMissingIsValueAndNanInTestAndNoTestTarget_ExpectMean(self): ce_leave.fit(train, target['outcome']) obtained = ce_leave.transform(test) - self.assertEqual([.5, 2/3], list(obtained['color'])) + self.assertEqual([.5, 2/3.0], list(obtained['color'])) def test_HandleUnknownValue_HaveUnknownInTest_ExpectMean(self): train = pd.Series(["a", "a", "a", "b", "b", "b"], name='color') From 08d96d3ff7349f43dac2baa8b6ea278f324ce05e Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 22 Dec 2018 13:26:33 -0700 Subject: [PATCH 35/40] Fix typos --- category_encoders/backward_difference.py | 4 ++-- category_encoders/basen.py | 2 +- category_encoders/binary.py | 4 ++-- category_encoders/helmert.py | 4 ++-- category_encoders/ordinal.py | 2 +- category_encoders/polynomial.py | 2 +- category_encoders/sum_coding.py | 2 +- category_encoders/target_encoder.py | 2 +- category_encoders/tests/test_polynomial.py | 2 +- 9 files changed, 12 insertions(+), 12 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index f04378ba..63090b46 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -26,11 +26,11 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 4433dadd..64cc60de 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -31,7 +31,7 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): when the downstream model copes well with nonlinearities (like decision tree), use higher base. handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example diff --git a/category_encoders/binary.py b/category_encoders/binary.py index 049c0f66..dc99a54e 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -24,11 +24,11 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 0ed8891c..712f1f46 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -27,11 +27,11 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 1ae1a1ef..ccefda6a 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -267,7 +267,7 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values @staticmethod - def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='valie'): + def ordinal_encoding(X_in, mapping=None, cols=None, handle_unknown='value', handle_missing='value'): """ Ordinal encoding uses a single column of integers to represent the classes. An optional mapping dict can be passed in, in this case we use the knowledge that there is some true order to the classes themselves. Otherwise, the classes diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index c143e5c5..b38ba8b2 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -30,7 +30,7 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): unexpected changes in the dimension in some cases. handle_missing: str options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index 46361d48..db74e374 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -30,7 +30,7 @@ class SumEncoder(BaseEstimator, TransformerMixin): unexpected changes in the dimension in some cases. handle_missing: str options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, - an extra column will be added in if the transform matrix has unknown categories. This can causes + an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. Example diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 9c8e3083..1a5bea5a 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -29,7 +29,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'valie', which will impute the target mean. + options are 'error', 'ignore' and 'value', defaults to 'value', which will impute the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float diff --git a/category_encoders/tests/test_polynomial.py b/category_encoders/tests/test_polynomial.py index 51dd3058..8a1af859 100644 --- a/category_encoders/tests/test_polynomial.py +++ b/category_encoders/tests/test_polynomial.py @@ -96,7 +96,7 @@ def test_polynomial_encoder_2StringCols_ExpectCorrectOrder(self): def test_HandleMissingIndicator_NanInTrain_ExpectAsColumn(self): train = ['A', 'B', np.nan] - encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='valie') + encoder = encoders.PolynomialEncoder(handle_missing='indicator', handle_unknown='value') result = encoder.fit_transform(train) expected = [a_encoding, From 58524d148431b20cfd765f3bacb8aca839950ec2 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 22 Dec 2018 15:08:07 -0700 Subject: [PATCH 36/40] replace 'ignore' with 'return_nan' in docs --- category_encoders/backward_difference.py | 4 ++-- category_encoders/basen.py | 8 +++++++- category_encoders/binary.py | 4 ++-- category_encoders/helmert.py | 4 ++-- category_encoders/leave_one_out.py | 2 +- category_encoders/one_hot.py | 14 +++++++++++--- category_encoders/polynomial.py | 4 ++-- category_encoders/sum_coding.py | 4 ++-- category_encoders/target_encoder.py | 2 +- category_encoders/tests/test_one_hot.py | 4 ++-- category_encoders/tests/test_ordinal.py | 2 +- category_encoders/woe.py | 2 +- 12 files changed, 34 insertions(+), 20 deletions(-) diff --git a/category_encoders/backward_difference.py b/category_encoders/backward_difference.py index 63090b46..2705ef9d 100644 --- a/category_encoders/backward_difference.py +++ b/category_encoders/backward_difference.py @@ -25,11 +25,11 @@ class BackwardDifferenceEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. diff --git a/category_encoders/basen.py b/category_encoders/basen.py index 64cc60de..e4840c24 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -30,7 +30,7 @@ class BaseNEncoder(BaseEstimator, TransformerMixin): base: int when the downstream model copes well with nonlinearities (like decision tree), use higher base. handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. @@ -282,6 +282,12 @@ def inverse_transform(self, X_in): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s" % (col,)) + if self.handle_unknown == 'return_nan': + for col in self.cols: + if X[col].isnull().any(): + raise ValueError("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) diff --git a/category_encoders/binary.py b/category_encoders/binary.py index dc99a54e..71683892 100644 --- a/category_encoders/binary.py +++ b/category_encoders/binary.py @@ -23,11 +23,11 @@ class BinaryEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. diff --git a/category_encoders/helmert.py b/category_encoders/helmert.py index 712f1f46..5ec83fae 100644 --- a/category_encoders/helmert.py +++ b/category_encoders/helmert.py @@ -26,11 +26,11 @@ class HelmertEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. handle_missing: str - options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. diff --git a/category_encoders/leave_one_out.py b/category_encoders/leave_one_out.py index da7e0ed6..202c3e96 100644 --- a/category_encoders/leave_one_out.py +++ b/category_encoders/leave_one_out.py @@ -27,7 +27,7 @@ class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value', which will impute the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the target mean. sigma: float adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma gives the standard deviation (spread or "width") of the normal distribution. diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 5d26a838..86580860 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -24,7 +24,7 @@ class OneHotEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. use_cat_names: bool @@ -206,6 +206,8 @@ def generate_mapping(self): if self.handle_unknown == 'value': base_df.loc[-1] = 0 + elif self.handle_unknown == 'return_nan': + base_df.loc[-1] = np.nan if self.handle_missing == 'return_nan': base_df.loc[values.loc[np.nan]] = np.nan @@ -257,7 +259,7 @@ def transform(self, X, override_return_df=False): if X[self.cols].isin([-1]).any().any(): raise ValueError('Columns to be encoded can not contain new values') - X = self.get_dummies(X, mapping=self.mapping) + X = self.get_dummies(X) if self.drop_invariant: for col in self.drop_cols: @@ -310,6 +312,12 @@ def inverse_transform(self, X_in): raise ValueError("inverse_transform is not supported because transform impute " "the unknown category -1 when encode %s"%(col,)) + if self.handle_unknown == 'return_nan': + for col in self.cols: + if X[col].isnull().any(): + raise ValueError("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) @@ -317,7 +325,7 @@ def inverse_transform(self, X_in): return X if self.return_df else X.values - def get_dummies(self, X_in, mapping): + def get_dummies(self, X_in): """ Convert numerical variable into dummy variables diff --git a/category_encoders/polynomial.py b/category_encoders/polynomial.py index b38ba8b2..7b3b8316 100644 --- a/category_encoders/polynomial.py +++ b/category_encoders/polynomial.py @@ -25,11 +25,11 @@ class PolynomialEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. handle_missing: str - options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. diff --git a/category_encoders/sum_coding.py b/category_encoders/sum_coding.py index db74e374..d207ee67 100644 --- a/category_encoders/sum_coding.py +++ b/category_encoders/sum_coding.py @@ -25,11 +25,11 @@ class SumEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value'. Warning: if value is used, + options are 'error', 'return_nan' and 'value', defaults to 'value'. Warning: if value is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in the dimension in some cases. handle_missing: str - options are 'error', 'ignore', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, + options are 'error', 'return_nan', 'value', and 'indicator', defaults to 'indicator'. Warning: if indicator is used, an extra column will be added in if the transform matrix has unknown categories. This can cause unexpected changes in dimension in some cases. diff --git a/category_encoders/target_encoder.py b/category_encoders/target_encoder.py index 1a5bea5a..80c812d4 100644 --- a/category_encoders/target_encoder.py +++ b/category_encoders/target_encoder.py @@ -29,7 +29,7 @@ def __init__(self, verbose=0, cols=None, drop_invariant=False, return_df=True, h return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'error', 'ignore' and 'value', defaults to 'value', which will impute the target mean. + options are 'error', 'return_nan' and 'value', defaults to 'value', which will impute the target mean. min_samples_leaf: int minimum samples to take category average into account. smoothing: float diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index 3b031c1b..1a5288e2 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -31,7 +31,7 @@ def test_one_hot(self): out = enc.transform(X_t) self.assertIn('extra_-1', out.columns.values) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore') + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='return_nan') enc.fit(X) out = enc.transform(X_t) self.assertEqual(len([x for x in out.columns.values if str(x).startswith('extra_')]), 3) @@ -43,7 +43,7 @@ def test_one_hot(self): with self.assertRaises(ValueError): enc.transform(X_t) - enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='ignore', use_cat_names=True) + enc = encoders.OneHotEncoder(verbose=1, return_df=True, handle_unknown='return_nan', use_cat_names=True) enc.fit(X) out = enc.transform(X_t) self.assertIn('extra_A', out.columns.values) diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index 8c7b0ba0..a34efb3c 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -35,7 +35,7 @@ def test_ordinal(self): self.assertIn(-1, set(out['extra'].values)) self.assertTrue(len(enc.mapping) > 0) - enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='ignore') + enc = encoders.OrdinalEncoder(verbose=1, return_df=True, handle_unknown='return _nan') enc.fit(X) out = enc.transform(X_t) out_cats = [x for x in set(out['extra'].values) if np.isfinite(x)] diff --git a/category_encoders/woe.py b/category_encoders/woe.py index 1224cd5e..4614a21f 100644 --- a/category_encoders/woe.py +++ b/category_encoders/woe.py @@ -23,7 +23,7 @@ class WOEEncoder(BaseEstimator, TransformerMixin): return_df: bool boolean for whether to return a pandas DataFrame from transform (otherwise it will be a numpy array). handle_unknown: str - options are 'ignore', 'error' and 'value', defaults to 'value', which will assume WOE=0. + options are 'return_nan', 'error' and 'value', defaults to 'value', which will assume WOE=0. randomized: bool, adds normal (Gaussian) distribution noise into training data in order to decrease overfitting (testing data are untouched). sigma: float From ccfb4d51cf257192faf6d2ec37ef211ab107ed66 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 29 Dec 2018 12:01:07 -0600 Subject: [PATCH 37/40] Convert ordinal encoder inverse transform to do best attempts at inverse transforms and issue warnings when unknowns exist or the bijection is broken --- category_encoders/ordinal.py | 11 ++-- category_encoders/tests/test_encoders.py | 14 ----- category_encoders/tests/test_ordinal.py | 73 +++++++++++++++++++++++- 3 files changed, 78 insertions(+), 20 deletions(-) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index ccefda6a..3092eb6f 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -4,6 +4,7 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin import category_encoders.utils as util +import warnings __author__ = 'willmcginnis' @@ -250,14 +251,14 @@ def inverse_transform(self, X_in): if self.handle_unknown == 'value': for col in self.cols: if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category -1 when encode %s" % (col,)) - if self.handle_unknown == 'return_nan': + if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': for col in self.cols: if X[col].isnull().any(): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category nan when encode %s" % (col,)) + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) for switch in self.mapping: column_mapping = switch.get('mapping') diff --git a/category_encoders/tests/test_encoders.py b/category_encoders/tests/test_encoders.py index 746dcca6..8b61b25c 100644 --- a/category_encoders/tests/test_encoders.py +++ b/category_encoders/tests/test_encoders.py @@ -203,20 +203,6 @@ def test_handle_unknown_value(self): result = enc.transform(test) self.assertFalse(result.iloc[1, :].isnull().all()) - def test_inverse_transform_handle_unknown_return_nan_expect_value_error(self): - train = pd.DataFrame({'city': ['chicago', 'los angeles']}) - test = pd.DataFrame({'city': ['chicago', 'denver']}) - y = pd.Series([1, 0]) - - # TODO - implement for all encoders supporting inverse transform - for encoder_name in ['OrdinalEncoder']: - with self.subTest(encoder_name=encoder_name): - enc = getattr(encoders, encoder_name)(handle_unknown='return_nan') - enc.fit(train, y) - result = enc.transform(test) - with self.assertRaises(ValueError): - _ = enc.inverse_transform(result) - def test_sklearn_compliance(self): for encoder_name in encoders.__all__: with self.subTest(encoder_name=encoder_name): diff --git a/category_encoders/tests/test_ordinal.py b/category_encoders/tests/test_ordinal.py index a34efb3c..eba8d5d6 100644 --- a/category_encoders/tests/test_ordinal.py +++ b/category_encoders/tests/test_ordinal.py @@ -2,7 +2,7 @@ from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ import category_encoders.tests.test_utils as tu import numpy as np - +import warnings import category_encoders as encoders @@ -128,3 +128,74 @@ def test_HaveNoneAndNan_ExpectCodesAsOne(self): result = enc.fit_transform(train)['city'].tolist() self.assertEqual(expected, result) + + def test_inverse_transform_HaveUnknown_ExpectWarning(self): + train = pd.DataFrame({'city': ['chicago', 'st louis']}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='value') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category -1 when encode city', str(w[0].message)) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='return_nan', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city', str(w[0].message)) + + def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + + enc = encoders.OrdinalEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(expected, original) From dc5ac1b767f740296a046d1b0284e0bba7be6555 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Sat, 29 Dec 2018 12:10:04 -0600 Subject: [PATCH 38/40] Update oridinal encoder inverse transform documentation --- category_encoders/ordinal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/category_encoders/ordinal.py b/category_encoders/ordinal.py index 3092eb6f..c2d2e75b 100644 --- a/category_encoders/ordinal.py +++ b/category_encoders/ordinal.py @@ -217,7 +217,9 @@ def transform(self, X, override_return_df=False): def inverse_transform(self, X_in): """ - Perform the inverse transformation to encoded data. + Perform the inverse transformation to encoded data. Will attempt best case reconstruction, which means + it will return nan for handle_missing and handle_unknown settings that break the bijection. We issue + warnings when some of those cases occur. Parameters ---------- From a4c916b628fdfec66253cfd725f4c983a0d54405 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Thu, 3 Jan 2019 19:41:54 -0800 Subject: [PATCH 39/40] Convert inverse transform to use warnings for cases where bijection is broken and test that we get nulls when bijection is broken --- category_encoders/basen.py | 24 ++++------ category_encoders/one_hot.py | 20 +++------ category_encoders/tests/test_basen.py | 57 ++++++++++++++++++++++++ category_encoders/tests/test_one_hot.py | 58 ++++++++++++++++++++++++- 4 files changed, 129 insertions(+), 30 deletions(-) diff --git a/category_encoders/basen.py b/category_encoders/basen.py index e4840c24..eab0403b 100644 --- a/category_encoders/basen.py +++ b/category_encoders/basen.py @@ -7,6 +7,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util +import warnings __author__ = 'willmcginnis' @@ -276,23 +277,17 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.handle_unknown == 'value': - for col in self.cols: - if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s" % (col,)) - - if self.handle_unknown == 'return_nan': - for col in self.cols: - if X[col].isnull().any(): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category nan when encode %s" % (col,)) - for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) + if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': + for col in self.cols: + if X[switch.get('col')].isnull().any(): + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + return X if self.return_df else X.values def calc_required_digits(self, values): @@ -356,10 +351,7 @@ def basen_to_integer(self, X, cols, base): for col in cols: col_list = [col0 for col0 in out_cols if str(col0).startswith(str(col))] - for col0 in col_list: - if any(X[col0].isnull()): - raise ValueError("inverse_transform is not supported because transform impute" - "the unknown category -1 when encode %s" % (col,)) + if base == 1: value_array = np.array([int(col0.split('_')[-1]) for col0 in col_list]) else: diff --git a/category_encoders/one_hot.py b/category_encoders/one_hot.py index 86580860..bc64c5c5 100644 --- a/category_encoders/one_hot.py +++ b/category_encoders/one_hot.py @@ -1,7 +1,7 @@ """One-hot or dummy coding""" import numpy as np import pandas as pd -import copy +import warnings from sklearn.base import BaseEstimator, TransformerMixin from category_encoders.ordinal import OrdinalEncoder import category_encoders.utils as util @@ -306,23 +306,17 @@ def inverse_transform(self, X_in): if not self.cols: return X if self.return_df else X.values - if self.handle_unknown == 'value': - for col in self.cols: - if any(X[col] == -1): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category -1 when encode %s"%(col,)) - - if self.handle_unknown == 'return_nan': - for col in self.cols: - if X[col].isnull().any(): - raise ValueError("inverse_transform is not supported because transform impute " - "the unknown category nan when encode %s" % (col,)) - for switch in self.ordinal_encoder.mapping: column_mapping = switch.get('mapping') inverse = pd.Series(data=column_mapping.index, index=column_mapping.get_values()) X[switch.get('col')] = X[switch.get('col')].map(inverse).astype(switch.get('data_type')) + if self.handle_unknown == 'return_nan' and self.handle_missing == 'return_nan': + for col in self.cols: + if X[switch.get('col')].isnull().any(): + warnings.warn("inverse_transform is not supported because transform impute " + "the unknown category nan when encode %s" % (col,)) + return X if self.return_df else X.values def get_dummies(self, X_in): diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py index 5447897c..2d3f694d 100644 --- a/category_encoders/tests/test_basen.py +++ b/category_encoders/tests/test_basen.py @@ -2,6 +2,7 @@ from unittest2 import TestCase # or `from unittest import ...` if on Python 3.4+ import numpy as np import category_encoders as encoders +import warnings class TestBaseNEncoder(TestCase): @@ -84,3 +85,59 @@ def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): self.assertEqual(2, result.shape[0]) self.assertListEqual([0, 0, 1], result.iloc[0, :].tolist()) self.assertListEqual([0, 1, 0], result.iloc[1, :].tolist()) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.BaseNEncoder(handle_missing='return_nan', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(2, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city', str(w[1].message)) + + def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + + enc = encoders.BaseNEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(expected, original) diff --git a/category_encoders/tests/test_one_hot.py b/category_encoders/tests/test_one_hot.py index 1a5288e2..e7c248b6 100644 --- a/category_encoders/tests/test_one_hot.py +++ b/category_encoders/tests/test_one_hot.py @@ -1,7 +1,7 @@ import pandas as pd from unittest import TestCase # or `from unittest import ...` if on Python 3.4+ import numpy as np - +import warnings import category_encoders.tests.test_utils as tu import category_encoders as encoders @@ -201,3 +201,59 @@ def test_HandleUnknown_HaveOnlyKnown_ExpectSecondColumn(self): expected = [[1, 0, 0], [0, 1, 0]] self.assertEqual(result.values.tolist(), expected) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingValue_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveNanInTrainAndHandleMissingReturnNan_ExpectReturnedWithNan(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + + enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='value') + result = enc.fit_transform(train) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OneHotEncoder(handle_missing='return_nan', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + + with warnings.catch_warnings(record=True) as w: + enc.inverse_transform(result) + + self.assertEqual(1, len(w)) + self.assertEqual('inverse_transform is not supported because transform impute ' + 'the unknown category nan when encode city', str(w[0].message)) + + def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', 'los angeles']}) + + enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(train, original) + + def test_inverse_transform_HaveHandleMissingValueAndHandleUnknownReturnNan_ExpectBestInverse(self): + train = pd.DataFrame({'city': ['chicago', np.nan]}) + test = pd.DataFrame({'city': ['chicago', np.nan, 'los angeles']}) + expected = pd.DataFrame({'city': ['chicago', np.nan, np.nan]}) + + enc = encoders.OneHotEncoder(handle_missing='value', handle_unknown='return_nan') + enc.fit(train) + result = enc.transform(test) + original = enc.inverse_transform(result) + + pd.testing.assert_frame_equal(expected, original) From 109a3d6e6d5a1352aefec515ea35ca83982af370 Mon Sep 17 00:00:00 2001 From: jcastaldo08 Date: Thu, 3 Jan 2019 19:57:56 -0800 Subject: [PATCH 40/40] Make test reflect what's in master --- category_encoders/tests/test_basen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/category_encoders/tests/test_basen.py b/category_encoders/tests/test_basen.py index 2d3f694d..abca9a10 100644 --- a/category_encoders/tests/test_basen.py +++ b/category_encoders/tests/test_basen.py @@ -115,9 +115,9 @@ def test_inverse_transform_BothFieldsAreReturnNanWithNan_ExpectValueError(self): with warnings.catch_warnings(record=True) as w: enc.inverse_transform(result) - self.assertEqual(2, len(w)) + self.assertEqual(1, len(w)) self.assertEqual('inverse_transform is not supported because transform impute ' - 'the unknown category nan when encode city', str(w[1].message)) + 'the unknown category nan when encode city', str(w[0].message)) def test_inverse_transform_HaveMissingAndNoUnknown_ExpectInversed(self): train = pd.DataFrame({'city': ['chicago', np.nan]})