From 770ad533809d757ecba2a7fab46c79dfa0ed3bd2 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 24 Jun 2019 15:01:06 -0700 Subject: [PATCH 01/12] implement replace for categorical blocks --- doc/source/whatsnew/v0.25.0.rst | 1 + pandas/core/internals/blocks.py | 19 +++++++++++++++++++ pandas/tests/series/test_replace.py | 17 +++++++++++++++++ 3 files changed, 37 insertions(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 109005364fca6..cfd234ff6aa79 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -585,6 +585,7 @@ Categorical - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) - Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) +- Bug in :func:`DataFrame.replace` and :func:`Series.replace` that would give unusual results on categorical data (:issue:`26988`) - Datetimelike diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4cc6c86417b3b..5ad2ab29d8c93 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2978,6 +2978,25 @@ def where(self, other, cond, align=True, errors='raise', axis=axis, transpose=transpose) return result + def replace(self, to_replace, value, inplace=False, filter=None, + regex=False, convert=True): + if filter is None or not self.mgr_locs.isin(filter).any(): + result = self if inplace else self.copy() + categories = result.values.categories.tolist() + if to_replace in categories: + index = categories.index(to_replace) + categories[index] = value + result.values.rename_categories(categories, inplace=True) + if convert: + return result.convert(by_item=True, numeric=False, + copy=not inplace) + else: + return result + else: + self.values.add_categories(value, inplace=True) + return super().replace(to_replace, value, inplace, + filter, regex, convert) + # ----------------------------------------------------------------- # Constructor Helpers diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 92096b3c95670..6acf4827a95ba 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -292,6 +292,23 @@ def test_replace_categorical(self, categorical, numeric): expected = pd.Series(numeric) tm.assert_series_equal(expected, result, check_dtype=False) + def test_replace_categorical_single(self): + # GH 26988 + dti = pd.date_range('2016-01-01', periods=3, tz='US/Pacific') + s = pd.Series(dti) + c = s.astype('category') + + expected = c.copy() + expected.cat.add_categories('foo', inplace=True) + expected[2] = 'foo' + expected.cat.remove_unused_categories(inplace=True) + + result = c.replace(c[2], 'foo') + tm.assert_series_equal(expected, result) + + c.replace(c[2], 'foo', inplace=True) + tm.assert_series_equal(expected, c) + def test_replace_with_no_overflowerror(self): # GH 25616 # casts to object without Exception from OverflowError From a2f2a7c3fb2375e22461212ddf1d94d1bb6a5542 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Mon, 24 Jun 2019 18:30:33 -0700 Subject: [PATCH 02/12] handle when value is none --- pandas/core/internals/blocks.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5ad2ab29d8c93..ce06a52945fa6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2984,16 +2984,20 @@ def replace(self, to_replace, value, inplace=False, filter=None, result = self if inplace else self.copy() categories = result.values.categories.tolist() if to_replace in categories: - index = categories.index(to_replace) - categories[index] = value - result.values.rename_categories(categories, inplace=True) + if isna(value): + result.values.remove_categories(to_replace, inplace=True) + else: + index = categories.index(to_replace) + categories[index] = value + result.values.rename_categories(categories, inplace=True) if convert: return result.convert(by_item=True, numeric=False, copy=not inplace) else: return result else: - self.values.add_categories(value, inplace=True) + if not isna(value): + self.values.add_categories(value, inplace=True) return super().replace(to_replace, value, inplace, filter, regex, convert) From 535ab5dca777b078a3bf48d41cb65effdb5d2bd9 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 25 Jun 2019 11:06:37 -0700 Subject: [PATCH 03/12] fix inplace arg --- pandas/core/internals/blocks.py | 9 +++++---- pandas/tests/series/test_replace.py | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ce06a52945fa6..ddb30a1769559 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2980,8 +2980,8 @@ def where(self, other, cond, align=True, errors='raise', def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True): + result = self if inplace else self.copy() if filter is None or not self.mgr_locs.isin(filter).any(): - result = self if inplace else self.copy() categories = result.values.categories.tolist() if to_replace in categories: if isna(value): @@ -2997,9 +2997,10 @@ def replace(self, to_replace, value, inplace=False, filter=None, return result else: if not isna(value): - self.values.add_categories(value, inplace=True) - return super().replace(to_replace, value, inplace, - filter, regex, convert) + result.values.add_categories(value, inplace=True) + return super(CategoricalBlock, result).replace(to_replace, value, + inplace, filter, + regex, convert) # ----------------------------------------------------------------- diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 6acf4827a95ba..28a6ef088f93f 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -302,9 +302,11 @@ def test_replace_categorical_single(self): expected.cat.add_categories('foo', inplace=True) expected[2] = 'foo' expected.cat.remove_unused_categories(inplace=True) + assert c[2] != 'foo' result = c.replace(c[2], 'foo') tm.assert_series_equal(expected, result) + assert c[2] != 'foo' c.replace(c[2], 'foo', inplace=True) tm.assert_series_equal(expected, c) From 9482c2bd5c880030a44c33ff0087e82b44f51a29 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 25 Jun 2019 11:11:13 -0700 Subject: [PATCH 04/12] fix whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index cfd234ff6aa79..c40bc5244cb8f 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -585,7 +585,7 @@ Categorical - Bug in :func:`DataFrame.at` and :func:`Series.at` that would raise exception if the index was a :class:`CategoricalIndex` (:issue:`20629`) - Fixed bug in comparison of ordered :class:`Categorical` that contained missing values with a scalar which sometimes incorrectly resulted in ``True`` (:issue:`26504`) -- Bug in :func:`DataFrame.replace` and :func:`Series.replace` that would give unusual results on categorical data (:issue:`26988`) +- Bug in :func:`DataFrame.replace` and :func:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`) - Datetimelike From 702e71ddd6e8c560b05e17a548d5fd97bcd2ca5b Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 25 Jun 2019 11:24:17 -0700 Subject: [PATCH 05/12] simplify filter --- pandas/core/internals/blocks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index ddb30a1769559..7341fcee23511 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2981,7 +2981,7 @@ def where(self, other, cond, align=True, errors='raise', def replace(self, to_replace, value, inplace=False, filter=None, regex=False, convert=True): result = self if inplace else self.copy() - if filter is None or not self.mgr_locs.isin(filter).any(): + if filter is None: categories = result.values.categories.tolist() if to_replace in categories: if isna(value): From da778e1a34386dae59f42ff980e5ce4839e7c883 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 25 Jun 2019 12:19:30 -0700 Subject: [PATCH 06/12] add test --- pandas/tests/frame/test_replace.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index 2513508822fec..a632702c3f2cd 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1149,3 +1149,20 @@ def test_replace_method(self, to_replace, method, expected): result = df.replace(to_replace=to_replace, value=None, method=method) expected = DataFrame(expected) assert_frame_equal(result, expected) + + @pytest.mark.parametrize("replace_dict, final_data", [ + ({'a': 1, 'b': 1}, [[3, 3], [2, 2]]), + ({'a': 1, 'b': 2}, [[3, 1], [2, 3]]) + ]) + def test_categorical_replace_with_dict(self, replace_dict, final_data): + # GH 26988 + df = DataFrame([[1, 1], [2, 2]], columns=['a', 'b'], dtype='category') + expected = DataFrame(final_data, columns=['a', 'b'], dtype='category') + expected['a'].cat.set_categories([1, 2, 3], inplace=True) + expected['b'].cat.set_categories([1, 2, 3], inplace=True) + result = df.replace(replace_dict, 3) + assert_frame_equal(result, expected) + with pytest.raises(AssertionError): + assert_frame_equal(df, expected) + df.replace(replace_dict, 3, inplace=True) + assert_frame_equal(df, expected) From bec92d093c355ca8fc7995b4fcf9629dbd5aa282 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 30 Jun 2019 00:02:56 -0700 Subject: [PATCH 07/12] improve tests --- pandas/tests/frame/test_replace.py | 5 +++-- pandas/tests/series/test_replace.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/test_replace.py b/pandas/tests/frame/test_replace.py index a632702c3f2cd..162149eda3a75 100644 --- a/pandas/tests/frame/test_replace.py +++ b/pandas/tests/frame/test_replace.py @@ -1158,11 +1158,12 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data): # GH 26988 df = DataFrame([[1, 1], [2, 2]], columns=['a', 'b'], dtype='category') expected = DataFrame(final_data, columns=['a', 'b'], dtype='category') - expected['a'].cat.set_categories([1, 2, 3], inplace=True) - expected['b'].cat.set_categories([1, 2, 3], inplace=True) + expected['a'] = expected['a'].cat.set_categories([1, 2, 3]) + expected['b'] = expected['b'].cat.set_categories([1, 2, 3]) result = df.replace(replace_dict, 3) assert_frame_equal(result, expected) with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original assert_frame_equal(df, expected) df.replace(replace_dict, 3, inplace=True) assert_frame_equal(df, expected) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 28a6ef088f93f..559f4cdf9436d 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -306,7 +306,7 @@ def test_replace_categorical_single(self): result = c.replace(c[2], 'foo') tm.assert_series_equal(expected, result) - assert c[2] != 'foo' + assert c[2] != 'foo' # ensure non-inplace call does not alter original c.replace(c[2], 'foo', inplace=True) tm.assert_series_equal(expected, c) From 593e1e00ad872b1f22c8f7ee985d225aa28c7a72 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 30 Jun 2019 00:44:59 -0700 Subject: [PATCH 08/12] fix other instance of inplace in test --- pandas/tests/series/test_replace.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_replace.py b/pandas/tests/series/test_replace.py index 559f4cdf9436d..dd2e79a9744d4 100644 --- a/pandas/tests/series/test_replace.py +++ b/pandas/tests/series/test_replace.py @@ -299,9 +299,9 @@ def test_replace_categorical_single(self): c = s.astype('category') expected = c.copy() - expected.cat.add_categories('foo', inplace=True) + expected = expected.cat.add_categories('foo') expected[2] = 'foo' - expected.cat.remove_unused_categories(inplace=True) + expected = expected.cat.remove_unused_categories() assert c[2] != 'foo' result = c.replace(c[2], 'foo') From f444347df84c66f175cd743a2df388b933356623 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 8 Sep 2019 15:33:42 -0700 Subject: [PATCH 09/12] add replace helper to categorical --- pandas/core/arrays/categorical.py | 14 ++++++++++++ pandas/core/internals/blocks.py | 12 +++------- pandas/tests/arrays/categorical/test_algos.py | 22 +++++++++++++++++++ 3 files changed, 39 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5929a8d51fe43..6def4aed35e0e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2494,6 +2494,20 @@ def isin(self, values): code_values = code_values[null_mask | (code_values >= 0)] return algorithms.isin(self.codes, code_values) + def replace(self, to_replace, value, inplace=False): + inplace = validate_bool_kwarg(inplace, "inplace") + cat = self if inplace else self.copy() + categories = cat.categories.tolist() + if to_replace in categories: + if isna(value): + cat.remove_categories(to_replace, inplace=True) + else: + index = categories.index(to_replace) + categories[index] = value + cat.rename_categories(categories, inplace=True) + if not inplace: + return cat + # The Series.cat accessor diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 363ccf21a865c..2a1617e41359b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -3008,18 +3008,12 @@ def where( def replace( self, to_replace, value, inplace=False, filter=None, regex=False, convert=True ): + inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() if filter is None: - categories = result.values.categories.tolist() - if to_replace in categories: - if isna(value): - result.values.remove_categories(to_replace, inplace=True) - else: - index = categories.index(to_replace) - categories[index] = value - result.values.rename_categories(categories, inplace=True) + result.values.replace(to_replace, value, inplace=True) if convert: - return result.convert(by_item=True, numeric=False, copy=not inplace) + return result.convert(numeric=False, copy=not inplace) else: return result else: diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 1508fef86ae62..376656e8edc7f 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -59,6 +59,28 @@ def test_isin_cats(): tm.assert_numpy_array_equal(expected, result) +@pytest.mark.parametrize( + "to_replace, value, result", + [ + ("b", "c", ["a", "c"]), + ("c", "d", ["a", "b"]), + ("b", None, ["a", None]), + ], +) +def test_replace(to_replace, value, result): + # GH 26988 + cat = pd.Categorical(["a", "b"]) + expected = pd.Categorical(result) + result = cat.replace(to_replace, value) + tm.assert_categorical_equal(result, expected) + if to_replace == "b": # the "c" test is supposed to be unchanged + with pytest.raises(AssertionError): + # ensure non-inplace call does not affect original + tm.assert_categorical_equal(cat, expected) + cat.replace(to_replace, value, inplace=True) + tm.assert_categorical_equal(cat, expected) + + @pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])]) def test_isin_empty(empty): s = pd.Categorical(["a", "b"]) From 44631bc3ced72393de93126fe0ed68435aad580d Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 8 Sep 2019 22:35:19 -0700 Subject: [PATCH 10/12] black formatting --- pandas/tests/arrays/categorical/test_algos.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py index 376656e8edc7f..8314366eeaeac 100644 --- a/pandas/tests/arrays/categorical/test_algos.py +++ b/pandas/tests/arrays/categorical/test_algos.py @@ -61,11 +61,7 @@ def test_isin_cats(): @pytest.mark.parametrize( "to_replace, value, result", - [ - ("b", "c", ["a", "c"]), - ("c", "d", ["a", "b"]), - ("b", None, ["a", None]), - ], + [("b", "c", ["a", "c"]), ("c", "d", ["a", "b"]), ("b", None, ["a", None])], ) def test_replace(to_replace, value, result): # GH 26988 From 988ebc30e781108ebc222b81c338359c6d7c3926 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 20 Oct 2019 10:30:45 -0700 Subject: [PATCH 11/12] fix formatting --- pandas/core/internals/blocks.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fde464e010c50..b9d3e8c611528 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -12,7 +12,6 @@ from pandas._libs.tslibs import Timedelta, conversion from pandas._libs.tslibs.timezones import tz_compare from pandas.util._validators import validate_bool_kwarg -from pandas.core.dtypes.common import is_hashable from pandas.core.dtypes.cast import ( astype_nansafe, @@ -3019,7 +3018,7 @@ def replace( to_replace, value, inplace: bool = False, - filter: bool = None, + filter=None, regex: bool = False, convert: bool = True, ): From c4d3f8489e467c072e4d96a4e515b76f7243e7d8 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Tue, 12 Nov 2019 18:42:56 -0800 Subject: [PATCH 12/12] add comment --- pandas/core/internals/blocks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index b03ef06585000..38e80525b3d13 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2935,13 +2935,13 @@ def replace( ): inplace = validate_bool_kwarg(inplace, "inplace") result = self if inplace else self.copy() - if filter is None: + if filter is None: # replace was called on a series result.values.replace(to_replace, value, inplace=True) if convert: return result.convert(numeric=False, copy=not inplace) else: return result - else: + else: # replace was called on a DataFrame if not isna(value): result.values.add_categories(value, inplace=True) return super(CategoricalBlock, result).replace(