From 7d3c1a3d63754b206612ce07ed3572ea8bf17848 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Wed, 5 Feb 2020 20:25:24 -0800 Subject: [PATCH 1/5] BUG: list-like to_replace on Categorical.replace is ignored or crash --- doc/source/whatsnew/v1.0.2.rst | 1 + pandas/_testing.py | 10 +++- pandas/core/arrays/categorical.py | 29 +++++++---- .../tests/arrays/categorical/test_replace.py | 48 +++++++++++++++++++ 4 files changed, 78 insertions(+), 10 deletions(-) create mode 100644 pandas/tests/arrays/categorical/test_replace.py diff --git a/doc/source/whatsnew/v1.0.2.rst b/doc/source/whatsnew/v1.0.2.rst index 0216007ea5ba8..19358689a2186 100644 --- a/doc/source/whatsnew/v1.0.2.rst +++ b/doc/source/whatsnew/v1.0.2.rst @@ -31,6 +31,7 @@ Bug fixes **Categorical** - Fixed bug where :meth:`Categorical.from_codes` improperly raised a ``ValueError`` when passed nullable integer codes. (:issue:`31779`) +- Bug in :class:`Categorical` that would ignore or crash when calling :meth:`Series.replace` with a list-like ``to_replace`` (:issue:`31720`) **I/O** diff --git a/pandas/_testing.py b/pandas/_testing.py index 01d2bfe0458c8..0cf4ad02f05f9 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1070,6 +1070,7 @@ def assert_series_equal( check_exact=False, check_datetimelike_compat=False, check_categorical=True, + check_category_order=True, obj="Series", ): """ @@ -1104,6 +1105,8 @@ def assert_series_equal( Compare datetime-like which is comparable ignoring dtype. check_categorical : bool, default True Whether to compare internal Categorical exactly. + check_category_order : bool, default True + Whether to compare category order of internal Categoricals obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. @@ -1206,7 +1209,12 @@ def assert_series_equal( if check_categorical: if is_categorical_dtype(left) or is_categorical_dtype(right): - assert_categorical_equal(left.values, right.values, obj=f"{obj} category") + assert_categorical_equal( + left.values, + right.values, + obj=f"{obj} category", + check_category_order=check_category_order, + ) # This could be refactored to use the NDFrame.equals method diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6c7c35e9b4763..878abde86ba3e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2440,18 +2440,29 @@ def replace(self, to_replace, value, inplace: bool = False): """ inplace = validate_bool_kwarg(inplace, "inplace") cat = self if inplace else self.copy() - if to_replace in cat.categories: - if isna(value): - cat.remove_categories(to_replace, inplace=True) - else: + + # build a dict of (to replace -> value) pairs + if is_list_like(to_replace): + # if to_replace is list-like and value is scalar + replace_dict = {replace_value: value for replace_value in to_replace} + else: + # if both to_replace and value are scalar + replace_dict = {to_replace: value} + # other cases, like if both to_replace and value are list-like or if + # to_replace is a dict, are handled separately in NDFrame + + for replace_value, new_value in replace_dict.items(): + if isna(new_value): + continue + if replace_value in cat.categories: categories = cat.categories.tolist() - index = categories.index(to_replace) - if value in cat.categories: - value_index = categories.index(value) + index = categories.index(replace_value) + if new_value in cat.categories: + value_index = categories.index(new_value) cat._codes[cat._codes == index] = value_index - cat.remove_categories(to_replace, inplace=True) + cat.remove_categories(replace_value, inplace=True) else: - categories[index] = value + categories[index] = new_value cat.rename_categories(categories, inplace=True) if not inplace: return cat diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py new file mode 100644 index 0000000000000..52530123bd52f --- /dev/null +++ b/pandas/tests/arrays/categorical/test_replace.py @@ -0,0 +1,48 @@ +import pytest + +import pandas as pd +import pandas._testing as tm + + +@pytest.mark.parametrize( + "to_replace,value,expected,check_types,check_categorical", + [ + # one-to-one + (1, 2, [2, 2, 3], True, True), + (1, 4, [4, 2, 3], True, True), + (4, 1, [1, 2, 3], True, True), + (5, 6, [1, 2, 3], True, True), + # many-to-one + ([1], 2, [2, 2, 3], True, True), + ([1, 2], 3, [3, 3, 3], True, True), + ([1, 2], 4, [4, 4, 3], True, True), + ((1, 2, 4), 5, [5, 5, 3], True, True), + ((5, 6), 2, [1, 2, 3], True, True), + # many-to-many, handled outside of Categorical and results in separate dtype + ([1], [2], [2, 2, 3], False, False), + ([1, 4], [5, 2], [5, 2, 3], False, False), + # check_categorical sorts categories, which crashes on mixed dtypes + (3, "4", [1, 2, "4"], True, False), + ([1, 2, "3"], "5", ["5", "5", 3], True, False), + ], +) +def test_replace(to_replace, value, expected, check_types, check_categorical): + # GH 31720 + s = pd.Series([1, 2, 3], dtype="category") + result = s.replace(to_replace, value) + expected = pd.Series(expected, dtype="category") + s.replace(to_replace, value, inplace=True) + tm.assert_series_equal( + expected, + result, + check_dtype=check_types, + check_categorical=check_categorical, + check_category_order=False, + ) + tm.assert_series_equal( + expected, + s, + check_dtype=check_types, + check_categorical=check_categorical, + check_category_order=False, + ) From 91d3ef627f59bd68e141fe60f2e16dbe6b77bd54 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Thu, 13 Feb 2020 20:25:00 -0800 Subject: [PATCH 2/5] revert isna change --- pandas/core/arrays/categorical.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 878abde86ba3e..545bf776c1c4b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2452,18 +2452,19 @@ def replace(self, to_replace, value, inplace: bool = False): # to_replace is a dict, are handled separately in NDFrame for replace_value, new_value in replace_dict.items(): - if isna(new_value): - continue if replace_value in cat.categories: - categories = cat.categories.tolist() - index = categories.index(replace_value) - if new_value in cat.categories: - value_index = categories.index(new_value) - cat._codes[cat._codes == index] = value_index + if isna(new_value): cat.remove_categories(replace_value, inplace=True) else: - categories[index] = new_value - cat.rename_categories(categories, inplace=True) + categories = cat.categories.tolist() + index = categories.index(replace_value) + if new_value in cat.categories: + value_index = categories.index(new_value) + cat._codes[cat._codes == index] = value_index + cat.remove_categories(replace_value, inplace=True) + else: + categories[index] = new_value + cat.rename_categories(categories, inplace=True) if not inplace: return cat From ea417383488aae5b41a986a1fcddbc2f54978922 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 15 Feb 2020 15:56:06 -0800 Subject: [PATCH 3/5] move isna --- pandas/core/arrays/categorical.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 545bf776c1c4b..ac05dc491f675 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2452,19 +2452,19 @@ def replace(self, to_replace, value, inplace: bool = False): # to_replace is a dict, are handled separately in NDFrame for replace_value, new_value in replace_dict.items(): + if isna(new_value): + cat.remove_categories(replace_value, inplace=True) + continue if replace_value in cat.categories: - if isna(new_value): + categories = cat.categories.tolist() + index = categories.index(replace_value) + if new_value in cat.categories: + value_index = categories.index(new_value) + cat._codes[cat._codes == index] = value_index cat.remove_categories(replace_value, inplace=True) else: - categories = cat.categories.tolist() - index = categories.index(replace_value) - if new_value in cat.categories: - value_index = categories.index(new_value) - cat._codes[cat._codes == index] = value_index - cat.remove_categories(replace_value, inplace=True) - else: - categories[index] = new_value - cat.rename_categories(categories, inplace=True) + categories[index] = new_value + cat.rename_categories(categories, inplace=True) if not inplace: return cat From e3777769264f5a4a9518ad6076fc16f440354e5e Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sat, 15 Feb 2020 23:17:56 -0800 Subject: [PATCH 4/5] more isna stuff --- pandas/core/arrays/categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ac05dc491f675..e2ad1e63938b9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2452,10 +2452,10 @@ def replace(self, to_replace, value, inplace: bool = False): # to_replace is a dict, are handled separately in NDFrame for replace_value, new_value in replace_dict.items(): - if isna(new_value): - cat.remove_categories(replace_value, inplace=True) - continue if replace_value in cat.categories: + if isna(new_value): + cat.remove_categories(replace_value, inplace=True) + continue categories = cat.categories.tolist() index = categories.index(replace_value) if new_value in cat.categories: From 8a2e12d1eee0b7a5309b40f2a67ba783e85523a5 Mon Sep 17 00:00:00 2001 From: JustinZhengBC Date: Sun, 16 Feb 2020 16:21:44 -0800 Subject: [PATCH 5/5] changes --- pandas/_testing.py | 2 ++ pandas/core/arrays/categorical.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_testing.py b/pandas/_testing.py index 0cf4ad02f05f9..5d0574e2b4cfd 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1107,6 +1107,8 @@ def assert_series_equal( Whether to compare internal Categorical exactly. check_category_order : bool, default True Whether to compare category order of internal Categoricals + + .. versionadded:: 1.0.2 obj : str, default 'Series' Specify object name being compared, internally used to show appropriate assertion message. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e2ad1e63938b9..b25202addb54f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2448,9 +2448,9 @@ def replace(self, to_replace, value, inplace: bool = False): else: # if both to_replace and value are scalar replace_dict = {to_replace: value} + # other cases, like if both to_replace and value are list-like or if # to_replace is a dict, are handled separately in NDFrame - for replace_value, new_value in replace_dict.items(): if replace_value in cat.categories: if isna(new_value):