Skip to content

Commit fb08cee

Browse files
JustinZhengBCjreback
authored andcommitted
BUG-26988 implement replace for categorical blocks (#27026)
1 parent e639af2 commit fb08cee

File tree

6 files changed

+129
-0
lines changed

6 files changed

+129
-0
lines changed

doc/source/whatsnew/v1.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ Categorical
298298
- Using date accessors on a categorical dtyped :class:`Series` of datetimes was not returning an object of the
299299
same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a
300300
:class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue: `27952`)
301+
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`)
301302

302303

303304
Datetimelike

pandas/core/arrays/categorical.py

+45
Original file line numberDiff line numberDiff line change
@@ -2470,6 +2470,51 @@ def isin(self, values):
24702470
code_values = code_values[null_mask | (code_values >= 0)]
24712471
return algorithms.isin(self.codes, code_values)
24722472

2473+
def replace(self, to_replace, value, inplace: bool = False):
2474+
"""
2475+
Replaces all instances of one value with another
2476+
2477+
Parameters
2478+
----------
2479+
to_replace: object
2480+
The value to be replaced
2481+
2482+
value: object
2483+
The value to replace it with
2484+
2485+
inplace: bool
2486+
Whether the operation is done in-place
2487+
2488+
Returns
2489+
-------
2490+
None if inplace is True, otherwise the new Categorical after replacement
2491+
2492+
2493+
Examples
2494+
--------
2495+
>>> s = pd.Categorical([1, 2, 1, 3])
2496+
>>> s.replace(1, 3)
2497+
[3, 3, 2, 3]
2498+
Categories (2, int64): [2, 3]
2499+
"""
2500+
inplace = validate_bool_kwarg(inplace, "inplace")
2501+
cat = self if inplace else self.copy()
2502+
if to_replace in cat.categories:
2503+
if isna(value):
2504+
cat.remove_categories(to_replace, inplace=True)
2505+
else:
2506+
categories = cat.categories.tolist()
2507+
index = categories.index(to_replace)
2508+
if value in cat.categories:
2509+
value_index = categories.index(value)
2510+
cat._codes[cat._codes == index] = value_index
2511+
cat.remove_categories(to_replace, inplace=True)
2512+
else:
2513+
categories[index] = value
2514+
cat.rename_categories(categories, inplace=True)
2515+
if not inplace:
2516+
return cat
2517+
24732518

24742519
# The Series.cat accessor
24752520

pandas/core/internals/blocks.py

+24
Original file line numberDiff line numberDiff line change
@@ -2924,6 +2924,30 @@ def where(
29242924
)
29252925
return result
29262926

2927+
def replace(
2928+
self,
2929+
to_replace,
2930+
value,
2931+
inplace: bool = False,
2932+
filter=None,
2933+
regex: bool = False,
2934+
convert: bool = True,
2935+
):
2936+
inplace = validate_bool_kwarg(inplace, "inplace")
2937+
result = self if inplace else self.copy()
2938+
if filter is None: # replace was called on a series
2939+
result.values.replace(to_replace, value, inplace=True)
2940+
if convert:
2941+
return result.convert(numeric=False, copy=not inplace)
2942+
else:
2943+
return result
2944+
else: # replace was called on a DataFrame
2945+
if not isna(value):
2946+
result.values.add_categories(value, inplace=True)
2947+
return super(CategoricalBlock, result).replace(
2948+
to_replace, value, inplace, filter, regex, convert
2949+
)
2950+
29272951

29282952
# -----------------------------------------------------------------
29292953
# Constructor Helpers

pandas/tests/arrays/categorical/test_algos.py

+18
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,24 @@ def test_isin_cats():
5959
tm.assert_numpy_array_equal(expected, result)
6060

6161

62+
@pytest.mark.parametrize(
63+
"to_replace, value, result",
64+
[("b", "c", ["a", "c"]), ("c", "d", ["a", "b"]), ("b", None, ["a", None])],
65+
)
66+
def test_replace(to_replace, value, result):
67+
# GH 26988
68+
cat = pd.Categorical(["a", "b"])
69+
expected = pd.Categorical(result)
70+
result = cat.replace(to_replace, value)
71+
tm.assert_categorical_equal(result, expected)
72+
if to_replace == "b": # the "c" test is supposed to be unchanged
73+
with pytest.raises(AssertionError):
74+
# ensure non-inplace call does not affect original
75+
tm.assert_categorical_equal(cat, expected)
76+
cat.replace(to_replace, value, inplace=True)
77+
tm.assert_categorical_equal(cat, expected)
78+
79+
6280
@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])])
6381
def test_isin_empty(empty):
6482
s = pd.Categorical(["a", "b"])

pandas/tests/frame/test_replace.py

+18
Original file line numberDiff line numberDiff line change
@@ -1296,6 +1296,24 @@ def test_replace_method(self, to_replace, method, expected):
12961296
expected = DataFrame(expected)
12971297
tm.assert_frame_equal(result, expected)
12981298

1299+
@pytest.mark.parametrize(
1300+
"replace_dict, final_data",
1301+
[({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])],
1302+
)
1303+
def test_categorical_replace_with_dict(self, replace_dict, final_data):
1304+
# GH 26988
1305+
df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category")
1306+
expected = DataFrame(final_data, columns=["a", "b"], dtype="category")
1307+
expected["a"] = expected["a"].cat.set_categories([1, 2, 3])
1308+
expected["b"] = expected["b"].cat.set_categories([1, 2, 3])
1309+
result = df.replace(replace_dict, 3)
1310+
tm.assert_frame_equal(result, expected)
1311+
with pytest.raises(AssertionError):
1312+
# ensure non-inplace call does not affect original
1313+
tm.assert_frame_equal(df, expected)
1314+
df.replace(replace_dict, 3, inplace=True)
1315+
tm.assert_frame_equal(df, expected)
1316+
12991317
@pytest.mark.parametrize(
13001318
"df, to_replace, exp",
13011319
[

pandas/tests/series/test_replace.py

+23
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,29 @@ def test_replace_categorical(self, categorical, numeric):
293293
expected = pd.Series(numeric)
294294
tm.assert_series_equal(expected, result, check_dtype=False)
295295

296+
def test_replace_categorical_single(self):
297+
# GH 26988
298+
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
299+
s = pd.Series(dti)
300+
c = s.astype("category")
301+
302+
expected = c.copy()
303+
expected = expected.cat.add_categories("foo")
304+
expected[2] = "foo"
305+
expected = expected.cat.remove_unused_categories()
306+
assert c[2] != "foo"
307+
308+
result = c.replace(c[2], "foo")
309+
tm.assert_series_equal(expected, result)
310+
assert c[2] != "foo" # ensure non-inplace call does not alter original
311+
312+
c.replace(c[2], "foo", inplace=True)
313+
tm.assert_series_equal(expected, c)
314+
315+
first_value = c[0]
316+
c.replace(c[1], c[0], inplace=True)
317+
assert c[0] == c[1] == first_value # test replacing with existing value
318+
296319
def test_replace_with_no_overflowerror(self):
297320
# GH 25616
298321
# casts to object without Exception from OverflowError

0 commit comments

Comments
 (0)