Skip to content

BUG-26988 implement replace for categorical blocks #27026

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 16, 2019
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ Categorical
- Using date accessors on a categorical dtyped :class:`Series` of datetimes was not returning an object of the
same type as if one used the :meth:`.str.` / :meth:`.dt.` on a :class:`Series` of that type. E.g. when accessing :meth:`Series.dt.tz_localize` on a
:class:`Categorical` with duplicate entries, the accessor was skipping duplicates (:issue: `27952`)
- Bug in :meth:`DataFrame.replace` and :meth:`Series.replace` that would give incorrect results on categorical data (:issue:`26988`)


Datetimelike
Expand Down
45 changes: 45 additions & 0 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -2471,6 +2471,51 @@ def isin(self, values):
code_values = code_values[null_mask | (code_values >= 0)]
return algorithms.isin(self.codes, code_values)

def replace(self, to_replace, value, inplace: bool = False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you type these more specifically (can be a followon PR)

"""
Replaces all instances of one value with another

Parameters
----------
to_replace: object
The value to be replaced

value: object
The value to replace it with

inplace: bool
Whether the operation is done in-place

Returns
-------
None if inplace is True, otherwise the new Categorical after replacement


Examples
--------
>>> s = pd.Categorical([1, 2, 1, 3])
>>> s.replace(1, 3)
[3, 3, 2, 3]
Categories (2, int64): [2, 3]
"""
inplace = validate_bool_kwarg(inplace, "inplace")
cat = self if inplace else self.copy()
if to_replace in cat.categories:
if isna(value):
cat.remove_categories(to_replace, inplace=True)
else:
categories = cat.categories.tolist()
index = categories.index(to_replace)
if value in cat.categories:
value_index = categories.index(value)
cat._codes[cat._codes == index] = value_index
cat.remove_categories(to_replace, inplace=True)
else:
categories[index] = value
cat.rename_categories(categories, inplace=True)
if not inplace:
return cat


# The Series.cat accessor

Expand Down
24 changes: 24 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2924,6 +2924,30 @@ def where(
)
return result

def replace(
self,
to_replace,
value,
inplace: bool = False,
filter=None,
regex: bool = False,
convert: bool = True,
):
inplace = validate_bool_kwarg(inplace, "inplace")
result = self if inplace else self.copy()
if filter is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what case hits this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

filter is none when replace is called on a Series as opposed to a DataFrame

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still find this non-obvious. can you add a comment to this effect

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added a comment

result.values.replace(to_replace, value, inplace=True)
if convert:
return result.convert(numeric=False, copy=not inplace)
else:
return result
else:
if not isna(value):
result.values.add_categories(value, inplace=True)
return super(CategoricalBlock, result).replace(
to_replace, value, inplace, filter, regex, convert
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can any of the logic in this method generalize to ExtensionBlock? (presumably we'd need to add replace to the EA interface?)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a good point, but we can certainly do as a followup; can you open an issue for discussion.



# -----------------------------------------------------------------
# Constructor Helpers
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/arrays/categorical/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,24 @@ def test_isin_cats():
tm.assert_numpy_array_equal(expected, result)


@pytest.mark.parametrize(
"to_replace, value, result",
[("b", "c", ["a", "c"]), ("c", "d", ["a", "b"]), ("b", None, ["a", None])],
)
def test_replace(to_replace, value, result):
# GH 26988
cat = pd.Categorical(["a", "b"])
expected = pd.Categorical(result)
result = cat.replace(to_replace, value)
tm.assert_categorical_equal(result, expected)
if to_replace == "b": # the "c" test is supposed to be unchanged
with pytest.raises(AssertionError):
# ensure non-inplace call does not affect original
tm.assert_categorical_equal(cat, expected)
cat.replace(to_replace, value, inplace=True)
tm.assert_categorical_equal(cat, expected)


@pytest.mark.parametrize("empty", [[], pd.Series(), np.array([])])
def test_isin_empty(empty):
s = pd.Categorical(["a", "b"])
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/frame/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,6 +1296,24 @@ def test_replace_method(self, to_replace, method, expected):
expected = DataFrame(expected)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"replace_dict, final_data",
[({"a": 1, "b": 1}, [[3, 3], [2, 2]]), ({"a": 1, "b": 2}, [[3, 1], [2, 3]])],
)
def test_categorical_replace_with_dict(self, replace_dict, final_data):
# GH 26988
df = DataFrame([[1, 1], [2, 2]], columns=["a", "b"], dtype="category")
expected = DataFrame(final_data, columns=["a", "b"], dtype="category")
expected["a"] = expected["a"].cat.set_categories([1, 2, 3])
expected["b"] = expected["b"].cat.set_categories([1, 2, 3])
result = df.replace(replace_dict, 3)
tm.assert_frame_equal(result, expected)
with pytest.raises(AssertionError):
# ensure non-inplace call does not affect original
tm.assert_frame_equal(df, expected)
df.replace(replace_dict, 3, inplace=True)
tm.assert_frame_equal(df, expected)

@pytest.mark.parametrize(
"df, to_replace, exp",
[
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/series/test_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,6 +293,29 @@ def test_replace_categorical(self, categorical, numeric):
expected = pd.Series(numeric)
tm.assert_series_equal(expected, result, check_dtype=False)

def test_replace_categorical_single(self):
# GH 26988
dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
s = pd.Series(dti)
c = s.astype("category")

expected = c.copy()
expected = expected.cat.add_categories("foo")
expected[2] = "foo"
expected = expected.cat.remove_unused_categories()
assert c[2] != "foo"

result = c.replace(c[2], "foo")
tm.assert_series_equal(expected, result)
assert c[2] != "foo" # ensure non-inplace call does not alter original

c.replace(c[2], "foo", inplace=True)
tm.assert_series_equal(expected, c)

first_value = c[0]
c.replace(c[1], c[0], inplace=True)
assert c[0] == c[1] == first_value # test replacing with existing value

def test_replace_with_no_overflowerror(self):
# GH 25616
# casts to object without Exception from OverflowError
Expand Down