Skip to content

BUG: Categorical.copy deep kwarg #27024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 15 additions & 12 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,11 +457,12 @@ def _formatter(self, boxed=False):
# Defer to CategoricalFormatter's formatter.
return None

def copy(self):
"""
Copy constructor.
"""
return self._constructor(values=self._codes.copy(),
@Appender(ExtensionArray.copy.__doc__)
def copy(self, deep: bool = False):
values = self._codes
if deep:
values = values.copy()
return self._constructor(values=values,
dtype=self.dtype,
fastpath=True)

Expand All @@ -483,7 +484,7 @@ def astype(self, dtype, copy=True):
if is_categorical_dtype(dtype):
# GH 10696/18593
dtype = self.dtype.update_dtype(dtype)
self = self.copy() if copy else self
self = self.copy(deep=True) if copy else self
if dtype == self.dtype:
return self
return self._set_dtype(dtype)
Expand Down Expand Up @@ -578,7 +579,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes,
codes = _recode_for_categories(inferred_codes, cats, categories)
elif not cats.is_monotonic_increasing:
# Sort categories and recode for unknown categories.
unsorted = cats.copy()
unsorted = cats.copy(deep=True)
categories = cats.sort_values()

codes = _recode_for_categories(inferred_codes, unsorted,
Expand Down Expand Up @@ -751,7 +752,7 @@ def set_ordered(self, value, inplace=False):
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
new_dtype = CategoricalDtype(self.categories, ordered=value)
cat = self if inplace else self.copy()
cat = self if inplace else self.copy(deep=True)
cat._dtype = new_dtype
if not inplace:
return cat
Expand Down Expand Up @@ -849,7 +850,7 @@ def set_categories(self, new_categories, ordered=None, rename=False,
ordered = self.dtype.ordered
new_dtype = CategoricalDtype(new_categories, ordered=ordered)

cat = self if inplace else self.copy()
cat = self if inplace else self.copy(deep=True)
if rename:
if (cat.dtype.categories is not None and
len(new_dtype.categories) < len(cat.dtype.categories)):
Expand Down Expand Up @@ -937,7 +938,7 @@ def rename_categories(self, new_categories, inplace=False):
Categories (2, object): [A, B]
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
cat = self if inplace else self.copy()
cat = self if inplace else self.copy(deep=True)

if isinstance(new_categories, ABCSeries):
msg = ("Treating Series 'new_categories' as a list-like and using "
Expand Down Expand Up @@ -1045,7 +1046,7 @@ def add_categories(self, new_categories, inplace=False):
new_categories = list(self.dtype.categories) + list(new_categories)
new_dtype = CategoricalDtype(new_categories, self.ordered)

cat = self if inplace else self.copy()
cat = self if inplace else self.copy(deep=True)
cat._dtype = new_dtype
cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories)
if not inplace:
Expand Down Expand Up @@ -1127,7 +1128,7 @@ def remove_unused_categories(self, inplace=False):
set_categories
"""
inplace = validate_bool_kwarg(inplace, 'inplace')
cat = self if inplace else self.copy()
cat = self if inplace else self.copy(deep=True)
idx, inv = np.unique(cat._codes, return_inverse=True)

if idx.size != 0 and idx[0] == -1: # na sentinel
Expand Down Expand Up @@ -2295,6 +2296,8 @@ def unique(self):

# unlike np.unique, unique1d does not sort
unique_codes = unique1d(self.codes)

# We don't need a deep copy since we overwrite cat._codes immediately
cat = self.copy()

# keep nan in codes
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,10 @@ def copy(self, deep=True):
""" copy constructor """
values = self.values
if deep:
values = values.copy()
if self.is_extension:
values = values.copy(deep=True)
else:
values = values.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is a deep-copy needed here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gfyoung just pushed addressing most of your comments. for this one: because without deep=True, it isn't a "real" copy. i.e. ExtensionArray.copy(deep=True) behaves the same as np.ndarray.copy().

return self.make_block_same_class(values, ndim=self.ndim)

def replace(self, to_replace, value, inplace=False, filter=None,
Expand Down Expand Up @@ -1855,7 +1858,7 @@ def where(self, other, cond, align=True, errors='raise',
dtype = self.dtype

try:
result = self.values.copy()
result = self.values.copy(deep=True)
icond = ~cond
if lib.is_scalar(other):
result[icond] = other
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,7 @@ def sanitize_array(data, index, dtype=None, copy=False,
subarr = data.astype(dtype)

if copy:
subarr = data.copy()
subarr = data.copy(deep=True)
return subarr

elif isinstance(data, (list, tuple)) and len(data) > 0:
Expand Down
4 changes: 3 additions & 1 deletion pandas/tests/extension/base/setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ def test_setitem_scalar_series(self, data, box_in_series):
def test_setitem_sequence(self, data, box_in_series):
if box_in_series:
data = pd.Series(data)
original = data.copy()
original = data.copy()
else:
original = data.copy(deep=True)

data[[0, 1]] = [data[1], data[0]]
assert data[0] == original[1]
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/extension/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,29 @@ def _compare_other(self, s, data, op_name, other):

class TestParsing(base.BaseParsingTests):
pass


def test_copy_deep(data):
# GH#27024
assert data[0] != data[1]

orig = data.copy(deep=True)
other = data.copy(deep=True)

# Modifying other will _not_ modify `data`
other[0] = other[1]
assert other[0] == other[1]
assert data[0] != data[1]

# Modifying other _will_ modify `data`
other2 = data.copy(deep=False)
other2[0] = other2[1]
assert other2[0] == other2[1]
assert data[0] == data[1]

# Default behavior should be deep=False
data = orig.copy(deep=True)
other3 = data.copy()

other3[0] = other3[1]
assert data[0] == data[1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lots of tests bundled into one function. I would consider breaking this up into at least four different tests, especially since you "reset" every time by doing a copy of some kind of data.