From c7b75d3f41c5deb702c69370f3cdc024e852702d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 24 Jun 2019 14:16:10 -0700 Subject: [PATCH 1/2] BUG: Categorical.copy deep kwarg --- pandas/core/arrays/categorical.py | 23 +++++++++++--------- pandas/core/internals/blocks.py | 7 ++++-- pandas/core/internals/construction.py | 2 +- pandas/tests/extension/base/setitem.py | 4 +++- pandas/tests/extension/test_categorical.py | 25 ++++++++++++++++++++++ 5 files changed, 47 insertions(+), 14 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 155638aca5560..5e05e99cdf015 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -457,11 +457,14 @@ def _formatter(self, boxed=False): # Defer to CategoricalFormatter's formatter. return None - def copy(self): + def copy(self, deep: bool = False): """ Copy constructor. """ - return self._constructor(values=self._codes.copy(), + values = self._codes + if deep: + values = values.copy() + return self._constructor(values=values, dtype=self.dtype, fastpath=True) @@ -483,7 +486,7 @@ def astype(self, dtype, copy=True): if is_categorical_dtype(dtype): # GH 10696/18593 dtype = self.dtype.update_dtype(dtype) - self = self.copy() if copy else self + self = self.copy(deep=True) if copy else self if dtype == self.dtype: return self return self._set_dtype(dtype) @@ -578,7 +581,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, codes = _recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: # Sort categories and recode for unknown categories. - unsorted = cats.copy() + unsorted = cats.copy(deep=True) categories = cats.sort_values() codes = _recode_for_categories(inferred_codes, unsorted, @@ -751,7 +754,7 @@ def set_ordered(self, value, inplace=False): """ inplace = validate_bool_kwarg(inplace, 'inplace') new_dtype = CategoricalDtype(self.categories, ordered=value) - cat = self if inplace else self.copy() + cat = self if inplace else self.copy(deep=True) cat._dtype = new_dtype if not inplace: return cat @@ -849,7 +852,7 @@ def set_categories(self, new_categories, ordered=None, rename=False, ordered = self.dtype.ordered new_dtype = CategoricalDtype(new_categories, ordered=ordered) - cat = self if inplace else self.copy() + cat = self if inplace else self.copy(deep=True) if rename: if (cat.dtype.categories is not None and len(new_dtype.categories) < len(cat.dtype.categories)): @@ -937,7 +940,7 @@ def rename_categories(self, new_categories, inplace=False): Categories (2, object): [A, B] """ inplace = validate_bool_kwarg(inplace, 'inplace') - cat = self if inplace else self.copy() + cat = self if inplace else self.copy(deep=True) if isinstance(new_categories, ABCSeries): msg = ("Treating Series 'new_categories' as a list-like and using " @@ -1045,7 +1048,7 @@ def add_categories(self, new_categories, inplace=False): new_categories = list(self.dtype.categories) + list(new_categories) new_dtype = CategoricalDtype(new_categories, self.ordered) - cat = self if inplace else self.copy() + cat = self if inplace else self.copy(deep=True) cat._dtype = new_dtype cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: @@ -1127,7 +1130,7 @@ def remove_unused_categories(self, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - cat = self if inplace else self.copy() + cat = self if inplace else self.copy(deep=True) idx, inv = np.unique(cat._codes, return_inverse=True) if idx.size != 0 and idx[0] == -1: # na sentinel @@ -2295,7 +2298,7 @@ def unique(self): # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - cat = self.copy() + cat = self.copy() # Don't need deep here # keep nan in codes cat._codes = unique_codes diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4cc6c86417b3b..c0b715d1236bc 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -720,7 +720,10 @@ def copy(self, deep=True): """ copy constructor """ values = self.values if deep: - values = values.copy() + if self.is_extension: + values = values.copy(deep=True) + else: + values = values.copy() return self.make_block_same_class(values, ndim=self.ndim) def replace(self, to_replace, value, inplace=False, filter=None, @@ -1855,7 +1858,7 @@ def where(self, other, cond, align=True, errors='raise', dtype = self.dtype try: - result = self.values.copy() + result = self.values.copy(deep=True) icond = ~cond if lib.is_scalar(other): result[icond] = other diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index d766d7f06d34a..8ef1771586c23 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -594,7 +594,7 @@ def sanitize_array(data, index, dtype=None, copy=False, subarr = data.astype(dtype) if copy: - subarr = data.copy() + subarr = data.copy(deep=True) return subarr elif isinstance(data, (list, tuple)) and len(data) > 0: diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index db6328e39e6cc..a60d54ef05708 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -18,7 +18,9 @@ def test_setitem_scalar_series(self, data, box_in_series): def test_setitem_sequence(self, data, box_in_series): if box_in_series: data = pd.Series(data) - original = data.copy() + original = data.copy() + else: + original = data.copy(deep=True) data[[0, 1]] = [data[1], data[0]] assert data[0] == original[1] diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 4cf9f78e1531d..d3fc0a17b8d6e 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -243,3 +243,28 @@ def _compare_other(self, s, data, op_name, other): class TestParsing(base.BaseParsingTests): pass + + +def test_copy_deep(data): + assert data[0] != data[1] + + orig = data.copy(deep=True) + other = data.copy(deep=True) + + # Modifying other will _not_ modify `data` + other[0] = other[1] + assert other[0] == other[1] + assert data[0] != data[1] + + # Modifying other _will_ modify `data` + other2 = data.copy(deep=False) + other2[0] = other2[1] + assert other2[0] == other2[1] + assert data[0] == data[1] + + # Default behavior should be deep=False + data = orig.copy(deep=True) + other3 = data.copy() + + other3[0] = other3[1] + assert data[0] == data[1] From 82fcd54bbcbb4a810b1cb4c75e5266e03e6785ef Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 26 Jun 2019 07:11:05 -0700 Subject: [PATCH 2/2] address reviewer comments --- pandas/core/arrays/categorical.py | 8 ++++---- pandas/tests/extension/test_categorical.py | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 5e05e99cdf015..a44c70f66a8b0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -457,10 +457,8 @@ def _formatter(self, boxed=False): # Defer to CategoricalFormatter's formatter. return None + @Appender(ExtensionArray.copy.__doc__) def copy(self, deep: bool = False): - """ - Copy constructor. - """ values = self._codes if deep: values = values.copy() @@ -2298,7 +2296,9 @@ def unique(self): # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - cat = self.copy() # Don't need deep here + + # We don't need a deep copy since we overwrite cat._codes immediately + cat = self.copy() # keep nan in codes cat._codes = unique_codes diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index d3fc0a17b8d6e..ba9a156e094b6 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -246,6 +246,7 @@ class TestParsing(base.BaseParsingTests): def test_copy_deep(data): + # GH#27024 assert data[0] != data[1] orig = data.copy(deep=True)