Skip to content

Commit 73e73db

Browse files
jschendeljreback
authored andcommitted
BUG: Fix Series.astype and Categorical.astype to update existing Categorical data (#18710)
1 parent 040470a commit 73e73db

File tree

6 files changed

+108
-38
lines changed

6 files changed

+108
-38
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,7 @@ Conversion
263263
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
264264
- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`)
265265
- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
266+
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
266267

267268

268269
Indexing

pandas/core/categorical.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -436,9 +436,12 @@ def astype(self, dtype, copy=True):
436436
437437
"""
438438
if is_categorical_dtype(dtype):
439-
if copy is True:
440-
return self.copy()
441-
return self
439+
# GH 10696/18593
440+
dtype = self.dtype._update_dtype(dtype)
441+
self = self.copy() if copy else self
442+
if dtype == self.dtype:
443+
return self
444+
return self._set_dtype(dtype)
442445
return np.array(self, dtype=dtype, copy=copy)
443446

444447
@cache_readonly

pandas/core/internals.py

+10-25
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
import pandas.core.dtypes.concat as _concat
5555

5656
from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex
57-
from pandas.core.common import is_null_slice
57+
from pandas.core.common import is_null_slice, _any_not_none
5858
import pandas.core.algorithms as algos
5959

6060
from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -573,7 +573,6 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
573573
raise TypeError(msg)
574574

575575
# may need to convert to categorical
576-
# this is only called for non-categoricals
577576
if self.is_categorical_astype(dtype):
578577

579578
# deprecated 17636
@@ -589,13 +588,16 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
589588
"CategoricalDtype instead",
590589
FutureWarning, stacklevel=7)
591590

592-
kwargs = kwargs.copy()
593-
categories = getattr(dtype, 'categories', None)
594-
ordered = getattr(dtype, 'ordered', False)
591+
categories = kwargs.get('categories', None)
592+
ordered = kwargs.get('ordered', None)
593+
if _any_not_none(categories, ordered):
594+
dtype = CategoricalDtype(categories, ordered)
595595

596-
kwargs.setdefault('categories', categories)
597-
kwargs.setdefault('ordered', ordered)
598-
return self.make_block(Categorical(self.values, **kwargs))
596+
if is_categorical_dtype(self.values):
597+
# GH 10696/18593: update an existing categorical efficiently
598+
return self.make_block(self.values.astype(dtype, copy=copy))
599+
600+
return self.make_block(Categorical(self.values, dtype=dtype))
599601

600602
# astype processing
601603
dtype = np.dtype(dtype)
@@ -2427,23 +2429,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
24272429

24282430
return self.make_block_same_class(new_values, new_mgr_locs)
24292431

2430-
def _astype(self, dtype, copy=False, errors='raise', values=None,
2431-
klass=None, mgr=None):
2432-
"""
2433-
Coerce to the new type (if copy=True, return a new copy)
2434-
raise on an except if raise == True
2435-
"""
2436-
2437-
if self.is_categorical_astype(dtype):
2438-
values = self.values
2439-
else:
2440-
values = np.asarray(self.values).astype(dtype, copy=False)
2441-
2442-
if copy:
2443-
values = values.copy()
2444-
2445-
return self.make_block(values)
2446-
24472432
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
24482433
""" convert to our native types format, slicing if desired """
24492434

pandas/tests/categorical/test_dtypes.py

+49-5
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,54 @@ def test_codes_dtypes(self):
9999
result = result.remove_categories(['foo%05d' % i for i in range(300)])
100100
assert result.codes.dtype == 'int8'
101101

102-
def test_astype_categorical(self):
102+
@pytest.mark.parametrize('ordered', [True, False])
103+
def test_astype(self, ordered):
104+
# string
105+
cat = Categorical(list('abbaaccc'), ordered=ordered)
106+
result = cat.astype(object)
107+
expected = np.array(cat)
108+
tm.assert_numpy_array_equal(result, expected)
109+
110+
msg = 'could not convert string to float'
111+
with tm.assert_raises_regex(ValueError, msg):
112+
cat.astype(float)
113+
114+
# numeric
115+
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
116+
result = cat.astype(object)
117+
expected = np.array(cat, dtype=object)
118+
tm.assert_numpy_array_equal(result, expected)
119+
120+
result = cat.astype(int)
121+
expected = np.array(cat, dtype=np.int)
122+
tm.assert_numpy_array_equal(result, expected)
123+
124+
result = cat.astype(float)
125+
expected = np.array(cat, dtype=np.float)
126+
tm.assert_numpy_array_equal(result, expected)
127+
128+
@pytest.mark.parametrize('dtype_ordered', [True, False])
129+
@pytest.mark.parametrize('cat_ordered', [True, False])
130+
def test_astype_category(self, dtype_ordered, cat_ordered):
131+
# GH 10696/18593
132+
data = list('abcaacbab')
133+
cat = Categorical(data, categories=list('bac'), ordered=cat_ordered)
134+
135+
# standard categories
136+
dtype = CategoricalDtype(ordered=dtype_ordered)
137+
result = cat.astype(dtype)
138+
expected = Categorical(
139+
data, categories=cat.categories, ordered=dtype_ordered)
140+
tm.assert_categorical_equal(result, expected)
103141

104-
cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
105-
tm.assert_categorical_equal(cat, cat.astype('category'))
106-
tm.assert_almost_equal(np.array(cat), cat.astype('object'))
142+
# non-standard categories
143+
dtype = CategoricalDtype(list('adc'), dtype_ordered)
144+
result = cat.astype(dtype)
145+
expected = Categorical(data, dtype=dtype)
146+
tm.assert_categorical_equal(result, expected)
107147

108-
pytest.raises(ValueError, lambda: cat.astype(float))
148+
if dtype_ordered is False:
149+
# dtype='category' can't specify ordered, so only test once
150+
result = cat.astype('category')
151+
expected = cat
152+
tm.assert_categorical_equal(result, expected)

pandas/tests/indexes/test_category.py

+4-5
Original file line numberDiff line numberDiff line change
@@ -411,19 +411,18 @@ def test_astype(self):
411411
result = IntervalIndex.from_intervals(result.values)
412412
tm.assert_index_equal(result, expected)
413413

414-
@pytest.mark.parametrize('copy', [True, False])
415414
@pytest.mark.parametrize('name', [None, 'foo'])
416415
@pytest.mark.parametrize('dtype_ordered', [True, False])
417416
@pytest.mark.parametrize('index_ordered', [True, False])
418-
def test_astype_category(self, copy, name, dtype_ordered, index_ordered):
417+
def test_astype_category(self, name, dtype_ordered, index_ordered):
419418
# GH 18630
420419
index = self.create_index(ordered=index_ordered)
421420
if name:
422421
index = index.rename(name)
423422

424423
# standard categories
425424
dtype = CategoricalDtype(ordered=dtype_ordered)
426-
result = index.astype(dtype, copy=copy)
425+
result = index.astype(dtype)
427426
expected = CategoricalIndex(index.tolist(),
428427
name=name,
429428
categories=index.categories,
@@ -432,13 +431,13 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered):
432431

433432
# non-standard categories
434433
dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
435-
result = index.astype(dtype, copy=copy)
434+
result = index.astype(dtype)
436435
expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
437436
tm.assert_index_equal(result, expected)
438437

439438
if dtype_ordered is False:
440439
# dtype='category' can't specify ordered, so only test once
441-
result = index.astype('category', copy=copy)
440+
result = index.astype('category')
442441
expected = index
443442
tm.assert_index_equal(result, expected)
444443

pandas/tests/series/test_dtypes.py

+38
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,44 @@ def cmp(a, b):
322322
lambda x: x.astype('object').astype(Categorical)]:
323323
pytest.raises(TypeError, lambda: invalid(s))
324324

325+
@pytest.mark.parametrize('name', [None, 'foo'])
326+
@pytest.mark.parametrize('dtype_ordered', [True, False])
327+
@pytest.mark.parametrize('series_ordered', [True, False])
328+
def test_astype_categorical_to_categorical(self, name, dtype_ordered,
329+
series_ordered):
330+
# GH 10696/18593
331+
s_data = list('abcaacbab')
332+
s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered)
333+
s = Series(s_data, dtype=s_dtype, name=name)
334+
335+
# unspecified categories
336+
dtype = CategoricalDtype(ordered=dtype_ordered)
337+
result = s.astype(dtype)
338+
exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
339+
expected = Series(s_data, name=name, dtype=exp_dtype)
340+
tm.assert_series_equal(result, expected)
341+
342+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
343+
result = s.astype('category', ordered=dtype_ordered)
344+
tm.assert_series_equal(result, expected)
345+
346+
# different categories
347+
dtype = CategoricalDtype(list('adc'), dtype_ordered)
348+
result = s.astype(dtype)
349+
expected = Series(s_data, name=name, dtype=dtype)
350+
tm.assert_series_equal(result, expected)
351+
352+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
353+
result = s.astype(
354+
'category', categories=list('adc'), ordered=dtype_ordered)
355+
tm.assert_series_equal(result, expected)
356+
357+
if dtype_ordered is False:
358+
# not specifying ordered, so only test once
359+
expected = s
360+
result = s.astype('category')
361+
tm.assert_series_equal(result, expected)
362+
325363
def test_astype_categoricaldtype(self):
326364
s = Series(['a', 'b', 'a'])
327365
result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))

0 commit comments

Comments
 (0)