Skip to content

Commit f9a1457

Browse files
committed
BUG: Fix Series.astype and Categorical.astype to update existing Categorical data
1 parent 34a8d36 commit f9a1457

File tree

5 files changed

+120
-33
lines changed

5 files changed

+120
-33
lines changed

doc/source/whatsnew/v0.22.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,7 @@ Conversion
255255
- Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`)
256256
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
257257
- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`)
258+
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)
258259

259260

260261
Indexing

pandas/core/categorical.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@
2727
is_categorical_dtype,
2828
is_list_like, is_sequence,
2929
is_scalar,
30-
is_dict_like)
30+
is_dict_like,
31+
pandas_dtype)
3132
from pandas.core.common import is_null_slice, _maybe_box_datetimelike
3233

3334
from pandas.core.algorithms import factorize, take_1d, unique1d
@@ -435,10 +436,24 @@ def astype(self, dtype, copy=True):
435436
.. versionadded:: 0.19.0
436437
437438
"""
439+
if isinstance(dtype, compat.string_types) and dtype == 'category':
440+
# GH 18593: astype('category') should not change anything
441+
return self.copy() if copy else self
442+
443+
dtype = pandas_dtype(dtype)
438444
if is_categorical_dtype(dtype):
439-
if copy is True:
440-
return self.copy()
441-
return self
445+
# GH 18593: keep current categories if None (ordered can't be None)
446+
if dtype.categories is None:
447+
new_categories = self.categories
448+
else:
449+
new_categories = dtype.categories
450+
dtype = CategoricalDtype(new_categories, dtype.ordered)
451+
452+
self = self.copy() if copy else self
453+
if dtype == self.dtype:
454+
# fastpath if dtypes are equal
455+
return self
456+
return self._set_dtype(dtype)
442457
return np.array(self, dtype=dtype, copy=copy)
443458

444459
@cache_readonly

pandas/core/internals.py

+10-24
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
import pandas.core.dtypes.concat as _concat
5555

5656
from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex
57-
from pandas.core.common import is_null_slice
57+
from pandas.core.common import is_null_slice, _any_not_none
5858
import pandas.core.algorithms as algos
5959

6060
from pandas.core.index import Index, MultiIndex, _ensure_index
@@ -589,13 +589,16 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
589589
"CategoricalDtype instead",
590590
FutureWarning, stacklevel=7)
591591

592-
kwargs = kwargs.copy()
593-
categories = getattr(dtype, 'categories', None)
594-
ordered = getattr(dtype, 'ordered', False)
592+
categories = kwargs.get('categories', None)
593+
ordered = kwargs.get('ordered', None)
594+
if _any_not_none(categories, ordered):
595+
dtype = CategoricalDtype(categories, ordered)
595596

596-
kwargs.setdefault('categories', categories)
597-
kwargs.setdefault('ordered', ordered)
598-
return self.make_block(Categorical(self.values, **kwargs))
597+
if is_categorical_dtype(self.values):
598+
# GH 10696/18593: update an existing categorical efficiently
599+
return self.make_block(self.values.astype(dtype, copy=copy))
600+
601+
return self.make_block(Categorical(self.values, dtype=dtype))
599602

600603
# astype processing
601604
dtype = np.dtype(dtype)
@@ -2427,23 +2430,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):
24272430

24282431
return self.make_block_same_class(new_values, new_mgr_locs)
24292432

2430-
def _astype(self, dtype, copy=False, errors='raise', values=None,
2431-
klass=None, mgr=None):
2432-
"""
2433-
Coerce to the new type (if copy=True, return a new copy)
2434-
raise on an except if raise == True
2435-
"""
2436-
2437-
if self.is_categorical_astype(dtype):
2438-
values = self.values
2439-
else:
2440-
values = np.asarray(self.values).astype(dtype, copy=False)
2441-
2442-
if copy:
2443-
values = values.copy()
2444-
2445-
return self.make_block(values)
2446-
24472433
def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
24482434
""" convert to our native types format, slicing if desired """
24492435

pandas/tests/categorical/test_dtypes.py

+51-5
Original file line numberDiff line numberDiff line change
@@ -99,10 +99,56 @@ def test_codes_dtypes(self):
9999
result = result.remove_categories(['foo%05d' % i for i in range(300)])
100100
assert result.codes.dtype == 'int8'
101101

102-
def test_astype_categorical(self):
102+
@pytest.mark.parametrize('ordered', [True, False])
103+
@pytest.mark.parametrize('copy', [True, False])
104+
def test_astype(self, copy, ordered):
105+
# string
106+
cat = Categorical(list('abbaaccc'), ordered=ordered)
107+
result = cat.astype(object, copy=copy)
108+
expected = np.array(cat)
109+
tm.assert_numpy_array_equal(result, expected)
110+
111+
msg = 'could not convert string to float'
112+
with tm.assert_raises_regex(ValueError, msg):
113+
cat.astype(float, copy=copy)
114+
115+
# numeric
116+
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
117+
result = cat.astype(object, copy=copy)
118+
expected = np.array(cat, dtype=object)
119+
tm.assert_numpy_array_equal(result, expected)
120+
121+
result = cat.astype(int, copy=copy)
122+
expected = np.array(cat, dtype=np.int)
123+
tm.assert_numpy_array_equal(result, expected)
124+
125+
result = cat.astype(float, copy=copy)
126+
expected = np.array(cat, dtype=np.float)
127+
tm.assert_numpy_array_equal(result, expected)
128+
129+
@pytest.mark.parametrize('copy', [True, False])
130+
@pytest.mark.parametrize('dtype_ordered', [True, False])
131+
@pytest.mark.parametrize('cat_ordered', [True, False])
132+
def test_astype_category(self, copy, dtype_ordered, cat_ordered):
133+
# GH 10696/18593
134+
data = list('abcaacbab')
135+
cat = Categorical(data, categories=list('bac'), ordered=cat_ordered)
136+
137+
# standard categories
138+
dtype = CategoricalDtype(ordered=dtype_ordered)
139+
result = cat.astype(dtype, copy=copy)
140+
expected = Categorical(
141+
data, categories=cat.categories, ordered=dtype_ordered)
142+
tm.assert_categorical_equal(result, expected)
103143

104-
cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
105-
tm.assert_categorical_equal(cat, cat.astype('category'))
106-
tm.assert_almost_equal(np.array(cat), cat.astype('object'))
144+
# non-standard categories
145+
dtype = CategoricalDtype(list('adc'), dtype_ordered)
146+
result = cat.astype(dtype, copy=copy)
147+
expected = Categorical(data, dtype=dtype)
148+
tm.assert_categorical_equal(result, expected)
107149

108-
pytest.raises(ValueError, lambda: cat.astype(float))
150+
if dtype_ordered is False:
151+
# dtype='category' can't specify ordered, so only test once
152+
result = cat.astype('category', copy=copy)
153+
expected = cat
154+
tm.assert_categorical_equal(result, expected)

pandas/tests/series/test_dtypes.py

+39
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,45 @@ def cmp(a, b):
322322
lambda x: x.astype('object').astype(Categorical)]:
323323
pytest.raises(TypeError, lambda: invalid(s))
324324

325+
@pytest.mark.parametrize('copy', [True, False])
326+
@pytest.mark.parametrize('name', [None, 'foo'])
327+
@pytest.mark.parametrize('dtype_ordered', [True, False])
328+
@pytest.mark.parametrize('series_ordered', [True, False])
329+
def test_astype_categorical_to_categorical(self, copy, name, dtype_ordered,
330+
series_ordered):
331+
# GH 10696/18593
332+
s_data = list('abcaacbab')
333+
s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered)
334+
s = Series(s_data, dtype=s_dtype, name=name)
335+
336+
# unspecified categories
337+
dtype = CategoricalDtype(ordered=dtype_ordered)
338+
result = s.astype(dtype, copy=copy)
339+
exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
340+
expected = Series(s_data, name=name, dtype=exp_dtype)
341+
tm.assert_series_equal(result, expected)
342+
343+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
344+
result = s.astype('category', ordered=dtype_ordered)
345+
tm.assert_series_equal(result, expected)
346+
347+
# different categories
348+
dtype = CategoricalDtype(list('adc'), dtype_ordered)
349+
result = s.astype(dtype, copy=copy)
350+
expected = Series(s_data, name=name, dtype=dtype)
351+
tm.assert_series_equal(result, expected)
352+
353+
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
354+
result = s.astype(
355+
'category', categories=list('adc'), ordered=dtype_ordered)
356+
tm.assert_series_equal(result, expected)
357+
358+
if dtype_ordered is False:
359+
# not specifying ordered, so only test once
360+
expected = s
361+
result = s.astype('category', copy=copy)
362+
tm.assert_series_equal(result, expected)
363+
325364
def test_astype_categoricaldtype(self):
326365
s = Series(['a', 'b', 'a'])
327366
result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))

0 commit comments

Comments
 (0)