Skip to content

BUG: Fix Series.astype and Categorical.astype to update existing Categorical data #18710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 13, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ Conversion
- Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`)
- Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`)
- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`)
- Bug in :meth:`Series.astype` and ``Categorical.astype()`` where an existing categorical data does not get updated (:issue:`10696`, :issue:`18593`)


Indexing
Expand Down
9 changes: 6 additions & 3 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,9 +436,12 @@ def astype(self, dtype, copy=True):
"""
if is_categorical_dtype(dtype):
if copy is True:
return self.copy()
return self
# GH 10696/18593
dtype = self.dtype._update_dtype(dtype)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we might want to add some explicit tests for _update_dtype at some point (separate PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added them in the other PR during the initial implementation actually:

@pytest.mark.parametrize('dtype', [
CategoricalDtype(list('abc'), False),
CategoricalDtype(list('abc'), True)])
@pytest.mark.parametrize('new_dtype', [
'category',
CategoricalDtype(None, False),
CategoricalDtype(None, True),
CategoricalDtype(list('abc'), False),
CategoricalDtype(list('abc'), True),
CategoricalDtype(list('cba'), False),
CategoricalDtype(list('cba'), True),
CategoricalDtype(list('wxyz'), False),
CategoricalDtype(list('wxyz'), True)])
def test_update_dtype(self, dtype, new_dtype):
if isinstance(new_dtype, string_types) and new_dtype == 'category':
expected_categories = dtype.categories
expected_ordered = dtype.ordered
else:
expected_categories = new_dtype.categories
if expected_categories is None:
expected_categories = dtype.categories
expected_ordered = new_dtype.ordered
result = dtype._update_dtype(new_dtype)
tm.assert_index_equal(result.categories, expected_categories)
assert result.ordered is expected_ordered

self = self.copy() if copy else self
if dtype == self.dtype:
return self
return self._set_dtype(dtype)
return np.array(self, dtype=dtype, copy=copy)

@cache_readonly
Expand Down
35 changes: 10 additions & 25 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
import pandas.core.dtypes.concat as _concat

from pandas.core.dtypes.generic import ABCSeries, ABCDatetimeIndex
from pandas.core.common import is_null_slice
from pandas.core.common import is_null_slice, _any_not_none
import pandas.core.algorithms as algos

from pandas.core.index import Index, MultiIndex, _ensure_index
Expand Down Expand Up @@ -573,7 +573,6 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
raise TypeError(msg)

# may need to convert to categorical
# this is only called for non-categoricals
if self.is_categorical_astype(dtype):

# deprecated 17636
Expand All @@ -589,13 +588,16 @@ def _astype(self, dtype, copy=False, errors='raise', values=None,
"CategoricalDtype instead",
FutureWarning, stacklevel=7)

kwargs = kwargs.copy()
categories = getattr(dtype, 'categories', None)
ordered = getattr(dtype, 'ordered', False)
categories = kwargs.get('categories', None)
ordered = kwargs.get('ordered', None)
if _any_not_none(categories, ordered):
dtype = CategoricalDtype(categories, ordered)

kwargs.setdefault('categories', categories)
kwargs.setdefault('ordered', ordered)
return self.make_block(Categorical(self.values, **kwargs))
if is_categorical_dtype(self.values):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think you need all of this logic, wouldn't

values = self.values.astype(dtype, copy=copy)

return self.make_block(values, dtype=dtype)

be enough (if values is a Categorical already or dtype is a CDT, it will infer correctly, and if its not it will as well).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that quite works, since self.values can be a different object depending on what self is: if self is already categorical, then self.values is a Categorical, otherwise self.values is a numpy array.

In the numpy case, self.values.astype raises TypeError: data type not understood when a CDT is passed as the dtype.

Likewise, self.make_block(Categorical(self.values, dtype=dtype)) also doesn't work by itself. In the Categorical case, the constructor ignores the dtype parameter when the input data is already Categorical, so no update occurs.

Seems like the two paths are necessary? Or am I overlooking something?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok seems reasonable then

# GH 10696/18593: update an existing categorical efficiently
return self.make_block(self.values.astype(dtype, copy=copy))

return self.make_block(Categorical(self.values, dtype=dtype))

# astype processing
dtype = np.dtype(dtype)
Expand Down Expand Up @@ -2427,23 +2429,6 @@ def take_nd(self, indexer, axis=0, new_mgr_locs=None, fill_tuple=None):

return self.make_block_same_class(new_values, new_mgr_locs)

def _astype(self, dtype, copy=False, errors='raise', values=None,
klass=None, mgr=None):
"""
Coerce to the new type (if copy=True, return a new copy)
raise on an except if raise == True
"""

if self.is_categorical_astype(dtype):
values = self.values
else:
values = np.asarray(self.values).astype(dtype, copy=False)

if copy:
values = values.copy()

return self.make_block(values)

def to_native_types(self, slicer=None, na_rep='', quoting=None, **kwargs):
""" convert to our native types format, slicing if desired """

Expand Down
54 changes: 49 additions & 5 deletions pandas/tests/categorical/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,54 @@ def test_codes_dtypes(self):
result = result.remove_categories(['foo%05d' % i for i in range(300)])
assert result.codes.dtype == 'int8'

def test_astype_categorical(self):
@pytest.mark.parametrize('ordered', [True, False])
def test_astype(self, ordered):
# string
cat = Categorical(list('abbaaccc'), ordered=ordered)
result = cat.astype(object)
expected = np.array(cat)
tm.assert_numpy_array_equal(result, expected)

msg = 'could not convert string to float'
with tm.assert_raises_regex(ValueError, msg):
cat.astype(float)

# numeric
cat = Categorical([0, 1, 2, 2, 1, 0, 1, 0, 2], ordered=ordered)
result = cat.astype(object)
expected = np.array(cat, dtype=object)
tm.assert_numpy_array_equal(result, expected)

result = cat.astype(int)
expected = np.array(cat, dtype=np.int)
tm.assert_numpy_array_equal(result, expected)

result = cat.astype(float)
expected = np.array(cat, dtype=np.float)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize('dtype_ordered', [True, False])
@pytest.mark.parametrize('cat_ordered', [True, False])
def test_astype_category(self, dtype_ordered, cat_ordered):
# GH 10696/18593
data = list('abcaacbab')
cat = Categorical(data, categories=list('bac'), ordered=cat_ordered)

# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(
data, categories=cat.categories, ordered=dtype_ordered)
tm.assert_categorical_equal(result, expected)

cat = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
tm.assert_categorical_equal(cat, cat.astype('category'))
tm.assert_almost_equal(np.array(cat), cat.astype('object'))
# non-standard categories
dtype = CategoricalDtype(list('adc'), dtype_ordered)
result = cat.astype(dtype)
expected = Categorical(data, dtype=dtype)
tm.assert_categorical_equal(result, expected)

pytest.raises(ValueError, lambda: cat.astype(float))
if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = cat.astype('category')
expected = cat
tm.assert_categorical_equal(result, expected)
9 changes: 4 additions & 5 deletions pandas/tests/indexes/test_category.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,19 +411,18 @@ def test_astype(self):
result = IntervalIndex.from_intervals(result.values)
tm.assert_index_equal(result, expected)

@pytest.mark.parametrize('copy', [True, False])
@pytest.mark.parametrize('name', [None, 'foo'])
@pytest.mark.parametrize('dtype_ordered', [True, False])
@pytest.mark.parametrize('index_ordered', [True, False])
def test_astype_category(self, copy, name, dtype_ordered, index_ordered):
def test_astype_category(self, name, dtype_ordered, index_ordered):
# GH 18630
index = self.create_index(ordered=index_ordered)
if name:
index = index.rename(name)

# standard categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = index.astype(dtype, copy=copy)
result = index.astype(dtype)
expected = CategoricalIndex(index.tolist(),
name=name,
categories=index.categories,
Expand All @@ -432,13 +431,13 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered):

# non-standard categories
dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered)
result = index.astype(dtype, copy=copy)
result = index.astype(dtype)
expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype)
tm.assert_index_equal(result, expected)

if dtype_ordered is False:
# dtype='category' can't specify ordered, so only test once
result = index.astype('category', copy=copy)
result = index.astype('category')
expected = index
tm.assert_index_equal(result, expected)

Expand Down
38 changes: 38 additions & 0 deletions pandas/tests/series/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,6 +322,44 @@ def cmp(a, b):
lambda x: x.astype('object').astype(Categorical)]:
pytest.raises(TypeError, lambda: invalid(s))

@pytest.mark.parametrize('name', [None, 'foo'])
@pytest.mark.parametrize('dtype_ordered', [True, False])
@pytest.mark.parametrize('series_ordered', [True, False])
def test_astype_categorical_to_categorical(self, name, dtype_ordered,
series_ordered):
# GH 10696/18593
s_data = list('abcaacbab')
s_dtype = CategoricalDtype(list('bac'), ordered=series_ordered)
s = Series(s_data, dtype=s_dtype, name=name)

# unspecified categories
dtype = CategoricalDtype(ordered=dtype_ordered)
result = s.astype(dtype)
exp_dtype = CategoricalDtype(s_dtype.categories, dtype_ordered)
expected = Series(s_data, name=name, dtype=exp_dtype)
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = s.astype('category', ordered=dtype_ordered)
tm.assert_series_equal(result, expected)

# different categories
dtype = CategoricalDtype(list('adc'), dtype_ordered)
result = s.astype(dtype)
expected = Series(s_data, name=name, dtype=dtype)
tm.assert_series_equal(result, expected)

with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
result = s.astype(
'category', categories=list('adc'), ordered=dtype_ordered)
tm.assert_series_equal(result, expected)

if dtype_ordered is False:
# not specifying ordered, so only test once
expected = s
result = s.astype('category')
tm.assert_series_equal(result, expected)

def test_astype_categoricaldtype(self):
s = Series(['a', 'b', 'a'])
result = s.astype(CategoricalDtype(['a', 'b'], ordered=True))
Expand Down