From 3c37bb7d66d20a32b5aa5bcec91556b9e61df787 Mon Sep 17 00:00:00 2001 From: jschendel Date: Thu, 7 Dec 2017 00:11:50 -0700 Subject: [PATCH 1/6] BUG: Ensure Index.astype('category') returns a CategoricalIndex --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/core/dtypes/common.py | 2 +- pandas/core/indexes/base.py | 4 ++++ pandas/core/indexes/category.py | 7 ++++++- pandas/core/indexes/datetimes.py | 5 +++++ pandas/core/indexes/multi.py | 3 ++- pandas/core/indexes/numeric.py | 11 ++++++++--- pandas/core/indexes/period.py | 5 +++++ pandas/core/indexes/timedeltas.py | 8 +++++++- pandas/tests/indexes/common.py | 18 ++++++++++++++++++ pandas/tests/indexes/test_interval.py | 10 +++++++++- pandas/tests/indexes/test_multi.py | 10 ++++++++++ 12 files changed, 76 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index b319c8bb79bb3..d76bfb29a8da3 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -259,6 +259,7 @@ Conversion - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) +- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index would not be converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) Indexing diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f60c0d5ffdca0..5b1335c1a834e 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1934,7 +1934,7 @@ def pandas_dtype(dtype): except TypeError: pass - elif dtype.startswith('interval[') or dtype.startswith('Interval['): + elif dtype.startswith('interval') or dtype.startswith('Interval'): try: return IntervalDtype.construct_from_string(dtype) except TypeError: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 938fd7130faa5..a80c5994940a6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1053,6 +1053,10 @@ def _to_embed(self, keep_tz=False, dtype=None): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): + if is_categorical_dtype(dtype): + from pandas.core.indexes.category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) return Index(self.values.astype(dtype, copy=copy), name=self.name, dtype=dtype) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 26ffb01b9577f..055ae823b4587 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -9,7 +9,8 @@ _ensure_platform_int, is_list_like, is_interval_dtype, - is_scalar) + is_scalar, + pandas_dtype) from pandas.core.common import (_asarray_tuplesafe, _values_from_object) from pandas.core.dtypes.missing import array_equivalent, isna @@ -341,9 +342,13 @@ def __array__(self, dtype=None): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex.from_intervals(np.array(self)) + elif is_categorical_dtype(dtype) and (dtype == self.dtype): + # fastpath if dtype is the same current + return self.copy() if copy else self return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy) @cache_readonly diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 290c77dd7f040..36fcf058d938b 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -20,6 +20,7 @@ is_period_dtype, is_bool_dtype, is_string_dtype, + is_categorical_dtype, is_string_like, is_list_like, is_scalar, @@ -915,6 +916,10 @@ def astype(self, dtype, copy=True): elif copy is True: return self.copy() return self + elif is_categorical_dtype(dtype): + from pandas.core.indexes.category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) elif is_string_dtype(dtype): return Index(self.format(), name=self.name, dtype=object) elif is_period_dtype(dtype): diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 0cbb87c65ccd7..621e1950bde71 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -17,6 +17,7 @@ is_object_dtype, is_iterator, is_list_like, + pandas_dtype, is_scalar) from pandas.core.dtypes.missing import isna, array_equivalent from pandas.errors import PerformanceWarning, UnsortedIndexError @@ -2715,7 +2716,7 @@ def difference(self, other): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - if not is_object_dtype(np.dtype(dtype)): + if not is_object_dtype(pandas_dtype(dtype)): raise TypeError('Setting %s dtype to anything other than object ' 'is not supported' % self.__class__) elif copy is True: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 72aeafbe7e1ab..8d8c422dcf560 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -7,6 +7,7 @@ is_float_dtype, is_object_dtype, is_integer_dtype, + is_categorical_dtype, is_bool, is_bool_dtype, is_scalar) @@ -321,10 +322,14 @@ def astype(self, dtype, copy=True): values = self._values.astype(dtype, copy=copy) elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) + elif is_categorical_dtype(dtype): + from pandas.core.indexes.category import CategoricalIndex + return CategoricalIndex(self, name=self.name, dtype=dtype, + copy=copy) else: - raise TypeError('Setting %s dtype to anything other than ' - 'float64 or object is not supported' % - self.__class__) + raise TypeError('Setting {cls} dtype to anything other than ' + 'float64, object, or category is not supported' + .format(cls=self.__class__)) return Index(values, name=self.name, dtype=dtype) @Appender(_index_shared_docs['_convert_scalar_indexer']) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 8b541bdce39ed..c2490b27190ec 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -16,6 +16,7 @@ is_timedelta64_dtype, is_period_dtype, is_bool_dtype, + is_categorical_dtype, pandas_dtype, _ensure_object) from pandas.core.dtypes.dtypes import PeriodDtype @@ -517,6 +518,10 @@ def astype(self, dtype, copy=True, how='start'): return self.to_timestamp(how=how).tz_localize(dtype.tz) elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) + elif is_categorical_dtype(dtype): + from pandas.core.indexes.category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) raise TypeError('Cannot cast PeriodIndex to dtype %s' % dtype) @Substitution(klass='PeriodIndex') diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 77e05ccf4db22..4c008ab1f96b3 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -12,6 +12,8 @@ is_object_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, + is_categorical_dtype, + pandas_dtype, _ensure_int64) from pandas.core.dtypes.missing import isna from pandas.core.dtypes.generic import ABCSeries @@ -479,7 +481,7 @@ def to_pytimedelta(self): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - dtype = np.dtype(dtype) + dtype = pandas_dtype(dtype) if is_object_dtype(dtype): return self._box_values_as_index() @@ -498,6 +500,10 @@ def astype(self, dtype, copy=True): elif is_integer_dtype(dtype): return Index(self.values.astype('i8', copy=copy), dtype='i8', name=self.name) + elif is_categorical_dtype(dtype): + from pandas.core.indexes.category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) raise TypeError('Cannot cast TimedeltaIndex to dtype %s' % dtype) def union(self, other): diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index c1ee18526cc01..ef531d4c2ebc4 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -13,6 +13,7 @@ from pandas.core.indexes.base import InvalidIndexError from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin from pandas.core.dtypes.common import needs_i8_conversion +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas._libs.tslib import iNaT import pandas.util.testing as tm @@ -1058,3 +1059,20 @@ def test_putmask_with_wrong_mask(self): with pytest.raises(ValueError): index.putmask('foo', 1) + + def test_astype_category(self): + # GH 18630 + index = self.create_index() + + expected = CategoricalIndex(index.values) + result = index.astype('category', copy=True) + tm.assert_index_equal(result, expected) + + expected = CategoricalIndex(index.values, name='foo') + result = index.rename('foo').astype('category', copy=False) + tm.assert_index_equal(result, expected) + + dtype = CategoricalDtype(index.unique()[:-1], ordered=True) + expected = CategoricalIndex(index.values, dtype=dtype) + result = index.astype(dtype) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index 1850ff2795a24..dad64707271d6 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -6,6 +6,7 @@ from pandas import ( Interval, IntervalIndex, Index, isna, notna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.compat import lzip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree @@ -376,8 +377,15 @@ def test_astype(self, closed): tm.assert_index_equal(result, idx) assert result.equals(idx) - result = idx.astype('category') + def test_astype_category(self, closed): + # GH 18630 + idx = self.create_index(closed=closed) expected = pd.Categorical(idx, ordered=True) + + result = idx.astype('category') + tm.assert_categorical_equal(result, expected) + + result = idx.astype(CategoricalDtype()) tm.assert_categorical_equal(result, expected) @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index e86b786e0d717..5d3249360fd7d 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -16,6 +16,7 @@ compat, date_range, period_range) from pandas.compat import PY3, long, lrange, lzip, range, u, PYPY from pandas.errors import PerformanceWarning, UnsortedIndexError +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.indexes.base import InvalidIndexError from pandas._libs import lib from pandas._libs.lib import Timestamp @@ -554,6 +555,15 @@ def test_astype(self): with tm.assert_raises_regex(TypeError, "^Setting.*dtype.*object"): self.index.astype(np.dtype(int)) + def test_astype_category(self): + # GH 18630 + msg = 'Setting .* dtype to anything other than object is not supported' + with tm.assert_raises_regex(TypeError, msg): + self.index.astype('category') + + with tm.assert_raises_regex(TypeError, msg): + self.index.astype(CategoricalDtype()) + def test_constructor_single_level(self): result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], labels=[[0, 1, 2, 3]], names=['first']) From 6d953e4a5eb5593d38d26a7e18438f4cb198635f Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 8 Dec 2017 01:03:01 -0700 Subject: [PATCH 2/6] review updates --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/indexes/category.py | 22 +++++++++++++---- pandas/core/indexes/interval.py | 5 ++-- pandas/core/indexes/multi.py | 12 ++++++--- pandas/tests/indexes/common.py | 28 ++++++++++++++------- pandas/tests/indexes/test_category.py | 35 ++++++++++++++++++++++++--- pandas/tests/indexes/test_interval.py | 12 --------- pandas/tests/indexes/test_multi.py | 17 +++++++------ 8 files changed, 91 insertions(+), 42 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index d76bfb29a8da3..29c6dcaef19ab 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -259,7 +259,7 @@ Conversion - Fixed a bug where creating a Series from an array that contains both tz-naive and tz-aware values will result in a Series whose dtype is tz-aware instead of object (:issue:`16406`) - Adding a ``Period`` object to a ``datetime`` or ``Timestamp`` object will now correctly raise a ``TypeError`` (:issue:`17983`) - Fixed a bug where ``FY5253`` date offsets could incorrectly raise an ``AssertionError`` in arithmetic operatons (:issue:`14774`) -- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index would not be converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) +- Bug in :meth:`Index.astype` with a categorical dtype where the resultant index is not converted to a :class:`CategoricalIndex` for all types of index (:issue:`18630`) Indexing diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 055ae823b4587..9273e46d13be7 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -4,6 +4,7 @@ from pandas import compat from pandas.compat.numpy import function as nv from pandas.core.dtypes.generic import ABCCategorical, ABCSeries +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.common import ( is_categorical_dtype, _ensure_platform_int, @@ -166,8 +167,6 @@ def _create_categorical(self, data, categories=None, ordered=None, data = Categorical(data, categories=categories, ordered=ordered, dtype=dtype) else: - from pandas.core.dtypes.dtypes import CategoricalDtype - if categories is not None: data = data.set_categories(categories, ordered=ordered) elif ordered is not None and ordered != data.ordered: @@ -346,9 +345,22 @@ def astype(self, dtype, copy=True): if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex.from_intervals(np.array(self)) - elif is_categorical_dtype(dtype) and (dtype == self.dtype): - # fastpath if dtype is the same current - return self.copy() if copy else self + elif is_categorical_dtype(dtype): + # want to maintain existing categories/ordered if they are None + if dtype.categories is None: + new_categories = self.categories + else: + new_categories = dtype.categories + if dtype.ordered is None: + new_ordered = self.ordered + else: + new_ordered = dtype.ordered + dtype = CategoricalDtype(new_categories, new_ordered) + + # fastpath if dtypes are equal + if dtype == self.dtype: + return self.copy() if copy else self + return super(CategoricalIndex, self).astype(dtype=dtype, copy=copy) @cache_readonly diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index a32e79920db41..7d5da4139d06f 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -632,8 +632,9 @@ def astype(self, dtype, copy=True): elif is_object_dtype(dtype): return Index(self.values, dtype=object) elif is_categorical_dtype(dtype): - from pandas import Categorical - return Categorical(self, ordered=True) + from pandas.core.indexes.category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) raise ValueError('Cannot cast IntervalIndex to dtype {dtype}' .format(dtype=dtype)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 621e1950bde71..c20c6e1f75a24 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.common import ( _ensure_int64, _ensure_platform_int, + is_categorical_dtype, is_object_dtype, is_iterator, is_list_like, @@ -2716,9 +2717,14 @@ def difference(self, other): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - if not is_object_dtype(pandas_dtype(dtype)): - raise TypeError('Setting %s dtype to anything other than object ' - 'is not supported' % self.__class__) + dtype = pandas_dtype(dtype) + if is_categorical_dtype(dtype): + msg = '> 1 ndim Categorical are not supported at this time' + raise NotImplementedError(msg) + elif not is_object_dtype(dtype): + msg = ('Setting {cls} dtype to anything other than object ' + 'is not supported').format(cls=self.__class__) + raise TypeError(msg) elif copy is True: return self._shallow_copy() return self diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index ef531d4c2ebc4..07e84ad60ef51 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -1060,19 +1060,29 @@ def test_putmask_with_wrong_mask(self): with pytest.raises(ValueError): index.putmask('foo', 1) - def test_astype_category(self): + @pytest.mark.parametrize('copy', [True, False]) + @pytest.mark.parametrize('name', [None, 'foo']) + @pytest.mark.parametrize('ordered', [True, False]) + def test_astype_category(self, copy, name, ordered): # GH 18630 index = self.create_index() + if name: + index = index.rename(name) - expected = CategoricalIndex(index.values) - result = index.astype('category', copy=True) + # standard categories + dtype = CategoricalDtype(ordered=ordered) + result = index.astype(dtype, copy=copy) + expected = CategoricalIndex(index.values, name=name, ordered=ordered) tm.assert_index_equal(result, expected) - expected = CategoricalIndex(index.values, name='foo') - result = index.rename('foo').astype('category', copy=False) + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], ordered) + result = index.astype(dtype, copy=copy) + expected = CategoricalIndex(index.values, name=name, dtype=dtype) tm.assert_index_equal(result, expected) - dtype = CategoricalDtype(index.unique()[:-1], ordered=True) - expected = CategoricalIndex(index.values, dtype=dtype) - result = index.astype(dtype) - tm.assert_index_equal(result, expected) + if ordered is False: + # dtype='category' defaults to ordered=False, so only test once + result = index.astype('category', copy=copy) + expected = CategoricalIndex(index.values, name=name) + tm.assert_index_equal(result, expected) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index c2eee4e437347..5529af791d500 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -388,9 +388,6 @@ def test_delete(self): def test_astype(self): ci = self.create_index() - result = ci.astype('category') - tm.assert_index_equal(result, ci, exact=True) - result = ci.astype(object) tm.assert_index_equal(result, Index(np.array(ci))) @@ -414,6 +411,38 @@ def test_astype(self): result = IntervalIndex.from_intervals(result.values) tm.assert_index_equal(result, expected) + @pytest.mark.parametrize('copy', [True, False]) + @pytest.mark.parametrize('name', [None, 'foo']) + @pytest.mark.parametrize('dtype_ordered', [True, False]) + @pytest.mark.parametrize('index_ordered', [True, False]) + def test_astype_category(self, copy, name, dtype_ordered, index_ordered): + # GH 18630 + index = self.create_index(ordered=index_ordered) + if name: + index = index.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=dtype_ordered) + result = index.astype(dtype, copy=copy) + expected = CategoricalIndex(index.tolist(), + name=name, + categories=index.categories, + ordered=dtype_ordered) + tm.assert_index_equal(result, expected) + + # non-standard categories + dtype = CategoricalDtype(index.unique().tolist()[:-1], dtype_ordered) + result = index.astype(dtype, copy=copy) + expected = CategoricalIndex(index.tolist(), name=name, dtype=dtype) + tm.assert_index_equal(result, expected) + + if dtype_ordered is False: + # dtype='category' defaults to ordered=False, so only test once + result = index.astype('category', copy=copy) + expected = CategoricalIndex( + index.tolist(), categories=index.categories, name=name) + tm.assert_index_equal(result, expected) + def test_reindex_base(self): # Determined by cat ordering. idx = CategoricalIndex(list("cab"), categories=list("cab")) diff --git a/pandas/tests/indexes/test_interval.py b/pandas/tests/indexes/test_interval.py index dad64707271d6..abad930793d7f 100644 --- a/pandas/tests/indexes/test_interval.py +++ b/pandas/tests/indexes/test_interval.py @@ -6,7 +6,6 @@ from pandas import ( Interval, IntervalIndex, Index, isna, notna, interval_range, Timestamp, Timedelta, compat, date_range, timedelta_range, DateOffset) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.compat import lzip from pandas.tseries.offsets import Day from pandas._libs.interval import IntervalTree @@ -377,17 +376,6 @@ def test_astype(self, closed): tm.assert_index_equal(result, idx) assert result.equals(idx) - def test_astype_category(self, closed): - # GH 18630 - idx = self.create_index(closed=closed) - expected = pd.Categorical(idx, ordered=True) - - result = idx.astype('category') - tm.assert_categorical_equal(result, expected) - - result = idx.astype(CategoricalDtype()) - tm.assert_categorical_equal(result, expected) - @pytest.mark.parametrize('klass', [list, tuple, np.array, pd.Series]) def test_where(self, closed, klass): idx = self.create_index(closed=closed) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 5d3249360fd7d..510ca6ac83ec0 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -555,14 +555,17 @@ def test_astype(self): with tm.assert_raises_regex(TypeError, "^Setting.*dtype.*object"): self.index.astype(np.dtype(int)) - def test_astype_category(self): + @pytest.mark.parametrize('ordered', [True, False]) + def test_astype_category(self, ordered): # GH 18630 - msg = 'Setting .* dtype to anything other than object is not supported' - with tm.assert_raises_regex(TypeError, msg): - self.index.astype('category') - - with tm.assert_raises_regex(TypeError, msg): - self.index.astype(CategoricalDtype()) + msg = '> 1 ndim Categorical are not supported at this time' + with tm.assert_raises_regex(NotImplementedError, msg): + self.index.astype(CategoricalDtype(ordered=ordered)) + + if ordered is False: + # dtype='category' defaults to ordered=False, so only test once + with tm.assert_raises_regex(NotImplementedError, msg): + self.index.astype('category') def test_constructor_single_level(self): result = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux']], From 20b55049952ef67331c41e417b49122e82f5d14e Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 8 Dec 2017 11:44:27 -0700 Subject: [PATCH 3/6] Make CI.astype('category') not change anything --- pandas/core/indexes/category.py | 12 ++++++------ pandas/tests/indexes/test_category.py | 5 ++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9273e46d13be7..2a67dd07dbedb 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -341,21 +341,21 @@ def __array__(self, dtype=None): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): + if isinstance(dtype, compat.string_types) and dtype == 'category': + # GH 18630: CI.astype('category') should not change anything + return self.copy() if copy else self + dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex.from_intervals(np.array(self)) elif is_categorical_dtype(dtype): - # want to maintain existing categories/ordered if they are None + # GH 18630: keep current categories if None (ordered can't be None) if dtype.categories is None: new_categories = self.categories else: new_categories = dtype.categories - if dtype.ordered is None: - new_ordered = self.ordered - else: - new_ordered = dtype.ordered - dtype = CategoricalDtype(new_categories, new_ordered) + dtype = CategoricalDtype(new_categories, dtype.ordered) # fastpath if dtypes are equal if dtype == self.dtype: diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 5529af791d500..ae9e011d76597 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -437,10 +437,9 @@ def test_astype_category(self, copy, name, dtype_ordered, index_ordered): tm.assert_index_equal(result, expected) if dtype_ordered is False: - # dtype='category' defaults to ordered=False, so only test once + # dtype='category' can't specify ordered, so only test once result = index.astype('category', copy=copy) - expected = CategoricalIndex( - index.tolist(), categories=index.categories, name=name) + expected = index tm.assert_index_equal(result, expected) def test_reindex_base(self): From a90acecb15c028e56d2e373f6a0c1280c26c7fa4 Mon Sep 17 00:00:00 2001 From: jschendel Date: Fri, 8 Dec 2017 17:07:37 -0700 Subject: [PATCH 4/6] Fix tests broken by II.astype('category') changes --- pandas/tests/reshape/test_tile.py | 21 ++++++++++++--------- pandas/tests/test_algos.py | 5 +++-- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/pandas/tests/reshape/test_tile.py b/pandas/tests/reshape/test_tile.py index 4edce8af92f84..c27af7a5bf8e4 100644 --- a/pandas/tests/reshape/test_tile.py +++ b/pandas/tests/reshape/test_tile.py @@ -4,9 +4,8 @@ import numpy as np from pandas.compat import zip -from pandas import (Series, Index, isna, - to_datetime, DatetimeIndex, Timestamp, - Interval, IntervalIndex, Categorical, +from pandas import (Series, isna, to_datetime, DatetimeIndex, + Timestamp, Interval, IntervalIndex, Categorical, cut, qcut, date_range) import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -29,7 +28,8 @@ def test_bins(self): result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + intervals = intervals.take([0, 0, 0, 1, 2, 0]) + expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) @@ -38,7 +38,8 @@ def test_right(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=True, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - expected = intervals.astype('category').take([0, 0, 0, 2, 3, 0, 0]) + expected = Categorical(intervals, ordered=True) + expected = expected.take([0, 0, 0, 2, 3, 0, 0]) tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.1905, 2.575, 4.95, 7.325, 9.7])) @@ -47,7 +48,8 @@ def test_noright(self): data = np.array([.2, 1.4, 2.5, 6.2, 9.7, 2.1, 2.575]) result, bins = cut(data, 4, right=False, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3), closed='left') - expected = intervals.take([0, 0, 0, 2, 3, 0, 1]).astype('category') + intervals = intervals.take([0, 0, 0, 2, 3, 0, 1]) + expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.2, 2.575, 4.95, 7.325, 9.7095])) @@ -56,7 +58,8 @@ def test_arraylike(self): data = [.2, 1.4, 2.5, 6.2, 9.7, 2.1] result, bins = cut(data, 3, retbins=True) intervals = IntervalIndex.from_breaks(bins.round(3)) - expected = intervals.take([0, 0, 0, 1, 2, 0]).astype('category') + intervals = intervals.take([0, 0, 0, 1, 2, 0]) + expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) tm.assert_almost_equal(bins, np.array([0.1905, 3.36666667, 6.53333333, 9.7])) @@ -249,8 +252,8 @@ def test_qcut_nas(self): def test_qcut_index(self): result = qcut([0, 2], 2) - expected = Index([Interval(-0.001, 1), Interval(1, 2)]).astype( - 'category') + intervals = [Interval(-0.001, 1), Interval(1, 2)] + expected = Categorical(intervals, ordered=True) tm.assert_categorical_equal(result, expected) def test_round_frac(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 69c8f90a57e9c..7ef77e4c78e10 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -19,6 +19,7 @@ import pandas.core.algorithms as algos from pandas.core.common import _asarray_tuplesafe import pandas.util.testing as tm +from pandas.core.dtypes.dtypes import CategoricalDtype as CDT from pandas.compat.numpy import np_array_datetime64_compat from pandas.util.testing import assert_almost_equal @@ -565,8 +566,8 @@ def test_value_counts(self): # assert isinstance(factor, n) result = algos.value_counts(factor) breaks = [-1.194, -0.535, 0.121, 0.777, 1.433] - expected_index = IntervalIndex.from_breaks(breaks).astype('category') - expected = Series([1, 1, 1, 1], index=expected_index) + index = IntervalIndex.from_breaks(breaks).astype(CDT(ordered=True)) + expected = Series([1, 1, 1, 1], index=index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): From afcc50a37f11d171d37026c2e0397d0c6136564d Mon Sep 17 00:00:00 2001 From: jschendel Date: Sat, 9 Dec 2017 13:08:49 -0700 Subject: [PATCH 5/6] Move CI imports --- pandas/core/indexes/base.py | 2 +- pandas/core/indexes/datetimes.py | 2 +- pandas/core/indexes/interval.py | 2 +- pandas/core/indexes/numeric.py | 2 +- pandas/core/indexes/period.py | 2 +- pandas/core/indexes/timedeltas.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index a80c5994940a6..9557261e61463 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -1054,7 +1054,7 @@ def _to_embed(self, keep_tz=False, dtype=None): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): if is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex + from .category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) return Index(self.values.astype(dtype, copy=copy), name=self.name, diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 36fcf058d938b..38e8c24de4bdf 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -36,6 +36,7 @@ from pandas.core.algorithms import checked_add_with_arr from pandas.core.indexes.base import Index, _index_shared_docs +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.numeric import Int64Index, Float64Index import pandas.compat as compat from pandas.tseries.frequencies import ( @@ -917,7 +918,6 @@ def astype(self, dtype, copy=True): return self.copy() return self elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) elif is_string_dtype(dtype): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 7d5da4139d06f..292b0f638f821 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -29,6 +29,7 @@ Interval, IntervalMixin, IntervalTree, intervals_to_interval_bounds) +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import date_range from pandas.core.indexes.timedeltas import timedelta_range from pandas.core.indexes.multi import MultiIndex @@ -632,7 +633,6 @@ def astype(self, dtype, copy=True): elif is_object_dtype(dtype): return Index(self.values, dtype=object) elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) raise ValueError('Cannot cast IntervalIndex to dtype {dtype}' diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 8d8c422dcf560..5fc9cb47362d6 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -17,6 +17,7 @@ from pandas.core import algorithms from pandas.core.indexes.base import ( Index, InvalidIndexError, _index_shared_docs) +from pandas.core.indexes.category import CategoricalIndex from pandas.util._decorators import Appender, cache_readonly import pandas.core.dtypes.concat as _concat import pandas.core.indexes.base as ibase @@ -323,7 +324,6 @@ def astype(self, dtype, copy=True): elif is_object_dtype(dtype): values = self._values.astype('object', copy=copy) elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self, name=self.name, dtype=dtype, copy=copy) else: diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c2490b27190ec..64756906d8a63 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -24,6 +24,7 @@ import pandas.tseries.frequencies as frequencies from pandas.tseries.frequencies import get_freq_code as _gfc +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.datetimes import DatetimeIndex, Int64Index, Index from pandas.core.indexes.timedeltas import TimedeltaIndex from pandas.core.indexes.datetimelike import DatelikeOps, DatetimeIndexOpsMixin @@ -519,7 +520,6 @@ def astype(self, dtype, copy=True, how='start'): elif is_period_dtype(dtype): return self.asfreq(freq=dtype.freq) elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) raise TypeError('Cannot cast PeriodIndex to dtype %s' % dtype) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 4c008ab1f96b3..25c764b138465 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -20,6 +20,7 @@ from pandas.core.common import _maybe_box, _values_from_object from pandas.core.indexes.base import Index +from pandas.core.indexes.category import CategoricalIndex from pandas.core.indexes.numeric import Int64Index import pandas.compat as compat from pandas.compat import u @@ -501,7 +502,6 @@ def astype(self, dtype, copy=True): return Index(self.values.astype('i8', copy=copy), dtype='i8', name=self.name) elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex return CategoricalIndex(self.values, name=self.name, dtype=dtype, copy=copy) raise TypeError('Cannot cast TimedeltaIndex to dtype %s' % dtype) From 60421314618f39a77f5d4a94a4b3f2462118e7fa Mon Sep 17 00:00:00 2001 From: jschendel Date: Mon, 11 Dec 2017 00:39:34 -0700 Subject: [PATCH 6/6] refactor dtype update --- pandas/core/dtypes/dtypes.py | 27 ++++++++++++++++++++++ pandas/core/indexes/category.py | 18 +++------------ pandas/tests/dtypes/test_dtypes.py | 36 ++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 15 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 040b735f8de2c..3a8edf9f066ee 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -340,6 +340,33 @@ def _validate_categories(categories, fastpath=False): return categories + def _update_dtype(self, dtype): + """ + Returns a CategoricalDtype with categories and ordered taken from dtype + if specified, otherwise falling back to self if unspecified + + Parameters + ---------- + dtype : CategoricalDtype + + Returns + ------- + new_dtype : CategoricalDtype + """ + if isinstance(dtype, compat.string_types) and dtype == 'category': + # dtype='category' should not change anything + return self + elif not self.is_dtype(dtype): + msg = ('a CategoricalDtype must be passed to perform an update, ' + 'got {dtype!r}').format(dtype=dtype) + raise ValueError(msg) + + # dtype is CDT: keep current categories if None (ordered can't be None) + new_categories = dtype.categories + if new_categories is None: + new_categories = self.categories + return CategoricalDtype(new_categories, dtype.ordered) + @property def categories(self): """ diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 2a67dd07dbedb..241907a54f393 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -10,8 +10,7 @@ _ensure_platform_int, is_list_like, is_interval_dtype, - is_scalar, - pandas_dtype) + is_scalar) from pandas.core.common import (_asarray_tuplesafe, _values_from_object) from pandas.core.dtypes.missing import array_equivalent, isna @@ -341,23 +340,12 @@ def __array__(self, dtype=None): @Appender(_index_shared_docs['astype']) def astype(self, dtype, copy=True): - if isinstance(dtype, compat.string_types) and dtype == 'category': - # GH 18630: CI.astype('category') should not change anything - return self.copy() if copy else self - - dtype = pandas_dtype(dtype) if is_interval_dtype(dtype): from pandas import IntervalIndex return IntervalIndex.from_intervals(np.array(self)) elif is_categorical_dtype(dtype): - # GH 18630: keep current categories if None (ordered can't be None) - if dtype.categories is None: - new_categories = self.categories - else: - new_categories = dtype.categories - dtype = CategoricalDtype(new_categories, dtype.ordered) - - # fastpath if dtypes are equal + # GH 18630 + dtype = self.dtype._update_dtype(dtype) if dtype == self.dtype: return self.copy() if copy else self diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 84e6f0d4f5a7a..d8e16482a414e 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -9,6 +9,7 @@ from pandas import ( Series, Categorical, CategoricalIndex, IntervalIndex, date_range) +from pandas.compat import string_types from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, IntervalDtype, CategoricalDtype) @@ -123,6 +124,41 @@ def test_tuple_categories(self): result = CategoricalDtype(categories) assert all(result.categories == categories) + @pytest.mark.parametrize('dtype', [ + CategoricalDtype(list('abc'), False), + CategoricalDtype(list('abc'), True)]) + @pytest.mark.parametrize('new_dtype', [ + 'category', + CategoricalDtype(None, False), + CategoricalDtype(None, True), + CategoricalDtype(list('abc'), False), + CategoricalDtype(list('abc'), True), + CategoricalDtype(list('cba'), False), + CategoricalDtype(list('cba'), True), + CategoricalDtype(list('wxyz'), False), + CategoricalDtype(list('wxyz'), True)]) + def test_update_dtype(self, dtype, new_dtype): + if isinstance(new_dtype, string_types) and new_dtype == 'category': + expected_categories = dtype.categories + expected_ordered = dtype.ordered + else: + expected_categories = new_dtype.categories + if expected_categories is None: + expected_categories = dtype.categories + expected_ordered = new_dtype.ordered + + result = dtype._update_dtype(new_dtype) + tm.assert_index_equal(result.categories, expected_categories) + assert result.ordered is expected_ordered + + @pytest.mark.parametrize('bad_dtype', [ + 'foo', object, np.int64, PeriodDtype('Q'), IntervalDtype(object)]) + def test_update_dtype_errors(self, bad_dtype): + dtype = CategoricalDtype(list('abc'), False) + msg = 'a CategoricalDtype must be passed to perform an update, ' + with tm.assert_raises_regex(ValueError, msg): + dtype._update_dtype(bad_dtype) + class TestDatetimeTZDtype(Base):