diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 090998570a358..572c2b1dcd8fd 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -96,12 +96,14 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +You can also specify differently ordered categories or make the resulting data +ordered by passing a :class:`CategoricalDtype`: .. ipython:: python s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + cat_type = pd.CategoricalDtype(categories=["b", "c", "d"], ordered=False) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -141,6 +143,20 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) + +CategoricalDtype +---------------- + +A categorical's type is fully described by 1.) its categories (an iterable), +and 2.) its orderedness (a boolean). +This information can be stored in a :class:`~pandas.CategoricalDtype` and passed to +any place pandas expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, the Series constructor, etc. + +As a convenience, you can use the string `'category'` in place of a +:class:`pandas.CategoricalDtype` when you want the default behavior of +the categories being unordered, and equal to the set values present in the array. + Description ----------- diff --git a/pandas/core/api.py b/pandas/core/api.py index b5e1de2063c7e..d12287f6ab5a3 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,6 +5,7 @@ import numpy as np from pandas.core.algorithms import factorize, match, unique, value_counts +from pandas.types.dtypes import CategoricalDtype from pandas.types.missing import isnull, notnull from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 43beefffd448e..6083c15e51e95 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -470,6 +470,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + # should we raise if CategoricalType and passed in kwargs? + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index 105e39562f561..bba7cd4e326a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2837,7 +2837,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ed7b0fda19cb7..18c8d8eaaa450 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -162,6 +162,24 @@ def test_constructor_categorical(self): self.assertTrue(is_categorical_dtype(s)) self.assertTrue(is_categorical_dtype(s.dtype)) + def test_constructor_categorical_dtype(self): + result = pd.Series(['a', 'b'], + dtype=pd.CategoricalDtype(['a', 'b', 'c'], + ordered=True)) + self.assertTrue(is_categorical_dtype(result)) + tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) + self.assertTrue(result.cat.ordered) + + result = pd.Series(['a', 'b'], dtype=pd.CategoricalDtype(['b', 'a'])) + self.assertTrue(is_categorical_dtype(result)) + tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) + self.assertFalse(result.cat.ordered) + + result = pd.Series(['a', 'b', 'c'], + dtype=pd.CategoricalDtype(['a', 'b'])) + expected = pd.Series(pd.Categorical(['a', 'b', np.nan])) + tm.assert_series_equal(result, expected) + def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 9a406dfa10c35..38ec58c974f97 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas import Series +from pandas import Series, CategoricalDtype, Categorical, Index from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta @@ -149,6 +149,23 @@ def test_astype_dict(self): self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) self.assertRaises(KeyError, s.astype, {0: str}) + def test_astype_categorical(self): + s = Series(['a', 'b', 'a']) + result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) + expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) + assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) + expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) + assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) + expected = Series(Categorical(['a', 'b', 'a'], + categories=['a', 'b', 'c'], + ordered=False)) + assert_series_equal(result, expected) + tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) + def test_complexx(self): # GH4819 # complex access for ndarray compat diff --git a/pandas/types/common.py b/pandas/types/common.py index 754ff80924c07..50b2bc253e11a 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -319,7 +319,10 @@ def is_complex_dtype(arr_or_dtype): def _coerce_to_dtype(dtype): """ coerce a string / np.dtype to a dtype """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + # TODO: pass thru categories, ordered + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 5b6d7905d4095..58d0410f45090 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -1,6 +1,7 @@ """ define extension dtypes """ import re +import reprlib import numpy as np from pandas import compat @@ -98,11 +99,54 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ + Type for categorical data with the categories and orderedness, + but not the values + + .. versionadded:: 0.20.0 + + Parameters + ---------- + categories : list or None + ordered : bool, default False + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> s = Series(['a', 'a', 'b', 'b', 'a']) + >>> s.astype(t) + 0 a + 1 a + 2 b + 3 b + 4 a + dtype: category + Categories (2, object): [b < a] + + Notes + ----- + An instance of ``CategoricalDtype`` compares equal with any other + instance of ``CategoricalDtype``, regardless of categories or ordered. + In addition they compare equal to the string ``'category'``. + To check whether two instances of a ``CategoricalDtype`` match, + use the ``is`` operator. + + >>> t1 = CategoricalDtype(['a', 'b'], ordered=True) + >>> t2 = CategoricalDtype(['a', 'c'], ordered=False) + >>> t1 == t2 + True + >>> t1 == 'category' + True + >>> t1 is t2 + False + >>> t1 is CategoricalDtype(['a', 'b'], ordered=True) + True + A np.dtype duck-typed class, suitable for holding a custom categorical dtype. THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' @@ -110,13 +154,18 @@ class CategoricalDtype(ExtensionDtype): base = np.dtype('O') _cache = {} - def __new__(cls): + def __new__(cls, categories=None, ordered=False): + categories_ = categories if categories is None else tuple(categories) + t = (categories_, ordered) try: - return cls._cache[cls.name] + return cls._cache[t] except KeyError: c = object.__new__(cls) - cls._cache[cls.name] = c + c.categories = categories + c.ordered = ordered + + cls._cache[t] = c return c def __hash__(self): @@ -129,6 +178,15 @@ def __eq__(self, other): return isinstance(other, CategoricalDtype) + # def __unicode__(self): + # tpl = 'CategoricalDtype({!r}, ordered={})' + # return tpl.format(reprlib.repr(self.categories), self.ordered) + + # def __repr__(self): + # """ return the base repr for the categories """ + # tpl = 'CategoricalDtype({!r}, ordered={})' + # return tpl.format(reprlib.repr(self.categories), self.ordered) + @classmethod def construct_from_string(cls, string): """ attempt to construct this type from a string, raise a TypeError if