From 9b2d05f3ae1e529e9fe6c90d5021c7cdaf4e34fe Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 19 Nov 2016 20:24:30 -0600 Subject: [PATCH 1/3] API: CategoricalType for specifying categoricals Simple implementation for now. --- pandas/core/api.py | 2 +- pandas/core/categorical.py | 45 ++++++++++++++++++++++++++++++ pandas/core/internals.py | 6 ++++ pandas/tests/series/test_dtypes.py | 8 +++++- 4 files changed, 59 insertions(+), 2 deletions(-) diff --git a/pandas/core/api.py b/pandas/core/api.py index b5e1de2063c7e..56f648c46fbdc 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -6,7 +6,7 @@ from pandas.core.algorithms import factorize, match, unique, value_counts from pandas.types.missing import isnull, notnull -from pandas.core.categorical import Categorical +from pandas.core.categorical import Categorical, CategoricalType from pandas.core.groupby import Grouper from pandas.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index fd1a23a5bab7f..516cb16192922 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2066,3 +2066,48 @@ def _factorize_from_iterables(iterables): # For consistency, it should return a list of 2 lists. return [[], []] return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) + + +class CategoricalType(CategoricalDtype): + """ + Type for categorical data with the categories and orderedness, + but not the values + + Parameters + ---------- + categories : list or None + ordered : bool, default False + + Notes + ----- + `categories=None` implies infer in whatever operation you're + doing. + + Examples + -------- + >>> t = CategoricalType(categories=['b', 'a'], ordered=True) + >>> s = Series(['a', 'a', 'b', 'b', 'a']) + >>> s.astype(t) + 0 a + 1 a + 2 b + 3 b + 4 a + dtype: category + Categories (2, object): [b < a] + """ + dtype = 'category' + name = 'category' + + def __new__(cls, categories=None, ordered=False): + self = object.__new__(cls) + self.categories = categories + self.ordered = ordered + # XXX: this is just for the repr, will move to base type + self._categorical = Categorical(None, categories=categories, + ordered=ordered) + return self + + def __repr__(self): + return "".format( + self._categorical._repr_categories()) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 43beefffd448e..6083c15e51e95 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -470,6 +470,12 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + # should we raise if CategoricalType and passed in kwargs? + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 9a406dfa10c35..b3cea6f6c956e 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas import Series +from pandas import Series, CategoricalType, Categorical from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta @@ -149,6 +149,12 @@ def test_astype_dict(self): self.assertRaises(KeyError, s.astype, {'abc': str, 'def': str}) self.assertRaises(KeyError, s.astype, {0: str}) + def test_astype_categorical(self): + s = Series(['a', 'b', 'a']) + result = s.astype(CategoricalType(['a', 'b'], ordered=True)) + expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) + assert_series_equal(result, expected) + def test_complexx(self): # GH4819 # complex access for ndarray compat From 9777fcf03024646371a9b8cebfc5429e479d9e5f Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 20 Nov 2016 06:52:36 -0600 Subject: [PATCH 2/3] reuse CategoricalDtye --- doc/source/categorical.rst | 20 +++++++++- pandas/core/api.py | 3 +- pandas/core/categorical.py | 45 --------------------- pandas/tests/series/test_dtypes.py | 15 ++++++- pandas/types/dtypes.py | 64 ++++++++++++++++++++++++++++-- 5 files changed, 94 insertions(+), 53 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 090998570a358..572c2b1dcd8fd 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -96,12 +96,14 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +You can also specify differently ordered categories or make the resulting data +ordered by passing a :class:`CategoricalDtype`: .. ipython:: python s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + cat_type = pd.CategoricalDtype(categories=["b", "c", "d"], ordered=False) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -141,6 +143,20 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) + +CategoricalDtype +---------------- + +A categorical's type is fully described by 1.) its categories (an iterable), +and 2.) its orderedness (a boolean). +This information can be stored in a :class:`~pandas.CategoricalDtype` and passed to +any place pandas expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, the Series constructor, etc. + +As a convenience, you can use the string `'category'` in place of a +:class:`pandas.CategoricalDtype` when you want the default behavior of +the categories being unordered, and equal to the set values present in the array. + Description ----------- diff --git a/pandas/core/api.py b/pandas/core/api.py index 56f648c46fbdc..d12287f6ab5a3 100644 --- a/pandas/core/api.py +++ b/pandas/core/api.py @@ -5,8 +5,9 @@ import numpy as np from pandas.core.algorithms import factorize, match, unique, value_counts +from pandas.types.dtypes import CategoricalDtype from pandas.types.missing import isnull, notnull -from pandas.core.categorical import Categorical, CategoricalType +from pandas.core.categorical import Categorical from pandas.core.groupby import Grouper from pandas.formats.format import set_eng_float_format from pandas.core.index import (Index, CategoricalIndex, Int64Index, diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 516cb16192922..fd1a23a5bab7f 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -2066,48 +2066,3 @@ def _factorize_from_iterables(iterables): # For consistency, it should return a list of 2 lists. return [[], []] return map(list, lzip(*[_factorize_from_iterable(it) for it in iterables])) - - -class CategoricalType(CategoricalDtype): - """ - Type for categorical data with the categories and orderedness, - but not the values - - Parameters - ---------- - categories : list or None - ordered : bool, default False - - Notes - ----- - `categories=None` implies infer in whatever operation you're - doing. - - Examples - -------- - >>> t = CategoricalType(categories=['b', 'a'], ordered=True) - >>> s = Series(['a', 'a', 'b', 'b', 'a']) - >>> s.astype(t) - 0 a - 1 a - 2 b - 3 b - 4 a - dtype: category - Categories (2, object): [b < a] - """ - dtype = 'category' - name = 'category' - - def __new__(cls, categories=None, ordered=False): - self = object.__new__(cls) - self.categories = categories - self.ordered = ordered - # XXX: this is just for the repr, will move to base type - self._categorical = Categorical(None, categories=categories, - ordered=ordered) - return self - - def __repr__(self): - return "".format( - self._categorical._repr_categories()) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index b3cea6f6c956e..38ec58c974f97 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -8,7 +8,7 @@ from numpy import nan import numpy as np -from pandas import Series, CategoricalType, Categorical +from pandas import Series, CategoricalDtype, Categorical, Index from pandas.tseries.index import Timestamp from pandas.tseries.tdi import Timedelta @@ -151,10 +151,21 @@ def test_astype_dict(self): def test_astype_categorical(self): s = Series(['a', 'b', 'a']) - result = s.astype(CategoricalType(['a', 'b'], ordered=True)) + result = s.astype(CategoricalDtype(['a', 'b'], ordered=True)) expected = Series(Categorical(['a', 'b', 'a'], ordered=True)) assert_series_equal(result, expected) + result = s.astype(CategoricalDtype(['a', 'b'], ordered=False)) + expected = Series(Categorical(['a', 'b', 'a'], ordered=False)) + assert_series_equal(result, expected) + + result = s.astype(CategoricalDtype(['a', 'b', 'c'], ordered=False)) + expected = Series(Categorical(['a', 'b', 'a'], + categories=['a', 'b', 'c'], + ordered=False)) + assert_series_equal(result, expected) + tm.assert_index_equal(result.cat.categories, Index(['a', 'b', 'c'])) + def test_complexx(self): # GH4819 # complex access for ndarray compat diff --git a/pandas/types/dtypes.py b/pandas/types/dtypes.py index 5b6d7905d4095..58d0410f45090 100644 --- a/pandas/types/dtypes.py +++ b/pandas/types/dtypes.py @@ -1,6 +1,7 @@ """ define extension dtypes """ import re +import reprlib import numpy as np from pandas import compat @@ -98,11 +99,54 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ + Type for categorical data with the categories and orderedness, + but not the values + + .. versionadded:: 0.20.0 + + Parameters + ---------- + categories : list or None + ordered : bool, default False + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> s = Series(['a', 'a', 'b', 'b', 'a']) + >>> s.astype(t) + 0 a + 1 a + 2 b + 3 b + 4 a + dtype: category + Categories (2, object): [b < a] + + Notes + ----- + An instance of ``CategoricalDtype`` compares equal with any other + instance of ``CategoricalDtype``, regardless of categories or ordered. + In addition they compare equal to the string ``'category'``. + To check whether two instances of a ``CategoricalDtype`` match, + use the ``is`` operator. + + >>> t1 = CategoricalDtype(['a', 'b'], ordered=True) + >>> t2 = CategoricalDtype(['a', 'c'], ordered=False) + >>> t1 == t2 + True + >>> t1 == 'category' + True + >>> t1 is t2 + False + >>> t1 is CategoricalDtype(['a', 'b'], ordered=True) + True + A np.dtype duck-typed class, suitable for holding a custom categorical dtype. THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' @@ -110,13 +154,18 @@ class CategoricalDtype(ExtensionDtype): base = np.dtype('O') _cache = {} - def __new__(cls): + def __new__(cls, categories=None, ordered=False): + categories_ = categories if categories is None else tuple(categories) + t = (categories_, ordered) try: - return cls._cache[cls.name] + return cls._cache[t] except KeyError: c = object.__new__(cls) - cls._cache[cls.name] = c + c.categories = categories + c.ordered = ordered + + cls._cache[t] = c return c def __hash__(self): @@ -129,6 +178,15 @@ def __eq__(self, other): return isinstance(other, CategoricalDtype) + # def __unicode__(self): + # tpl = 'CategoricalDtype({!r}, ordered={})' + # return tpl.format(reprlib.repr(self.categories), self.ordered) + + # def __repr__(self): + # """ return the base repr for the categories """ + # tpl = 'CategoricalDtype({!r}, ordered={})' + # return tpl.format(reprlib.repr(self.categories), self.ordered) + @classmethod def construct_from_string(cls, string): """ attempt to construct this type from a string, raise a TypeError if From 2a6a0e140d01366fed30fa7562656e0a74a7f983 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sun, 20 Nov 2016 19:58:13 -0600 Subject: [PATCH 3/3] Series ctor --- pandas/core/series.py | 3 ++- pandas/tests/series/test_constructors.py | 18 ++++++++++++++++++ pandas/types/common.py | 5 ++++- 3 files changed, 24 insertions(+), 2 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 105e39562f561..bba7cd4e326a2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2837,7 +2837,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ed7b0fda19cb7..18c8d8eaaa450 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -162,6 +162,24 @@ def test_constructor_categorical(self): self.assertTrue(is_categorical_dtype(s)) self.assertTrue(is_categorical_dtype(s.dtype)) + def test_constructor_categorical_dtype(self): + result = pd.Series(['a', 'b'], + dtype=pd.CategoricalDtype(['a', 'b', 'c'], + ordered=True)) + self.assertTrue(is_categorical_dtype(result)) + tm.assert_index_equal(result.cat.categories, pd.Index(['a', 'b', 'c'])) + self.assertTrue(result.cat.ordered) + + result = pd.Series(['a', 'b'], dtype=pd.CategoricalDtype(['b', 'a'])) + self.assertTrue(is_categorical_dtype(result)) + tm.assert_index_equal(result.cat.categories, pd.Index(['b', 'a'])) + self.assertFalse(result.cat.ordered) + + result = pd.Series(['a', 'b', 'c'], + dtype=pd.CategoricalDtype(['a', 'b'])) + expected = pd.Series(pd.Categorical(['a', 'b', np.nan])) + tm.assert_series_equal(result, expected) + def test_constructor_maskedarray(self): data = ma.masked_all((3, ), dtype=float) result = Series(data) diff --git a/pandas/types/common.py b/pandas/types/common.py index 754ff80924c07..50b2bc253e11a 100644 --- a/pandas/types/common.py +++ b/pandas/types/common.py @@ -319,7 +319,10 @@ def is_complex_dtype(arr_or_dtype): def _coerce_to_dtype(dtype): """ coerce a string / np.dtype to a dtype """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + # TODO: pass thru categories, ordered + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype):