From 790cd42ab29e683120fc4a2e26f440fa4c6e99d7 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 24 Aug 2017 16:04:02 -0500 Subject: [PATCH 1/7] ENH: Parametrized CategoricalDtype We extended the CategoricalDtype to accept optional categories and ordered argument. ```python pd.CategoricalDtype(categories=['a', 'b'], ordered=True ``` CategoricalDtype is now part of the public API. This allows users to specify the desired categories and orderedness of an operation ahead of time. The current behavior, which is still possible with categories=None, the default, is to infer the categories from whatever is present. This change will make it easy to implement support for specifying categories that are know ahead of time in other places e.g. .astype, .read_csv, and the Series constructor. Closes #14711 Closes #15078 Closes #14676 --- doc/source/advanced.rst | 4 +- doc/source/api.rst | 5 +- doc/source/categorical.rst | 101 +++++- doc/source/merging.rst | 11 +- doc/source/whatsnew/v0.21.0.txt | 26 ++ pandas/core/categorical.py | 317 +++++++++--------- pandas/core/dtypes/common.py | 38 ++- pandas/core/dtypes/dtypes.py | 204 ++++++++++- pandas/core/indexes/base.py | 15 +- pandas/core/indexes/category.py | 54 ++- pandas/core/indexes/interval.py | 3 +- pandas/core/indexes/multi.py | 2 +- pandas/core/indexes/range.py | 2 +- pandas/core/internals.py | 20 +- pandas/core/series.py | 3 +- pandas/core/sorting.py | 3 +- pandas/core/util/hashing.py | 2 +- pandas/tests/dtypes/test_common.py | 10 +- pandas/tests/dtypes/test_dtypes.py | 111 +++++- pandas/tests/frame/test_analytics.py | 3 + pandas/tests/indexes/test_category.py | 10 +- .../tests/io/json/test_json_table_schema.py | 5 +- pandas/tests/io/test_parquet.py | 4 + pandas/tests/io/test_pytables.py | 10 +- pandas/tests/reshape/test_merge.py | 4 +- pandas/tests/series/test_analytics.py | 11 +- pandas/tests/series/test_constructors.py | 21 ++ pandas/tests/series/test_dtypes.py | 34 +- pandas/tests/test_algos.py | 72 ++-- pandas/tests/test_categorical.py | 141 +++++++- pandas/util/testing.py | 7 +- 31 files changed, 970 insertions(+), 283 deletions(-) diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 3bda8c7eacb61..799d04859cc2a 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -638,9 +638,11 @@ and allows efficient indexing and storage of an index with a large number of dup .. ipython:: python + from pandas.api.types import CategoricalDtype + df = pd.DataFrame({'A': np.arange(6), 'B': list('aabbca')}) - df['B'] = df['B'].astype('category', categories=list('cab')) + df['B'] = df['B'].astype(CategoricalDtype(list('cab'))) df df.dtypes df.B.cat.categories diff --git a/doc/source/api.rst b/doc/source/api.rst index 6b3e6bedcb24b..b822b7943f1d6 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -646,7 +646,10 @@ strings and apply several methods to it. These can be accessed like Categorical ~~~~~~~~~~~ -If the Series is of dtype ``category``, ``Series.cat`` can be used to change the the categorical +.. autoclass:: api.types.CategoricalDtype + :members: categories, ordered + +If the Series is of dtype ``CategoricalDtype``, ``Series.cat`` can be used to change the categorical data. This accessor is similar to the ``Series.dt`` or ``Series.str`` and has the following usable methods and properties: diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 65361886436d6..55b5d93e94943 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -89,12 +89,22 @@ By passing a :class:`pandas.Categorical` object to a `Series` or assigning it to df["B"] = raw_cat df -You can also specify differently ordered categories or make the resulting data ordered, by passing these arguments to ``astype()``: +Anywhere above we passed a keyword ``dtype='category'``, we used the default behavior of + +1. categories are inferred from the data +2. categories are unordered. + +To control those behaviors, instead of passing ``'category'``, use an instance +of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python - s = pd.Series(["a","b","c","a"]) - s_cat = s.astype("category", categories=["b","c","d"], ordered=False) + from pandas.api.types import CategoricalDtype + + s = pd.Series(["a", "b", "c", "a"]) + cat_type = CategoricalDtype(categories=["b", "c", "d"], + ordered=True) + s_cat = s.astype(cat_type) s_cat Categorical data has a specific ``category`` :ref:`dtype `: @@ -133,6 +143,73 @@ constructor to save the factorize step during normal constructor mode: splitter = np.random.choice([0,1], 5, p=[0.5,0.5]) s = pd.Series(pd.Categorical.from_codes(splitter, categories=["train", "test"])) +.. _categorical.categoricaldtype: + +CategoricalDtype +---------------- + +.. versionchanged:: 0.21.0 + +A categorical's type is fully described by + +1. ``categories``: a sequence of unique values and no missing values +2. ``ordered``: a boolean + +This information can be stored in a :class:`~pandas.api.types.CategoricalDtype`. +The ``categories`` argument is optional, which implies that the actual categories +should be inferred from whatever is present in the data when the +:class:`pandas.Categorical` is created. The categories are assumed to be unordered +by default. + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + CategoricalDtype(['a', 'b', 'c']) + CategoricalDtype(['a', 'b', 'c'], ordered=True) + CategoricalDtype() + +A :class:`~pandas.api.types.CategoricalDtype` can be used in any place pandas +expects a `dtype`. For example :func:`pandas.read_csv`, +:func:`pandas.DataFrame.astype`, or in the Series constructor. + +.. note:: + + As a convenience, you can use the string ``'category'`` in place of a + :class:`~pandas.api.types.CategoricalDtype` when you want the default behavior of + the categories being unordered, and equal to the set values present in the + array. In other words, ``dtype='category'`` is equivalent to + ``dtype=CategoricalDtype()``. + +Equality Semantics +~~~~~~~~~~~~~~~~~~ + +Two instances of :class:`~pandas.api.types.CategoricalDtype` compare equal +whenever they have the same categories and orderedness. When comparing two +unordered categoricals, the order of the ``categories`` is not considered + +.. ipython:: python + + c1 = CategoricalDtype(['a', 'b', 'c'], ordered=False) + + # Equal, since order is not considered when ordered=False + c1 == CategoricalDtype(['b', 'c', 'a'], ordered=False) + + # Unequal, since the second CategoricalDtype is ordered + c1 == CategoricalDtype(['a', 'b', 'c'], ordered=True) + +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` + +.. ipython:: python + + c1 == 'category' + +.. warning:: + + Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, + and since all instances ``CategoricalDtype`` compare equal to ``'`category'``, + all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)`` + Description ----------- @@ -184,7 +261,7 @@ It's also possible to pass in the categories in a specific order: .. ipython:: python - s = pd.Series(list('babc')).astype('category', categories=list('abcd')) + s = pd.Series(list('babc')).astype(CategoricalDtype(list('abcd'))) s # categories @@ -297,7 +374,9 @@ meaning and certain operations are possible. If the categorical is unordered, `` s = pd.Series(pd.Categorical(["a","b","c","a"], ordered=False)) s.sort_values(inplace=True) - s = pd.Series(["a","b","c","a"]).astype('category', ordered=True) + s = pd.Series(["a","b","c","a"]).astype( + CategoricalDtype(ordered=True) + ) s.sort_values(inplace=True) s s.min(), s.max() @@ -397,9 +476,15 @@ categories or a categorical with any list-like object, will raise a TypeError. .. ipython:: python - cat = pd.Series([1,2,3]).astype("category", categories=[3,2,1], ordered=True) - cat_base = pd.Series([2,2,2]).astype("category", categories=[3,2,1], ordered=True) - cat_base2 = pd.Series([2,2,2]).astype("category", ordered=True) + cat = pd.Series([1,2,3]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base = pd.Series([2,2,2]).astype( + CategoricalDtype([3, 2, 1], ordered=True) + ) + cat_base2 = pd.Series([2,2,2]).astype( + CategoricalDtype(ordered=True) + ) cat cat_base diff --git a/doc/source/merging.rst b/doc/source/merging.rst index 72787ea97a782..ad40c75a62722 100644 --- a/doc/source/merging.rst +++ b/doc/source/merging.rst @@ -830,8 +830,10 @@ The left frame. .. ipython:: python + from pandas.api.types import CategoricalDtype + X = pd.Series(np.random.choice(['foo', 'bar'], size=(10,))) - X = X.astype('category', categories=['foo', 'bar']) + X = X.astype(CategoricalDtype(categories=['foo', 'bar'])) left = pd.DataFrame({'X': X, 'Y': np.random.choice(['one', 'two', 'three'], size=(10,))}) @@ -842,8 +844,11 @@ The right frame. .. ipython:: python - right = pd.DataFrame({'X': pd.Series(['foo', 'bar']).astype('category', categories=['foo', 'bar']), - 'Z': [1, 2]}) + right = pd.DataFrame({ + 'X': pd.Series(['foo', 'bar'], + dtype=CategoricalDtype(['foo', 'bar'])), + 'Z': [1, 2] + }) right right.dtypes diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 5a353544a4283..e73572c296eac 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -10,6 +10,8 @@ users upgrade to this version. Highlights include: - Integration with `Apache Parquet `__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here `. +- New user-facing :class:`pandas.api.types.CategoricalDtype` for specifying + categoricals independent of the data, see :ref:`here `. Check the :ref:`API Changes ` and :ref:`deprecations ` before updating. @@ -89,6 +91,30 @@ This does not raise any obvious exceptions, but also does not create a new colum Setting a list-like data structure into a new attribute now raise a ``UserWarning`` about the potential for unexpected behavior. See :ref:`Attribute Access `. +.. _whatsnew_0210.enhancements.categorical_dtype: + +``CategoricalDtype`` for specifying categoricals +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:class:`pandas.api.types.CategoricalDtype` has been added to the public API and +expanded to include the ``categories`` and ``ordered`` attributes. A +``CategoricalDtype`` can be used to specify the set of categories and +orderedness of an array, independent of the data themselves. This can be useful, +e.g., when converting string data to a ``Categorical`` (:issue:`14711`, :issue:`15078`): + +.. ipython:: python + + from pandas.api.types import CategoricalDtype + + s = pd.Series(['a', 'b', 'c', 'a']) # strings + dtype = CategoricalDtype(categories=['a', 'b', 'c', 'd'], ordered=True) + s.astype(dtype) + +The ``.dtype`` property of a ``Categorical``, ``CategoricalIndex`` or a +``Series`` with categorical type will now return an instance of ``CategoricalDtype``. + +See the :ref:`CategoricalDtype docs ` for more. + .. _whatsnew_0210.enhancements.other: Other Enhancements diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index ddca93f07ad5e..7e92255ef0419 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -23,7 +23,7 @@ is_datetimelike, is_categorical, is_categorical_dtype, - is_integer_dtype, is_bool, + is_integer_dtype, is_list_like, is_sequence, is_scalar) from pandas.core.common import is_null_slice, _maybe_box_datetimelike @@ -139,33 +139,6 @@ def maybe_to_categorical(array): setter to change values in the categorical. """ -_categories_doc = """The categories of this categorical. - -Setting assigns new values to each category (effectively a rename of -each individual category). - -The assigned value has to be a list-like object. All items must be unique and -the number of items in the new categories must be the same as the number of -items in the old categories. - -Assigning to `categories` is a inplace operation! - -Raises ------- -ValueError - If the new categories do not validate as categories or if the number of new - categories is unequal the number of old categories - -See also --------- -rename_categories -reorder_categories -add_categories -remove_categories -remove_unused_categories -set_categories -""" - class Categorical(PandasObject): """ @@ -192,6 +165,10 @@ class Categorical(PandasObject): ordered : boolean, (default False) Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will not be ordered. + dtype : CategoricalDtype + An instance of ``CategoricalDtype`` to use for this categorical + + .. versionadded:: 0.21.0 Attributes ---------- @@ -202,6 +179,11 @@ class Categorical(PandasObject): categorical, read only. ordered : boolean Whether or not this Categorical is ordered. + dtype : CategoricalDtype + The instance of ``CategoricalDtype`` storing the ``categories`` + and ``ordered``. + + .. versionadded:: 0.21.0 Raises ------ @@ -211,7 +193,6 @@ class Categorical(PandasObject): If an explicit ``ordered=True`` is given but no `categories` and the `values` are not sortable. - Examples -------- >>> from pandas import Categorical @@ -223,17 +204,17 @@ class Categorical(PandasObject): [a, b, c, a, b, c] Categories (3, object): [a < b < c] + Only ordered `Categoricals` can be sorted (according to the order + of the categories) and have a min and max value. + >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], ordered=True) >>> a.min() 'c' - """ - dtype = CategoricalDtype() - """The dtype (always "category")""" - """Whether or not this Categorical is ordered. - Only ordered `Categoricals` can be sorted (according to the order - of the categories) and have a min and max value. + Notes + ----- + See the :ref:`user guide ` for more. See also -------- @@ -246,24 +227,39 @@ class Categorical(PandasObject): # For comparisons, so that numpy uses our implementation if the compare # ops, which raise __array_priority__ = 1000 + _dtype = CategoricalDtype() _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, fastpath=False): + def __init__(self, values, categories=None, ordered=None, dtype=None, + fastpath=False): - self._validate_ordered(ordered) + if dtype is not None: + if isinstance(dtype, compat.string_types): + if dtype == 'category': + dtype = CategoricalDtype(categories, ordered) + else: + raise ValueError("Unknown `dtype` {}".format(dtype)) + elif categories is not None or ordered is not None: + raise ValueError("Cannot specify both `dtype` and `categories`" + " or `ordered`.") + + categories = dtype.categories + ordered = dtype.ordered + + if ordered is None: + ordered = False if fastpath: - # fast path + if dtype is None: + dtype = CategoricalDtype(categories, ordered) self._codes = coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories( - categories, fastpath=isinstance(categories, ABCIndexClass)) - self._ordered = ordered + self._dtype = dtype return # sanitize input if is_categorical_dtype(values): - # we are either a Series or a CategoricalIndex + # we are either a Series, CategoricalIndex if isinstance(values, (ABCSeries, ABCCategoricalIndex)): values = values._values @@ -313,7 +309,8 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - categories = self._validate_categories(categories) + if dtype is None or isinstance(dtype, str): + dtype = CategoricalDtype(categories, ordered) else: # there were two ways if categories are present @@ -325,12 +322,15 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): # make sure that we always have the same type here, no matter what # we get passed in - categories = self._validate_categories(categories) - codes = _get_codes_for_values(values, categories) + if dtype is None or isinstance(dtype, str): + dtype = CategoricalDtype(categories, ordered) + + codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes # after 0.18/ in 2016 - if is_integer_dtype(values) and not is_integer_dtype(categories): + if (is_integer_dtype(values) and + not is_integer_dtype(dtype.categories)): warn("Values and categories have different dtypes. Did you " "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) @@ -341,9 +341,57 @@ def __init__(self, values, categories=None, ordered=False, fastpath=False): "mean to use\n'Categorical.from_codes(codes, " "categories)'?", RuntimeWarning, stacklevel=2) - self.set_ordered(ordered or False, inplace=True) - self._categories = categories - self._codes = coerce_indexer_dtype(codes, categories) + self._dtype = dtype + self._codes = coerce_indexer_dtype(codes, dtype.categories) + + @property + def categories(self): + """The categories of this categorical. + + Setting assigns new values to each category (effectively a rename of + each individual category). + + The assigned value has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. + + Assigning to `categories` is a inplace operation! + + Raises + ------ + ValueError + If the new categories do not validate as categories or if the + number of new categories is unequal the number of old categories + + See also + -------- + rename_categories + reorder_categories + add_categories + remove_categories + remove_unused_categories + set_categories + """ + return self.dtype.categories + + @categories.setter + def categories(self, categories): + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (self.dtype.categories is not None and + len(self.dtype.categories) != len(new_dtype.categories)): + raise ValueError("new categories need to have the same number of " + "items as the old categories!") + self._dtype = new_dtype + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self.dtype.ordered + + @property + def dtype(self): + """The :ref:`~pandas.api.types.CategoricalDtype` for this instance""" + return self._dtype def __dir__(self): # Avoid IPython warnings for deprecated properties @@ -492,7 +540,7 @@ def from_codes(cls, codes, categories, ordered=False): raise ValueError( "codes need to be convertible to an arrays of integers") - categories = cls._validate_categories(categories) + categories = CategoricalDtype._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " @@ -535,69 +583,6 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) - _categories = None - - @classmethod - def _validate_ordered(cls, ordered): - """ - Validates that we have a valid ordered parameter. If - it is not a boolean, a TypeError will be raised. - - Parameters - ---------- - ordered : object - The parameter to be verified. - - Raises - ------ - TypeError - If 'ordered' is not a boolean. - """ - if not is_bool(ordered): - raise TypeError("'ordered' must either be 'True' or 'False'") - - @classmethod - def _validate_categories(cls, categories, fastpath=False): - """ - Validates that we have good categories - - Parameters - ---------- - fastpath : boolean (default: False) - Don't perform validation of the categories for uniqueness or nulls - - """ - if not isinstance(categories, ABCIndexClass): - dtype = None - if not hasattr(categories, "dtype"): - if not is_list_like(categories): - raise TypeError("`categories` must be list-like. " - "Got {} instead".format(repr(categories))) - categories = _convert_to_list_like(categories) - # On categories with NaNs, int values would be converted to - # float. Use "object" dtype to prevent this. - if isna(categories).any(): - without_na = np.array([x for x in categories - if notna(x)]) - with_na = np.array(categories) - if with_na.dtype != without_na.dtype: - dtype = "object" - - from pandas import Index - categories = Index(categories, dtype=dtype) - - if not fastpath: - - # Categories cannot contain NaN. - if categories.hasnans: - raise ValueError('Categorial categories cannot be null') - - # Categories must be unique. - if not categories.is_unique: - raise ValueError('Categorical categories must be unique') - - return categories - def _set_categories(self, categories, fastpath=False): """ Sets new categories @@ -608,21 +593,17 @@ def _set_categories(self, categories, fastpath=False): """ - categories = self._validate_categories(categories, fastpath=fastpath) - if (not fastpath and self._categories is not None and - len(categories) != len(self._categories)): + if fastpath: + new_dtype = CategoricalDtype._from_fastpath(categories, + self.ordered) + else: + new_dtype = CategoricalDtype(categories, ordered=self.ordered) + if (not fastpath and self.dtype.categories is not None and + len(new_dtype.categories) != len(self.dtype.categories)): raise ValueError("new categories need to have the same number of " "items than the old categories!") - self._categories = categories - - def _get_categories(self): - """ Gets the categories """ - # categories is an Index, which is immutable -> no need to copy - return self._categories - - categories = property(fget=_get_categories, fset=_set_categories, - doc=_categories_doc) + self._dtype = new_dtype def _codes_for_groupby(self, sort): """ @@ -664,7 +645,21 @@ def _codes_for_groupby(self, sort): return self.reorder_categories(cat.categories) - _ordered = None + def _set_dtype(self, dtype): + """Internal method for directly updating the CategoricalDtype + + Parameters + ---------- + dtype : CategoricalDtype + + Notes + ----- + We don't do any validation here. It's assumed that the dtype is + a (valid) instance of `CategoricalDtype`. + """ + codes = _recode_for_categories(self.codes, self.categories, + dtype.categories) + return type(self)(codes, dtype=dtype, fastpath=True) def set_ordered(self, value, inplace=False): """ @@ -679,9 +674,9 @@ def set_ordered(self, value, inplace=False): of this categorical with ordered set to the value """ inplace = validate_bool_kwarg(inplace, 'inplace') - self._validate_ordered(value) + new_dtype = CategoricalDtype(self.categories, ordered=value) cat = self if inplace else self.copy() - cat._ordered = value + cat._dtype = new_dtype if not inplace: return cat @@ -711,12 +706,6 @@ def as_unordered(self, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') return self.set_ordered(False, inplace=inplace) - def _get_ordered(self): - """ Gets the ordered attribute """ - return self._ordered - - ordered = property(fget=_get_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): """ Sets the categories to the specified new_categories. @@ -769,22 +758,21 @@ def set_categories(self, new_categories, ordered=None, rename=False, remove_unused_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - new_categories = self._validate_categories(new_categories) + if ordered is None: + ordered = self.dtype.ordered + new_dtype = CategoricalDtype(new_categories, ordered=ordered) + cat = self if inplace else self.copy() if rename: - if (cat._categories is not None and - len(new_categories) < len(cat._categories)): + if (cat.dtype.categories is not None and + len(new_dtype.categories) < len(cat.dtype.categories)): # remove all _codes which are larger and set to -1/NaN - self._codes[self._codes >= len(new_categories)] = -1 + self._codes[self._codes >= len(new_dtype.categories)] = -1 else: codes = _recode_for_categories(self.codes, self.categories, - new_categories) + new_dtype.categories) cat._codes = codes - cat._categories = new_categories - - if ordered is None: - ordered = self.ordered - cat.set_ordered(ordered, inplace=True) + cat._dtype = new_dtype if not inplace: return cat @@ -864,7 +852,7 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ inplace = validate_bool_kwarg(inplace, 'inplace') - if set(self._categories) != set(new_categories): + if set(self.dtype.categories) != set(new_categories): raise ValueError("items in new_categories are not the same as in " "old categories") return self.set_categories(new_categories, ordered=ordered, @@ -905,15 +893,17 @@ def add_categories(self, new_categories, inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') if not is_list_like(new_categories): new_categories = [new_categories] - already_included = set(new_categories) & set(self._categories) + already_included = set(new_categories) & set(self.dtype.categories) if len(already_included) != 0: msg = ("new categories must not include old categories: %s" % str(already_included)) raise ValueError(msg) - new_categories = list(self._categories) + list(new_categories) + new_categories = list(self.dtype.categories) + list(new_categories) + new_dtype = CategoricalDtype(new_categories, self.ordered) + cat = self if inplace else self.copy() - cat._categories = self._validate_categories(new_categories) - cat._codes = coerce_indexer_dtype(cat._codes, new_categories) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(cat._codes, new_dtype.categories) if not inplace: return cat @@ -953,8 +943,9 @@ def remove_categories(self, removals, inplace=False): removals = [removals] removal_set = set(list(removals)) - not_included = removal_set - set(self._categories) - new_categories = [c for c in self._categories if c not in removal_set] + not_included = removal_set - set(self.dtype.categories) + new_categories = [c for c in self.dtype.categories + if c not in removal_set] # GH 10156 if any(isna(removals)): @@ -996,8 +987,11 @@ def remove_unused_categories(self, inplace=False): if idx.size != 0 and idx[0] == -1: # na sentinel idx, inv = idx[1:], inv - 1 - cat._categories = cat.categories.take(idx) - cat._codes = coerce_indexer_dtype(inv, self._categories) + new_categories = cat.dtype.categories.take(idx) + new_dtype = CategoricalDtype._from_fastpath(new_categories, + ordered=self.ordered) + cat._dtype = new_dtype + cat._codes = coerce_indexer_dtype(inv, new_dtype.categories) if not inplace: return cat @@ -1098,7 +1092,7 @@ def __setstate__(self, state): # Provide compatibility with pre-0.15.0 Categoricals. if '_categories' not in state and '_levels' in state: - state['_categories'] = self._validate_categories(state.pop( + state['_categories'] = self.dtype._validate_categories(state.pop( '_levels')) if '_codes' not in state and 'labels' in state: state['_codes'] = coerce_indexer_dtype( @@ -1113,6 +1107,11 @@ def __setstate__(self, state): else: state['_ordered'] = False + # 0.21.0 CategoricalDtype change + if '_dtype' not in state: + state['_dtype'] = CategoricalDtype(state['_categories'], + state['_ordered']) + for k, v in compat.iteritems(state): setattr(self, k, v) @@ -1122,7 +1121,7 @@ def T(self): @property def nbytes(self): - return self._codes.nbytes + self._categories.values.nbytes + return self._codes.nbytes + self.dtype.categories.values.nbytes def memory_usage(self, deep=False): """ @@ -1147,7 +1146,8 @@ def memory_usage(self, deep=False): -------- numpy.ndarray.nbytes """ - return self._codes.nbytes + self._categories.memory_usage(deep=deep) + return self._codes.nbytes + self.dtype.categories.memory_usage( + deep=deep) @Substitution(klass='Categorical') @Appender(_shared_docs['searchsorted']) @@ -1278,7 +1278,7 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = self._constructor(ix, categories=cat, ordered=obj.ordered, + ix = self._constructor(ix, dtype=self.dtype, fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1991,8 +1991,7 @@ def is_dtype_equal(self, other): """ try: - return (self.categories.equals(other.categories) and - self.ordered == other.ordered) + return hash(self.dtype) == hash(other.dtype) except (AttributeError, TypeError): return False diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index c47e61dc446be..f60c0d5ffdca0 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -692,6 +692,40 @@ def is_dtype_equal(source, target): return False +def is_dtype_union_equal(source, target): + """ + Check whether two arrays have compatible dtypes to do a union. + numpy types are checked with ``is_dtype_equal``. Extension types are + checked separately. + + Parameters + ---------- + source : The first dtype to compare + target : The second dtype to compare + + Returns + ---------- + boolean : Whether or not the two dtypes are equal. + + >>> is_dtype_equal("int", int) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'])) + True + + >>> is_dtype_equal(CategoricalDtype(['a', 'b'], + ... CategoricalDtype(['b', 'c'], ordered=True)) + False + """ + source = _get_dtype(source) + target = _get_dtype(target) + if is_categorical_dtype(source) and is_categorical_dtype(target): + # ordered False for both + return source.ordered is target.ordered + return is_dtype_equal(source, target) + + def is_any_int_dtype(arr_or_dtype): """ DEPRECATED: This function will be removed in a future version. @@ -1671,7 +1705,9 @@ def _coerce_to_dtype(dtype): """ if is_categorical_dtype(dtype): - dtype = CategoricalDtype() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + dtype = CategoricalDtype(categories=categories, ordered=ordered) elif is_datetime64tz_dtype(dtype): dtype = DatetimeTZDtype(dtype) elif is_period_dtype(dtype): diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index dc2c56ea476f9..8be7870be67f2 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,6 +3,7 @@ import re import numpy as np from pandas import compat +from pandas.core.dtypes.generic import ABCIndexClass class ExtensionDtype(object): @@ -110,37 +111,148 @@ class CategoricalDtypeType(type): class CategoricalDtype(ExtensionDtype): """ - A np.dtype duck-typed class, suitable for holding a custom categorical - dtype. - - THIS IS NOT A REAL NUMPY DTYPE, but essentially a sub-class of np.object + Type for categorical data with the categories and orderedness + + .. versionchanged:: 0.21.0 + + Parameters + ---------- + categories : sequence, optional + Must be unique, and must not contain any nulls. + ordered : bool, default False + + Notes + ----- + This class is useful for specifying the type of a ``Categorical`` + independent of the values. See :ref:`categorical.categoricaldtype` + for more. + + Examples + -------- + >>> t = CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) + 0 a + 1 b + 2 a + 3 NaN + dtype: category + Categories (2, object): [b < a] + + See Also + -------- + Categorical """ + # TODO: Document public vs. private API name = 'category' type = CategoricalDtypeType kind = 'O' str = '|O08' base = np.dtype('O') - _metadata = [] + _metadata = ['categories', 'ordered'] _cache = {} - def __new__(cls): + def __init__(self, categories=None, ordered=False): + self._finalize(categories, ordered, fastpath=False) - try: - return cls._cache[cls.name] - except KeyError: - c = object.__new__(cls) - cls._cache[cls.name] = c - return c + @classmethod + def _from_fastpath(cls, categories=None, ordered=False): + self = cls.__new__(cls) + self._finalize(categories, ordered, fastpath=True) + return self + + def _finalize(self, categories, ordered, fastpath=False): + from pandas.core.indexes.base import Index + + if categories is not None: + categories = Index(categories, tupleize_cols=False) + # validation + self._validate_categories(categories) + self._validate_ordered(ordered) + self._categories = categories + self._ordered = ordered + + def __setstate__(self, state): + self._categories = state.pop('categories', None) + self._ordered = state.pop('ordered', False) def __hash__(self): - # make myself hashable - return hash(str(self)) + # _hash_categories returns a uint64, so use the negative + # space for when we have unknown categories to avoid a conflict + if self.categories is None: + if self.ordered: + return -1 + else: + return -2 + # We *do* want to include the real self.ordered here + return int(self._hash_categories(self.categories, self.ordered)) def __eq__(self, other): if isinstance(other, compat.string_types): return other == self.name - return isinstance(other, CategoricalDtype) + if not (hasattr(other, 'ordered') and hasattr(other, 'categories')): + return False + elif self.categories is None or other.categories is None: + # We're forced into a suboptimal corner thanks to math and + # backwards compatibility. We require that `CDT(...) == 'category'` + # for all CDTs **including** `CDT(None, ...)`. Therefore, *all* + # CDT(., .) = CDT(None, False) and *all* + # CDT(., .) = CDT(None, True). + return True + elif self.ordered: + return other.ordered and self.categories.equals(other.categories) + elif other.ordered: + return False + else: + # both unordered; this could probably be optimized / cached + return hash(self) == hash(other) + + def __unicode__(self): + tpl = u'CategoricalDtype(categories={}ordered={})' + if self.categories is None: + data = u"None, " + else: + data = self.categories._format_data(name=self.__class__.__name__) + return tpl.format(data, self.ordered) + + @staticmethod + def _hash_categories(categories, ordered=True): + from pandas.core.util.hashing import ( + hash_array, _combine_hash_arrays, hash_tuples + ) + + if len(categories) and isinstance(categories[0], tuple): + # assumes if any individual category is a tuple, then all our. ATM + # I don't really want to support just some of the categories being + # tuples. + categories = list(categories) # breaks if a np.array of categories + cat_array = hash_tuples(categories) + else: + if categories.dtype == 'O': + types = [type(x) for x in categories] + if not len(set(types)) == 1: + # TODO: hash_array doesn't handle mixed types. It casts + # everything to a str first, which means we treat + # {'1', '2'} the same as {'1', 2} + # find a better solution + cat_array = np.array([hash(x) for x in categories]) + hashed = hash((tuple(categories), ordered)) + return hashed + cat_array = hash_array(np.asarray(categories), categorize=False) + if ordered: + cat_array = np.vstack([ + cat_array, np.arange(len(cat_array), dtype=cat_array.dtype) + ]) + else: + cat_array = [cat_array] + hashed = _combine_hash_arrays(iter(cat_array), + num_items=len(cat_array)) + if len(hashed) == 0: + # bug in Numpy<1.12 for length 0 arrays. Just return the correct + # value of 0 + return 0 + else: + return np.bitwise_xor.reduce(hashed) @classmethod def construct_from_string(cls, string): @@ -154,6 +266,68 @@ def construct_from_string(cls, string): raise TypeError("cannot construct a CategoricalDtype") + @staticmethod + def _validate_ordered(ordered): + """ + Validates that we have a valid ordered parameter. If + it is not a boolean, a TypeError will be raised. + + Parameters + ---------- + ordered : object + The parameter to be verified. + + Raises + ------ + TypeError + If 'ordered' is not a boolean. + """ + from pandas.core.dtypes.common import is_bool + if not is_bool(ordered): + raise TypeError("'ordered' must either be 'True' or 'False'") + + @staticmethod + def _validate_categories(categories, fastpath=False): + """ + Validates that we have good categories + + Parameters + ---------- + categories : array-like + fastpath : bool + Whether to skip nan and uniqueness checks + + Returns + ------- + categories : Index + """ + from pandas import Index + + if not isinstance(categories, ABCIndexClass): + categories = Index(categories) + + if not fastpath: + + if categories.hasnans: + raise ValueError('Categorial categories cannot be null') + + if not categories.is_unique: + raise ValueError('Categorical categories must be unique') + + return categories + + @property + def categories(self): + """ + An ``Index`` containing the unique categories allowed. + """ + return self._categories + + @property + def ordered(self): + """Whether the categories have an ordered relationship""" + return self._ordered + class DatetimeTZDtypeType(type): """ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 008828cf4f309..3ce7403f8d726 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -27,6 +27,7 @@ is_integer, is_float, is_dtype_equal, + is_dtype_union_equal, is_object_dtype, is_categorical_dtype, is_interval_dtype, @@ -847,7 +848,7 @@ def _formatter_func(self): """ return default_pprint - def _format_data(self): + def _format_data(self, name=None): """ Return the formatted data as a unicode string """ @@ -856,9 +857,11 @@ def _format_data(self): display_width, _ = get_console_size() if display_width is None: display_width = get_option('display.width') or 80 + if name is None: + name = self.__class__.__name__ - space1 = "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - space2 = "\n%s" % (' ' * (len(self.__class__.__name__) + 2)) + space1 = "\n%s" % (' ' * (len(name) + 1)) + space2 = "\n%s" % (' ' * (len(name) + 2)) n = len(self) sep = ',' @@ -2170,7 +2173,11 @@ def union(self, other): if len(self) == 0: return other._get_consensus_name(self) - if not is_dtype_equal(self.dtype, other.dtype): + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): this = self.astype('O') other = other.astype('O') return this.union(other) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index ef1dc4d971f37..5464bc10b18e5 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -58,16 +58,18 @@ def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=False, **kwargs): if fastpath: - return cls._simple_new(data, name=name) + return cls._simple_new(data, name=name, dtype=dtype) if name is None and hasattr(data, 'name'): name = data.name if isinstance(data, ABCCategorical): - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) elif isinstance(data, CategoricalIndex): data = data._data - data = cls._create_categorical(cls, data, categories, ordered) + data = cls._create_categorical(cls, data, categories, ordered, + dtype) else: # don't allow scalars @@ -114,7 +116,8 @@ def _create_from_codes(self, codes, categories=None, ordered=None, return CategoricalIndex(cat, name=name) @staticmethod - def _create_categorical(self, data, categories=None, ordered=None): + def _create_categorical(self, data, categories=None, ordered=None, + dtype=None): """ *this is an internal non-public method* @@ -125,6 +128,7 @@ def _create_categorical(self, data, categories=None, ordered=None): data : data for new Categorical categories : optional categories, defaults to existing ordered : optional ordered attribute, defaults to existing + dtype : CategoricalDtype, defaults to existing Returns ------- @@ -135,22 +139,30 @@ def _create_categorical(self, data, categories=None, ordered=None): data = data.values if not isinstance(data, ABCCategorical): - ordered = False if ordered is None else ordered + if ordered is None and dtype is None: + ordered = False from pandas.core.categorical import Categorical - data = Categorical(data, categories=categories, ordered=ordered) + data = Categorical(data, categories=categories, ordered=ordered, + dtype=dtype) else: + from pandas.core.dtypes.dtypes import CategoricalDtype + if categories is not None: - data = data.set_categories(categories) - if ordered is not None: + data = data.set_categories(categories, ordered=ordered) + elif ordered is not None and ordered != data.ordered: data = data.set_ordered(ordered) + if isinstance(dtype, CategoricalDtype): + # we want to silently ignore dtype='category' + data = data._set_dtype(dtype) return data @classmethod def _simple_new(cls, values, name=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): result = object.__new__(cls) - values = cls._create_categorical(cls, values, categories, ordered) + values = cls._create_categorical(cls, values, categories, ordered, + dtype=dtype) result._data = values result.name = name for k, v in compat.iteritems(kwargs): @@ -161,16 +173,28 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, - **kwargs): + dtype=None, **kwargs): # categories and ordered can't be part of attributes, # as these are properties + # we want to reuse self.dtype if possible, i.e. neither are + # overridden. + if dtype is not None and (categories is not None or + ordered is not None): + raise TypeError("Cannot specify both `dtype` and `categories` " + "or `ordered`") + + if categories is None and ordered is None: + dtype = self.dtype if dtype is None else dtype + return super(CategoricalIndex, self)._shallow_copy( + values=values, dtype=dtype, **kwargs) if categories is None: categories = self.categories if ordered is None: ordered = self.ordered - return super(CategoricalIndex, - self)._shallow_copy(values=values, categories=categories, - ordered=ordered, **kwargs) + + return super(CategoricalIndex, self)._shallow_copy( + values=values, categories=categories, + ordered=ordered, **kwargs) def _is_dtype_compat(self, other): """ @@ -236,7 +260,7 @@ def _format_attrs(self): ('ordered', self.ordered)] if self.name is not None: attrs.append(('name', ibase.default_pprint(self.name))) - attrs.append(('dtype', "'%s'" % self.dtype)) + attrs.append(('dtype', "'%s'" % self.dtype.name)) max_seq_items = get_option('display.max_seq_items') or len(self) if len(self) > max_seq_items: attrs.append(('length', len(self))) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index c0a9c139722f5..c36ef020faf31 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -950,9 +950,10 @@ def _format_native_types(self, na_rep='', quoting=None, **kwargs): na_rep=na_rep, justify='all').get_result() - def _format_data(self): + def _format_data(self, name=None): # TODO: integrate with categorical and make generic + # name argument is unused here; just for compat with base / categorical n = len(self) max_seq_items = min((get_option( 'display.max_seq_items') or n) // 10, 10) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ea613a27b6521..9de69c9c3e97c 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -490,7 +490,7 @@ def _format_attrs(self): def _format_space(self): return "\n%s" % (' ' * (len(self.__class__.__name__) + 1)) - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index b759abaed4e56..81600f1baa842 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -189,7 +189,7 @@ def _format_attrs(self): attrs.append(('name', ibase.default_pprint(self.name))) return attrs - def _format_data(self): + def _format_data(self, name=None): # we are formatting thru the attributes return None diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 83b382ec0ed72..e510ca87e44aa 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -139,14 +139,14 @@ def is_categorical_astype(self, dtype): validate that we have a astypeable to categorical, returns a boolean if we are a categorical """ - if is_categorical_dtype(dtype): - if dtype == CategoricalDtype(): - return True - + if dtype is Categorical or dtype is CategoricalDtype: # this is a pd.Categorical, but is not # a valid type for astypeing raise TypeError("invalid type {0} for astype".format(dtype)) + elif is_categorical_dtype(dtype): + return True + return False def external_values(self, dtype=None): @@ -548,6 +548,18 @@ def _astype(self, dtype, copy=False, errors='raise', values=None, # may need to convert to categorical # this is only called for non-categoricals if self.is_categorical_astype(dtype): + if (('categories' in kwargs or 'ordered' in kwargs) and + isinstance(dtype, CategoricalDtype)): + raise TypeError("Cannot specify a CategoricalDtype and also " + "`categories` or `ordered`. Use " + "`dtype=CategoricalDtype(categories, ordered)`" + " instead.") + kwargs = kwargs.copy() + categories = getattr(dtype, 'categories', None) + ordered = getattr(dtype, 'ordered', False) + + kwargs.setdefault('categories', categories) + kwargs.setdefault('ordered', ordered) return self.make_block(Categorical(self.values, **kwargs)) # astype processing diff --git a/pandas/core/series.py b/pandas/core/series.py index ac11c5f908fdc..bc84bd09f0b44 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2987,7 +2987,8 @@ def _try_cast(arr, take_fast_path): subarr = np.array(subarr, dtype=dtype, copy=copy) except (ValueError, TypeError): if is_categorical_dtype(dtype): - subarr = Categorical(arr) + subarr = Categorical(arr, dtype.categories, + ordered=dtype.ordered) elif dtype is not None and raise_cast_failure: raise else: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 12e8d8aba9177..27252b9616a44 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -2,7 +2,6 @@ import numpy as np from pandas.compat import long, string_types, PY3 -from pandas.core.categorical import Categorical from pandas.core.dtypes.common import ( _ensure_platform_int, _ensure_int64, @@ -183,6 +182,8 @@ def indexer_from_factorized(labels, shape, compress=True): def lexsort_indexer(keys, orders=None, na_position='last'): + from pandas.core.categorical import Categorical + labels = [] shape = [] if isinstance(orders, bool): diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index 07e993d7ef509..0c82773b75c28 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -260,7 +260,7 @@ def hash_array(vals, encoding='utf8', hash_key=None, categorize=True): # For categoricals, we hash the categories, then remap the codes to the # hash values. (This check is above the complex check so that we don't ask - # numpy if categorical is a subdtype of complex, as it will choke. + # numpy if categorical is a subdtype of complex, as it will choke). if is_categorical_dtype(dtype): return _hash_categorical(vals, encoding, hash_key) diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py index 8a36f234484b4..e0be34b14a97d 100644 --- a/pandas/tests/dtypes/test_common.py +++ b/pandas/tests/dtypes/test_common.py @@ -545,10 +545,12 @@ def test_is_complex_dtype(): (pd.Index([1, 2]), np.dtype('int64')), (pd.Index(['a', 'b']), np.dtype(object)), ('category', 'category'), - (pd.Categorical(['a', 'b']).dtype, CategoricalDtype()), - (pd.Categorical(['a', 'b']), CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype()), - (pd.CategoricalIndex(['a', 'b']), CategoricalDtype()), + (pd.Categorical(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.Categorical(['a', 'b']), CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']).dtype, CategoricalDtype(['a', 'b'])), + (pd.CategoricalIndex(['a', 'b']), CategoricalDtype(['a', 'b'])), + (CategoricalDtype(), CategoricalDtype()), + (CategoricalDtype(['a', 'b']), CategoricalDtype()), (pd.DatetimeIndex([1, 2]), np.dtype(' Date: Tue, 19 Sep 2017 16:27:46 -0500 Subject: [PATCH 2/7] update --- pandas/core/categorical.py | 1 + pandas/tests/dtypes/test_dtypes.py | 3 +++ pandas/util/testing.py | 4 +++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 7e92255ef0419..6ae333618c2ab 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -222,6 +222,7 @@ class Categorical(PandasObject): Categorical.order Categorical.min Categorical.max + pandas.api.types.CategoricalDtype """ # For comparisons, so that numpy uses our implementation if the compare diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index ce06f013dc85d..0314723745e05 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -615,6 +615,9 @@ def test_invalid_raises(self): with tm.assert_raises_regex(TypeError, 'ordered'): CategoricalDtype(['a', 'b'], ordered='foo') + with tm.assert_raises_regex(TypeError, 'collection'): + CategoricalDtype('category') + def test_mixed(self): a = CategoricalDtype(['a', 'b', 1, 2]) b = CategoricalDtype(['a', 'b', '1', '2']) diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 65f095a1406ca..23dba96e74a2e 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -1244,9 +1244,11 @@ def assert_series_equal(left, right, check_dtype=True, obj='{obj}.index'.format(obj=obj)) if check_dtype: + # We want to skip exact dtype checking when `check_categorical` + # is False. We'll still raise if only one is a `Categorical`, + # regardless of `check_categorical` if (is_categorical_dtype(left) and is_categorical_dtype(right) and not check_categorical): - # compat with pandas 0.21.0 CategoricalDtype pass else: assert_attr_equal('dtype', left, right) From 416d1d7b9ce81287a1b7f2802751dca1fdeb5084 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 20 Sep 2017 10:47:09 -0500 Subject: [PATCH 3/7] Consistent CategoricalDtype use in Categorical init Get a valid instance of `CategoricalDtype` as early as possible, and use that throughout. --- pandas/core/categorical.py | 46 ++++++++++++++++++++---------- pandas/core/dtypes/dtypes.py | 13 +++++++++ pandas/tests/dtypes/test_dtypes.py | 27 ++++++++++++++++++ pandas/tests/test_categorical.py | 41 ++++++++++++++++++++++++++ 4 files changed, 112 insertions(+), 15 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 6ae333618c2ab..f5c2d0306a42c 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -234,6 +234,21 @@ class Categorical(PandasObject): def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False): + # Ways of specifying the dtype (prioritized ordered) + # 1. dtype is a CategoricalDtype + # a.) with known categories, use dtype.categories + # b.) else with Categorical values, use values.dtype + # c.) else, infer from values + # d.) specifying dtype=CategoricalDtype and categories is an error + # 2. dtype is a string 'category' + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + # 3. dtype is None + # a.) use categories, ordered + # b.) use values.dtype + # c.) infer from values + if dtype is not None: if isinstance(dtype, compat.string_types): if dtype == 'category': @@ -247,12 +262,16 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, categories = dtype.categories ordered = dtype.ordered - if ordered is None: - ordered = False + elif is_categorical(values): + dtype = values.dtype._from_categorical_dtype(values.dtype, + categories, ordered) + else: + dtype = CategoricalDtype(categories, ordered) + + # At this point, dtype is always a CategoricalDtype + # if dtype.categories is None, we are inferring if fastpath: - if dtype is None: - dtype = CategoricalDtype(categories, ordered) self._codes = coerce_indexer_dtype(values, categories) self._dtype = dtype return @@ -260,7 +279,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # sanitize input if is_categorical_dtype(values): - # we are either a Series, CategoricalIndex + # we are either a Series or a CategoricalIndex if isinstance(values, (ABCSeries, ABCCategoricalIndex)): values = values._values @@ -271,6 +290,7 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, values = values.get_values() elif isinstance(values, (ABCIndexClass, ABCSeries)): + # we'll do inference later pass else: @@ -288,12 +308,12 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # "object" dtype to prevent this. In the end objects will be # casted to int/... in the category assignment step. if len(values) == 0 or isna(values).any(): - dtype = 'object' + sanitize_dtype = 'object' else: - dtype = None - values = _sanitize_array(values, None, dtype=dtype) + sanitize_dtype = None + values = _sanitize_array(values, None, dtype=sanitize_dtype) - if categories is None: + if dtype.categories is None: try: codes, categories = factorize(values, sort=True) except TypeError: @@ -310,7 +330,8 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, raise NotImplementedError("> 1 ndim Categorical are not " "supported at this time") - if dtype is None or isinstance(dtype, str): + if dtype.categories is None: + # we're inferring from values dtype = CategoricalDtype(categories, ordered) else: @@ -321,11 +342,6 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # - the new one, where each value is also in the categories array # (or np.nan) - # make sure that we always have the same type here, no matter what - # we get passed in - if dtype is None or isinstance(dtype, str): - dtype = CategoricalDtype(categories, ordered) - codes = _get_codes_for_values(values, dtype.categories) # TODO: check for old style usage. These warnings should be removes diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8be7870be67f2..d2487905caced 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -160,9 +160,22 @@ def _from_fastpath(cls, categories=None, ordered=False): self._finalize(categories, ordered, fastpath=True) return self + @classmethod + def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): + if categories is ordered is None: + return dtype + if categories is None: + categories = dtype.categories + if ordered is None: + ordered = dtype.ordered + return cls(categories, ordered) + def _finalize(self, categories, ordered, fastpath=False): from pandas.core.indexes.base import Index + if ordered is None: + ordered = False + if categories is not None: categories = Index(categories, tupleize_cols=False) # validation diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0314723745e05..be3e5fdc467d3 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -622,3 +622,30 @@ def test_mixed(self): a = CategoricalDtype(['a', 'b', 1, 2]) b = CategoricalDtype(['a', 'b', '1', '2']) assert hash(a) != hash(b) + + def test_from_categorical_dtype_identity(self): + c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) + # Identity test for no changes + c2 = CategoricalDtype._from_categorical_dtype(c1) + assert c2 is c1 + + def test_from_categorical_dtype_categories(self): + c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) + # override categories + result = CategoricalDtype._from_categorical_dtype( + c1, categories=[2, 3]) + assert result == CategoricalDtype([2, 3], ordered=True) + + def test_from_categorical_dtype_ordered(self): + c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) + # override ordered + result = CategoricalDtype._from_categorical_dtype( + c1, ordered=False) + assert result == CategoricalDtype([1, 2, 3], ordered=False) + + def test_from_categorical_dtype_both(self): + c1 = Categorical([1, 2], categories=[1, 2, 3], ordered=True) + # override ordered + result = CategoricalDtype._from_categorical_dtype( + c1, categories=[1, 2], ordered=False) + assert result == CategoricalDtype([1, 2], ordered=False) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 94f163028ba74..afd9e2dc2df2f 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -488,6 +488,37 @@ def test_constructor_str_unknown(self): with tm.assert_raises_regex(ValueError, "Unknown `dtype`"): Categorical([1, 2], dtype="foo") + def test_constructor_from_categorical_with_dtype(self): + dtype = CategoricalDtype(['a', 'b', 'c'], ordered=True) + values = Categorical(['a', 'b', 'd']) + result = Categorical(values, dtype=dtype) + # We use dtype.categories, not values.categories + expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'], + ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_constructor_from_categorical_with_unknown_dtype(self): + dtype = CategoricalDtype(None, ordered=True) + values = Categorical(['a', 'b', 'd']) + result = Categorical(values, dtype=dtype) + # We use values.categories, not dtype.categories + expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'd'], + ordered=True) + tm.assert_categorical_equal(result, expected) + + def test_contructor_from_categorical_string(self): + values = Categorical(['a', 'b', 'd']) + # use categories, ordered + result = Categorical(values, categories=['a', 'b', 'c'], ordered=True, + dtype='category') + expected = Categorical(['a', 'b', 'd'], categories=['a', 'b', 'c'], + ordered=True) + tm.assert_categorical_equal(result, expected) + + # No string + result = Categorical(values, categories=['a', 'b', 'c'], ordered=True) + tm.assert_categorical_equal(result, expected) + def test_from_codes(self): # too few categories @@ -932,6 +963,16 @@ def test_set_dtype_nans(self): tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype='int8')) + def test_set_categories(self): + cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) + result = cat._set_categories(['a', 'b', 'c', 'd', 'e']) + expected = Categorical(['a', 'b', 'c'], categories=list('abcde')) + tm.assert_categorical_equal(result, expected) + + # fastpath + result = cat._set_categories(['a', 'b', 'c', 'd', 'e'], fastpath=True) + tm.assert_categorical_equal(result, expected) + @pytest.mark.parametrize('values, categories, new_categories', [ # No NaNs, same cats, same order (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],), From e6c05a005181042153c746fe0f62ee549529b631 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 22 Sep 2017 17:30:18 -0500 Subject: [PATCH 4/7] PEP8 fixes --- pandas/tests/io/test_parquet.py | 2 -- pandas/tests/test_categorical.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 4382958474c9f..af382a05fee45 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -7,8 +7,6 @@ import numpy as np import pandas as pd -from pandas.compat import PY3 -from distutils.version import LooseVersion from pandas.compat import PY3, is_platform_windows from pandas.io.parquet import (to_parquet, read_parquet, get_engine, PyArrowImpl, FastParquetImpl) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index afd9e2dc2df2f..d3a62cae685ed 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -963,7 +963,7 @@ def test_set_dtype_nans(self): tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype='int8')) - def test_set_categories(self): + def test_set_categories_private(self): cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) result = cat._set_categories(['a', 'b', 'c', 'd', 'e']) expected = Categorical(['a', 'b', 'c'], categories=list('abcde')) From 41172ce0969a9306f225bc09142db07a0097037e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 23 Sep 2017 06:08:35 -0500 Subject: [PATCH 5/7] Final doc fixups --- doc/source/categorical.rst | 6 ++++-- doc/source/whatsnew/v0.21.0.txt | 3 ++- pandas/tests/test_categorical.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 55b5d93e94943..d2b53420764b2 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -207,8 +207,10 @@ All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` .. warning:: Since ``dtype='category'`` is essentially ``CategoricalDtype(None, False)``, - and since all instances ``CategoricalDtype`` compare equal to ``'`category'``, - all instances of ``CategoricalDtype`` compare equal to a ``CategoricalDtype(None)`` + and since all instances ``CategoricalDtype`` compare equal to ``'category'``, + all instances of ``CategoricalDtype`` compare equal to a + ``CategoricalDtype(None, False)``, regardless of ``categories`` or + ``ordered``. Description ----------- diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index e73572c296eac..82b793f7f84c3 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -100,7 +100,8 @@ Setting a list-like data structure into a new attribute now raise a ``UserWarnin expanded to include the ``categories`` and ``ordered`` attributes. A ``CategoricalDtype`` can be used to specify the set of categories and orderedness of an array, independent of the data themselves. This can be useful, -e.g., when converting string data to a ``Categorical`` (:issue:`14711`, :issue:`15078`): +e.g., when converting string data to a ``Categorical`` (:issue:`14711`, +:issue:`15078`, :issue:`16015`): .. ipython:: python diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index d3a62cae685ed..9b124ba1f276d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -199,7 +199,7 @@ def test_is_equal_dtype(self): assert not c1.is_dtype_equal(c1.astype(object)) assert c1.is_dtype_equal(CategoricalIndex(c1)) assert (c1.is_dtype_equal( - CategoricalIndex(c1, categories=list('cab')))) # XXX: changed + CategoricalIndex(c1, categories=list('cab')))) assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True)) def test_constructor(self): @@ -4216,7 +4216,7 @@ def test_categorical_index_preserver(self): # wrong catgories df3 = DataFrame({'A': a, - 'B': pd.Categorical(b, categories=list('abe')) # XXX + 'B': pd.Categorical(b, categories=list('abe')) }).set_index('B') pytest.raises(TypeError, lambda: pd.concat([df2, df3])) From 141e5094e13fecd68f1fc124e06e0349c29adc2a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 23 Sep 2017 07:56:56 -0500 Subject: [PATCH 6/7] Fixup set_categories inplace test --- pandas/core/categorical.py | 13 ++++++++++++- pandas/tests/test_categorical.py | 12 +++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index f5c2d0306a42c..48180f5e3217a 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -601,13 +601,24 @@ def _get_labels(self): labels = property(fget=_get_labels, fset=_set_codes) def _set_categories(self, categories, fastpath=False): - """ Sets new categories + """ Sets new categories inplace Parameters ---------- fastpath : boolean (default: False) Don't perform validation of the categories for uniqueness or nulls + Examples + -------- + >>> c = Categorical(['a', 'b']) + >>> c + [a, b] + Categories (2, object): [a, b] + + >>> c._set_categories(pd.Index(['a', 'c'])) + >>> c + [a, c] + Categories (2, object): [a, c] """ if fastpath: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 9b124ba1f276d..71f43d1922085 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -965,13 +965,15 @@ def test_set_dtype_nans(self): def test_set_categories_private(self): cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) - result = cat._set_categories(['a', 'b', 'c', 'd', 'e']) - expected = Categorical(['a', 'b', 'c'], categories=list('abcde')) - tm.assert_categorical_equal(result, expected) + cat._set_categories(['a', 'c', 'd', 'e']) + expected = Categorical(['a', 'c', 'd'], categories=list('acde')) + tm.assert_categorical_equal(cat, expected) # fastpath - result = cat._set_categories(['a', 'b', 'c', 'd', 'e'], fastpath=True) - tm.assert_categorical_equal(result, expected) + cat = Categorical(['a', 'b', 'c'], categories=['a', 'b', 'c', 'd']) + cat._set_categories(['a', 'c', 'd', 'e'], fastpath=True) + expected = Categorical(['a', 'c', 'd'], categories=list('acde')) + tm.assert_categorical_equal(cat, expected) @pytest.mark.parametrize('values, categories, new_categories', [ # No NaNs, same cats, same order From 43f90cc13786b57b89709cdb7dd8d2c023adaee6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Sat, 23 Sep 2017 11:33:03 -0500 Subject: [PATCH 7/7] PEP8 --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index af382a05fee45..ecd4e8f719014 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from pandas.compat import PY3, is_platform_windows +from pandas.compat import PY3 from pandas.io.parquet import (to_parquet, read_parquet, get_engine, PyArrowImpl, FastParquetImpl) from pandas.util import testing as tm