From d9a62787518fd596226ab035850ac65c12c1a903 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 22 Dec 2018 20:50:21 +0000 Subject: [PATCH 01/10] Add dtype to Categorical.from_codes --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/arrays/categorical.py | 24 ++++---- pandas/core/indexes/category.py | 3 +- .../arrays/categorical/test_constructors.py | 60 ++++++++++++------- 4 files changed, 54 insertions(+), 34 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 4bc50695e1ecd..28fee434f63b9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -403,6 +403,7 @@ Other Enhancements - :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). +- :meth:`Categorical.from_codes` now can take a dtype parameter (:issue:`24398`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e145a479cd3cb..164912db42379 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -603,7 +603,7 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, return cls(codes, dtype=dtype, fastpath=True) @classmethod - def from_codes(cls, codes, categories, ordered=False): + def from_codes(cls, codes, categories=None, ordered=None, dtype=None): """ Make a Categorical type from codes and categories arrays. @@ -621,10 +621,19 @@ def from_codes(cls, codes, categories, ordered=False): categories or -1 for NaN categories : index-like The categories for the categorical. Items need to be unique. - ordered : boolean, (default False) + ordered : boolean, optional Whether or not this categorical is treated as a ordered categorical. If not given, the resulting categorical will be unordered. + + .. versionchanged:: 0.24.0 + + The default value has been changed to ``None``. Previously + the default value was ``False``. + dtype : CategoricalDtype, optional + An instance of ``CategoricalDtype`` to use for this categorical. + + .. versionadded:: 0.24.0 """ dtype = CategoricalDtype._from_values_or_dtype(codes, categories, ordered) @@ -1265,8 +1274,7 @@ def shift(self, periods, fill_value=None): else: codes[periods:] = fill_value - return self.from_codes(codes, categories=self.categories, - ordered=self.ordered) + return self.from_codes(codes, dtype=self.dtype) def __array__(self, dtype=None): """ @@ -1887,9 +1895,7 @@ def take_nd(self, indexer, allow_fill=None, fill_value=None): codes = take(self._codes, indexer, allow_fill=allow_fill, fill_value=fill_value) - result = type(self).from_codes(codes, - categories=dtype.categories, - ordered=dtype.ordered) + result = type(self).from_codes(codes, dtype=dtype) return result take = take_nd @@ -2078,9 +2084,7 @@ def __setitem__(self, key, value): new_codes = _recode_for_categories( value.codes, value.categories, self.categories ) - value = Categorical.from_codes(new_codes, - categories=self.categories, - ordered=self.ordered) + value = Categorical.from_codes(new_codes, dtype=self.dtype) rvalue = value if is_list_like(value) else [value] diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f76085f9889dd..e43b64827d02a 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -148,8 +148,7 @@ def _create_from_codes(self, codes, dtype=None, name=None): dtype = self.dtype if name is None: name = self.name - cat = Categorical.from_codes(codes, categories=dtype.categories, - ordered=dtype.ordered) + cat = Categorical.from_codes(codes, dtype=dtype) return CategoricalIndex(cat, name=name) @classmethod diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index f8e9e393091e5..d5a3829f2842b 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -417,33 +417,44 @@ def test_constructor_with_categorical_categories(self): def test_from_codes(self): # too few categories + dtype = CategoricalDtype(categories=[1, 2]) with pytest.raises(ValueError): - Categorical.from_codes([1, 2], [1, 2]) + Categorical.from_codes([1, 2], categories=dtype.categories) + with pytest.raises(ValueError): + Categorical.from_codes([1, 2], dtype=dtype) # no int codes with pytest.raises(ValueError): - Categorical.from_codes(["a"], [1, 2]) + Categorical.from_codes(["a"], categories=dtype.categories) + with pytest.raises(ValueError): + Categorical.from_codes(["a"], dtype=dtype) # no unique categories with pytest.raises(ValueError): - Categorical.from_codes([0, 1, 2], ["a", "a", "b"]) + Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) # NaN categories included with pytest.raises(ValueError): - Categorical.from_codes([0, 1, 2], ["a", "b", np.nan]) + Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) # too negative + dtype = CategoricalDtype(categories=["a", "b", "c"]) + with pytest.raises(ValueError): + Categorical.from_codes([-2, 1, 2], categories=dtype.categories) with pytest.raises(ValueError): - Categorical.from_codes([-2, 1, 2], ["a", "b", "c"]) + Categorical.from_codes([-2, 1, 2], dtype=dtype) exp = Categorical(["a", "b", "c"], ordered=False) - res = Categorical.from_codes([0, 1, 2], ["a", "b", "c"]) + res = Categorical.from_codes([0, 1, 2], categories=dtype.categories) + tm.assert_categorical_equal(exp, res) + + res = Categorical.from_codes([0, 1, 2], dtype=dtype) tm.assert_categorical_equal(exp, res) - # Not available in earlier numpy versions - if hasattr(np.random, "choice"): - codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) - Categorical.from_codes(codes, categories=["train", "test"]) + codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) + dtype = CategoricalDtype(categories=["train", "test"]) + Categorical.from_codes(codes, categories=dtype.categories) + Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_categorical_categories(self): # GH17884 @@ -464,22 +475,30 @@ def test_from_codes_with_categorical_categories(self): def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] - categories = ['a', 'b', 'c'] + dtype = CategoricalDtype(categories=['a', 'b', 'c']) with pytest.raises(ValueError): - Categorical.from_codes(codes, categories) + Categorical.from_codes(codes, categories=dtype.categories) + with pytest.raises(ValueError): + Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_float(self): # GH21767 codes = [1.0, 2.0, 0] # integer, but in float dtype - categories = ['a', 'b', 'c'] + dtype = CategoricalDtype(categories=['a', 'b', 'c']) with tm.assert_produces_warning(FutureWarning): - cat = Categorical.from_codes(codes, categories) + cat = Categorical.from_codes(codes, dtype.categories) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) + + with tm.assert_produces_warning(FutureWarning): + cat = Categorical.from_codes(codes, dtype=dtype) tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) codes = [1.1, 2.0, 0] # non-integer with pytest.raises(ValueError): - Categorical.from_codes(codes, categories) + Categorical.from_codes(codes, dtype.categories) + with pytest.raises(ValueError): + Categorical.from_codes(codes, dtype=dtype) @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): @@ -515,14 +534,11 @@ def test_from_inferred_categories_coerces(self): expected = Categorical([1, 1, 2, np.nan]) tm.assert_categorical_equal(result, expected) - def test_construction_with_ordered(self): + @pytest.mark.parametrize('ordered', [None, True, False]) + def test_construction_with_ordered(self, ordered): # GH 9347, 9190 - cat = Categorical([0, 1, 2]) - assert not cat.ordered - cat = Categorical([0, 1, 2], ordered=False) - assert not cat.ordered - cat = Categorical([0, 1, 2], ordered=True) - assert cat.ordered + cat = Categorical([0, 1, 2], ordered=ordered) + assert cat.ordered == bool(ordered) @pytest.mark.xfail(reason="Imaginary values not supported in Categorical") def test_constructor_imaginary(self): From 5dfca0516b2f96e5f03cfbf233006300f85630ba Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 22 Dec 2018 21:50:10 +0000 Subject: [PATCH 02/10] changes acccording to comments --- pandas/core/arrays/categorical.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 164912db42379..35731e7bd25d9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -619,10 +619,10 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): codes : array-like, integers An integer array, where each integer points to a category in categories or -1 for NaN - categories : index-like + categories : index-like, optional The categories for the categorical. Items need to be unique. - ordered : boolean, optional - Whether or not this categorical is treated as a ordered + ordered : bool, optional + Whether or not this categorical is treated as an ordered categorical. If not given, the resulting categorical will be unordered. From b97e8a6056af5d12ef273eda1bef539b99f3e7dd Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 22 Dec 2018 23:28:32 +0000 Subject: [PATCH 03/10] add match messages --- pandas/core/arrays/categorical.py | 4 +- .../arrays/categorical/test_constructors.py | 62 ++++++++++++------- pandas/tests/indexes/test_category.py | 2 +- 3 files changed, 42 insertions(+), 26 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 35731e7bd25d9..3d51ff6bde880 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -636,7 +636,7 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): .. versionadded:: 0.24.0 """ dtype = CategoricalDtype._from_values_or_dtype(codes, categories, - ordered) + ordered, dtype) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): @@ -652,7 +652,7 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): raise ValueError(msg) try: - codes = coerce_indexer_dtype(codes, categories) + codes = coerce_indexer_dtype(codes, dtype.categories) except (ValueError, TypeError): raise ValueError( "codes need to be convertible to an arrays of integers") diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index d5a3829f2842b..20abf9900417f 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -77,7 +77,9 @@ def test_constructor_unsortable(self): assert not factor.ordered # this however will raise as cannot be sorted - with pytest.raises(TypeError): + msg = ("'values' is not ordered, please explicitly specify the " + "categories order by passing in a categories argument.") + with pytest.raises(TypeError, match=msg): Categorical(arr, ordered=True) def test_constructor_interval(self): @@ -99,10 +101,11 @@ def test_constructor(self): tm.assert_numpy_array_equal(c2.__array__(), exp_arr) # categories must be unique - with pytest.raises(ValueError): + msg = "Categorical categories must be unique" + with pytest.raises(ValueError, match=msg): Categorical([1, 2], [1, 2, 2]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical(["a", "b"], ["a", "b", "b"]) # The default should be unordered @@ -211,21 +214,23 @@ def test_constructor(self): def test_constructor_not_sequence(self): # https://github.com/pandas-dev/pandas/issues/16022 - with pytest.raises(TypeError): + msg = r"^Parameter 'categories' must be list-like, was" + with pytest.raises(TypeError, match=msg): Categorical(['a', 'b'], categories='a') def test_constructor_with_null(self): # Cannot have NaN in categories - with pytest.raises(ValueError): + msg = "Categorial categories cannot be null" + with pytest.raises(ValueError, match=msg): Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"]) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical(DatetimeIndex(['nat', '20160101']), categories=[NaT, Timestamp('20160101')]) @@ -347,13 +352,14 @@ def test_constructor_with_dtype(self, ordered): def test_constructor_dtype_and_others_raises(self): dtype = CategoricalDtype(['a', 'b'], ordered=True) - with pytest.raises(ValueError, match="Cannot"): + msg = "Cannot specify `categories` or `ordered` together with `dtype`." + with pytest.raises(ValueError, match=msg): Categorical(['a', 'b'], categories=['a', 'b'], dtype=dtype) - with pytest.raises(ValueError, match="Cannot"): + with pytest.raises(ValueError, match=msg): Categorical(['a', 'b'], ordered=True, dtype=dtype) - with pytest.raises(ValueError, match="Cannot"): + with pytest.raises(ValueError, match=msg): Categorical(['a', 'b'], ordered=False, dtype=dtype) @pytest.mark.parametrize('categories', [ @@ -418,30 +424,35 @@ def test_from_codes(self): # too few categories dtype = CategoricalDtype(categories=[1, 2]) - with pytest.raises(ValueError): + msg = "codes need to be between " + with pytest.raises(ValueError, match=msg): Categorical.from_codes([1, 2], categories=dtype.categories) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical.from_codes([1, 2], dtype=dtype) # no int codes - with pytest.raises(ValueError): + msg = "codes need to be array-like integers" + with pytest.raises(ValueError, match=msg): Categorical.from_codes(["a"], categories=dtype.categories) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical.from_codes(["a"], dtype=dtype) # no unique categories - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Categorical categories must be unique"): Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) # NaN categories included - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Categorial categories cannot be null"): Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) # too negative dtype = CategoricalDtype(categories=["a", "b", "c"]) - with pytest.raises(ValueError): + msg = r"codes need to be between -1 and len\(categories\)-1" + with pytest.raises(ValueError, match=msg): Categorical.from_codes([-2, 1, 2], categories=dtype.categories) - with pytest.raises(ValueError): + with pytest.raises(ValueError, match=msg): Categorical.from_codes([-2, 1, 2], dtype=dtype) exp = Categorical(["a", "b", "c"], ordered=False) @@ -469,16 +480,19 @@ def test_from_codes_with_categorical_categories(self): tm.assert_categorical_equal(result, expected) # non-unique Categorical still raises - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="Categorical categories must be unique"): Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] dtype = CategoricalDtype(categories=['a', 'b', 'c']) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="codes need to be array-like integers"): Categorical.from_codes(codes, categories=dtype.categories) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) def test_from_codes_with_float(self): @@ -495,9 +509,11 @@ def test_from_codes_with_float(self): tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) codes = [1.1, 2.0, 0] # non-integer - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype.categories) - with pytest.raises(ValueError): + with pytest.raises(ValueError, + match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) @pytest.mark.parametrize('dtype', [None, 'category']) diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 8518c1fa369c2..d85568ce67d16 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -158,7 +158,7 @@ def test_construction_with_categorical_dtype(self): tm.assert_index_equal(result, expected, exact=True) # error when combining categories/ordered and dtype kwargs - msg = 'Cannot specify `categories` or `ordered` together with `dtype`.' + msg = "Cannot specify `categories` or `ordered` together with `dtype`." with pytest.raises(ValueError, match=msg): CategoricalIndex(data, categories=cats, dtype=dtype) From d86e75420467e467629674d9b6c84012e33c73a3 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 23 Dec 2018 09:13:01 +0000 Subject: [PATCH 04/10] Add a constructor example --- pandas/core/arrays/categorical.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 3d51ff6bde880..58921c5b5e874 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -634,6 +634,13 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): An instance of ``CategoricalDtype`` to use for this categorical. .. versionadded:: 0.24.0 + + Examples + -------- + >>> dtype = pd.api.types.CategoricalDtype(['a', 'b'], ordered=True) + >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) + [a, b, a, b] + Categories (2, object): [a < b] """ dtype = CategoricalDtype._from_values_or_dtype(codes, categories, ordered, dtype) From 6cf8203ec1353162473532131ea6eca48ead40aa Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 23 Dec 2018 16:24:11 +0000 Subject: [PATCH 05/10] deprecate categories and ordered parameters --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/arrays/categorical.py | 35 +++++--- pandas/core/groupby/grouper.py | 15 ++-- pandas/core/indexes/multi.py | 12 +-- pandas/io/packers.py | 6 +- pandas/io/pytables.py | 7 +- .../arrays/categorical/test_constructors.py | 83 ++++++------------- .../tests/arrays/categorical/test_subclass.py | 28 ++++--- pandas/tests/arrays/test_period.py | 5 +- pandas/tests/groupby/test_categorical.py | 42 +++++----- pandas/tests/indexes/test_category.py | 3 +- pandas/tests/io/test_stata.py | 13 +-- pandas/tests/reshape/test_cut.py | 8 +- pandas/tests/reshape/test_pivot.py | 18 ++-- pandas/tests/test_algos.py | 6 +- pandas/tests/util/test_hashing.py | 11 ++- 16 files changed, 138 insertions(+), 155 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 28fee434f63b9..99bbfc8f0cf26 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1283,6 +1283,7 @@ Deprecations - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) +- :meth:`Categorical.from_codes` has deprecated parameters ``categories`` and ``ordered``. Supply a :class:`~pandas.api.types.CategoricalDtype` to new parameter ``dtype`` instead. (:issue:`24398`) - :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) - :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 58921c5b5e874..460498425a4ce 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -605,9 +605,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, @classmethod def from_codes(cls, codes, categories=None, ordered=None, dtype=None): """ - Make a Categorical type from codes and categories arrays. + Make a Categorical type from codes and CategoricalDtype. - This constructor is useful if you already have codes and categories and + This constructor is useful if you already have codes and the dtype and so do not need the (computation intensive) factorization step, which is usually done on the constructor. @@ -621,19 +621,21 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): categories or -1 for NaN categories : index-like, optional The categories for the categorical. Items need to be unique. + + .. deprecated:: 0.24.0 + Use ``dtype`` instead. ordered : bool, optional Whether or not this categorical is treated as an ordered categorical. If not given, the resulting categorical will be unordered. - .. versionchanged:: 0.24.0 - - The default value has been changed to ``None``. Previously - the default value was ``False``. - dtype : CategoricalDtype, optional + .. deprecated:: 0.24.0 + Use ``dtype`` instead. + dtype : CategoricalDtype An instance of ``CategoricalDtype`` to use for this categorical. .. versionadded:: 0.24.0 + dtype will be required in the future. Examples -------- @@ -642,8 +644,18 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): [a, b, a, b] Categories (2, object): [a < b] """ - dtype = CategoricalDtype._from_values_or_dtype(codes, categories, - ordered, dtype) + if dtype is not None: + if categories is not None or ordered is not None: + raise ValueError("Cannot specify `categories` or `ordered` " + "together with `dtype`.") + elif categories is None and dtype is None: + raise ValueError("Must specify `dtype`.") + else: + msg = u("The 'categories' and 'ordered' keyword are deprecated " + "and will be removed in a future version. Please use " + "'dtype' instead.") + warn(msg, FutureWarning, stacklevel=2) + dtype = CategoricalDtype(categories, ordered) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): @@ -1211,9 +1223,8 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - return self.from_codes(self._codes.copy(), - categories=new_categories, - ordered=self.ordered) + new_dtype = CategoricalDtype(new_categories, ordered=self.ordered) + return self.from_codes(self._codes.copy(), dtype=new_dtype) except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d8df227d4911a..df28a34868589 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -14,6 +14,7 @@ from pandas.core.dtypes.common import ( ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable, is_list_like, is_scalar, is_timedelta64_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms @@ -292,7 +293,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) - categories = self.grouper.categories + dtype = CategoricalDtype(self.grouper.categories, + ordered=self.grouper.ordered) # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes @@ -300,13 +302,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if observed: codes = algorithms.unique1d(self.grouper.codes) else: - codes = np.arange(len(categories)) + codes = np.arange(len(dtype.categories)) self._group_index = CategoricalIndex( - Categorical.from_codes( - codes=codes, - categories=categories, - ordered=self.grouper.ordered)) + Categorical.from_codes(codes=codes, dtype=dtype)) # we are done if isinstance(self.grouper, Grouping): @@ -395,8 +394,8 @@ def _make_labels(self): @cache_readonly def groups(self): - return self.index.groupby(Categorical.from_codes(self.labels, - self.group_index)) + return self.index.groupby( + Categorical(self.labels, self.group_index, fastpath=True)) def _get_grouper(obj, key=None, axis=0, level=None, sort=True, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 8d26080a0361d..cf8f712b21bfc 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -18,7 +18,8 @@ ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, pandas_dtype) -from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype +from pandas.core.dtypes.dtypes import ( + CategoricalDtype, ExtensionDtype, PandasExtensionDtype) from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import array_equivalent, isna @@ -2026,13 +2027,14 @@ def _get_codes_for_sorting(self): """ from pandas.core.arrays import Categorical - def cats(level_codes): - return np.arange(np.array(level_codes).max() + 1 if + def as_dtype(level_codes): + cats = np.arange(np.array(level_codes).max() + 1 if len(level_codes) else 0, dtype=level_codes.dtype) + return CategoricalDtype(cats, ordered=True) - return [Categorical.from_codes(level_codes, cats(level_codes), - ordered=True) + return [Categorical.from_codes(level_codes, + dtype=as_dtype(level_codes)) for level_codes in self.codes] def sortlevel(self, level=0, ascending=True, sort_remaining=True): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index b83eab7d0eba0..0971b17292114 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -55,6 +55,7 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype, needs_i8_conversion, pandas_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype as CDT from pandas import ( # noqa:F401 Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, @@ -621,9 +622,8 @@ def decode(obj): name=obj[u'name']) elif typ == u'category': from_codes = globals()[obj[u'klass']].from_codes - return from_codes(codes=obj[u'codes'], - categories=obj[u'categories'], - ordered=obj[u'ordered']) + dtype = CDT(obj[u'categories'], ordered=obj[u'ordered']) + return from_codes(codes=obj[u'codes'], dtype=dtype) elif typ == u'interval': return Interval(obj[u'left'], obj[u'right'], obj[u'closed']) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index b115529f696b8..11ea4403703d6 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -24,6 +24,7 @@ ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_list_like, is_timedelta64_dtype) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import array_equivalent from pandas import ( @@ -2206,10 +2207,8 @@ def convert(self, values, nan_rep, encoding, errors): categories = categories[~mask] codes[codes != -1] -= mask.astype(int).cumsum().values - self.data = Categorical.from_codes(codes, - categories=categories, - ordered=self.ordered) - + dtype = CategoricalDtype(categories, ordered=self.ordered) + self.data = Categorical.from_codes(codes, dtype=dtype) else: try: diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 20abf9900417f..dea9d9382c0fc 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -21,18 +21,13 @@ class TestCategoricalConstructors(object): def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'" - exp_err = TypeError - # This should be a boolean. + # This should be a boolean or None. ordered = np.array([0, 1, 2]) - with pytest.raises(exp_err, match=exp_msg): + with pytest.raises(TypeError, match=exp_msg): Categorical([1, 2, 3], ordered=ordered) - with pytest.raises(exp_err, match=exp_msg): - Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], - ordered=ordered) - def test_constructor_empty(self): # GH 17248 c = Categorical([]) @@ -421,76 +416,41 @@ def test_constructor_with_categorical_categories(self): tm.assert_categorical_equal(result, expected) def test_from_codes(self): + dtype = CategoricalDtype(categories=[1, 2]) + + # no dtype or categories + msg = 'Must specify `dtype`.' + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2]) # too few categories - dtype = CategoricalDtype(categories=[1, 2]) msg = "codes need to be between " - with pytest.raises(ValueError, match=msg): - Categorical.from_codes([1, 2], categories=dtype.categories) with pytest.raises(ValueError, match=msg): Categorical.from_codes([1, 2], dtype=dtype) # no int codes msg = "codes need to be array-like integers" - with pytest.raises(ValueError, match=msg): - Categorical.from_codes(["a"], categories=dtype.categories) with pytest.raises(ValueError, match=msg): Categorical.from_codes(["a"], dtype=dtype) - # no unique categories - with pytest.raises(ValueError, - match="Categorical categories must be unique"): - Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) - - # NaN categories included - with pytest.raises(ValueError, - match="Categorial categories cannot be null"): - Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) - # too negative dtype = CategoricalDtype(categories=["a", "b", "c"]) msg = r"codes need to be between -1 and len\(categories\)-1" - with pytest.raises(ValueError, match=msg): - Categorical.from_codes([-2, 1, 2], categories=dtype.categories) with pytest.raises(ValueError, match=msg): Categorical.from_codes([-2, 1, 2], dtype=dtype) exp = Categorical(["a", "b", "c"], ordered=False) - res = Categorical.from_codes([0, 1, 2], categories=dtype.categories) - tm.assert_categorical_equal(exp, res) - res = Categorical.from_codes([0, 1, 2], dtype=dtype) tm.assert_categorical_equal(exp, res) codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) dtype = CategoricalDtype(categories=["train", "test"]) - Categorical.from_codes(codes, categories=dtype.categories) Categorical.from_codes(codes, dtype=dtype) - def test_from_codes_with_categorical_categories(self): - # GH17884 - expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - - result = Categorical.from_codes( - [0, 1], categories=Categorical(['a', 'b', 'c'])) - tm.assert_categorical_equal(result, expected) - - result = Categorical.from_codes( - [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) - tm.assert_categorical_equal(result, expected) - - # non-unique Categorical still raises - with pytest.raises(ValueError, - match="Categorical categories must be unique"): - Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) - def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] dtype = CategoricalDtype(categories=['a', 'b', 'c']) - with pytest.raises(ValueError, - match="codes need to be array-like integers"): - Categorical.from_codes(codes, categories=dtype.categories) with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) @@ -500,28 +460,34 @@ def test_from_codes_with_float(self): codes = [1.0, 2.0, 0] # integer, but in float dtype dtype = CategoricalDtype(categories=['a', 'b', 'c']) - with tm.assert_produces_warning(FutureWarning): - cat = Categorical.from_codes(codes, dtype.categories) - tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) - - with tm.assert_produces_warning(FutureWarning): + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): cat = Categorical.from_codes(codes, dtype=dtype) tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) codes = [1.1, 2.0, 0] # non-integer - with pytest.raises(ValueError, - match="codes need to be array-like integers"): - Categorical.from_codes(codes, dtype.categories) with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) + def test_from_codes_deprecated(self, ordered): + # GH24398 + cats = ['a', 'b'] + with tm.assert_produces_warning(FutureWarning): + Categorical.from_codes([0, 1], categories=cats) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + Categorical.from_codes([0, 1], categories=cats, ordered=True) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + Categorical.from_codes([0, 1], categories=cats, ordered=False) + @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b'] codes = np.array([0, 0, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical.from_codes(codes, cats) + expected = Categorical.from_codes(codes, + dtype=CategoricalDtype(cats)) tm.assert_categorical_equal(result, expected) @pytest.mark.parametrize('dtype', [None, 'category']) @@ -529,7 +495,8 @@ def test_from_inferred_categories_sorts(self, dtype): cats = ['b', 'a'] codes = np.array([0, 1, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) + expected = Categorical.from_codes([1, 0, 0, 0], + dtype=CategoricalDtype(['a', 'b'])) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_dtype(self): diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index 7e90f8d51a3ef..3dca568817a38 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -1,25 +1,29 @@ # -*- coding: utf-8 -*- from pandas import Categorical +from pandas.api.types import CategoricalDtype import pandas.util.testing as tm class TestCategoricalSubclassing(object): def test_constructor(self): - sc = tm.SubclassedCategorical(['a', 'b', 'c']) - assert isinstance(sc, tm.SubclassedCategorical) - tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) + subclassed = tm.SubclassedCategorical(['a', 'b', 'c']) + assert isinstance(subclassed, tm.SubclassedCategorical) + tm.assert_categorical_equal(subclassed, Categorical(['a', 'b', 'c'])) def test_from_codes(self): - sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) - assert isinstance(sc, tm.SubclassedCategorical) - exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) - tm.assert_categorical_equal(sc, exp) + dtype = CategoricalDtype(['a', 'b', 'c']) + subclassed = tm.SubclassedCategorical.from_codes([1, 0, 2], + dtype=dtype) + assert isinstance(subclassed, tm.SubclassedCategorical) + + expected = Categorical.from_codes([1, 0, 2], dtype=dtype) + tm.assert_categorical_equal(subclassed, expected) def test_map(self): - sc = tm.SubclassedCategorical(['a', 'b', 'c']) - res = sc.map(lambda x: x.upper()) - assert isinstance(res, tm.SubclassedCategorical) - exp = Categorical(['A', 'B', 'C']) - tm.assert_categorical_equal(res, exp) + subclassed = tm.SubclassedCategorical(['a', 'b', 'c']) + result = subclassed.map(lambda x: x.upper()) + assert isinstance(result, tm.SubclassedCategorical) + expected = Categorical(['A', 'B', 'C']) + tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index affe3b3854490..7b1775366ef4c 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -7,6 +7,7 @@ from pandas.core.dtypes.dtypes import PeriodDtype, registry import pandas as pd +from pandas.api.types import CategoricalDtype as CDT from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm @@ -129,8 +130,8 @@ def test_astype_copies(): def test_astype_categorical(): arr = period_array(['2000', '2001', '2001', None], freq='D') result = arr.astype('category') - categories = pd.PeriodIndex(['2000', '2001'], freq='D') - expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) + dtype = CDT(categories=pd.PeriodIndex(['2000', '2001'], freq='D')) + expected = pd.Categorical.from_codes([0, 1, 1, -1], dtype=dtype) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 144b64025e1c0..8bc167f602f91 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -11,6 +11,7 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut) +from pandas.api.types import CategoricalDtype import pandas.util.testing as tm from pandas.util.testing import ( assert_equal, assert_frame_equal, assert_series_equal) @@ -22,10 +23,9 @@ def cartesian_product_for_groupers(result, args, names): def f(a): if isinstance(a, (CategoricalIndex, Categorical)): - categories = a.categories - a = Categorical.from_codes(np.arange(len(categories)), - categories=categories, - ordered=a.ordered) + dtype = CategoricalDtype(a.categories, ordered=a.ordered) + a = Categorical.from_codes(np.arange(len(dtype.categories)), + dtype=dtype) return a index = pd.MultiIndex.from_product(map(f, args), names=names) @@ -148,17 +148,17 @@ def f(x): # more basic levels = ['foo', 'bar', 'baz', 'qux'] + dtype = CategoricalDtype(levels, ordered=True) codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, levels, ordered=True) + cats = Categorical.from_codes(codes, dtype=dtype) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() - exp_idx = CategoricalIndex(levels, categories=cats.categories, - ordered=True) + exp_idx = CategoricalIndex(levels, dtype=dtype) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -177,8 +177,7 @@ def f(x): assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), - levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), dtype=dtype) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index .get_level_values(0)), exp) @@ -423,9 +422,10 @@ def test_observed_groups(observed): def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) + dtype = CategoricalDtype(levels, ordered=True) codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, levels, ordered=True) + cats = Categorical.from_codes(codes, dtype=dtype) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() @@ -452,8 +452,7 @@ def test_datetime(): expected.index.get_level_values(0)) # GH 10460 - expc = Categorical.from_codes( - np.arange(4).repeat(8), levels, ordered=True) + expc = Categorical.from_codes(np.arange(4).repeat(8), dtype=dtype) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index .get_level_values(0)), exp) @@ -466,9 +465,9 @@ def test_datetime(): def test_categorical_index(): s = np.random.RandomState(12345) - levels = ['foo', 'bar', 'baz', 'qux'] + dtype = CategoricalDtype(['foo', 'bar', 'baz', 'qux'], ordered=True) codes = s.randint(0, 4, size=20) - cats = Categorical.from_codes(codes, levels, ordered=True) + cats = Categorical.from_codes(codes, dtype=dtype) df = DataFrame( np.repeat( np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) @@ -478,16 +477,14 @@ def test_categorical_index(): result = df.set_index('cats').groupby(level=0, observed=False).sum() expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') + Categorical.from_codes([0, 1, 2, 3], dtype=dtype), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index result = df.groupby('cats', observed=False).sum() expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes( - [0, 1, 2, 3], levels, ordered=True), name='cats') + Categorical.from_codes([0, 1, 2, 3], dtype=dtype), name='cats') assert_frame_equal(result, expected) @@ -638,7 +635,8 @@ def test_categorical_no_compress(): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) + dtype = CategoricalDtype([0, 1, 2], ordered=True) + cats = Categorical.from_codes(codes, dtype=dtype) result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean() @@ -648,12 +646,12 @@ def test_categorical_no_compress(): assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) + dtype = CategoricalDtype([0, 1, 2, 3], ordered=True) + cats = Categorical.from_codes(codes, dtype=dtype) result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, categories=cats.categories, - ordered=cats.ordered) + exp.index = CategoricalIndex(exp.index, dtype=cats.dtype) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index d85568ce67d16..869ac7793bd55 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -458,8 +458,9 @@ def test_astype(self): right=[2, 4], closed='right') + dtype = CategoricalDtype(categories=ii, ordered=True) ci = CategoricalIndex(Categorical.from_codes( - [0, 1, -1], categories=ii, ordered=True)) + [0, 1, -1], dtype=dtype)) result = ci.astype('interval') expected = ii.take([0, 1, -1]) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index ce9be6a7857bf..16bb012a4e503 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -19,6 +19,7 @@ from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd +from pandas.api.types import CategoricalDtype as CDT from pandas.core.frame import DataFrame, Series import pandas.util.testing as tm @@ -958,7 +959,7 @@ def test_categorical_with_stata_missing_values(self, version): 'file', ['dta19_115', 'dta19_117']) def test_categorical_order(self, file): # Directly construct using expected codes - # Format is is_cat, col_name, labels (in order), underlying data + # Format is is_cat, col_name, categories (in order), underlying data expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), @@ -973,11 +974,13 @@ def test_categorical_order(self, file): (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] cols = [] - for is_cat, col, labels, codes in expected: + for is_cat, col, categories, codes in expected: if is_cat: - cols.append((col, pd.Categorical.from_codes(codes, labels))) + dtype = CDT(categories) + cols.append((col, pd.Categorical.from_codes(codes, + dtype=dtype))) else: - cols.append((col, pd.Series(labels, dtype=np.float32))) + cols.append((col, pd.Series(categories, dtype=np.float32))) expected = DataFrame.from_dict(OrderedDict(cols)) # Read with and with out categoricals, ensure order is identical @@ -1005,7 +1008,7 @@ def test_categorical_sorting(self, file): parsed.index = np.arange(parsed.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] - cat = pd.Categorical.from_codes(codes=codes, categories=categories) + cat = pd.Categorical.from_codes(codes=codes, dtype=CDT(categories)) expected = pd.Series(cat, name='srh') tm.assert_series_equal(expected, parsed["srh"], check_categorical=False) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 6833460fa515b..44c4b85da3a69 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -76,8 +76,7 @@ def test_bins_from_interval_index(): tm.assert_categorical_equal(result, expected) expected = Categorical.from_codes(np.append(c.codes, -1), - categories=c.categories, - ordered=True) + dtype=CDT(c.categories, ordered=True)) result = cut(range(6), bins=expected.categories) tm.assert_categorical_equal(result, expected) @@ -230,8 +229,9 @@ def test_cut_out_of_bounds(): lambda labels: Categorical(["Medium"] + 4 * ["Small"] + ["Medium", "Large"], categories=labels, ordered=True)), - (lambda labels: Categorical.from_codes([0, 1, 2], labels), - lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)) + (lambda labels: Categorical.from_codes([0, 1, 2], dtype=CDT(labels)), + lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], + dtype=CDT(labels))) ]) def test_cut_pass_labels(get_labels, get_expected): bins = [0, 25, 50, 100] diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f0d1ad57ba829..770a8679c7889 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -151,13 +151,12 @@ def test_pivot_with_non_observable_dropna(self, dropna): 'B': range(5)}) result = df.pivot_table(index='A', values='B', dropna=dropna) + dtype = pd.api.types.CategoricalDtype(categories=['low', 'high'], + ordered=True) expected = pd.DataFrame( {'B': [2, 3]}, - index=pd.Index( - pd.Categorical.from_codes([0, 1], - categories=['low', 'high'], - ordered=True), - name='A')) + index=pd.Index(pd.Categorical.from_codes([0, 1], dtype=dtype), + name='A')) tm.assert_frame_equal(result, expected) @@ -169,13 +168,12 @@ def test_pivot_with_non_observable_dropna(self, dropna): 'B': range(5)}) result = df.pivot_table(index='A', values='B', dropna=dropna) + dtype = pd.api.types.CategoricalDtype(['low', 'high', 'left'], + ordered=True) expected = pd.DataFrame( {'B': [2, 3, 0]}, - index=pd.Index( - pd.Categorical.from_codes([0, 1, 2], - categories=['low', 'high', 'left'], - ordered=True), - name='A')) + index=pd.Index(pd.Categorical.from_codes([0, 1, 2], dtype=dtype), + name='A')) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 5951f5802f50e..abfe22dd9f129 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -671,9 +671,9 @@ def test_large(self): def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) - cats = ['a', 'b', 'c'] - Sd = Series(Categorical(1).from_codes(vals, cats)) - St = Series(Categorical(1).from_codes(np.array([0, 1]), cats)) + dtype = CDT(['a', 'b', 'c']) + Sd = Series(Categorical(1).from_codes(vals, dtype=dtype)) + St = Series(Categorical(1).from_codes(np.array([0, 1]), dtype=dtype)) expected = np.array([True, True, False, True]) result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index d36de931e2610..62f5ff6cf81ae 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -5,6 +5,7 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series +from pandas.api.types import CategoricalDtype as CDT from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples from pandas.util import hash_array, hash_pandas_object import pandas.util.testing as tm @@ -243,14 +244,12 @@ def test_categorical_consistency(s1, categorize): def test_categorical_with_nan_consistency(): - c = pd.Categorical.from_codes( - [-1, 0, 1, 2, 3, 4], - categories=pd.date_range("2012-01-01", periods=5, name="B")) + dtype = CDT(pd.date_range("2012-01-01", periods=5, name="B")) + c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], dtype=dtype) expected = hash_array(c, categorize=False) - c = pd.Categorical.from_codes( - [-1, 0], - categories=[pd.Timestamp("2012-01-01")]) + c = pd.Categorical.from_codes([-1, 0], + dtype=CDT([pd.Timestamp("2012-01-01")])) result = hash_array(c, categorize=False) assert result[0] in expected From fccb54df64ec95af04c680e61127d95c7fcb2304 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 8 Jan 2019 00:36:34 +0000 Subject: [PATCH 06/10] Revert "deprecate categories and ordered parameters" This reverts commit 6cf8203ec1353162473532131ea6eca48ead40aa. --- doc/source/whatsnew/v0.24.0.rst | 1 - pandas/core/arrays/categorical.py | 35 +++----- pandas/core/groupby/grouper.py | 15 ++-- pandas/core/indexes/multi.py | 12 ++- pandas/io/packers.py | 6 +- pandas/io/pytables.py | 7 +- .../arrays/categorical/test_constructors.py | 83 +++++++++++++------ .../tests/arrays/categorical/test_subclass.py | 28 +++---- pandas/tests/arrays/test_period.py | 5 +- pandas/tests/groupby/test_categorical.py | 42 +++++----- pandas/tests/indexes/test_category.py | 3 +- pandas/tests/io/test_stata.py | 13 ++- pandas/tests/reshape/test_cut.py | 8 +- pandas/tests/reshape/test_pivot.py | 18 ++-- pandas/tests/test_algos.py | 6 +- pandas/tests/util/test_hashing.py | 11 +-- 16 files changed, 155 insertions(+), 138 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 99bbfc8f0cf26..28fee434f63b9 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1283,7 +1283,6 @@ Deprecations - :meth:`Series.compress` is deprecated. Use ``Series[condition]`` instead (:issue:`18262`) - The signature of :meth:`Series.to_csv` has been uniformed to that of :meth:`DataFrame.to_csv`: the name of the first argument is now ``path_or_buf``, the order of subsequent arguments has changed, the ``header`` argument now defaults to ``True``. (:issue:`19715`) - :meth:`Categorical.from_codes` has deprecated providing float values for the ``codes`` argument. (:issue:`21767`) -- :meth:`Categorical.from_codes` has deprecated parameters ``categories`` and ``ordered``. Supply a :class:`~pandas.api.types.CategoricalDtype` to new parameter ``dtype`` instead. (:issue:`24398`) - :func:`pandas.read_table` is deprecated. Instead, use :func:`read_csv` passing ``sep='\t'`` if necessary (:issue:`21948`) - :meth:`Series.str.cat` has deprecated using arbitrary list-likes *within* list-likes. A list-like container may still contain many ``Series``, ``Index`` or 1-dimensional ``np.ndarray``, or alternatively, only scalar values. (:issue:`21950`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 460498425a4ce..58921c5b5e874 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -605,9 +605,9 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, @classmethod def from_codes(cls, codes, categories=None, ordered=None, dtype=None): """ - Make a Categorical type from codes and CategoricalDtype. + Make a Categorical type from codes and categories arrays. - This constructor is useful if you already have codes and the dtype and + This constructor is useful if you already have codes and categories and so do not need the (computation intensive) factorization step, which is usually done on the constructor. @@ -621,21 +621,19 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): categories or -1 for NaN categories : index-like, optional The categories for the categorical. Items need to be unique. - - .. deprecated:: 0.24.0 - Use ``dtype`` instead. ordered : bool, optional Whether or not this categorical is treated as an ordered categorical. If not given, the resulting categorical will be unordered. - .. deprecated:: 0.24.0 - Use ``dtype`` instead. - dtype : CategoricalDtype + .. versionchanged:: 0.24.0 + + The default value has been changed to ``None``. Previously + the default value was ``False``. + dtype : CategoricalDtype, optional An instance of ``CategoricalDtype`` to use for this categorical. .. versionadded:: 0.24.0 - dtype will be required in the future. Examples -------- @@ -644,18 +642,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): [a, b, a, b] Categories (2, object): [a < b] """ - if dtype is not None: - if categories is not None or ordered is not None: - raise ValueError("Cannot specify `categories` or `ordered` " - "together with `dtype`.") - elif categories is None and dtype is None: - raise ValueError("Must specify `dtype`.") - else: - msg = u("The 'categories' and 'ordered' keyword are deprecated " - "and will be removed in a future version. Please use " - "'dtype' instead.") - warn(msg, FutureWarning, stacklevel=2) - dtype = CategoricalDtype(categories, ordered) + dtype = CategoricalDtype._from_values_or_dtype(codes, categories, + ordered, dtype) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): @@ -1223,8 +1211,9 @@ def map(self, mapper): """ new_categories = self.categories.map(mapper) try: - new_dtype = CategoricalDtype(new_categories, ordered=self.ordered) - return self.from_codes(self._codes.copy(), dtype=new_dtype) + return self.from_codes(self._codes.copy(), + categories=new_categories, + ordered=self.ordered) except ValueError: # NA values are represented in self._codes with -1 # np.take causes NA values to take final element in new_categories diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index df28a34868589..d8df227d4911a 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -14,7 +14,6 @@ from pandas.core.dtypes.common import ( ensure_categorical, is_categorical_dtype, is_datetime64_dtype, is_hashable, is_list_like, is_scalar, is_timedelta64_dtype) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCSeries import pandas.core.algorithms as algorithms @@ -293,8 +292,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, from pandas.core.groupby.categorical import recode_for_groupby self.grouper, self.all_grouper = recode_for_groupby( self.grouper, self.sort, observed) - dtype = CategoricalDtype(self.grouper.categories, - ordered=self.grouper.ordered) + categories = self.grouper.categories # we make a CategoricalIndex out of the cat grouper # preserving the categories / ordered attributes @@ -302,10 +300,13 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if observed: codes = algorithms.unique1d(self.grouper.codes) else: - codes = np.arange(len(dtype.categories)) + codes = np.arange(len(categories)) self._group_index = CategoricalIndex( - Categorical.from_codes(codes=codes, dtype=dtype)) + Categorical.from_codes( + codes=codes, + categories=categories, + ordered=self.grouper.ordered)) # we are done if isinstance(self.grouper, Grouping): @@ -394,8 +395,8 @@ def _make_labels(self): @cache_readonly def groups(self): - return self.index.groupby( - Categorical(self.labels, self.group_index, fastpath=True)) + return self.index.groupby(Categorical.from_codes(self.labels, + self.group_index)) def _get_grouper(obj, key=None, axis=0, level=None, sort=True, diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index cf8f712b21bfc..8d26080a0361d 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -18,8 +18,7 @@ ensure_int64, ensure_platform_int, is_categorical_dtype, is_hashable, is_integer, is_iterator, is_list_like, is_object_dtype, is_scalar, pandas_dtype) -from pandas.core.dtypes.dtypes import ( - CategoricalDtype, ExtensionDtype, PandasExtensionDtype) +from pandas.core.dtypes.dtypes import ExtensionDtype, PandasExtensionDtype from pandas.core.dtypes.generic import ABCDataFrame from pandas.core.dtypes.missing import array_equivalent, isna @@ -2027,14 +2026,13 @@ def _get_codes_for_sorting(self): """ from pandas.core.arrays import Categorical - def as_dtype(level_codes): - cats = np.arange(np.array(level_codes).max() + 1 if + def cats(level_codes): + return np.arange(np.array(level_codes).max() + 1 if len(level_codes) else 0, dtype=level_codes.dtype) - return CategoricalDtype(cats, ordered=True) - return [Categorical.from_codes(level_codes, - dtype=as_dtype(level_codes)) + return [Categorical.from_codes(level_codes, cats(level_codes), + ordered=True) for level_codes in self.codes] def sortlevel(self, level=0, ascending=True, sort_remaining=True): diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 0971b17292114..b83eab7d0eba0 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -55,7 +55,6 @@ from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64tz_dtype, is_object_dtype, needs_i8_conversion, pandas_dtype) -from pandas.core.dtypes.dtypes import CategoricalDtype as CDT from pandas import ( # noqa:F401 Categorical, CategoricalIndex, DataFrame, DatetimeIndex, Float64Index, @@ -622,8 +621,9 @@ def decode(obj): name=obj[u'name']) elif typ == u'category': from_codes = globals()[obj[u'klass']].from_codes - dtype = CDT(obj[u'categories'], ordered=obj[u'ordered']) - return from_codes(codes=obj[u'codes'], dtype=dtype) + return from_codes(codes=obj[u'codes'], + categories=obj[u'categories'], + ordered=obj[u'ordered']) elif typ == u'interval': return Interval(obj[u'left'], obj[u'right'], obj[u'closed']) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 11ea4403703d6..b115529f696b8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -24,7 +24,6 @@ ensure_int64, ensure_object, ensure_platform_int, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_list_like, is_timedelta64_dtype) -from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import array_equivalent from pandas import ( @@ -2207,8 +2206,10 @@ def convert(self, values, nan_rep, encoding, errors): categories = categories[~mask] codes[codes != -1] -= mask.astype(int).cumsum().values - dtype = CategoricalDtype(categories, ordered=self.ordered) - self.data = Categorical.from_codes(codes, dtype=dtype) + self.data = Categorical.from_codes(codes, + categories=categories, + ordered=self.ordered) + else: try: diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index dea9d9382c0fc..20abf9900417f 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -21,13 +21,18 @@ class TestCategoricalConstructors(object): def test_validate_ordered(self): # see gh-14058 exp_msg = "'ordered' must either be 'True' or 'False'" + exp_err = TypeError - # This should be a boolean or None. + # This should be a boolean. ordered = np.array([0, 1, 2]) - with pytest.raises(TypeError, match=exp_msg): + with pytest.raises(exp_err, match=exp_msg): Categorical([1, 2, 3], ordered=ordered) + with pytest.raises(exp_err, match=exp_msg): + Categorical.from_codes([0, 0, 1], categories=['a', 'b', 'c'], + ordered=ordered) + def test_constructor_empty(self): # GH 17248 c = Categorical([]) @@ -416,41 +421,76 @@ def test_constructor_with_categorical_categories(self): tm.assert_categorical_equal(result, expected) def test_from_codes(self): - dtype = CategoricalDtype(categories=[1, 2]) - - # no dtype or categories - msg = 'Must specify `dtype`.' - with pytest.raises(ValueError, match=msg): - Categorical.from_codes([1, 2]) # too few categories + dtype = CategoricalDtype(categories=[1, 2]) msg = "codes need to be between " + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([1, 2], categories=dtype.categories) with pytest.raises(ValueError, match=msg): Categorical.from_codes([1, 2], dtype=dtype) # no int codes msg = "codes need to be array-like integers" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes(["a"], categories=dtype.categories) with pytest.raises(ValueError, match=msg): Categorical.from_codes(["a"], dtype=dtype) + # no unique categories + with pytest.raises(ValueError, + match="Categorical categories must be unique"): + Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"]) + + # NaN categories included + with pytest.raises(ValueError, + match="Categorial categories cannot be null"): + Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan]) + # too negative dtype = CategoricalDtype(categories=["a", "b", "c"]) msg = r"codes need to be between -1 and len\(categories\)-1" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([-2, 1, 2], categories=dtype.categories) with pytest.raises(ValueError, match=msg): Categorical.from_codes([-2, 1, 2], dtype=dtype) exp = Categorical(["a", "b", "c"], ordered=False) + res = Categorical.from_codes([0, 1, 2], categories=dtype.categories) + tm.assert_categorical_equal(exp, res) + res = Categorical.from_codes([0, 1, 2], dtype=dtype) tm.assert_categorical_equal(exp, res) codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) dtype = CategoricalDtype(categories=["train", "test"]) + Categorical.from_codes(codes, categories=dtype.categories) Categorical.from_codes(codes, dtype=dtype) + def test_from_codes_with_categorical_categories(self): + # GH17884 + expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) + + result = Categorical.from_codes( + [0, 1], categories=Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + result = Categorical.from_codes( + [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + # non-unique Categorical still raises + with pytest.raises(ValueError, + match="Categorical categories must be unique"): + Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) + def test_from_codes_with_nan_code(self): # GH21767 codes = [1, 2, np.nan] dtype = CategoricalDtype(categories=['a', 'b', 'c']) + with pytest.raises(ValueError, + match="codes need to be array-like integers"): + Categorical.from_codes(codes, categories=dtype.categories) with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) @@ -460,34 +500,28 @@ def test_from_codes_with_float(self): codes = [1.0, 2.0, 0] # integer, but in float dtype dtype = CategoricalDtype(categories=['a', 'b', 'c']) - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning(FutureWarning): + cat = Categorical.from_codes(codes, dtype.categories) + tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) + + with tm.assert_produces_warning(FutureWarning): cat = Categorical.from_codes(codes, dtype=dtype) tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype='i1')) codes = [1.1, 2.0, 0] # non-integer + with pytest.raises(ValueError, + match="codes need to be array-like integers"): + Categorical.from_codes(codes, dtype.categories) with pytest.raises(ValueError, match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) - def test_from_codes_deprecated(self, ordered): - # GH24398 - cats = ['a', 'b'] - with tm.assert_produces_warning(FutureWarning): - Categorical.from_codes([0, 1], categories=cats) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - Categorical.from_codes([0, 1], categories=cats, ordered=True) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - Categorical.from_codes([0, 1], categories=cats, ordered=False) - @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b'] codes = np.array([0, 0, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical.from_codes(codes, - dtype=CategoricalDtype(cats)) + expected = Categorical.from_codes(codes, cats) tm.assert_categorical_equal(result, expected) @pytest.mark.parametrize('dtype', [None, 'category']) @@ -495,8 +529,7 @@ def test_from_inferred_categories_sorts(self, dtype): cats = ['b', 'a'] codes = np.array([0, 1, 1, 1], dtype='i8') result = Categorical._from_inferred_categories(cats, codes, dtype) - expected = Categorical.from_codes([1, 0, 0, 0], - dtype=CategoricalDtype(['a', 'b'])) + expected = Categorical.from_codes([1, 0, 0, 0], ['a', 'b']) tm.assert_categorical_equal(result, expected) def test_from_inferred_categories_dtype(self): diff --git a/pandas/tests/arrays/categorical/test_subclass.py b/pandas/tests/arrays/categorical/test_subclass.py index 3dca568817a38..7e90f8d51a3ef 100644 --- a/pandas/tests/arrays/categorical/test_subclass.py +++ b/pandas/tests/arrays/categorical/test_subclass.py @@ -1,29 +1,25 @@ # -*- coding: utf-8 -*- from pandas import Categorical -from pandas.api.types import CategoricalDtype import pandas.util.testing as tm class TestCategoricalSubclassing(object): def test_constructor(self): - subclassed = tm.SubclassedCategorical(['a', 'b', 'c']) - assert isinstance(subclassed, tm.SubclassedCategorical) - tm.assert_categorical_equal(subclassed, Categorical(['a', 'b', 'c'])) + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + assert isinstance(sc, tm.SubclassedCategorical) + tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c'])) def test_from_codes(self): - dtype = CategoricalDtype(['a', 'b', 'c']) - subclassed = tm.SubclassedCategorical.from_codes([1, 0, 2], - dtype=dtype) - assert isinstance(subclassed, tm.SubclassedCategorical) - - expected = Categorical.from_codes([1, 0, 2], dtype=dtype) - tm.assert_categorical_equal(subclassed, expected) + sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + assert isinstance(sc, tm.SubclassedCategorical) + exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c']) + tm.assert_categorical_equal(sc, exp) def test_map(self): - subclassed = tm.SubclassedCategorical(['a', 'b', 'c']) - result = subclassed.map(lambda x: x.upper()) - assert isinstance(result, tm.SubclassedCategorical) - expected = Categorical(['A', 'B', 'C']) - tm.assert_categorical_equal(result, expected) + sc = tm.SubclassedCategorical(['a', 'b', 'c']) + res = sc.map(lambda x: x.upper()) + assert isinstance(res, tm.SubclassedCategorical) + exp = Categorical(['A', 'B', 'C']) + tm.assert_categorical_equal(res, exp) diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py index 7b1775366ef4c..affe3b3854490 100644 --- a/pandas/tests/arrays/test_period.py +++ b/pandas/tests/arrays/test_period.py @@ -7,7 +7,6 @@ from pandas.core.dtypes.dtypes import PeriodDtype, registry import pandas as pd -from pandas.api.types import CategoricalDtype as CDT from pandas.core.arrays import PeriodArray, period_array import pandas.util.testing as tm @@ -130,8 +129,8 @@ def test_astype_copies(): def test_astype_categorical(): arr = period_array(['2000', '2001', '2001', None], freq='D') result = arr.astype('category') - dtype = CDT(categories=pd.PeriodIndex(['2000', '2001'], freq='D')) - expected = pd.Categorical.from_codes([0, 1, 1, -1], dtype=dtype) + categories = pd.PeriodIndex(['2000', '2001'], freq='D') + expected = pd.Categorical.from_codes([0, 1, 1, -1], categories=categories) tm.assert_categorical_equal(result, expected) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8bc167f602f91..144b64025e1c0 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -11,7 +11,6 @@ import pandas as pd from pandas import ( Categorical, CategoricalIndex, DataFrame, Index, MultiIndex, Series, qcut) -from pandas.api.types import CategoricalDtype import pandas.util.testing as tm from pandas.util.testing import ( assert_equal, assert_frame_equal, assert_series_equal) @@ -23,9 +22,10 @@ def cartesian_product_for_groupers(result, args, names): def f(a): if isinstance(a, (CategoricalIndex, Categorical)): - dtype = CategoricalDtype(a.categories, ordered=a.ordered) - a = Categorical.from_codes(np.arange(len(dtype.categories)), - dtype=dtype) + categories = a.categories + a = Categorical.from_codes(np.arange(len(categories)), + categories=categories, + ordered=a.ordered) return a index = pd.MultiIndex.from_product(map(f, args), names=names) @@ -148,17 +148,17 @@ def f(x): # more basic levels = ['foo', 'bar', 'baz', 'qux'] - dtype = CategoricalDtype(levels, ordered=True) codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, dtype=dtype) + cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() expected = data.groupby(np.asarray(cats), observed=False).mean() - exp_idx = CategoricalIndex(levels, dtype=dtype) + exp_idx = CategoricalIndex(levels, categories=cats.categories, + ordered=True) expected = expected.reindex(exp_idx) assert_frame_equal(result, expected) @@ -177,7 +177,8 @@ def f(x): assert_frame_equal(desc_result, expected) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), dtype=dtype) + expc = Categorical.from_codes(np.arange(4).repeat(8), + levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index .get_level_values(0)), exp) @@ -422,10 +423,9 @@ def test_observed_groups(observed): def test_datetime(): # GH9049: ensure backward compatibility levels = pd.date_range('2014-01-01', periods=4) - dtype = CategoricalDtype(levels, ordered=True) codes = np.random.randint(0, 4, size=100) - cats = Categorical.from_codes(codes, dtype=dtype) + cats = Categorical.from_codes(codes, levels, ordered=True) data = DataFrame(np.random.randn(100, 4)) result = data.groupby(cats, observed=False).mean() @@ -452,7 +452,8 @@ def test_datetime(): expected.index.get_level_values(0)) # GH 10460 - expc = Categorical.from_codes(np.arange(4).repeat(8), dtype=dtype) + expc = Categorical.from_codes( + np.arange(4).repeat(8), levels, ordered=True) exp = CategoricalIndex(expc) tm.assert_index_equal((desc_result.stack().index .get_level_values(0)), exp) @@ -465,9 +466,9 @@ def test_datetime(): def test_categorical_index(): s = np.random.RandomState(12345) - dtype = CategoricalDtype(['foo', 'bar', 'baz', 'qux'], ordered=True) + levels = ['foo', 'bar', 'baz', 'qux'] codes = s.randint(0, 4, size=20) - cats = Categorical.from_codes(codes, dtype=dtype) + cats = Categorical.from_codes(codes, levels, ordered=True) df = DataFrame( np.repeat( np.arange(20), 4).reshape(-1, 4), columns=list('abcd')) @@ -477,14 +478,16 @@ def test_categorical_index(): result = df.set_index('cats').groupby(level=0, observed=False).sum() expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes([0, 1, 2, 3], dtype=dtype), name='cats') + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) # with a cat column, should produce a cat index result = df.groupby('cats', observed=False).sum() expected = df[list('abcd')].groupby(cats.codes, observed=False).sum() expected.index = CategoricalIndex( - Categorical.from_codes([0, 1, 2, 3], dtype=dtype), name='cats') + Categorical.from_codes( + [0, 1, 2, 3], levels, ordered=True), name='cats') assert_frame_equal(result, expected) @@ -635,8 +638,7 @@ def test_categorical_no_compress(): data = Series(np.random.randn(9)) codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) - dtype = CategoricalDtype([0, 1, 2], ordered=True) - cats = Categorical.from_codes(codes, dtype=dtype) + cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True) result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean() @@ -646,12 +648,12 @@ def test_categorical_no_compress(): assert_series_equal(result, exp) codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3]) - dtype = CategoricalDtype([0, 1, 2, 3], ordered=True) - cats = Categorical.from_codes(codes, dtype=dtype) + cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True) result = data.groupby(cats, observed=False).mean() exp = data.groupby(codes, observed=False).mean().reindex(cats.categories) - exp.index = CategoricalIndex(exp.index, dtype=cats.dtype) + exp.index = CategoricalIndex(exp.index, categories=cats.categories, + ordered=cats.ordered) assert_series_equal(result, exp) cats = Categorical(["a", "a", "a", "b", "b", "b", "c", "c", "c"], diff --git a/pandas/tests/indexes/test_category.py b/pandas/tests/indexes/test_category.py index 869ac7793bd55..d85568ce67d16 100644 --- a/pandas/tests/indexes/test_category.py +++ b/pandas/tests/indexes/test_category.py @@ -458,9 +458,8 @@ def test_astype(self): right=[2, 4], closed='right') - dtype = CategoricalDtype(categories=ii, ordered=True) ci = CategoricalIndex(Categorical.from_codes( - [0, 1, -1], dtype=dtype)) + [0, 1, -1], categories=ii, ordered=True)) result = ci.astype('interval') expected = ii.take([0, 1, -1]) diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 16bb012a4e503..ce9be6a7857bf 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -19,7 +19,6 @@ from pandas.core.dtypes.common import is_categorical_dtype import pandas as pd -from pandas.api.types import CategoricalDtype as CDT from pandas.core.frame import DataFrame, Series import pandas.util.testing as tm @@ -959,7 +958,7 @@ def test_categorical_with_stata_missing_values(self, version): 'file', ['dta19_115', 'dta19_117']) def test_categorical_order(self, file): # Directly construct using expected codes - # Format is is_cat, col_name, categories (in order), underlying data + # Format is is_cat, col_name, labels (in order), underlying data expected = [(True, 'ordered', ['a', 'b', 'c', 'd', 'e'], np.arange(5)), (True, 'reverse', ['a', 'b', 'c', 'd', 'e'], np.arange(5)[::-1]), @@ -974,13 +973,11 @@ def test_categorical_order(self, file): (True, 'int32_mixed', ['d', 2, 'e', 'b', 'a'], np.arange(5))] cols = [] - for is_cat, col, categories, codes in expected: + for is_cat, col, labels, codes in expected: if is_cat: - dtype = CDT(categories) - cols.append((col, pd.Categorical.from_codes(codes, - dtype=dtype))) + cols.append((col, pd.Categorical.from_codes(codes, labels))) else: - cols.append((col, pd.Series(categories, dtype=np.float32))) + cols.append((col, pd.Series(labels, dtype=np.float32))) expected = DataFrame.from_dict(OrderedDict(cols)) # Read with and with out categoricals, ensure order is identical @@ -1008,7 +1005,7 @@ def test_categorical_sorting(self, file): parsed.index = np.arange(parsed.shape[0]) codes = [-1, -1, 0, 1, 1, 1, 2, 2, 3, 4] categories = ["Poor", "Fair", "Good", "Very good", "Excellent"] - cat = pd.Categorical.from_codes(codes=codes, dtype=CDT(categories)) + cat = pd.Categorical.from_codes(codes=codes, categories=categories) expected = pd.Series(cat, name='srh') tm.assert_series_equal(expected, parsed["srh"], check_categorical=False) diff --git a/pandas/tests/reshape/test_cut.py b/pandas/tests/reshape/test_cut.py index 44c4b85da3a69..6833460fa515b 100644 --- a/pandas/tests/reshape/test_cut.py +++ b/pandas/tests/reshape/test_cut.py @@ -76,7 +76,8 @@ def test_bins_from_interval_index(): tm.assert_categorical_equal(result, expected) expected = Categorical.from_codes(np.append(c.codes, -1), - dtype=CDT(c.categories, ordered=True)) + categories=c.categories, + ordered=True) result = cut(range(6), bins=expected.categories) tm.assert_categorical_equal(result, expected) @@ -229,9 +230,8 @@ def test_cut_out_of_bounds(): lambda labels: Categorical(["Medium"] + 4 * ["Small"] + ["Medium", "Large"], categories=labels, ordered=True)), - (lambda labels: Categorical.from_codes([0, 1, 2], dtype=CDT(labels)), - lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], - dtype=CDT(labels))) + (lambda labels: Categorical.from_codes([0, 1, 2], labels), + lambda labels: Categorical.from_codes([1] + 4 * [0] + [1, 2], labels)) ]) def test_cut_pass_labels(get_labels, get_expected): bins = [0, 25, 50, 100] diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 770a8679c7889..f0d1ad57ba829 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -151,12 +151,13 @@ def test_pivot_with_non_observable_dropna(self, dropna): 'B': range(5)}) result = df.pivot_table(index='A', values='B', dropna=dropna) - dtype = pd.api.types.CategoricalDtype(categories=['low', 'high'], - ordered=True) expected = pd.DataFrame( {'B': [2, 3]}, - index=pd.Index(pd.Categorical.from_codes([0, 1], dtype=dtype), - name='A')) + index=pd.Index( + pd.Categorical.from_codes([0, 1], + categories=['low', 'high'], + ordered=True), + name='A')) tm.assert_frame_equal(result, expected) @@ -168,12 +169,13 @@ def test_pivot_with_non_observable_dropna(self, dropna): 'B': range(5)}) result = df.pivot_table(index='A', values='B', dropna=dropna) - dtype = pd.api.types.CategoricalDtype(['low', 'high', 'left'], - ordered=True) expected = pd.DataFrame( {'B': [2, 3, 0]}, - index=pd.Index(pd.Categorical.from_codes([0, 1, 2], dtype=dtype), - name='A')) + index=pd.Index( + pd.Categorical.from_codes([0, 1, 2], + categories=['low', 'high', 'left'], + ordered=True), + name='A')) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index abfe22dd9f129..5951f5802f50e 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -671,9 +671,9 @@ def test_large(self): def test_categorical_from_codes(self): # GH 16639 vals = np.array([0, 1, 2, 0]) - dtype = CDT(['a', 'b', 'c']) - Sd = Series(Categorical(1).from_codes(vals, dtype=dtype)) - St = Series(Categorical(1).from_codes(np.array([0, 1]), dtype=dtype)) + cats = ['a', 'b', 'c'] + Sd = Series(Categorical(1).from_codes(vals, cats)) + St = Series(Categorical(1).from_codes(np.array([0, 1]), cats)) expected = np.array([True, True, False, True]) result = algos.isin(Sd, St) tm.assert_numpy_array_equal(expected, result) diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 62f5ff6cf81ae..d36de931e2610 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -5,7 +5,6 @@ import pandas as pd from pandas import DataFrame, Index, MultiIndex, Series -from pandas.api.types import CategoricalDtype as CDT from pandas.core.util.hashing import _hash_scalar, hash_tuple, hash_tuples from pandas.util import hash_array, hash_pandas_object import pandas.util.testing as tm @@ -244,12 +243,14 @@ def test_categorical_consistency(s1, categorize): def test_categorical_with_nan_consistency(): - dtype = CDT(pd.date_range("2012-01-01", periods=5, name="B")) - c = pd.Categorical.from_codes([-1, 0, 1, 2, 3, 4], dtype=dtype) + c = pd.Categorical.from_codes( + [-1, 0, 1, 2, 3, 4], + categories=pd.date_range("2012-01-01", periods=5, name="B")) expected = hash_array(c, categorize=False) - c = pd.Categorical.from_codes([-1, 0], - dtype=CDT([pd.Timestamp("2012-01-01")])) + c = pd.Categorical.from_codes( + [-1, 0], + categories=[pd.Timestamp("2012-01-01")]) result = hash_array(c, categorize=False) assert result[0] in expected From 700223548f0c17bcde2fafbeda0c88d181c25ac1 Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 8 Jan 2019 00:44:33 +0000 Subject: [PATCH 07/10] clean-up Categorical.from_codes --- pandas/core/arrays/categorical.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 58921c5b5e874..a656ed1f9d95d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -605,11 +605,11 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, @classmethod def from_codes(cls, codes, categories=None, ordered=None, dtype=None): """ - Make a Categorical type from codes and categories arrays. + Make a Categorical type from codes and categories or dtype. - This constructor is useful if you already have codes and categories and - so do not need the (computation intensive) factorization step, which is - usually done on the constructor. + This constructor is useful if you already have codes and + categories/dtype and so do not need the (computation intensive) + factorization step, which is usually done on the constructor. If your data does not follow this convention, please use the normal constructor. @@ -618,9 +618,12 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): ---------- codes : array-like, integers An integer array, where each integer points to a category in - categories or -1 for NaN + categories or dtype.categories, or else is -1 for NaN categories : index-like, optional The categories for the categorical. Items need to be unique. + .. versionchanged:: 0.24.0 + + The `categories` parameter has been made optional. ordered : bool, optional Whether or not this categorical is treated as an ordered categorical. If not given, the resulting categorical will be @@ -630,8 +633,9 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): The default value has been changed to ``None``. Previously the default value was ``False``. - dtype : CategoricalDtype, optional - An instance of ``CategoricalDtype`` to use for this categorical. + dtype : CategoricalDtype or the string "category", optional + If :class:`CategoricalDtype`, cannot be used together with + `categories` or `ordered`. .. versionadded:: 0.24.0 @@ -642,8 +646,9 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): [a, b, a, b] Categories (2, object): [a < b] """ - dtype = CategoricalDtype._from_values_or_dtype(codes, categories, - ordered, dtype) + dtype = CategoricalDtype._from_values_or_dtype(categories=categories, + ordered=ordered, + dtype=dtype) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): @@ -658,12 +663,6 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): if msg: raise ValueError(msg) - try: - codes = coerce_indexer_dtype(codes, dtype.categories) - except (ValueError, TypeError): - raise ValueError( - "codes need to be convertible to an arrays of integers") - if len(codes) and ( codes.max() >= len(dtype.categories) or codes.min() < -1): raise ValueError("codes need to be between -1 and " From 6008c08a331e877d554ec0400641fe9008385b4b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 8 Jan 2019 08:45:51 -0600 Subject: [PATCH 08/10] updates --- doc/source/whatsnew/v0.24.0.rst | 2 +- pandas/core/arrays/categorical.py | 17 ++++++----------- .../arrays/categorical/test_constructors.py | 15 ++++++++++----- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 28fee434f63b9..cf5973921fe8a 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -403,7 +403,7 @@ Other Enhancements - :meth:`pandas.api.types.is_list_like` has gained a keyword ``allow_sets`` which is ``True`` by default; if ``False``, all instances of ``set`` will not be considered "list-like" anymore (:issue:`23061`) - :meth:`Index.to_frame` now supports overriding column name(s) (:issue:`22580`). -- :meth:`Categorical.from_codes` now can take a dtype parameter (:issue:`24398`). +- :meth:`Categorical.from_codes` now can take a ``dtype`` parameter as an alternative to passing ``categories`` and ``ordered`` (:issue:`24398`). - New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`). - Compatibility with Matplotlib 3.0 (:issue:`22790`). - Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index a656ed1f9d95d..d1b09e138fbbc 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -621,27 +621,22 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): categories or dtype.categories, or else is -1 for NaN categories : index-like, optional The categories for the categorical. Items need to be unique. - .. versionchanged:: 0.24.0 - - The `categories` parameter has been made optional. ordered : bool, optional Whether or not this categorical is treated as an ordered - categorical. If not given, the resulting categorical will be - unordered. - - .. versionchanged:: 0.24.0 - - The default value has been changed to ``None``. Previously - the default value was ``False``. + categorical. If not given here or in `dtype`, the resulting + categorical will be unordered. dtype : CategoricalDtype or the string "category", optional If :class:`CategoricalDtype`, cannot be used together with `categories` or `ordered`. .. versionadded:: 0.24.0 + When `dtype` is provided, neither `categories` nor `ordered` + should be provided. + Examples -------- - >>> dtype = pd.api.types.CategoricalDtype(['a', 'b'], ordered=True) + >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True) >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype) [a, b, a, b] Categories (2, object): [a < b] diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index 20abf9900417f..ffe2493e98acd 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -462,11 +462,6 @@ def test_from_codes(self): res = Categorical.from_codes([0, 1, 2], dtype=dtype) tm.assert_categorical_equal(exp, res) - codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) - dtype = CategoricalDtype(categories=["train", "test"]) - Categorical.from_codes(codes, categories=dtype.categories) - Categorical.from_codes(codes, dtype=dtype) - def test_from_codes_with_categorical_categories(self): # GH17884 expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) @@ -516,6 +511,16 @@ def test_from_codes_with_float(self): match="codes need to be array-like integers"): Categorical.from_codes(codes, dtype=dtype) + def test_from_codes_with_dtype_raises(self): + msg = 'Cannot specify' + with pytest.raises(ValueError, match=msg): + Categorical([0, 1], categories=['a', 'b'], + dtype=CategoricalDtype(['a', 'b'])) + + with pytest.raises(ValueError, match=msg): + Categorical([0, 1], ordered=True, + dtype=CategoricalDtype(['a', 'b'])) + @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b'] From 71445304114bf03a2e7e2d43c440c5be48278089 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 8 Jan 2019 08:51:51 -0600 Subject: [PATCH 09/10] Fixups * Bug in test not using from_codes * Raise when neither provided --- pandas/core/arrays/categorical.py | 6 ++++++ .../tests/arrays/categorical/test_constructors.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d1b09e138fbbc..b4f22dbfb11c0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -621,6 +621,8 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): categories or dtype.categories, or else is -1 for NaN categories : index-like, optional The categories for the categorical. Items need to be unique. + If the categories are not given here, then theey must be provided + in `dtype`. ordered : bool, optional Whether or not this categorical is treated as an ordered categorical. If not given here or in `dtype`, the resulting @@ -644,6 +646,10 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): dtype = CategoricalDtype._from_values_or_dtype(categories=categories, ordered=ordered, dtype=dtype) + if dtype.categories is None: + msg = ("The categories must be provided in 'categories' or " + "'dtype'. Both were None.") + raise ValueError(msg) codes = np.asarray(codes) # #21767 if not is_integer_dtype(codes): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index ffe2493e98acd..25c299692ceca 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -514,12 +514,17 @@ def test_from_codes_with_float(self): def test_from_codes_with_dtype_raises(self): msg = 'Cannot specify' with pytest.raises(ValueError, match=msg): - Categorical([0, 1], categories=['a', 'b'], - dtype=CategoricalDtype(['a', 'b'])) + Categorical.from_codes([0, 1], categories=['a', 'b'], + dtype=CategoricalDtype(['a', 'b'])) with pytest.raises(ValueError, match=msg): - Categorical([0, 1], ordered=True, - dtype=CategoricalDtype(['a', 'b'])) + Categorical.from_codes([0, 1], ordered=True, + dtype=CategoricalDtype(['a', 'b'])) + + def test_from_codes_neither(self): + msg = "Both were None" + with pytest.raises(ValueError, match=msg): + Categorical.from_codes([0, 1]) @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): From 0459ad0986e445e69819fa573def6861e0d79b3e Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 8 Jan 2019 08:54:10 -0600 Subject: [PATCH 10/10] Update categorical.py --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index b4f22dbfb11c0..f88249d0fa6b2 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -621,7 +621,7 @@ def from_codes(cls, codes, categories=None, ordered=None, dtype=None): categories or dtype.categories, or else is -1 for NaN categories : index-like, optional The categories for the categorical. Items need to be unique. - If the categories are not given here, then theey must be provided + If the categories are not given here, then they must be provided in `dtype`. ordered : bool, optional Whether or not this categorical is treated as an ordered