From c5858674a4faae0e12d8cf057790afa4f24991fa Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 11 May 2023 20:05:18 +0100 Subject: [PATCH 1/9] ENH: allow more specific categorical dtype strings, e.g. `category[string]'. --- doc/source/user_guide/categorical.rst | 15 ++++++- doc/source/whatsnew/v2.1.0.rst | 21 +++++++-- pandas/core/arrays/categorical.py | 2 +- pandas/core/dtypes/dtypes.py | 62 +++++++++++++++++++++------ 4 files changed, 79 insertions(+), 21 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index e486235f044f5..ea6e240777e75 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -138,7 +138,18 @@ behavior: 1. Categories are inferred from the data. 2. Categories are unordered. -To control those behaviors, instead of passing ``'category'``, use an instance +It is also possible to give a dtype inside bracket to ensure the dtype of the categories, like this: + +.. ipython:: python + + s = pd.Series(["a", "b", "c", "a"], dtype="category[string]") + s.dtype.categories + +.. versionadded:: 2.1.0 + + The ability to a specify the categories dtype in the dtype string was added in :ref:`v2.1.0 ` + +To control those behaviors even more, instead of passing ``'category'``, use an instance of :class:`~pandas.api.types.CategoricalDtype`. .. ipython:: python @@ -146,7 +157,7 @@ of :class:`~pandas.api.types.CategoricalDtype`. from pandas.api.types import CategoricalDtype s = pd.Series(["a", "b", "c", "a"]) - cat_type = CategoricalDtype(categories=["b", "c", "d"], ordered=True) + cat_type = CategoricalDtype(["b", "c", "d"], ordered=True, categories_dtype="string") s_cat = s.astype(cat_type) s_cat diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 2c5263f447951..b6b60cf8bbf28 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,12 +14,24 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_210.enhancements.enhancement1: +.. _whatsnew_210.enhancements.category_subtype: + +Specific categorical string dtypes like ``dtype="category[string]"`` now works +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When giving the string ``"category"`` as a dtype it is now possible to specify the dtype +of the categories as part of the dtype string: + +.. ipython:: python + + ser = pd.Series(["a", "b", np.nan], dtype="category[string]") + ser.dtype.categories + +The expression inside the brackets can be any string that Pandas accepts for a dtype and +whose data can be stored in an :class:`Index` (:issue:`48515`). -enhancement1 -^^^^^^^^^^^^ -.. _whatsnew_210.enhancements.enhancement2: +.. _whatsnew_210.enhancements.map_na_action: ``map(func, na_action="ignore")`` now works for all array types ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -99,6 +111,7 @@ Other enhancements - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) +- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6eb21fae29612..bfa827962a040 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -453,7 +453,7 @@ def __init__( ) from err # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2d0ec66dbc9cb..5be48dc3efc0f 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -152,13 +152,17 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): ---------- categories : sequence, optional Must be unique, and must not contain any nulls. - The categories are stored in an Index, - and if an index is provided the dtype of that index will be used. + The categories are stored in an Index. ordered : bool or None, default False Whether or not this categorical is treated as a ordered categorical. None can be used to maintain the ordered value of existing categoricals when used in operations that combine categoricals, e.g. astype, and will resolve to False if there is no existing ordered to maintain. + categories_dtype : dtype, optional + If given, will be the dtype of the categories. + If not given, the categories dtype will be inferred. + + .. versionadded:: 2.1.0 Attributes ---------- @@ -181,14 +185,14 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): Examples -------- - >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) + >>> t = pd.CategoricalDtype(['b', 'a'], ordered=True, categories_dtype="string") >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) 0 a 1 b 2 a 3 NaN dtype: category - Categories (2, object): ['b' < 'a'] + Categories (2, string): ['b' < 'a'] An empty CategoricalDtype with a specific dtype can be created by providing an empty index. As follows, @@ -205,8 +209,19 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): base = np.dtype("O") _metadata = ("categories", "ordered") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} + _categories_dtype: Dtype | None = None + _match = re.compile(r"category\[(?P.+)\]") + + def __init__( + self, + categories=None, + ordered: Ordered = False, + categories_dtype: Dtype | None = None, + ) -> None: + if categories_dtype is not None: + from pandas.core.dtypes.common import pandas_dtype - def __init__(self, categories=None, ordered: Ordered = False) -> None: + self._categories_dtype = pandas_dtype(categories_dtype) self._finalize(categories, ordered, fastpath=False) @classmethod @@ -352,12 +367,31 @@ def construct_from_string(cls, string: str_type) -> CategoricalDtype: raise TypeError( f"'construct_from_string' expects a string, got {type(string)}" ) - if string != cls.name: - raise TypeError(f"Cannot construct a 'CategoricalDtype' from '{string}'") # need ordered=None to ensure that operations specifying dtype="category" don't # override the ordered value for existing categoricals - return cls(ordered=None) + + if string == cls.name: + return cls(ordered=None) + + msg = f"Cannot construct a '{cls.__name__}' from '{string}'" + match = cls._match.match(string) + if match: + d = match.groupdict() + try: + return cls(categories_dtype=d["categories_dtype"]) + except (KeyError, TypeError, ValueError) as err: + # keyError is if "categories_dtype" key is not found + # TypeError if we pass a nonsense; + raise TypeError(msg) from err + raise TypeError(msg) + + @property + def categories_dtype(self) -> Dtype: + if self.categories is None: + return self._categories_dtype + + return self.categories.dtype def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: if ordered is not None: @@ -451,18 +485,16 @@ def __eq__(self, other: Any) -> bool: def __repr__(self) -> str_type: if self.categories is None: data = "None" - dtype = "None" else: data = self.categories._format_data(name=type(self).__name__) if data is None: # self.categories is RangeIndex data = str(self.categories._range) data = data.rstrip(", ") - dtype = self.categories.dtype return ( f"CategoricalDtype(categories={data}, ordered={self.ordered}, " - f"categories_dtype={dtype})" + f"categories_dtype={self.categories_dtype})" ) @cache_readonly @@ -537,8 +569,7 @@ def validate_ordered(ordered: Ordered) -> None: if not is_bool(ordered): raise TypeError("'ordered' must either be 'True' or 'False'") - @staticmethod - def validate_categories(categories, fastpath: bool = False) -> Index: + def validate_categories(self, categories, fastpath: bool = False) -> Index: """ Validates that we have good categories @@ -558,8 +589,11 @@ def validate_categories(categories, fastpath: bool = False) -> Index: raise TypeError( f"Parameter 'categories' must be list-like, was {repr(categories)}" ) + dtype = self._categories_dtype if not isinstance(categories, ABCIndex): - categories = Index._with_infer(categories, tupleize_cols=False) + categories = Index._with_infer(categories, dtype=dtype, tupleize_cols=False) + elif dtype is not None: + categories = categories.astype(dtype) if not fastpath: if categories.hasnans: From 7ad0624ce2eaba3e59750e93d6817c4178eeae99 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Thu, 11 May 2023 23:36:30 +0100 Subject: [PATCH 2/9] fix precommit --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b6b60cf8bbf28..ed99eff90a2a2 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -108,10 +108,10 @@ Other enhancements - Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`) - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`) - Performance improvement in :func:`read_csv` (:issue:`52632`) with ``engine="c"`` +- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`) - :meth:`Categorical.from_codes` has gotten a ``validate`` parameter (:issue:`50975`) - Added ``engine_kwargs`` parameter to :meth:`DataFrame.to_excel` (:issue:`53220`) - Performance improvement in :func:`concat` with homogeneous ``np.float64`` or ``np.float32`` dtypes (:issue:`52685`) -- :class:`CategoricalDtype` has gotten a new parameter and attribute named :meth:`CategoricalDtype.categories_dtype` (:issue:`48515`) - Performance improvement in :meth:`DataFrame.filter` when ``items`` is given (:issue:`52941`) - From d4aa357e64443a136003f419bcb0488c4787c489 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Fri, 12 May 2023 06:41:15 +0100 Subject: [PATCH 3/9] git doc test (interim) --- pandas/core/dtypes/dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 5be48dc3efc0f..68842bb1e2838 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -185,14 +185,14 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): Examples -------- - >>> t = pd.CategoricalDtype(['b', 'a'], ordered=True, categories_dtype="string") + >>> t = pd.CategoricalDtype(['b', 'a'], ordered=True) >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) 0 a 1 b 2 a 3 NaN dtype: category - Categories (2, string): ['b' < 'a'] + Categories (2, object): ['b' < 'a'] An empty CategoricalDtype with a specific dtype can be created by providing an empty index. As follows, From 6c9a33ea8e8bd4be20eb8e492f653894c36f7961 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Mon, 22 May 2023 21:09:26 +0100 Subject: [PATCH 4/9] add categories_dtype to dtype string --- pandas/_libs/lib.pyx | 6 ++ pandas/core/arrays/categorical.py | 2 +- pandas/core/dtypes/common.py | 5 +- pandas/core/dtypes/dtypes.py | 45 ++++++++---- .../arrays/categorical/test_operators.py | 4 +- pandas/tests/arrays/categorical/test_repr.py | 70 +++++++++---------- pandas/tests/dtypes/test_dtypes.py | 2 +- pandas/tests/frame/methods/test_to_records.py | 5 +- pandas/tests/frame/test_reductions.py | 9 +-- pandas/tests/groupby/test_function.py | 4 +- pandas/tests/groupby/test_groupby.py | 2 +- pandas/tests/groupby/test_raises.py | 64 ++++++++++------- .../tests/indexes/categorical/test_formats.py | 24 +++---- pandas/tests/series/methods/test_nlargest.py | 3 +- pandas/tests/series/test_arithmetic.py | 2 +- pandas/tests/series/test_repr.py | 38 +++++----- 16 files changed, 158 insertions(+), 127 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index bc2886e5b531c..a634290fd3d62 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1388,6 +1388,12 @@ cdef object _try_infer_map(object dtype): val = getattr(dtype, attr, None) if val in _TYPE_MAP: return _TYPE_MAP[val] + + # CategoricalDtype may have name category[dtype], so not caught above + name = getattr(dtype, "name", None) + if name.startswith("category["): + return _TYPE_MAP["category"] + return None diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bfa827962a040..011d9c72ff996 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1999,7 +1999,7 @@ def _repr_categories_info(self) -> str: def _repr_footer(self) -> str: info = self._repr_categories_info() - return f"Length: {len(self)}\n{info}" + return f"Length: {len(self)}, dtype: {self.dtype}\n{info}" def _get_repr( self, length: bool = True, na_rep: str = "NaN", footer: bool = True diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 3931b12e06f9b..376a6ee61079c 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -502,9 +502,8 @@ def is_categorical_dtype(arr_or_dtype) -> bool: FutureWarning, stacklevel=find_stack_level(), ) - if isinstance(arr_or_dtype, ExtensionDtype): - # GH#33400 fastpath for dtype object - return arr_or_dtype.name == "category" + if isinstance(arr_or_dtype, CategoricalDtype): + return True if arr_or_dtype is None: return False diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 68842bb1e2838..8ed97ad10cf22 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -202,7 +202,6 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): """ # TODO: Document public vs. private API - name = "category" type: type[CategoricalDtypeType] = CategoricalDtypeType kind: str_type = "O" str = "|O08" @@ -315,12 +314,12 @@ def _from_values_or_dtype( if dtype is not None: # The dtype argument takes precedence over values.dtype (if any) if isinstance(dtype, str): - if dtype == "category": + if dtype.startswith("category"): if ordered is None and cls.is_dtype(values): # GH#49309 preserve orderedness ordered = values.dtype.ordered - - dtype = CategoricalDtype(categories, ordered) + cat_dtype = cls._get_categories_dtype_from_string(dtype) + dtype = CategoricalDtype(categories, ordered, cat_dtype) else: raise ValueError(f"Unknown dtype {repr(dtype)}") elif categories is not None or ordered is not None: @@ -371,20 +370,27 @@ def construct_from_string(cls, string: str_type) -> CategoricalDtype: # need ordered=None to ensure that operations specifying dtype="category" don't # override the ordered value for existing categoricals - if string == cls.name: + if string == "category": return cls(ordered=None) msg = f"Cannot construct a '{cls.__name__}' from '{string}'" + categories_dtype = cls._get_categories_dtype_from_string(string) + if categories_dtype is None: + raise TypeError(msg) + try: + return cls(categories_dtype=categories_dtype) + except (KeyError, TypeError, ValueError) as err: + # keyError is if "categories_dtype" key is not found + # TypeError if we pass a nonsense; + raise TypeError(msg) from err + + @classmethod + def _get_categories_dtype_from_string(cls, string: str_type) -> str_type | None: match = cls._match.match(string) - if match: - d = match.groupdict() - try: - return cls(categories_dtype=d["categories_dtype"]) - except (KeyError, TypeError, ValueError) as err: - # keyError is if "categories_dtype" key is not found - # TypeError if we pass a nonsense; - raise TypeError(msg) from err - raise TypeError(msg) + if match is None: + return None + d = match.groupdict() + return d.get("categories_dtype") @property def categories_dtype(self) -> Dtype: @@ -435,7 +441,7 @@ def __eq__(self, other: Any) -> bool: 6) Any other comparison returns False """ if isinstance(other, str): - return other == self.name + return other == self.name or other == "category" elif other is self: return True elif not (hasattr(other, "ordered") and hasattr(other, "categories")): @@ -497,6 +503,15 @@ def __repr__(self) -> str_type: f"categories_dtype={self.categories_dtype})" ) + @property + def name(self) -> str_type: + if self.categories is not None: + return f"category[{self.categories.dtype}]" + elif self.categories_dtype is not None: + return f"category[{self.categories_dtype}]" + else: + return "category" + @cache_readonly def _hash_categories(self) -> int: from pandas.core.util.hashing import ( diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py index de88960280102..13f67b0d9ea6c 100644 --- a/pandas/tests/arrays/categorical/test_operators.py +++ b/pandas/tests/arrays/categorical/test_operators.py @@ -169,7 +169,7 @@ def test_comparison_with_unknown_scalars(self): # for unequal comps, but not for equal/not equal cat = Categorical([1, 2, 3], ordered=True) - msg = "Invalid comparison between dtype=category and int" + msg = r"Invalid comparison between dtype=category\[int64\] and int" with pytest.raises(TypeError, match=msg): cat < 4 with pytest.raises(TypeError, match=msg): @@ -398,6 +398,6 @@ def test_numeric_like_ops_series_arith(self, op, str_rep): def test_numeric_like_ops_series_invalid(self): # invalid ufunc s = Series(Categorical([1, 2, 3, 4])) - msg = "Object with dtype category cannot perform the numpy op log" + msg = r"Object with dtype category\[int64\] cannot perform the numpy op log" with pytest.raises(TypeError, match=msg): np.log(s) diff --git a/pandas/tests/arrays/categorical/test_repr.py b/pandas/tests/arrays/categorical/test_repr.py index cdf5d967d9c3d..101efc28279a4 100644 --- a/pandas/tests/arrays/categorical/test_repr.py +++ b/pandas/tests/arrays/categorical/test_repr.py @@ -30,7 +30,7 @@ def test_big_print(self): factor = Categorical.from_codes(codes, dtype=dtype) expected = [ "['a', 'b', 'c', 'a', 'b', ..., 'b', 'c', 'a', 'b', 'c']", - "Length: 600", + "Length: 600, dtype: category[object]", "Categories (3, object): ['a', 'b', 'c']", ] expected = "\n".join(expected) @@ -60,7 +60,7 @@ def test_print_none_width(self): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + "dtype: category[int64]\nCategories (4, int64): [1, 2, 3, 4]" ) with option_context("display.width", None): @@ -70,7 +70,7 @@ def test_unicode_print(self): c = Categorical(["aaaaa", "bb", "cccc"] * 20) expected = """\ ['aaaaa', 'bb', 'cccc', 'aaaaa', 'bb', ..., 'bb', 'cccc', 'aaaaa', 'bb', 'cccc'] -Length: 60 +Length: 60, dtype: category[object] Categories (3, object): ['aaaaa', 'bb', 'cccc']""" assert repr(c) == expected @@ -78,7 +78,7 @@ def test_unicode_print(self): c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """\ ['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] -Length: 60 +Length: 60, dtype: category[object] Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 assert repr(c) == expected @@ -88,7 +88,7 @@ def test_unicode_print(self): with option_context("display.unicode.east_asian_width", True): c = Categorical(["ああああ", "いいいいい", "ううううううう"] * 20) expected = """['ああああ', 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', ..., 'いいいいい', 'ううううううう', 'ああああ', 'いいいいい', 'ううううううう'] -Length: 60 +Length: 60, dtype: category[object] Categories (3, object): ['ああああ', 'いいいいい', 'ううううううう']""" # noqa: E501 assert repr(c) == expected @@ -108,14 +108,14 @@ def test_categorical_repr(self): c = Categorical([1, 2, 3, 4, 5] * 10) exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] -Length: 50 +Length: 50, dtype: category[int64] Categories (5, int64): [1, 2, 3, 4, 5]""" assert repr(c) == exp c = Categorical(np.arange(20, dtype=np.int64)) exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] -Length: 20 +Length: 20, dtype: category[int64] Categories (20, int64): [0, 1, 2, 3, ..., 16, 17, 18, 19]""" assert repr(c) == exp @@ -135,14 +135,14 @@ def test_categorical_repr_ordered(self): c = Categorical([1, 2, 3, 4, 5] * 10, ordered=True) exp = """[1, 2, 3, 4, 5, ..., 1, 2, 3, 4, 5] -Length: 50 +Length: 50, dtype: category[int64] Categories (5, int64): [1 < 2 < 3 < 4 < 5]""" assert repr(c) == exp c = Categorical(np.arange(20, dtype=np.int64), ordered=True) exp = """[0, 1, 2, 3, 4, ..., 15, 16, 17, 18, 19] -Length: 20 +Length: 20, dtype: category[int64] Categories (20, int64): [0 < 1 < 2 < 3 ... 16 < 17 < 18 < 19]""" assert repr(c) == exp @@ -249,7 +249,7 @@ def test_categorical_repr_int_with_nan(self): s = Series([1, 2, np.nan], dtype="object").astype("category") s_exp = """0 1\n1 2\n2 NaN -dtype: category +dtype: category[int64] Categories (2, int64): [1, 2]""" assert repr(s) == s_exp @@ -328,7 +328,7 @@ def test_categorical_repr_timedelta(self): idx = timedelta_range("1 hours", periods=20) c = Categorical(idx) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 20 +Length: 20, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501 @@ -337,7 +337,7 @@ def test_categorical_repr_timedelta(self): c = Categorical(idx.append(idx), categories=idx) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 40 +Length: 40, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00]""" # noqa: E501 @@ -361,7 +361,7 @@ def test_categorical_repr_timedelta_ordered(self): idx = timedelta_range("1 hours", periods=20) c = Categorical(idx, ordered=True) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 20 +Length: 20, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < 18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501 @@ -370,7 +370,7 @@ def test_categorical_repr_timedelta_ordered(self): c = Categorical(idx.append(idx), categories=idx, ordered=True) exp = """[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, 4 days 01:00:00, ..., 15 days 01:00:00, 16 days 01:00:00, 17 days 01:00:00, 18 days 01:00:00, 19 days 01:00:00] -Length: 40 +Length: 40, dtype: category[timedelta64[ns]] Categories (20, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < 3 days 01:00:00 ... 16 days 01:00:00 < 17 days 01:00:00 < 18 days 01:00:00 < 19 days 01:00:00]""" # noqa: E501 @@ -379,20 +379,20 @@ def test_categorical_repr_timedelta_ordered(self): def test_categorical_index_repr(self): idx = CategoricalIndex(Categorical([1, 2, 3])) - exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=False, dtype='category[int64]')""" # noqa: E501 assert repr(idx) == exp i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64))) - exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=False, dtype='category[int64]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_ordered(self): i = CategoricalIndex(Categorical([1, 2, 3], ordered=True)) - exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([1, 2, 3], categories=[1, 2, 3], ordered=True, dtype='category[int64]')""" # noqa: E501 assert repr(i) == exp i = CategoricalIndex(Categorical(np.arange(10, dtype=np.int64), ordered=True)) - exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], categories=[0, 1, 2, 3, ..., 6, 7, 8, 9], ordered=True, dtype='category[int64]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_datetime(self): @@ -401,7 +401,7 @@ def test_categorical_index_repr_datetime(self): exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', '2011-01-01 13:00:00'], - categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=False, dtype='category[datetime64[ns]]')""" # noqa: E501 assert repr(i) == exp @@ -410,7 +410,7 @@ def test_categorical_index_repr_datetime(self): exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], - categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=False, dtype='category[datetime64[ns, US/Eastern]]')""" # noqa: E501 assert repr(i) == exp @@ -420,7 +420,7 @@ def test_categorical_index_repr_datetime_ordered(self): exp = """CategoricalIndex(['2011-01-01 09:00:00', '2011-01-01 10:00:00', '2011-01-01 11:00:00', '2011-01-01 12:00:00', '2011-01-01 13:00:00'], - categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00], ordered=True, dtype='category[datetime64[ns]]')""" # noqa: E501 assert repr(i) == exp @@ -429,7 +429,7 @@ def test_categorical_index_repr_datetime_ordered(self): exp = """CategoricalIndex(['2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], - categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category[datetime64[ns, US/Eastern]]')""" # noqa: E501 assert repr(i) == exp @@ -439,7 +439,7 @@ def test_categorical_index_repr_datetime_ordered(self): '2011-01-01 13:00:00-05:00', '2011-01-01 09:00:00-05:00', '2011-01-01 10:00:00-05:00', '2011-01-01 11:00:00-05:00', '2011-01-01 12:00:00-05:00', '2011-01-01 13:00:00-05:00'], - categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00], ordered=True, dtype='category[datetime64[ns, US/Eastern]]')""" # noqa: E501 assert repr(i) == exp @@ -447,24 +447,24 @@ def test_categorical_index_repr_period(self): # test all length idx = period_range("2011-01-01 09:00", freq="H", periods=1) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01-01 09:00'], categories=[2011-01-01 09:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01-01 09:00", freq="H", periods=2) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01-01 09:00", freq="H", periods=3) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00'], categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01-01 09:00", freq="H", periods=5) i = CategoricalIndex(Categorical(idx)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], - categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp @@ -473,13 +473,13 @@ def test_categorical_index_repr_period(self): '2011-01-01 12:00', '2011-01-01 13:00', '2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], - categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=False, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=False, dtype='category[period[M]]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_period_ordered(self): @@ -487,19 +487,19 @@ def test_categorical_index_repr_period_ordered(self): i = CategoricalIndex(Categorical(idx, ordered=True)) exp = """CategoricalIndex(['2011-01-01 09:00', '2011-01-01 10:00', '2011-01-01 11:00', '2011-01-01 12:00', '2011-01-01 13:00'], - categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00], ordered=True, dtype='category[period[H]]')""" # noqa: E501 assert repr(i) == exp idx = period_range("2011-01", freq="M", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) - exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['2011-01', '2011-02', '2011-03', '2011-04', '2011-05'], categories=[2011-01, 2011-02, 2011-03, 2011-04, 2011-05], ordered=True, dtype='category[period[M]]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_timedelta(self): idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx)) - exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=False, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp idx = timedelta_range("1 hours", periods=10) @@ -508,14 +508,14 @@ def test_categorical_index_repr_timedelta(self): '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', '9 days 01:00:00'], - categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category')""" # noqa: E501 + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=False, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp def test_categorical_index_repr_timedelta_ordered(self): idx = timedelta_range("1 days", periods=5) i = CategoricalIndex(Categorical(idx, ordered=True)) - exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category')""" # noqa: E501 + exp = """CategoricalIndex(['1 days', '2 days', '3 days', '4 days', '5 days'], categories=[1 days, 2 days, 3 days, 4 days, 5 days], ordered=True, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp idx = timedelta_range("1 hours", periods=10) @@ -524,7 +524,7 @@ def test_categorical_index_repr_timedelta_ordered(self): '3 days 01:00:00', '4 days 01:00:00', '5 days 01:00:00', '6 days 01:00:00', '7 days 01:00:00', '8 days 01:00:00', '9 days 01:00:00'], - categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category')""" # noqa: E501 + categories=[0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00], ordered=True, dtype='category[timedelta64[ns]]')""" # noqa: E501 assert repr(i) == exp diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 155c61508b706..52c331cbab70e 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -1019,7 +1019,7 @@ def test_from_categorical_dtype_both(self): def test_str_vs_repr(self, ordered): c1 = CategoricalDtype(["a", "b"], ordered=ordered) - assert str(c1) == "category" + assert str(c1) == "category[object]" # Py2 will have unicode prefixes pat = ( r"CategoricalDtype\(categories=\[.*\], ordered={ordered}, " diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index f2eea452764a6..97230c63af2ab 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -374,7 +374,10 @@ def test_to_records_with_categorical(self): "index": False, "column_dtypes": {"A": "int32", "B": CategoricalDtype(["a", "b"])}, }, - (ValueError, "Invalid dtype category specified for column B"), + ( + ValueError, + "Invalid dtype category\\[object\\] specified for column B", + ), ), # Check that bad types raise ( diff --git a/pandas/tests/frame/test_reductions.py b/pandas/tests/frame/test_reductions.py index b4a4324593d22..068e5549cf5d7 100644 --- a/pandas/tests/frame/test_reductions.py +++ b/pandas/tests/frame/test_reductions.py @@ -1322,15 +1322,12 @@ def test_any_all_np_func(self, func, data, expected): data = DataFrame(data) if any(isinstance(x, CategoricalDtype) for x in data.dtypes): - with pytest.raises( - TypeError, match="dtype category does not support reduction" - ): + msg = "dtype category\\[int64\\] does not support reduction" + with pytest.raises(TypeError, match=msg): func(data) # method version - with pytest.raises( - TypeError, match="dtype category does not support reduction" - ): + with pytest.raises(TypeError, match=msg): getattr(DataFrame(data), func.__name__)(axis=None) else: msg = "'(any|all)' with datetime64 dtypes is deprecated" diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 98fce9d668e44..21d232dfe515a 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -258,7 +258,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): elif method in ("sum", "mean", "median", "prod"): msg = "|".join( [ - "category type does not support sum operations", + "category\\[object\\] type does not support sum operations", "[Cc]ould not convert", "can't multiply sequence by non-int of type 'str'", ] @@ -276,7 +276,7 @@ def _check(self, df, method, expected_columns, expected_columns_numeric): [ "[Cc]ould not convert", "Categorical is not ordered", - "category type does not support", + "category\\[object\\] type does not support", "can't multiply sequence", "function is not implemented for this dtype", f"Cannot perform {method} with non-ordered Categorical", diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0c6661b49d917..c7268173b2eb8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -1999,7 +1999,7 @@ def get_categorical_invalid_expected(): elif is_per: msg = "Period type does not support" else: - msg = "category type does not support" + msg = "category\\[.+\\] type does not support" if op == "skew": msg = "|".join([msg, "does not support reduction 'skew'"]) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6fb903b02b62f..348cc1510b8fb 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -379,25 +379,25 @@ def test_groupby_raises_category( "cumcount": (None, ""), "cummax": ( (NotImplementedError, TypeError), - "(category type does not support cummax operations|" + "(category\\[object\\] type does not support cummax operations|" "category dtype not supported|" "cummax is not supported for category dtype)", ), "cummin": ( (NotImplementedError, TypeError), - "(category type does not support cummin operations|" + "(category\\[object\\] type does not support cummin operations|" "category dtype not supported|" "cummin is not supported for category dtype)", ), "cumprod": ( (NotImplementedError, TypeError), - "(category type does not support cumprod operations|" + "(category\\[object\\] type does not support cumprod operations|" "category dtype not supported|" "cumprod is not supported for category dtype)", ), "cumsum": ( (NotImplementedError, TypeError), - "(category type does not support cumsum operations|" + "(category\\[object\\] type does not support cumsum operations|" "category dtype not supported|" "cumsum is not supported for category dtype)", ), @@ -423,7 +423,7 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'mean'", - "category dtype does not support aggregation 'mean'", + r"category\[object\] dtype does not support aggregation 'mean'", ] ), ), @@ -432,7 +432,7 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'median'", - "category dtype does not support aggregation 'median'", + r"category\[object\] dtype does not support aggregation 'median'", ] ), ), @@ -443,7 +443,10 @@ def test_groupby_raises_category( TypeError, r"unsupported operand type\(s\) for /: 'Categorical' and 'Categorical'", ), - "prod": (TypeError, "category type does not support prod operations"), + "prod": ( + TypeError, + r"category\[object\] type does not support prod operations", + ), "quantile": (TypeError, "No matching signature found"), "rank": (None, ""), "sem": ( @@ -451,7 +454,7 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'sem'", - "category dtype does not support aggregation 'sem'", + r"category\[object\] dtype does not support aggregation 'sem'", ] ), ), @@ -462,7 +465,7 @@ def test_groupby_raises_category( "|".join( [ "dtype category does not support reduction 'skew'", - "category type does not support skew operations", + r"category\[object\] type does not support skew operations", ] ), ), @@ -471,17 +474,17 @@ def test_groupby_raises_category( "|".join( [ "'Categorical' .* does not support reduction 'std'", - "category dtype does not support aggregation 'std'", + r"category\[object\] dtype does not support aggregation 'std'", ] ), ), - "sum": (TypeError, "category type does not support sum operations"), + "sum": (TypeError, r"category\[object\] type does not support sum operations"), "var": ( TypeError, "|".join( [ "'Categorical' .* does not support reduction 'var'", - "category dtype does not support aggregation 'var'", + r"category\[object\] dtype does not support aggregation 'var'", ] ), ), @@ -519,10 +522,10 @@ def test_groupby_raises_category_np( gb = gb["d"] klass, msg = { - np.sum: (TypeError, "category type does not support sum operations"), + np.sum: (TypeError, r"category\[object\] type does not support sum operations"), np.mean: ( TypeError, - "category dtype does not support aggregation 'mean'", + r"category\[object\] dtype does not support aggregation 'mean'", ), }[groupby_func_np] @@ -572,25 +575,25 @@ def test_groupby_raises_category_on_category( (NotImplementedError, TypeError), "(cummax is not supported for category dtype|" "category dtype not supported|" - "category type does not support cummax operations)", + r"category\[object\] type does not support cummax operations)", ), "cummin": ( (NotImplementedError, TypeError), "(cummin is not supported for category dtype|" "category dtype not supported|" - "category type does not support cummin operations)", + r"category\[object\] type does not support cummin operations)", ), "cumprod": ( (NotImplementedError, TypeError), "(cumprod is not supported for category dtype|" "category dtype not supported|" - "category type does not support cumprod operations)", + r"category\[object\] type does not support cumprod operations)", ), "cumsum": ( (NotImplementedError, TypeError), "(cumsum is not supported for category dtype|" "category dtype not supported|" - "category type does not support cumsum operations)", + r"category\[object\] type does not support cumsum operations)", ), "diff": (TypeError, "unsupported operand type"), "ffill": (None, ""), @@ -610,13 +613,22 @@ def test_groupby_raises_category_on_category( else (None, ""), "last": (None, ""), "max": (None, ""), - "mean": (TypeError, "category dtype does not support aggregation 'mean'"), - "median": (TypeError, "category dtype does not support aggregation 'median'"), + "mean": ( + TypeError, + r"category\[object\] dtype does not support aggregation 'mean'", + ), + "median": ( + TypeError, + r"category\[object\] dtype does not support aggregation 'median'", + ), "min": (None, ""), "ngroup": (None, ""), "nunique": (None, ""), "pct_change": (TypeError, "unsupported operand type"), - "prod": (TypeError, "category type does not support prod operations"), + "prod": ( + TypeError, + r"category\[object\] type does not support prod operations", + ), "quantile": (TypeError, ""), "rank": (None, ""), "sem": ( @@ -624,7 +636,7 @@ def test_groupby_raises_category_on_category( "|".join( [ "'Categorical' .* does not support reduction 'sem'", - "category dtype does not support aggregation 'sem'", + r"category\[object\] dtype does not support aggregation 'sem'", ] ), ), @@ -634,7 +646,7 @@ def test_groupby_raises_category_on_category( TypeError, "|".join( [ - "category type does not support skew operations", + r"category\[object\] type does not support skew operations", "dtype category does not support reduction 'skew'", ] ), @@ -644,17 +656,17 @@ def test_groupby_raises_category_on_category( "|".join( [ "'Categorical' .* does not support reduction 'std'", - "category dtype does not support aggregation 'std'", + r"category\[object\] dtype does not support aggregation 'std'", ] ), ), - "sum": (TypeError, "category type does not support sum operations"), + "sum": (TypeError, r"category\[object\] type does not support sum operations"), "var": ( TypeError, "|".join( [ "'Categorical' .* does not support reduction 'var'", - "category dtype does not support aggregation 'var'", + r"category\[object\] dtype does not support aggregation 'var'", ] ), ), diff --git a/pandas/tests/indexes/categorical/test_formats.py b/pandas/tests/indexes/categorical/test_formats.py index 7dbcaaa8d4ba6..19a10153c5391 100644 --- a/pandas/tests/indexes/categorical/test_formats.py +++ b/pandas/tests/indexes/categorical/test_formats.py @@ -16,7 +16,7 @@ def test_format_different_scalar_lengths(self): def test_string_categorical_index_repr(self): # short idx = CategoricalIndex(["a", "bb", "ccc"]) - expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + expected = """CategoricalIndex(['a', 'bb', 'ccc'], categories=['a', 'bb', 'ccc'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # multiple lines @@ -24,7 +24,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category')""" # noqa: E501 + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -33,7 +33,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', ... 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc', 'a', 'bb', 'ccc'], - categories=['a', 'bb', 'ccc'], ordered=False, dtype='category', length=300)""" # noqa: E501 + categories=['a', 'bb', 'ccc'], ordered=False, dtype='category[object]', length=300)""" # noqa: E501 assert repr(idx) == expected @@ -41,13 +41,13 @@ def test_string_categorical_index_repr(self): idx = CategoricalIndex(list("abcdefghijklmmo")) expected = """CategoricalIndex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'm', 'o'], - categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category')""" # noqa: E501 + categories=['a', 'b', 'c', 'd', ..., 'k', 'l', 'm', 'o'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # short idx = CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # multiple lines @@ -55,7 +55,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -64,7 +64,7 @@ def test_string_categorical_index_repr(self): expected = """CategoricalIndex(['あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]', length=300)""" # noqa: E501 assert repr(idx) == expected @@ -72,7 +72,7 @@ def test_string_categorical_index_repr(self): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -80,7 +80,7 @@ def test_string_categorical_index_repr(self): with cf.option_context("display.unicode.east_asian_width", True): # short idx = CategoricalIndex(["あ", "いい", "ううう"]) - expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + expected = """CategoricalIndex(['あ', 'いい', 'ううう'], categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected # multiple lines @@ -89,7 +89,7 @@ def test_string_categorical_index_repr(self): 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected @@ -100,7 +100,7 @@ def test_string_categorical_index_repr(self): ... 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう', 'あ', 'いい', 'ううう'], - categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category', length=300)""" # noqa: E501 + categories=['あ', 'いい', 'ううう'], ordered=False, dtype='category[object]', length=300)""" # noqa: E501 assert repr(idx) == expected @@ -108,6 +108,6 @@ def test_string_categorical_index_repr(self): idx = CategoricalIndex(list("あいうえおかきくけこさしすせそ")) expected = """CategoricalIndex(['あ', 'い', 'う', 'え', 'お', 'か', 'き', 'く', 'け', 'こ', 'さ', 'し', 'す', 'せ', 'そ'], - categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category')""" # noqa: E501 + categories=['あ', 'い', 'う', 'え', ..., 'し', 'す', 'せ', 'そ'], ordered=False, dtype='category[object]')""" # noqa: E501 assert repr(idx) == expected diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index ecc5d3060c0a2..99ae73df68418 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -93,12 +93,11 @@ class TestSeriesNLargestNSmallest: # Series([3., 2, 1, 2, 5], dtype='complex256'), Series([3.0, 2, 1, 2, 5], dtype="complex128"), Series(list("abcde")), - Series(list("abcde"), dtype="category"), ], ) def test_nlargest_error(self, r): dt = r.dtype - msg = f"Cannot use method 'n(largest|smallest)' with dtype {dt}" + msg = rf"Cannot use method 'n(largest|smallest)' with dtype {dt}" args = 2, len(r), 0, -1 methods = r.nlargest, r.nsmallest for method, arg in product(methods, args): diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py index a0edfae606e3f..795bc8847b4aa 100644 --- a/pandas/tests/series/test_arithmetic.py +++ b/pandas/tests/series/test_arithmetic.py @@ -576,7 +576,7 @@ def test_unequal_categorical_comparison_raises_type_error(self): # for unequal comps, but not for equal/not equal cat = Series(Categorical(list("abc"), ordered=True)) - msg = "Invalid comparison between dtype=category and str" + msg = r"Invalid comparison between dtype=category\[object\] and str" with pytest.raises(TypeError, match=msg): cat < "d" with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py index c42b9f056878d..0fdffb8ee6271 100644 --- a/pandas/tests/series/test_repr.py +++ b/pandas/tests/series/test_repr.py @@ -290,7 +290,7 @@ def test_categorical_repr(self): a = Series(Categorical([1, 2, 3, 4])) exp = ( "0 1\n1 2\n2 3\n3 4\n" - "dtype: category\nCategories (4, int64): [1, 2, 3, 4]" + "dtype: category[int64]\nCategories (4, int64): [1, 2, 3, 4]" ) assert exp == a.__str__() @@ -300,7 +300,7 @@ def test_categorical_repr(self): "0 a\n1 b\n" " ..\n" "48 a\n49 b\n" - "Length: 50, dtype: category\nCategories (2, object): ['a', 'b']" + "Length: 50, dtype: category[object]\nCategories (2, object): ['a', 'b']" ) with option_context("display.max_rows", 5): assert exp == repr(a) @@ -309,7 +309,7 @@ def test_categorical_repr(self): a = Series(Categorical(["a", "b"], categories=levs, ordered=True)) exp = ( "0 a\n1 b\n" - "dtype: category\n" + "dtype: category[object]\n" "Categories (26, object): ['a' < 'b' < 'c' < 'd' ... 'w' < 'x' < 'y' < 'z']" ) assert exp == a.__str__() @@ -319,7 +319,7 @@ def test_categorical_series_repr(self): exp = """0 1 1 2 2 3 -dtype: category +dtype: category[int64] Categories (3, int64): [1, 2, 3]""" assert repr(s) == exp @@ -335,7 +335,7 @@ def test_categorical_series_repr(self): 7 7 8 8 9 9 -dtype: category +dtype: category[{np.int_().dtype}] Categories (10, {np.int_().dtype}): [0, 1, 2, 3, ..., 6, 7, 8, 9]""" assert repr(s) == exp @@ -345,7 +345,7 @@ def test_categorical_series_repr_ordered(self): exp = """0 1 1 2 2 3 -dtype: category +dtype: category[int64] Categories (3, int64): [1 < 2 < 3]""" assert repr(s) == exp @@ -361,7 +361,7 @@ def test_categorical_series_repr_ordered(self): 7 7 8 8 9 9 -dtype: category +dtype: category[{np.int_().dtype}] Categories (10, {np.int_().dtype}): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]""" assert repr(s) == exp @@ -374,7 +374,7 @@ def test_categorical_series_repr_datetime(self): 2 2011-01-01 11:00:00 3 2011-01-01 12:00:00 4 2011-01-01 13:00:00 -dtype: category +dtype: category[datetime64[ns]] Categories (5, datetime64[ns]): [2011-01-01 09:00:00, 2011-01-01 10:00:00, 2011-01-01 11:00:00, 2011-01-01 12:00:00, 2011-01-01 13:00:00]""" # noqa: E501 @@ -387,7 +387,7 @@ def test_categorical_series_repr_datetime(self): 2 2011-01-01 11:00:00-05:00 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 -dtype: category +dtype: category[datetime64[ns, US/Eastern]] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00, 2011-01-01 10:00:00-05:00, 2011-01-01 11:00:00-05:00, 2011-01-01 12:00:00-05:00, 2011-01-01 13:00:00-05:00]""" # noqa: E501 @@ -402,7 +402,7 @@ def test_categorical_series_repr_datetime_ordered(self): 2 2011-01-01 11:00:00 3 2011-01-01 12:00:00 4 2011-01-01 13:00:00 -dtype: category +dtype: category[datetime64[ns]] Categories (5, datetime64[ns]): [2011-01-01 09:00:00 < 2011-01-01 10:00:00 < 2011-01-01 11:00:00 < 2011-01-01 12:00:00 < 2011-01-01 13:00:00]""" # noqa: E501 @@ -415,7 +415,7 @@ def test_categorical_series_repr_datetime_ordered(self): 2 2011-01-01 11:00:00-05:00 3 2011-01-01 12:00:00-05:00 4 2011-01-01 13:00:00-05:00 -dtype: category +dtype: category[datetime64[ns, US/Eastern]] Categories (5, datetime64[ns, US/Eastern]): [2011-01-01 09:00:00-05:00 < 2011-01-01 10:00:00-05:00 < 2011-01-01 11:00:00-05:00 < 2011-01-01 12:00:00-05:00 < 2011-01-01 13:00:00-05:00]""" # noqa: E501 @@ -430,7 +430,7 @@ def test_categorical_series_repr_period(self): 2 2011-01-01 11:00 3 2011-01-01 12:00 4 2011-01-01 13:00 -dtype: category +dtype: category[period[H]] Categories (5, period[H]): [2011-01-01 09:00, 2011-01-01 10:00, 2011-01-01 11:00, 2011-01-01 12:00, 2011-01-01 13:00]""" # noqa: E501 @@ -443,7 +443,7 @@ def test_categorical_series_repr_period(self): 2 2011-03 3 2011-04 4 2011-05 -dtype: category +dtype: category[period[M]] Categories (5, period[M]): [2011-01, 2011-02, 2011-03, 2011-04, 2011-05]""" assert repr(s) == exp @@ -456,7 +456,7 @@ def test_categorical_series_repr_period_ordered(self): 2 2011-01-01 11:00 3 2011-01-01 12:00 4 2011-01-01 13:00 -dtype: category +dtype: category[period[H]] Categories (5, period[H]): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 < 2011-01-01 13:00]""" # noqa: E501 @@ -469,7 +469,7 @@ def test_categorical_series_repr_period_ordered(self): 2 2011-03 3 2011-04 4 2011-05 -dtype: category +dtype: category[period[M]] Categories (5, period[M]): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]""" assert repr(s) == exp @@ -482,7 +482,7 @@ def test_categorical_series_repr_timedelta(self): 2 3 days 3 4 days 4 5 days -dtype: category +dtype: category[timedelta64[ns]] Categories (5, timedelta64[ns]): [1 days, 2 days, 3 days, 4 days, 5 days]""" assert repr(s) == exp @@ -499,7 +499,7 @@ def test_categorical_series_repr_timedelta(self): 7 7 days 01:00:00 8 8 days 01:00:00 9 9 days 01:00:00 -dtype: category +dtype: category[timedelta64[ns]] Categories (10, timedelta64[ns]): [0 days 01:00:00, 1 days 01:00:00, 2 days 01:00:00, 3 days 01:00:00, ..., 6 days 01:00:00, 7 days 01:00:00, 8 days 01:00:00, 9 days 01:00:00]""" # noqa: E501 @@ -514,7 +514,7 @@ def test_categorical_series_repr_timedelta_ordered(self): 2 3 days 3 4 days 4 5 days -dtype: category +dtype: category[timedelta64[ns]] Categories (5, timedelta64[ns]): [1 days < 2 days < 3 days < 4 days < 5 days]""" assert repr(s) == exp @@ -531,7 +531,7 @@ def test_categorical_series_repr_timedelta_ordered(self): 7 7 days 01:00:00 8 8 days 01:00:00 9 9 days 01:00:00 -dtype: category +dtype: category[timedelta64[ns]] Categories (10, timedelta64[ns]): [0 days 01:00:00 < 1 days 01:00:00 < 2 days 01:00:00 < 3 days 01:00:00 ... 6 days 01:00:00 < 7 days 01:00:00 < 8 days 01:00:00 < 9 days 01:00:00]""" # noqa: E501 From 358f654076856290287108395156eebdd9a10516 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 May 2023 10:37:23 +0100 Subject: [PATCH 5/9] update dtype name --- pandas/core/arrays/categorical.py | 20 ++++++++++---------- pandas/core/dtypes/dtypes.py | 22 +++++++++++----------- pandas/core/generic.py | 4 ++-- pandas/core/indexes/category.py | 20 ++++++++++---------- pandas/core/reshape/tile.py | 2 +- 5 files changed, 34 insertions(+), 34 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 011d9c72ff996..c38a318e4d0d1 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -453,7 +453,7 @@ def __init__( ) from err # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype) + dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes @@ -2524,7 +2524,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a', 'b', 'c'] >>> s.cat.categories @@ -2537,7 +2537,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 a 4 a 5 a - dtype: category + dtype: category[object] Categories (3, object): ['c', 'b', 'a'] >>> s.cat.reorder_categories(list("cba")) @@ -2547,7 +2547,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['c', 'b', 'a'] >>> s.cat.add_categories(["d", "e"]) @@ -2557,7 +2557,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (5, object): ['a', 'b', 'c', 'd', 'e'] >>> s.cat.remove_categories(["a", "c"]) @@ -2567,7 +2567,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 NaN 4 NaN 5 NaN - dtype: category + dtype: category[object] Categories (1, object): ['b'] >>> s1 = s.cat.add_categories(["d", "e"]) @@ -2578,7 +2578,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a', 'b', 'c'] >>> s.cat.set_categories(list("abcde")) @@ -2588,7 +2588,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (5, object): ['a', 'b', 'c', 'd', 'e'] >>> s.cat.as_ordered() @@ -2598,7 +2598,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a' < 'b' < 'c'] >>> s.cat.as_unordered() @@ -2608,7 +2608,7 @@ class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin): 3 c 4 c 5 c - dtype: category + dtype: category[object] Categories (3, object): ['a', 'b', 'c'] """ diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 8ed97ad10cf22..1996332f5fa08 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -185,13 +185,13 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): Examples -------- - >>> t = pd.CategoricalDtype(['b', 'a'], ordered=True) + >>> t = pd.CategoricalDtype(categories=['b', 'a'], ordered=True) >>> pd.Series(['a', 'b', 'a', 'c'], dtype=t) 0 a 1 b 2 a 3 NaN - dtype: category + dtype: category[object] Categories (2, object): ['b' < 'a'] An empty CategoricalDtype with a specific dtype can be created @@ -208,7 +208,7 @@ class CategoricalDtype(PandasExtensionDtype, ExtensionDtype): base = np.dtype("O") _metadata = ("categories", "ordered") _cache_dtypes: dict[str_type, PandasExtensionDtype] = {} - _categories_dtype: Dtype | None = None + _categories_dtype: Dtype _match = re.compile(r"category\[(?P.+)\]") def __init__( @@ -392,13 +392,6 @@ def _get_categories_dtype_from_string(cls, string: str_type) -> str_type | None: d = match.groupdict() return d.get("categories_dtype") - @property - def categories_dtype(self) -> Dtype: - if self.categories is None: - return self._categories_dtype - - return self.categories.dtype - def _finalize(self, categories, ordered: Ordered, fastpath: bool = False) -> None: if ordered is not None: self.validate_ordered(ordered) @@ -604,7 +597,7 @@ def validate_categories(self, categories, fastpath: bool = False) -> Index: raise TypeError( f"Parameter 'categories' must be list-like, was {repr(categories)}" ) - dtype = self._categories_dtype + dtype = self.categories_dtype if not isinstance(categories, ABCIndex): categories = Index._with_infer(categories, dtype=dtype, tupleize_cols=False) elif dtype is not None: @@ -662,6 +655,13 @@ def categories(self) -> Index: """ return self._categories + @property + def categories_dtype(self) -> Dtype | None: + try: + return self.categories.dtype + except AttributeError: + return getattr(self, "_categories_dtype", None) + @property def ordered(self) -> Ordered: """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bcfbfa1a2b713..5a712797f7a4d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6343,7 +6343,7 @@ def astype( >>> ser.astype('category') 0 1 1 2 - dtype: category + dtype: category[int32] Categories (2, int32): [1, 2] Convert to ordered categorical type with custom ordering: @@ -6354,7 +6354,7 @@ def astype( >>> ser.astype(cat_dtype) 0 1 1 2 - dtype: category + dtype: category[int64] Categories (2, int64): [2 < 1] Create a series of dates: diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index d2ef607635abb..9714983f57caa 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -147,16 +147,16 @@ class CategoricalIndex(NDArrayBackedExtensionIndex): Examples -------- - >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"]) + >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"], orderer=True) CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c'], ordered=False, dtype='category') + categories=['a', 'b', 'c'], ordered=True, dtype='category[object]') ``CategoricalIndex`` can also be instantiated from a ``Categorical``: - >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"]) + >>> c = pd.Categorical(["a", "b", "c", "a", "b", "c"], ordered=True) >>> pd.CategoricalIndex(c) CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['a', 'b', 'c'], ordered=False, dtype='category') + categories=['a', 'b', 'c'], ordered=True, dtype='category[object]') Ordered ``CategoricalIndex`` can have a min and max value. @@ -165,7 +165,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex): ... ) >>> ci CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], - categories=['c', 'b', 'a'], ordered=True, dtype='category') + categories=['c', 'b', 'a'], ordered=True, dtype='category[object]') >>> ci.min() 'c' """ @@ -438,13 +438,13 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): >>> idx = pd.CategoricalIndex(['a', 'b', 'c']) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], - ordered=False, dtype='category') + ordered=False, dtype='category[object]') >>> idx.map(lambda x: x.upper()) CategoricalIndex(['A', 'B', 'C'], categories=['A', 'B', 'C'], - ordered=False, dtype='category') + ordered=False, dtype='category[object]') >>> idx.map({'a': 'first', 'b': 'second', 'c': 'third'}) CategoricalIndex(['first', 'second', 'third'], categories=['first', - 'second', 'third'], ordered=False, dtype='category') + 'second', 'third'], ordered=False, dtype='category[object]') If the mapping is one-to-one the ordering of the categories is preserved: @@ -452,10 +452,10 @@ def map(self, mapper, na_action: Literal["ignore"] | None = None): >>> idx = pd.CategoricalIndex(['a', 'b', 'c'], ordered=True) >>> idx CategoricalIndex(['a', 'b', 'c'], categories=['a', 'b', 'c'], - ordered=True, dtype='category') + ordered=True, dtype='category[object]') >>> idx.map({'a': 3, 'b': 2, 'c': 1}) CategoricalIndex([3, 2, 1], categories=[3, 2, 1], ordered=True, - dtype='category') + dtype='category[int64]') If the mapping is not one-to-one an :class:`~pandas.Index` is returned: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 83d004c8b8e3e..2e44e8eba16d5 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -199,7 +199,7 @@ def cut( c (4.667, 7.333] d (7.333, 10.0] e (7.333, 10.0] - dtype: category + dtype: category[interval[float64, right]] Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ... Passing a Series as an input returns a Series with mapping value. From 55d2bab77e5cdff6e6c1fc7851de5223c0ec2a24 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 May 2023 10:51:00 +0100 Subject: [PATCH 6/9] update --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index ed99eff90a2a2..5ff6adefb8d3a 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -25,7 +25,7 @@ of the categories as part of the dtype string: .. ipython:: python ser = pd.Series(["a", "b", np.nan], dtype="category[string]") - ser.dtype.categories + ser.dtype The expression inside the brackets can be any string that Pandas accepts for a dtype and whose data can be stored in an :class:`Index` (:issue:`48515`). diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c38a318e4d0d1..c85cab583b883 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -453,7 +453,7 @@ def __init__( ) from err # we're inferring from values - dtype = CategoricalDtype(categories, dtype.ordered) + dtype = CategoricalDtype(categories, dtype.ordered, dtype.categories_dtype) elif isinstance(values.dtype, CategoricalDtype): old_codes = extract_array(values)._codes From e54e1fb2ee7684b4494676c1374e9537d1dedea4 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 May 2023 11:21:37 +0100 Subject: [PATCH 7/9] improve docs --- doc/source/user_guide/categorical.rst | 9 ++++++++- doc/source/whatsnew/v2.1.0.rst | 17 ++++++++++++++++- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index ea6e240777e75..92881d8a3c550 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -268,11 +268,18 @@ unordered categoricals, the order of the ``categories`` is not considered. # Unequal, since the second CategoricalDtype is ordered c1 == CategoricalDtype(["a", "b", "c"], ordered=True) -All instances of ``CategoricalDtype`` compare equal to the string ``'category'``. +All instances of ``CategoricalDtype`` compare equal to the string ``'category'`` or the +string ``'category'`` with the string dtype for the categories inside square brackets. .. ipython:: python c1 == "category" + c1 == "category[object]" + +.. versionadded:: 2.1.0 + + The ability to a specify the categories dtype inside square brackets in the dtype + string was added in :ref:`v2.1.0 ` Description ----------- diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 5ff6adefb8d3a..c7b26e2a28d3c 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -25,11 +25,26 @@ of the categories as part of the dtype string: .. ipython:: python ser = pd.Series(["a", "b", np.nan], dtype="category[string]") - ser.dtype + ser The expression inside the brackets can be any string that Pandas accepts for a dtype and whose data can be stored in an :class:`Index` (:issue:`48515`). +The categories dtype will also now be part of the dtype repr: + +.. ipython:: python + + df = pd.DataFrame({"a": ser, "b": pd.array([1, 2, 3], dtype="category[Int8]")}) + df.dtypes + +We can now also compare categorical dtypes to a string with the dtype of the categories inside brackets in order to get more precise comparisons: + +.. ipython:: python + + ser.dtype == "category[string]" + ser.dtype == "category" # also works, but doesn't check the categories dtype + ser.dtype == "category[object]" # fails, wrong categories dtype + .. _whatsnew_210.enhancements.map_na_action: From cddf02a6dd4faf5c79f37f27c4b104eb8f7a35bc Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sat, 27 May 2023 12:09:38 +0100 Subject: [PATCH 8/9] fix CI --- pandas/core/dtypes/dtypes.py | 2 +- pandas/core/indexes/category.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 1996332f5fa08..0c4f87550e883 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -434,7 +434,7 @@ def __eq__(self, other: Any) -> bool: 6) Any other comparison returns False """ if isinstance(other, str): - return other == self.name or other == "category" + return other in [self.name, "category"] elif other is self: return True elif not (hasattr(other, "ordered") and hasattr(other, "categories")): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 9714983f57caa..fb68f207d287c 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -147,7 +147,7 @@ class CategoricalIndex(NDArrayBackedExtensionIndex): Examples -------- - >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"], orderer=True) + >>> pd.CategoricalIndex(["a", "b", "c", "a", "b", "c"], ordered=True) CategoricalIndex(['a', 'b', 'c', 'a', 'b', 'c'], categories=['a', 'b', 'c'], ordered=True, dtype='category[object]') From 0a1e4389dbb71eba8ba9d272369a331d8138c2d0 Mon Sep 17 00:00:00 2001 From: Terji Petersen Date: Sun, 28 May 2023 11:38:00 +0100 Subject: [PATCH 9/9] fix asv test --- asv_bench/benchmarks/pandas_vb_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 4bd56ccb1b5ce..d1dcbbb0063f1 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -44,7 +44,7 @@ pd.UInt16Dtype, pd.UInt32Dtype, pd.UInt64Dtype, - pd.CategoricalDtype, + pd.CategoricalDtype(), pd.IntervalDtype, pd.DatetimeTZDtype("ns", "UTC"), pd.PeriodDtype("D"),