From 5d82b0259ea91f6bbe00b46e6b0913a2fb0d1d3d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 22 Oct 2020 21:31:00 +0000 Subject: [PATCH 01/38] PERF/ENH: add fast astyping for categorical input --- pandas/core/generic.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e000fe5fa733d..9f24d724933c7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -66,6 +66,7 @@ ensure_str, is_bool, is_bool_dtype, + is_categorical, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, @@ -5794,6 +5795,12 @@ def astype( for i in range(len(self.columns)) ] + elif is_categorical(self): + new_data = pd.Categorical.from_codes( + self.cat.codes, categories=self.cat.categories.astype(dtype) + ) + return self._constructor(new_data).__finalize__(self, method="astype") + else: # else, only a single dtype is given new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) From c18ae4e685f9ba66ab5f7ddcdfa3cb07872c291f Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 03:51:02 +0000 Subject: [PATCH 02/38] replace is_categorical -> is_categorical_dtype --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9f24d724933c7..c27194195bd8a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -66,7 +66,7 @@ ensure_str, is_bool, is_bool_dtype, - is_categorical, + is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, @@ -5795,7 +5795,7 @@ def astype( for i in range(len(self.columns)) ] - elif is_categorical(self): + elif is_categorical_dtype(self): new_data = pd.Categorical.from_codes( self.cat.codes, categories=self.cat.categories.astype(dtype) ) From d7c05759079e779eaa801b0bb07a7d6ffe80681e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 04:08:03 +0000 Subject: [PATCH 03/38] ASV: add astyping benchmark --- asv_bench/benchmarks/categoricals.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index a0b24342091ec..0208cef4c9838 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -67,6 +67,24 @@ def time_existing_series(self): pd.Categorical(self.series) +class AsType: + def setup(self): + N = 10 ** 6 + + self.df = pd.DataFrame( + np.random.default_rng() + .choice(np.array(list("abcde")), 4 * N) + .reshape(N, 4), + columns=list("ABCD"), + ) + + for col in self.df.columns: + self.df[col] = self.df[col].astype("category") + + def astype_unicode(self): + [self.df[col].astype("unicode") for col in self.df.columns] + + class Concat: def setup(self): N = 10 ** 5 From 856995fa65ec563a6a9b08b236c8e994c8d625c4 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 04:11:49 +0000 Subject: [PATCH 04/38] DOC: whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index cdd8e50d98077..71b46c5e43540 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -347,6 +347,7 @@ Performance improvements - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) +- Performance improvement in :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) .. --------------------------------------------------------------------------- From 57817a45be2eed886352b91ea9251c5fd5cd4627 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 15:41:30 +0000 Subject: [PATCH 05/38] feedback: move change core/generic -> internals --- pandas/core/generic.py | 7 ------- pandas/core/internals/blocks.py | 7 ++++++- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index c27194195bd8a..e000fe5fa733d 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -66,7 +66,6 @@ ensure_str, is_bool, is_bool_dtype, - is_categorical_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, is_dict_like, @@ -5795,12 +5794,6 @@ def astype( for i in range(len(self.columns)) ] - elif is_categorical_dtype(self): - new_data = pd.Categorical.from_codes( - self.cat.codes, categories=self.cat.categories.astype(dtype) - ) - return self._constructor(new_data).__finalize__(self, method="astype") - else: # else, only a single dtype is given new_data = self._mgr.astype(dtype=dtype, copy=copy, errors=errors) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 24b00199611bf..64fcce15132d6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -592,7 +592,12 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): if is_categorical_dtype(self.values.dtype): # GH 10696/18593: update an existing categorical efficiently - return self.make_block(self.values.astype(dtype, copy=copy)) + return self.make_block( + Categorical.from_codes( + self.cat.codes, categories=self.cat.categories.astype(dtype) + ), + copy=copy, + ) return self.make_block(Categorical(self.values, dtype=dtype)) From 1050d9e9bcda40863b5461d1507a765b3950491b Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 15:54:14 +0000 Subject: [PATCH 06/38] rewrite categorical check in Block.astype --- pandas/core/internals/blocks.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 64fcce15132d6..2311e2547493b 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -588,18 +588,12 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): dtype = pandas_dtype(dtype) # may need to convert to categorical - if is_categorical_dtype(dtype): - - if is_categorical_dtype(self.values.dtype): - # GH 10696/18593: update an existing categorical efficiently - return self.make_block( - Categorical.from_codes( - self.cat.codes, categories=self.cat.categories.astype(dtype) - ), - copy=copy, + if is_categorical_dtype(self.values.dtype): + return self.make_block( + Categorical.from_codes( + self.values.codes, categories=self.values.categories.astype(dtype) ) - - return self.make_block(Categorical(self.values, dtype=dtype)) + ) dtype = pandas_dtype(dtype) From c8c05cc877afd65fa4de72dfa84128b3e2777070 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 18:04:41 +0000 Subject: [PATCH 07/38] rewrite the fix --- pandas/core/internals/blocks.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 2311e2547493b..be5f0802ef7b6 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -588,7 +588,19 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): dtype = pandas_dtype(dtype) # may need to convert to categorical - if is_categorical_dtype(self.values.dtype): + if is_categorical_dtype(dtype): + + if is_categorical_dtype(self.values.dtype): + # GH 10696/18593: update an existing categorical efficiently + return self.make_block(self.values.astype(dtype, copy=copy)) + + return self.make_block(Categorical(self.values, dtype=dtype)) + + elif ( + copy + and is_categorical_dtype(self.values.dtype) + and not is_object_dtype(dtype) + ): return self.make_block( Categorical.from_codes( self.values.codes, categories=self.values.categories.astype(dtype) From b8141c4c99ded351ca7cf86b23a537e88b523c54 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 18:05:43 +0000 Subject: [PATCH 08/38] rewrite the fix --- pandas/core/internals/blocks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index be5f0802ef7b6..5c4839784a44c 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -596,10 +596,10 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): return self.make_block(Categorical(self.values, dtype=dtype)) - elif ( - copy - and is_categorical_dtype(self.values.dtype) + elif ( # GH8628 + is_categorical_dtype(self.values.dtype) and not is_object_dtype(dtype) + and copy is True ): return self.make_block( Categorical.from_codes( From 3d3bcf1cc8be2b77b6dd04deb6aa6318e22dd2bc Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 18:32:15 +0000 Subject: [PATCH 09/38] fix handling of strings --- pandas/core/internals/blocks.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 5c4839784a44c..648961da68c0e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -46,6 +46,7 @@ is_re, is_re_compilable, is_sparse, + is_string_like_dtype, is_timedelta64_dtype, pandas_dtype, ) @@ -598,7 +599,7 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): elif ( # GH8628 is_categorical_dtype(self.values.dtype) - and not is_object_dtype(dtype) + and not (is_object_dtype(dtype) or is_string_like_dtype(dtype)) and copy is True ): return self.make_block( From 3714d096132b7a644b34e824b9fe99cff0fd509e Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 18:42:27 +0000 Subject: [PATCH 10/38] improve readability --- pandas/core/internals/blocks.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 648961da68c0e..a5a8d6d8e4e23 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -602,11 +602,10 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): and not (is_object_dtype(dtype) or is_string_like_dtype(dtype)) and copy is True ): - return self.make_block( - Categorical.from_codes( - self.values.codes, categories=self.values.categories.astype(dtype) - ) + new_data = Categorical.from_codes( + self.values.codes, categories=self.values.categories.astype(dtype) ) + return self.make_block(new_data) dtype = pandas_dtype(dtype) From f8f501f3cb8e3f12cbcc3ad59f92a8478542d03a Mon Sep 17 00:00:00 2001 From: arw2019 Date: Fri, 23 Oct 2020 19:23:28 +0000 Subject: [PATCH 11/38] add more special casing... --- pandas/core/internals/blocks.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index a5a8d6d8e4e23..c5fd35eb164a3 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -33,6 +33,7 @@ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, + is_datetime_or_timedelta_dtype, is_dtype_equal, is_extension_array_dtype, is_float, @@ -599,7 +600,12 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): elif ( # GH8628 is_categorical_dtype(self.values.dtype) - and not (is_object_dtype(dtype) or is_string_like_dtype(dtype)) + and not ( + is_object_dtype(dtype) + or is_string_like_dtype(dtype) + or is_extension_array_dtype(dtype) + or is_datetime_or_timedelta_dtype(dtype) + ) and copy is True ): new_data = Categorical.from_codes( From 2ec7ded625be253368c0e2a646f037332f56be3d Mon Sep 17 00:00:00 2001 From: arw2019 Date: Mon, 26 Oct 2020 03:15:11 +0000 Subject: [PATCH 12/38] feedback: move changes to Categorical.astype --- pandas/core/arrays/categorical.py | 5 ++++- pandas/core/internals/blocks.py | 17 ----------------- 2 files changed, 4 insertions(+), 18 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 081a363ce03c6..39fdc354be24a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -413,7 +413,10 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: return array(self, dtype=dtype, copy=copy) if is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - return np.array(self, dtype=dtype, copy=copy) + + new_categories = self.categories.astype(dtype) + obj = Categorical.from_codes(self.codes, categories=new_categories) + return np.array(obj.categories[self.codes], copy=copy) @cache_readonly def itemsize(self) -> int: diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c5fd35eb164a3..24b00199611bf 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -33,7 +33,6 @@ is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, is_dtype_equal, is_extension_array_dtype, is_float, @@ -47,7 +46,6 @@ is_re, is_re_compilable, is_sparse, - is_string_like_dtype, is_timedelta64_dtype, pandas_dtype, ) @@ -598,21 +596,6 @@ def astype(self, dtype, copy: bool = False, errors: str = "raise"): return self.make_block(Categorical(self.values, dtype=dtype)) - elif ( # GH8628 - is_categorical_dtype(self.values.dtype) - and not ( - is_object_dtype(dtype) - or is_string_like_dtype(dtype) - or is_extension_array_dtype(dtype) - or is_datetime_or_timedelta_dtype(dtype) - ) - and copy is True - ): - new_data = Categorical.from_codes( - self.values.codes, categories=self.values.categories.astype(dtype) - ) - return self.make_block(new_data) - dtype = pandas_dtype(dtype) # astype processing From cd110bc7bb6aeb30fc60607e508891925b9ad171 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Thu, 29 Oct 2020 04:50:15 +0000 Subject: [PATCH 13/38] feedback: add comment in Categorical.astype --- pandas/core/arrays/categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 39fdc354be24a..795117eaf164c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -414,6 +414,8 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: if is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") + # xref GH8628 + # PERF: astype category codes instead of astyping each entry new_categories = self.categories.astype(dtype) obj = Categorical.from_codes(self.codes, categories=new_categories) return np.array(obj.categories[self.codes], copy=copy) From 113a569f990302d52661ae8dbfec78be3b8223c1 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sat, 31 Oct 2020 20:47:05 +0000 Subject: [PATCH 14/38] CLN: remove unnecessary line --- pandas/core/arrays/categorical.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 795117eaf164c..0fb3ec74e11b8 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -417,8 +417,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # xref GH8628 # PERF: astype category codes instead of astyping each entry new_categories = self.categories.astype(dtype) - obj = Categorical.from_codes(self.codes, categories=new_categories) - return np.array(obj.categories[self.codes], copy=copy) + return np.array(new_categories[self.codes], copy=copy) @cache_readonly def itemsize(self) -> int: From 341ceb66f27d15bbac41eb13268ccc8f8c4885ec Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 1 Nov 2020 02:34:19 +0000 Subject: [PATCH 15/38] TST: error msg changed --- pandas/tests/arrays/categorical/test_dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 47ce9cb4089f9..9c18243a9e451 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -116,8 +116,8 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = "could not convert string to float" - with pytest.raises(ValueError, match=msg): + msg = "Cannot cast Index to dtype float64" + with pytest.raises(TypeError, match=msg): cat.astype(float) # numeric From c5f3fd4a02b4ca4786b0017fc5bbe5dbddf788db Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 1 Nov 2020 04:06:37 +0000 Subject: [PATCH 16/38] ASV: feedback --- asv_bench/benchmarks/categoricals.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 0208cef4c9838..04a8386008af2 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -1,3 +1,5 @@ +import string +import sys import warnings import numpy as np @@ -69,15 +71,26 @@ def time_existing_series(self): class AsType: def setup(self): - N = 10 ** 6 + N = 10 ** 5 + + random_pick = np.random.default_rng().choice + + categories = { + "str": list(string.ascii_letters), + "int": np.random.randint(2 ** 16, size=154), + "float": sys.maxsize * np.random.random((38,)), + "timestamp": [ + pd.Timestamp(x, unit="s") for x in np.random.randint(2 ** 18, size=578) + ], + } self.df = pd.DataFrame( - np.random.default_rng() - .choice(np.array(list("abcde")), 4 * N) - .reshape(N, 4), - columns=list("ABCD"), + {col: random_pick(cats, N) for col, cats in categories.items()} ) + for col in ("int", "float", "timestamp"): + self.df[col + "as_str"] = self.df[col].astype(str) + for col in self.df.columns: self.df[col] = self.df[col].astype("category") From 9943bb9c15a873c2af8dcd3210c6ea95cebe58b5 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 1 Nov 2020 04:21:34 +0000 Subject: [PATCH 17/38] ASV: more --- asv_bench/benchmarks/categoricals.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 04a8386008af2..f3b005b704014 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -89,13 +89,25 @@ def setup(self): ) for col in ("int", "float", "timestamp"): - self.df[col + "as_str"] = self.df[col].astype(str) + self.df[col + "_as_str"] = self.df[col].astype(str) for col in self.df.columns: self.df[col] = self.df[col].astype("category") - def astype_unicode(self): - [self.df[col].astype("unicode") for col in self.df.columns] + def astype_str(self): + [self.df[col].astype("str") for col in "int float timestamp".split()] + + def astype_int(self): + [self.df[col].astype("int") for col in "int_as_str timestamp".split()] + + def astype_float(self): + [ + self.df[col].astype("float") + for col in "float_as_str int int_as_str timestamp".split() + ] + + def astype_datetime(self): + self.df["float"].astype(pd.DatetimeTZDtype(tz="US/Pacific")) class Concat: From c720536f21e65a607184c3148a967e38c4d72d79 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Sun, 1 Nov 2020 04:29:07 +0000 Subject: [PATCH 18/38] DOC: mention Series.astype --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 87b0e23997d4e..3c6e1c786e41d 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -347,7 +347,7 @@ Performance improvements - Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`) - Reduced peak memory usage in :meth:`DataFrame.to_pickle` when using ``protocol=5`` in python 3.8+ (:issue:`34244`) - Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`) -- Performance improvement in :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) +- Performance improvement in :meth:`Series.astype` and :meth:`DataFrame.astype` for :class:`Categorical` (:issue:`8628`) .. --------------------------------------------------------------------------- From 190c015665dfed0d7ee460997f31236ee35ea7b4 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 8 Nov 2020 04:51:06 +0000 Subject: [PATCH 19/38] BUG: fix empty input cases --- pandas/core/arrays/categorical.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 2f57fd34eab03..4928cade67188 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -416,7 +416,10 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # xref GH8628 # PERF: astype category codes instead of astyping each entry - new_categories = self.categories.astype(dtype) + if len(self.codes) == 0 or len(self.categories) == 0: + return array(self, dtype=dtype, copy=copy) + + new_categories = np.append(self.categories.astype(dtype), [np.nan]) return np.array(new_categories[self.codes], copy=copy) @cache_readonly From f9a3040ac383c1c8a970c39556e0a9f17eee4448 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 8 Nov 2020 05:55:32 +0000 Subject: [PATCH 20/38] BUG: fix array vs. np.array usage --- pandas/core/arrays/categorical.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 4928cade67188..e14c745631093 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -406,21 +406,23 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # GH 10696/18593 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - if dtype == self.dtype: - return self - return self._set_dtype(dtype) - if is_extension_array_dtype(dtype): - return array(self, dtype=dtype, copy=copy) - if is_integer_dtype(dtype) and self.isna().any(): + result = self if dtype is self.dtype else self._set_dtype(dtype) + + elif is_extension_array_dtype(dtype): + result = array(self, dtype=dtype, copy=copy) + + elif is_integer_dtype(dtype) and self.isna().any(): raise ValueError("Cannot convert float NaN to integer") - # xref GH8628 - # PERF: astype category codes instead of astyping each entry - if len(self.codes) == 0 or len(self.categories) == 0: - return array(self, dtype=dtype, copy=copy) + elif len(self.codes) == 0 or len(self.categories) == 0: + result = np.array(self, dtype=dtype, copy=copy) - new_categories = np.append(self.categories.astype(dtype), [np.nan]) - return np.array(new_categories[self.codes], copy=copy) + else: + # PERF (GH8628): astype category codes instead of astyping array + new_categories = np.append(self.categories.astype(dtype), [np.nan]) + result = np.array(new_categories[self.codes], dtype=dtype, copy=copy) + + return result @cache_readonly def itemsize(self) -> int: From 568aa7f76b57d6540fc412916fe01e7a47858838 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 8 Nov 2020 06:23:57 +0000 Subject: [PATCH 21/38] BUG: astype to same dtype; use na_value_for_dtype --- pandas/core/arrays/categorical.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index e14c745631093..763b4fe4b22fa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -38,7 +38,12 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna +from pandas.core.dtypes.missing import ( + is_valid_nat_for_dtype, + isna, + na_value_for_dtype, + notna, +) from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names @@ -400,7 +405,10 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: If copy is set to False and dtype is categorical, the original object is returned. """ - if is_categorical_dtype(dtype): + if self.dtype is dtype: + result = self.copy() if copy else self + + elif is_categorical_dtype(dtype): dtype = cast(Union[str, CategoricalDtype], dtype) # GH 10696/18593 @@ -419,7 +427,9 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: else: # PERF (GH8628): astype category codes instead of astyping array - new_categories = np.append(self.categories.astype(dtype), [np.nan]) + new_categories = np.append( + self.categories.astype(dtype), [na_value_for_dtype(dtype)] + ) result = np.array(new_categories[self.codes], dtype=dtype, copy=copy) return result From 6860e483205cd744c3200dc4ce3703cec2251a15 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 8 Nov 2020 06:47:39 +0000 Subject: [PATCH 22/38] TST/BUG: fix error message --- pandas/tests/series/test_dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2fbed92567f71..9369f182e23d7 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -84,8 +84,8 @@ def test_astype_categorical_to_other(self): expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = r"could not convert string to float|invalid literal for float\(\)" - with pytest.raises(ValueError, match=msg): + msg = r"Cannot cast Index to dtype float64" + with pytest.raises(TypeError, match=msg): s.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) From f2aa2ef55908a6d00fb857118cb237b9fef004da Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Sun, 8 Nov 2020 06:50:18 +0000 Subject: [PATCH 23/38] CLN: simplify elif --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 763b4fe4b22fa..c324e0b1261a9 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -414,7 +414,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # GH 10696/18593 dtype = self.dtype.update_dtype(dtype) self = self.copy() if copy else self - result = self if dtype is self.dtype else self._set_dtype(dtype) + result = self._set_dtype(dtype) elif is_extension_array_dtype(dtype): result = array(self, dtype=dtype, copy=copy) @@ -426,7 +426,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: result = np.array(self, dtype=dtype, copy=copy) else: - # PERF (GH8628): astype category codes instead of astyping array + # GH8628 (PERF): astype category codes instead of astyping array new_categories = np.append( self.categories.astype(dtype), [na_value_for_dtype(dtype)] ) From a323544886b638e35ee5d6864581e60e0f10b8fb Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 9 Nov 2020 17:58:52 +0000 Subject: [PATCH 24/38] TST: revert error message --- pandas/tests/series/test_dtypes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 2fed80603df33..e738cbb69cb3a 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -68,8 +68,8 @@ def test_astype_categorical_to_other(self): expected = s tm.assert_series_equal(s.astype("category"), expected) tm.assert_series_equal(s.astype(CategoricalDtype()), expected) - msg = r"Cannot cast Index to dtype float64" - with pytest.raises(TypeError, match=msg): + msg = r"Cannot cast object dtype to float64" + with pytest.raises(ValueError, match=msg): s.astype("float64") cat = Series(Categorical(["a", "b", "b", "a", "a", "c", "c", "c"])) From 07b2a6523d455330abc6af69a2680706f1f9b949 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 9 Nov 2020 18:02:01 +0000 Subject: [PATCH 25/38] REF/ERR: use take_1d + catch casting error --- pandas/core/arrays/categorical.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ae4ed19684951..74455baf4c24d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -38,12 +38,7 @@ ) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import ( - is_valid_nat_for_dtype, - isna, - na_value_for_dtype, - notna, -) +from pandas.core.dtypes.missing import is_valid_nat_for_dtype, isna, notna from pandas.core import ops from pandas.core.accessor import PandasDelegate, delegate_names @@ -427,10 +422,15 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array - new_categories = np.append( - self.categories.astype(dtype), [na_value_for_dtype(dtype)] - ) - result = np.array(new_categories[self.codes], dtype=dtype, copy=copy) + try: + astyped_categories = np.array( + self.categories.to_numpy(), dtype=dtype, copy=copy + ) + except (TypeError, ValueError): + raise ValueError( + f"Cannot cast {self.categories.dtype} dtype to {dtype}" + ) + result = np.array(take_1d(astyped_categories, self._codes), copy=copy) return result From 229bfc7541cecc3b1d2f363f661218ca56ca266f Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 9 Nov 2020 18:40:19 +0000 Subject: [PATCH 26/38] ERR: fix casting error message --- pandas/core/arrays/categorical.py | 2 +- pandas/tests/arrays/categorical/test_dtypes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 74455baf4c24d..8b9e4c33ba261 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -428,7 +428,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: ) except (TypeError, ValueError): raise ValueError( - f"Cannot cast {self.categories.dtype} dtype to {dtype}" + f"Cannot cast {self.categories.dtype} dtype to {dtype.__name__}" ) result = np.array(take_1d(astyped_categories, self._codes), copy=copy) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 945e9678e9933..9799113d9e0c6 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -127,8 +127,8 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = "Cannot cast Index to dtype float64" - with pytest.raises(TypeError, match=msg): + msg = "Cannot cast object dtype to float" + with pytest.raises(ValueError, match=msg): cat.astype(float) # numeric From da12be0ca49671b3e0658947d0d06771c5600bda Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 9 Nov 2020 20:05:42 +0000 Subject: [PATCH 27/38] ERR/TST: revert message change in method & test --- pandas/core/arrays/categorical.py | 2 +- pandas/tests/arrays/categorical/test_dtypes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 8b9e4c33ba261..74455baf4c24d 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -428,7 +428,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: ) except (TypeError, ValueError): raise ValueError( - f"Cannot cast {self.categories.dtype} dtype to {dtype.__name__}" + f"Cannot cast {self.categories.dtype} dtype to {dtype}" ) result = np.array(take_1d(astyped_categories, self._codes), copy=copy) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 9799113d9e0c6..ad21d068c6173 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -127,7 +127,7 @@ def test_astype(self, ordered): expected = np.array(cat) tm.assert_numpy_array_equal(result, expected) - msg = "Cannot cast object dtype to float" + msg = r"Cannot cast object dtype to " with pytest.raises(ValueError, match=msg): cat.astype(float) From 93f3e1a5020fadf0996f9972bd50e51853a42953 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 01:00:46 +0000 Subject: [PATCH 28/38] REF (feedback): astype categories then extract numpy array --- pandas/core/arrays/categorical.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 74455baf4c24d..0d3c83b004616 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -423,13 +423,12 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array try: - astyped_categories = np.array( - self.categories.to_numpy(), dtype=dtype, copy=copy - ) + astyped_categories = self.categories.astype(dtype=dtype, copy=copy) except (TypeError, ValueError): raise ValueError( f"Cannot cast {self.categories.dtype} dtype to {dtype}" ) + astyped_categories = extract_array(astyped_categories, extract_numpy=True) result = np.array(take_1d(astyped_categories, self._codes), copy=copy) return result From 9cb5fe33471ad44615bbe9264e86caf56bce6975 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 03:38:41 +0000 Subject: [PATCH 29/38] CLN (feedback): remove np.array from take_1d --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 0d3c83b004616..780ceae4023c4 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -429,7 +429,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: f"Cannot cast {self.categories.dtype} dtype to {dtype}" ) astyped_categories = extract_array(astyped_categories, extract_numpy=True) - result = np.array(take_1d(astyped_categories, self._codes), copy=copy) + result = take_1d(astyped_categories, self._codes) return result From f55964e87804c9e48981a9cba9b5297cee2ad890 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 03:56:27 +0000 Subject: [PATCH 30/38] CI (feedback): use ensure_platform_int in cast --- pandas/core/arrays/categorical.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 780ceae4023c4..33ca0a04407fe 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -429,7 +429,9 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: f"Cannot cast {self.categories.dtype} dtype to {dtype}" ) astyped_categories = extract_array(astyped_categories, extract_numpy=True) - result = take_1d(astyped_categories, self._codes) + result = take_1d( + astyped_categories, libalgos.ensure_platform_int(self._codes) + ) return result From b342135a2adf29e8a103e8b8e8b926380c218b10 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 04:04:53 +0000 Subject: [PATCH 31/38] COMMENT: add TODO re consolidating EA/ndarray cases --- pandas/core/arrays/categorical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 33ca0a04407fe..05ea46299b465 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -411,6 +411,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: self = self.copy() if copy else self result = self._set_dtype(dtype) + # TODO: consolidate with ndarray case? elif is_extension_array_dtype(dtype): result = array(self, dtype=dtype, copy=copy) From 73e044270fd752e5707250db03de5f206cf2a9f1 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Tue, 10 Nov 2020 05:08:29 +0000 Subject: [PATCH 32/38] CLN: shorten variable name --- pandas/core/arrays/categorical.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 05ea46299b465..6d8ff5cc10b1c 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -424,15 +424,14 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: else: # GH8628 (PERF): astype category codes instead of astyping array try: - astyped_categories = self.categories.astype(dtype=dtype, copy=copy) + astyped_cats = self.categories.astype(dtype=dtype, copy=copy) except (TypeError, ValueError): raise ValueError( f"Cannot cast {self.categories.dtype} dtype to {dtype}" ) - astyped_categories = extract_array(astyped_categories, extract_numpy=True) - result = take_1d( - astyped_categories, libalgos.ensure_platform_int(self._codes) - ) + + astyped_cats = extract_array(astyped_cats, extract_numpy=True) + result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) return result From d195d91fb9cfc3cc63190ba9e12d62552e95b5d6 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 11 Nov 2020 22:09:47 +0000 Subject: [PATCH 33/38] TST/CI (32bit): fix up int conversions --- pandas/tests/arrays/categorical/test_dtypes.py | 2 +- pandas/tests/series/test_dtypes.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index ad21d068c6173..38460b32fc3b7 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -139,7 +139,7 @@ def test_astype(self, ordered): result = cat.astype(int) expected = np.array(cat, dtype=int) - tm.assert_numpy_array_equal(result, expected) + tm.assert_numpy_array_equal(result, expected, check_dtype=False) result = cat.astype(float) expected = np.array(cat, dtype=float) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index e738cbb69cb3a..570951760dcb0 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -76,7 +76,8 @@ def test_astype_categorical_to_other(self): exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) - exp2 = Series([1, 2, 3, 4]).astype(int) + res = s2.astype("int") + exp2 = Series([1, 2, 3, 4]).astype(res.dtype) tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same From 3351cb1040fa0bc02711133ebb1b10e9bf2306c0 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 11 Nov 2020 22:37:33 +0000 Subject: [PATCH 34/38] fix merge error --- pandas/core/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 317bdb1fcc797..24c1ae971686e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -54,7 +54,7 @@ from pandas.compat._optional import import_optional_dependency from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError, InvalidIndexError -from pandas.util._decorators import Appender, doc, rewrite_axis_style_signature +from pandas.util._decorators import doc, rewrite_axis_style_signature from pandas.util._validators import ( validate_bool_kwarg, validate_fillna_kwargs, @@ -10861,6 +10861,7 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): cls.all = all # type: ignore[assignment] @doc( + NDFrame.mad, desc="Return the mean absolute deviation of the values " "over the requested axis.", name1=name1, @@ -10869,7 +10870,6 @@ def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): see_also="", examples="", ) - @Appender(NDFrame.mad.__doc__) def mad(self, axis=None, skipna=None, level=None): return NDFrame.mad(self, axis, skipna, level) From 13fa0866c977bb957c04af3b653e536bb42bd794 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Tue, 17 Nov 2020 15:24:07 +0000 Subject: [PATCH 35/38] TST: use np.intp in dtype tests --- pandas/tests/arrays/categorical/test_dtypes.py | 4 ++-- pandas/tests/series/test_dtypes.py | 3 +-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 38460b32fc3b7..13946aef808e4 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -138,8 +138,8 @@ def test_astype(self, ordered): tm.assert_numpy_array_equal(result, expected) result = cat.astype(int) - expected = np.array(cat, dtype=int) - tm.assert_numpy_array_equal(result, expected, check_dtype=False) + expected = np.array(cat, dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) result = cat.astype(float) expected = np.array(cat, dtype=float) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index be5278f53a2d5..3c73107880653 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -68,8 +68,7 @@ def test_astype_categorical_to_other(self): exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) - res = s2.astype("int") - exp2 = Series([1, 2, 3, 4]).astype(res.dtype) + exp2 = Series([1, 2, 3, 4]).astype(np.intp) tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same From 10168945b672c7f7c8931ebabd7cf698271772a7 Mon Sep 17 00:00:00 2001 From: arw2019 Date: Wed, 18 Nov 2020 02:28:41 +0000 Subject: [PATCH 36/38] CI: fix 32bit again --- pandas/tests/arrays/categorical/test_dtypes.py | 2 +- pandas/tests/series/test_dtypes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_dtypes.py b/pandas/tests/arrays/categorical/test_dtypes.py index 13946aef808e4..12654388de904 100644 --- a/pandas/tests/arrays/categorical/test_dtypes.py +++ b/pandas/tests/arrays/categorical/test_dtypes.py @@ -138,7 +138,7 @@ def test_astype(self, ordered): tm.assert_numpy_array_equal(result, expected) result = cat.astype(int) - expected = np.array(cat, dtype=np.intp) + expected = np.array(cat, dtype="int64") tm.assert_numpy_array_equal(result, expected) result = cat.astype(float) diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py index 3c73107880653..865ae565b6501 100644 --- a/pandas/tests/series/test_dtypes.py +++ b/pandas/tests/series/test_dtypes.py @@ -68,7 +68,7 @@ def test_astype_categorical_to_other(self): exp = Series(["a", "b", "b", "a", "a", "c", "c", "c"]) tm.assert_series_equal(cat.astype("str"), exp) s2 = Series(Categorical(["1", "2", "3", "4"])) - exp2 = Series([1, 2, 3, 4]).astype(np.intp) + exp2 = Series([1, 2, 3, 4]).astype("int64") tm.assert_series_equal(s2.astype("int"), exp2) # object don't sort correctly, so just compare that we have the same From a9544b3c55e53211152a80ba058830a305534218 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 18 Nov 2020 10:42:21 -0500 Subject: [PATCH 37/38] DOC: add note re: CategoricalIndex TypeError catch --- pandas/core/arrays/categorical.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fe8b7c206f38d..10f05fd46fd90 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -425,10 +425,12 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: # GH8628 (PERF): astype category codes instead of astyping array try: astyped_cats = self.categories.astype(dtype=dtype, copy=copy) - except (TypeError, ValueError): - raise ValueError( - f"Cannot cast {self.categories.dtype} dtype to {dtype}" - ) + except ( + TypeError, # downstream error msg for CategoricalIndex is misleading + ValueError, + ): + msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" + raise ValueError(msg) astyped_cats = extract_array(astyped_cats, extract_numpy=True) result = take_1d(astyped_cats, libalgos.ensure_platform_int(self._codes)) From 7e9fc329fca908309612cff33260f2fef8b170c5 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Wed, 18 Nov 2020 11:19:12 -0500 Subject: [PATCH 38/38] CI: fix merge error --- pandas/core/indexes/period.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index d52a8ae935688..0b0f985697da9 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -6,7 +6,7 @@ from pandas._libs import index as libindex from pandas._libs.tslibs import BaseOffset, Period, Resolution, Tick from pandas._libs.tslibs.parsing import DateParseError, parse_time_string -from pandas._typing import DtypeObj, Label +from pandas._typing import DtypeObj from pandas.errors import InvalidIndexError from pandas.util._decorators import Appender, cache_readonly, doc