From 682d4a96d5c666de12f44f4afa8963f6e43d763b Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Wed, 2 Oct 2019 17:56:30 -0500 Subject: [PATCH 01/10] BUG: Allow cast from cat to extension dtype --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/arrays/categorical.py | 4 +++- pandas/tests/extension/test_categorical.py | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2668734031ee1..d9d38dc1c9dc1 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -173,6 +173,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) +- Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 33d1de01fa3db..26dced886ffce 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -57,7 +57,7 @@ ) from pandas.core.base import NoNewAttributesMixin, PandasObject, _shared_docs import pandas.core.common as com -from pandas.core.construction import extract_array, sanitize_array +from pandas.core.construction import array, extract_array, sanitize_array from pandas.core.missing import interpolate_2d from pandas.core.sorting import nargsort @@ -523,6 +523,8 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: if is_integer_dtype(dtype) and self.isna().any(): msg = "Cannot convert float NaN to integer" raise ValueError(msg) + if is_extension_array_dtype(dtype): + return array(self, dtype=dtype, copy=copy) return np.array(self, dtype=dtype, copy=copy) @cache_readonly diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index c342777b0ebc4..92a65ce38a0ee 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -207,6 +207,13 @@ def test_cast_nan_to_int(self, cls, values): with pytest.raises((ValueError, TypeError), match=msg): s.astype(int) + def test_cast_category_to_interval(self): + # GH 28668 + expected = pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval") + result = expected.astype("category").astype("interval") + + tm.assert_series_equal(result, expected) + class TestArithmeticOps(base.BaseArithmeticOpsTests): def test_arith_series_with_scalar(self, data, all_arithmetic_operators): From b91ba053eb9853e9a2327467e4d92254b5941579 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Thu, 3 Oct 2019 17:18:52 -0500 Subject: [PATCH 02/10] Ignore type check --- pandas/core/arrays/categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 26dced886ffce..1c01fc31bca4e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -524,7 +524,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: msg = "Cannot convert float NaN to integer" raise ValueError(msg) if is_extension_array_dtype(dtype): - return array(self, dtype=dtype, copy=copy) + return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770 return np.array(self, dtype=dtype, copy=copy) @cache_readonly From 10745401194d3f5b25d67ef058b1ee9ab42996e0 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Fri, 4 Oct 2019 08:17:34 -0500 Subject: [PATCH 03/10] Move condition --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 1c01fc31bca4e..b7431033dae59 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -520,11 +520,11 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: if dtype == self.dtype: return self return self._set_dtype(dtype) + if is_extension_array_dtype(dtype): + return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770 if is_integer_dtype(dtype) and self.isna().any(): msg = "Cannot convert float NaN to integer" raise ValueError(msg) - if is_extension_array_dtype(dtype): - return array(self, dtype=dtype, copy=copy) # type: ignore # GH 28770 return np.array(self, dtype=dtype, copy=copy) @cache_readonly From 580a30fd1375a7bfb33a5a99415146bec1993422 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Fri, 4 Oct 2019 16:28:25 -0500 Subject: [PATCH 04/10] Parametrize test over dtypes --- pandas/tests/extension/test_categorical.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 92a65ce38a0ee..a7daa153d8b39 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -207,10 +207,18 @@ def test_cast_nan_to_int(self, cls, values): with pytest.raises((ValueError, TypeError), match=msg): s.astype(int) - def test_cast_category_to_interval(self): + @pytest.mark.parametrize( + "values", + [ + pd.Series([pd.Period("2019"), pd.Period("2020")], dtype="period[A-DEC]"), + pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval"), + pd.Series([1, np.nan], dtype="Int64"), + ], + ) + def test_cast_category_to_extension_dtype(self, values): # GH 28668 - expected = pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval") - result = expected.astype("category").astype("interval") + expected = values + result = expected.astype("category").astype(expected.dtype) tm.assert_series_equal(result, expected) From 1fdc8da733295416afd4c3450060a7aeadadfda1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sat, 5 Oct 2019 16:08:31 -0500 Subject: [PATCH 05/10] Update test --- pandas/tests/extension/test_categorical.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index a7daa153d8b39..bbd963ceb44e3 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -208,16 +208,15 @@ def test_cast_nan_to_int(self, cls, values): s.astype(int) @pytest.mark.parametrize( - "values", + "expected", [ pd.Series([pd.Period("2019"), pd.Period("2020")], dtype="period[A-DEC]"), pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval"), pd.Series([1, np.nan], dtype="Int64"), ], ) - def test_cast_category_to_extension_dtype(self, values): + def test_cast_category_to_extension_dtype(self, expected): # GH 28668 - expected = values result = expected.astype("category").astype(expected.dtype) tm.assert_series_equal(result, expected) From d24c5c15de265a7f2671a5939e3cd3244cffe154 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sat, 5 Oct 2019 22:10:59 -0500 Subject: [PATCH 06/10] Add merge test --- pandas/tests/reshape/merge/test_merge.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index a04f093ee7818..ca09413251f0a 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2094,3 +2094,20 @@ def test_merge_equal_cat_dtypes2(): # Categorical is unordered, so don't check ordering. tm.assert_frame_equal(result, expected, check_categorical=False) + + +def test_merge_on_cat_and_ext_array(): + # GH 28668 + df1 = DataFrame( + {"a": Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval")} + ) + df2 = df1.copy() + df2["a"] = df1["a"].astype("category") + + joined1 = pd.merge(df1, df2, how="inner", on="a") + joined2 = pd.merge(df2, df1, how="inner", on="a") + + # dtype depends on order of join so this is necessary for now + joined2["a"] = joined2["a"].astype(joined1["a"].dtype) + + assert_frame_equal(joined1, joined2) From 377810a24983e45314114424669a226977c27b45 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sat, 5 Oct 2019 22:59:34 -0500 Subject: [PATCH 07/10] Add datetime extension dtype test --- pandas/tests/extension/test_categorical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index bbd963ceb44e3..03f7e739ae74b 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -210,6 +210,7 @@ def test_cast_nan_to_int(self, cls, values): @pytest.mark.parametrize( "expected", [ + pd.Series(["2019", "2020"], dtype=pd.DatetimeTZDtype(tz="UTC")), pd.Series([pd.Period("2019"), pd.Period("2020")], dtype="period[A-DEC]"), pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval"), pd.Series([1, np.nan], dtype="Int64"), From 4ed091531953b009c4242374ae254dbe48e6aead Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sun, 6 Oct 2019 18:07:23 -0500 Subject: [PATCH 08/10] Add to test cases --- pandas/tests/extension/test_categorical.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 03f7e739ae74b..e70e4f2fe501b 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -210,7 +210,8 @@ def test_cast_nan_to_int(self, cls, values): @pytest.mark.parametrize( "expected", [ - pd.Series(["2019", "2020"], dtype=pd.DatetimeTZDtype(tz="UTC")), + pd.Series(["2019", "2020"], dtype="datetime64[ns, UTC]"), + pd.Series([0, 0], dtype="timedelta64[ns]"), pd.Series([pd.Period("2019"), pd.Period("2020")], dtype="period[A-DEC]"), pd.Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval"), pd.Series([1, np.nan], dtype="Int64"), From 300e03700d2c07566c23785dba34d870adc9543d Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sun, 6 Oct 2019 18:14:34 -0500 Subject: [PATCH 09/10] Clean up merge test --- pandas/tests/reshape/merge/test_merge.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 772fcb77713f2..4de8bba169438 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -2098,19 +2098,16 @@ def test_merge_equal_cat_dtypes2(): def test_merge_on_cat_and_ext_array(): # GH 28668 - df1 = DataFrame( + right = DataFrame( {"a": Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval")} ) - df2 = df1.copy() - df2["a"] = df2["a"].astype("category") - - joined1 = pd.merge(df1, df2, how="inner", on="a") - joined2 = pd.merge(df2, df1, how="inner", on="a") + left = right.copy() + left["a"] = left["a"].astype("category") - # dtype depends on order of join so this is necessary for now - joined2["a"] = joined2["a"].astype(joined1["a"].dtype) + result = pd.merge(left, right, how="inner", on="a") + expected = right.copy() - assert_frame_equal(joined1, joined2) + assert_frame_equal(result, expected) def test_merge_multiindex_columns(): From 261854062c4e5a3cc4cbd6aa64211db7241e57e6 Mon Sep 17 00:00:00 2001 From: Daniel Saxton <> Date: Sun, 6 Oct 2019 19:37:34 -0500 Subject: [PATCH 10/10] Add merge note --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 518a22875247e..f83aa0ce87d3b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -218,6 +218,7 @@ Categorical - Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`) - Bug in :meth:`Categorical.astype` where ``NaN`` values were handled incorrectly when casting to int (:issue:`28406`) - Bug in :meth:`Categorical.astype` not allowing for casting to extension dtypes (:issue:`28668`) +- Bug where :func:`merge` was unable to join on categorical and extension dtype columns (:issue:`28668`) - :meth:`Categorical.searchsorted` and :meth:`CategoricalIndex.searchsorted` now work on unordered categoricals also (:issue:`21667`) - Added test to assert roundtripping to parquet with :func:`DataFrame.to_parquet` or :func:`read_parquet` will preserve Categorical dtypes for string types (:issue:`27955`) -