diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 9a6455d4d012f..0b6e8ac7886d3 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -113,7 +113,7 @@ Bug fixes Categorical ^^^^^^^^^^^ -- +- Bug in :meth:`CategoricalIndex.union` when the index's categories are integer-dtype and the index contains ``NaN`` values incorrectly raising instead of casting to ``float64`` (:issue:`45362`) - Datetimelike diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3ea716d93aca0..9a85e2f8a5c1e 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -81,6 +81,7 @@ ) from pandas.core.dtypes.generic import ( ABCExtensionArray, + ABCIndex, ABCSeries, ) from pandas.core.dtypes.inference import is_list_like @@ -93,7 +94,9 @@ if TYPE_CHECKING: + from pandas import Index from pandas.core.arrays import ( + Categorical, DatetimeArray, ExtensionArray, IntervalArray, @@ -1470,6 +1473,44 @@ def find_result_type(left: ArrayLike, right: Any) -> DtypeObj: return new_dtype +def common_dtype_categorical_compat( + objs: list[Index | ArrayLike], dtype: DtypeObj +) -> DtypeObj: + """ + Update the result of find_common_type to account for NAs in a Categorical. + + Parameters + ---------- + objs : list[np.ndarray | ExtensionArray | Index] + dtype : np.dtype or ExtensionDtype + + Returns + ------- + np.dtype or ExtensionDtype + """ + # GH#38240 + + # TODO: more generally, could do `not can_hold_na(dtype)` + if isinstance(dtype, np.dtype) and dtype.kind in ["i", "u"]: + + for obj in objs: + # We don't want to accientally allow e.g. "categorical" str here + obj_dtype = getattr(obj, "dtype", None) + if isinstance(obj_dtype, CategoricalDtype): + if isinstance(obj, ABCIndex): + # This check may already be cached + hasnas = obj.hasnans + else: + # Categorical + hasnas = cast("Categorical", obj)._hasnans + + if hasnas: + # see test_union_int_categorical_with_nan + dtype = np.dtype(np.float64) + break + return dtype + + @overload def find_common_type(types: list[np.dtype]) -> np.dtype: ... diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 95d9ed7adc360..241e0fea221d0 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -66,6 +66,7 @@ from pandas.core.dtypes.cast import ( can_hold_element, + common_dtype_categorical_compat, find_common_type, infer_dtype_from, maybe_cast_pointwise_result, @@ -5976,17 +5977,8 @@ def _find_common_type_compat(self, target) -> DtypeObj: return _dtype_obj dtype = find_common_type([self.dtype, target_dtype]) + dtype = common_dtype_categorical_compat([self, target], dtype) - if dtype.kind in ["i", "u"]: - # TODO: what about reversed with self being categorical? - if ( - isinstance(target, Index) - and is_categorical_dtype(target.dtype) - and target.hasnans - ): - # FIXME: find_common_type incorrect with Categorical GH#38240 - # FIXME: some cases where float64 cast can be lossy? - dtype = np.dtype(np.float64) if dtype.kind == "c": dtype = _dtype_obj return dtype diff --git a/pandas/tests/indexes/test_setops.py b/pandas/tests/indexes/test_setops.py index a73ac89994761..a150a1f6d9494 100644 --- a/pandas/tests/indexes/test_setops.py +++ b/pandas/tests/indexes/test_setops.py @@ -592,6 +592,20 @@ def test_union_with_duplicate_index_not_subset_and_non_monotonic(cls): tm.assert_index_equal(result, expected) +def test_union_int_categorical_with_nan(): + ci = CategoricalIndex([1, 2, np.nan]) + assert ci.categories.dtype.kind == "i" + + idx = Index([1, 2]) + + result = idx.union(ci) + expected = Index([1, 2, np.nan], dtype=np.float64) + tm.assert_index_equal(result, expected) + + result = ci.union(idx) + tm.assert_index_equal(result, expected) + + class TestSetOpsUnsorted: # These may eventually belong in a dtype-specific test_setops, or # parametrized over a more general fixture