diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 8209525721b98..8ae9a680e4bdd 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -604,7 +604,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`DataFrame.loc` when setting :class:`Series` with extension dtype into NumPy dtype (:issue:`55604`) - Bug in :meth:`Index.difference` not returning a unique set of values when ``other`` is empty or ``other`` is considered non-comparable (:issue:`55113`) -- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`) +- Bug in setting :class:`Categorical` values into a :class:`DataFrame` with numpy dtypes raising ``RecursionError`` (:issue:`52927`, :issue:`56376`) Missing ^^^^^^^ diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d5144174d3c71..aad5a44de651c 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1749,6 +1749,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: tipo = _maybe_infer_dtype_type(element) + casted: Any # For mypy if dtype.kind in "iu": if isinstance(element, range): if _dtype_can_hold_range(element, dtype): @@ -1780,9 +1781,9 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: return casted raise LossySetitemError - elif isinstance(element, ABCExtensionArray) and isinstance( - element.dtype, CategoricalDtype - ): + elif isinstance( + element, (ABCExtensionArray, ABCIndex, ABCSeries) + ) and isinstance(element.dtype, CategoricalDtype): # GH#52927 setting Categorical value into non-EA frame # TODO: general-case for EAs? try: @@ -1834,6 +1835,23 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: raise LossySetitemError if tipo is not None: + if isinstance( + element, (ABCExtensionArray, ABCIndex, ABCSeries) + ) and isinstance(element.dtype, CategoricalDtype): + # GH#52927,56376 setting Categorical value into non-EA frame + # TODO: general-case for EAs? + try: + casted = element.astype(dtype) + except (ValueError, TypeError): + raise LossySetitemError + # Check for cases of either + # a) lossy overflow/rounding or + # b) semantic changes like dt64->int64 + comp = casted == element + if not comp.all(): + raise LossySetitemError + return casted + # TODO: itemsize check? if tipo.kind not in "iuf": # Anything other than float/integer we cannot hold @@ -1841,7 +1859,7 @@ def np_can_hold_element(dtype: np.dtype, element: Any) -> Any: if not isinstance(tipo, np.dtype): # i.e. nullable IntegerDtype or FloatingDtype; # we can put this into an ndarray losslessly iff it has no NAs - if element._hasna: + if element._hasna: # type: ignore[union-attr] raise LossySetitemError return element elif tipo.itemsize > dtype.itemsize or tipo.kind != dtype.kind: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 18d6834e6191c..553393abb7b23 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -5504,6 +5504,14 @@ def putmask(self, mask, value) -> Index: # See also: Block.coerce_to_target_dtype dtype = self._find_common_type_compat(value) + + # Prevent an infinite putmask loop GH56376 + if dtype == self.dtype: + raise AssertionError( + "Something has gone wrong, please report a bug at " + "https://github.com/pandas-dev/pandas/issues" + ) from err + return self.astype(dtype).putmask(mask, value) values = self._values.copy() diff --git a/pandas/tests/indexes/test_indexing.py b/pandas/tests/indexes/test_indexing.py index 1ea47f636ac9b..35705f74d325c 100644 --- a/pandas/tests/indexes/test_indexing.py +++ b/pandas/tests/indexes/test_indexing.py @@ -26,12 +26,15 @@ from pandas import ( NA, + Categorical, + CategoricalDtype, DatetimeIndex, Index, IntervalIndex, MultiIndex, NaT, PeriodIndex, + Series, TimedeltaIndex, ) import pandas._testing as tm @@ -303,6 +306,35 @@ def test_putmask_with_wrong_mask(self, index): index.putmask("foo", fill) +def test_putmask_categorical(): + # Check that putmask can use categorical values in various forms GH56376 + index = Index([2, 1, 0], dtype="int64") + dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64")) + + value = Categorical([1.0, 2.0, 3.0], dtype=dtype) + result = index.putmask([True, True, False], value) + expected = Index([1, 2, 0], dtype="int64") + tm.assert_index_equal(result, expected) + + value = Series([1.0, 2.0, 3.0], dtype=dtype) + result = index.putmask([True, True, False], value) + tm.assert_index_equal(result, expected) + + value = Index([1.0, 2.0, 3.0], dtype=dtype) + result = index.putmask([True, True, False], value) + tm.assert_index_equal(result, expected) + + +def test_putmask_infinite_loop(): + # Check that putmask won't get stuck in an infinite loop GH56376 + index = Index([1, 2, 0], dtype="int64") + dtype = CategoricalDtype(categories=np.asarray([1, 2, 3], dtype="float64")) + value = Index([1.0, np.nan, 3.0], dtype=dtype) + + with pytest.raises(AssertionError, match="please report a bug"): + index.putmask([True, True, False], value) + + @pytest.mark.parametrize( "idx", [Index([1, 2, 3]), Index([0.1, 0.2, 0.3]), Index(["a", "b", "c"])] ) diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py index ce7dde3c4cb42..ec4a4ee87c3a9 100644 --- a/pandas/tests/indexing/test_loc.py +++ b/pandas/tests/indexing/test_loc.py @@ -1677,12 +1677,17 @@ def test_loc_setitem_range_key(self, frame_or_series): expected = frame_or_series([0, 1, 10, 9, 11], index=obj.index) tm.assert_equal(obj, expected) - def test_loc_setitem_numpy_frame_categorical_value(self): + @pytest.mark.parametrize("dtype", ["int64", "float64"]) + def test_loc_setitem_numpy_frame_categorical_value(self, dtype): # GH#52927 - df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": ["a", "a", "a", "a", "a"]}).astype( + {"a": dtype} + ) df.loc[1:2, "a"] = Categorical([2, 2], categories=[1, 2]) - expected = DataFrame({"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]}) + expected = DataFrame( + {"a": [1, 2, 2, 1, 1], "b": ["a", "a", "a", "a", "a"]} + ).astype({"a": dtype}) tm.assert_frame_equal(df, expected)