From d3c0b9cb362d1d1b1c9b6642f14d44aeda200c67 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 4 Nov 2021 15:36:11 -0700 Subject: [PATCH 1/3] BUG: failure to cast all-int floating dtypes when setting into int dtypes --- pandas/core/dtypes/cast.py | 8 +++ pandas/core/indexes/numeric.py | 12 ++++ pandas/core/internals/blocks.py | 8 +++ pandas/core/series.py | 17 ++++++ .../dtypes/cast/test_can_hold_element.py | 13 +++++ .../tests/indexing/multiindex/test_setitem.py | 17 +++++- pandas/tests/indexing/test_iloc.py | 11 +++- pandas/tests/series/indexing/test_setitem.py | 57 +++++++++++++++---- pandas/tests/series/indexing/test_where.py | 10 +--- 9 files changed, 133 insertions(+), 20 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 16512305e07ec..201e0501b79e3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2204,6 +2204,14 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: if tipo.kind not in ["i", "u"]: if is_float(element) and element.is_integer(): return True + + if isinstance(element, np.ndarray) and element.dtype.kind == "f": + # If all can be losslessly cast to integers, then we can hold them + # We do something similar in putmask_smart + casted = element.astype(dtype) + comp = casted == element + return comp.all() + # Anything other than integer we cannot hold return False elif dtype.itemsize < tipo.itemsize: diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 40aa70a2ada2f..4d8c411478993 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -425,6 +425,18 @@ def asi8(self) -> npt.NDArray[np.int64]: ) return self._values.view(self._default_dtype) + def _validate_fill_value(self, value): + # e.g. np.array([1.0]) we want np.array([1], dtype=self.dtype) + # see TestSetitemFloatNDarrayIntoIntegerSeries + super()._validate_fill_value(value) + if hasattr(value, "dtype") and is_float_dtype(value.dtype): + converted = value.astype(self.dtype) + if (converted == value).all(): + # See also: can_hold_element + return converted + raise TypeError + return value + class Int64Index(IntegerIndex): _index_descr_args = { diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 751cf41a09f14..f6fcf8a7e45ac 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1198,6 +1198,14 @@ def where(self, other, cond, errors="raise") -> list[Block]: values, icond.sum(), other # type: ignore[arg-type] ) if alt is not other: + if is_list_like(other) and len(other) < len(values): + # call np.where with other to get the appropriate ValueError + np.where(~icond, values, other) + raise NotImplementedError( + "This should not be reached; call to np.where above is " + "expected to raise ValueError. Please report a bug at " + "github.com/pandas-dev/pandas" + ) result = values.copy() np.putmask(result, icond, alt) else: diff --git a/pandas/core/series.py b/pandas/core/series.py index b67f16008bb13..8e345cbfb4fa6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1096,9 +1096,26 @@ def __setitem__(self, key, value) -> None: if com.is_bool_indexer(key): key = check_bool_indexer(self.index, key) key = np.asarray(key, dtype=bool) + + if ( + is_list_like(value) + and len(value) != len(self) + and not isinstance(value, Series) + and not is_object_dtype(self.dtype) + ): + # Series will be reindexed to have matching length inside + # _where call below + # GH#44265 + indexer = key.nonzero()[0] + self._set_values(indexer, value) + return + + # otherwise with listlike other we interpret series[mask] = other + # as series[mask] = other[mask] try: self._where(~key, value, inplace=True) except InvalidIndexError: + # test_where_dups self.iloc[key] = value return diff --git a/pandas/tests/dtypes/cast/test_can_hold_element.py b/pandas/tests/dtypes/cast/test_can_hold_element.py index c4776f2a1e143..3a486f795f23e 100644 --- a/pandas/tests/dtypes/cast/test_can_hold_element.py +++ b/pandas/tests/dtypes/cast/test_can_hold_element.py @@ -40,3 +40,16 @@ def test_can_hold_element_range(any_int_numpy_dtype): rng = range(10 ** 10, 10 ** 10) assert len(rng) == 0 assert can_hold_element(arr, rng) + + +def test_can_hold_element_int_values_float_ndarray(): + arr = np.array([], dtype=np.int64) + + element = np.array([1.0, 2.0]) + assert can_hold_element(arr, element) + + assert not can_hold_element(arr, element + 0.5) + + # integer but not losslessly castable to int64 + element = np.array([3, 2 ** 65], dtype=np.float64) + assert not can_hold_element(arr, element) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index 5d0aeba4aebbc..bd6c41ac07ff2 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -196,16 +196,29 @@ def test_multiindex_assignment(self): df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) + def test_multiindex_assignment_single_dtype(self): + # GH3777 part 2b # single dtype + arr = np.array([0.0, 1.0]) + df = DataFrame( np.random.randint(5, 10, size=9).reshape(3, 3), columns=list("abc"), index=[[4, 4, 8], [8, 10, 12]], + dtype=np.int64, ) + # arr can be losslessly cast to int, so this setitem is inplace df.loc[4, "c"] = arr - exp = Series(arr, index=[8, 10], name="c", dtype="float64") - tm.assert_series_equal(df.loc[4, "c"], exp) + exp = Series(arr, index=[8, 10], name="c", dtype="int64") + result = df.loc[4, "c"] + tm.assert_series_equal(result, exp) + + # arr + 0.5 cannot be cast losslessly to int, so we upcast + df.loc[4, "c"] = arr + 0.5 + result = df.loc[4, "c"] + exp = exp + 0.5 + tm.assert_series_equal(result, exp) # scalar ok df.loc[4, "c"] = 10 diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py index 046d349b92f3f..d446d606d726f 100644 --- a/pandas/tests/indexing/test_iloc.py +++ b/pandas/tests/indexing/test_iloc.py @@ -515,9 +515,18 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks( # but on a DataFrame with multiple blocks df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"]) + # setting float values that can be held by existing integer arrays + # is inplace df.iloc[:, 0] = df.iloc[:, 0].astype("f8") + if not using_array_manager: + assert len(df._mgr.blocks) == 1 + + # if the assigned values cannot be held by existing integer arrays, + # we cast + df.iloc[:, 0] = df.iloc[:, 0] + 0.5 if not using_array_manager: assert len(df._mgr.blocks) == 2 + expected = df.copy() # assign back to self @@ -892,7 +901,7 @@ def test_iloc_with_boolean_operation(self): tm.assert_frame_equal(result, expected) result.iloc[[False, False, True, True]] /= 2 - expected = DataFrame([[0.0, 4.0], [8.0, 12.0], [4.0, 5.0], [6.0, np.nan]]) + expected = DataFrame([[0, 4.0], [8, 12.0], [4, 5.0], [6, np.nan]]) tm.assert_frame_equal(result, expected) def test_iloc_getitem_singlerow_slice_categoricaldtype_gives_series(self): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 5f0710dfbb85a..c73849e689df1 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -620,23 +620,27 @@ def test_mask_key(self, obj, key, expected, val, indexer_sli): mask[key] = True obj = obj.copy() + + if is_list_like(val) and len(val) < mask.sum(): + msg = "boolean index did not match indexed array along dimension" + with pytest.raises(IndexError, match=msg): + indexer_sli(obj)[mask] = val + return + indexer_sli(obj)[mask] = val tm.assert_series_equal(obj, expected) def test_series_where(self, obj, key, expected, val, is_inplace): - if is_list_like(val) and len(val) < len(obj): - # Series.where is not valid here - if isinstance(val, range): - return - - # FIXME: The remaining TestSetitemDT64IntoInt that go through here - # are relying on technically-incorrect behavior because Block.where - # uses np.putmask instead of expressions.where in those cases, - # which has different length-checking semantics. - mask = np.zeros(obj.shape, dtype=bool) mask[key] = True + if is_list_like(val) and len(val) < len(obj): + # Series.where is not valid here + msg = "operands could not be broadcast together with shapes" + with pytest.raises(ValueError, match=msg): + obj.where(~mask, val) + return + orig = obj obj = obj.copy() arr = obj._values @@ -1014,6 +1018,39 @@ def inplace(self): return True +@pytest.mark.parametrize( + "val", + [ + np.array([2.0, 3.0]), + np.array([2.5, 3.5]), + np.array([2 ** 65, 2 ** 65 + 1], dtype=np.float64), # all ints, but can't cast + ], +) +class TestSetitemFloatNDarrayIntoIntegerSeries(SetitemCastingEquivalents): + @pytest.fixture + def obj(self): + return Series(range(5), dtype=np.int64) + + @pytest.fixture + def key(self): + return slice(0, 2) + + @pytest.fixture + def inplace(self, val): + # NB: this condition is based on currently-harcoded "val" cases + return val[0] == 2 + + @pytest.fixture + def expected(self, val, inplace): + if inplace: + dtype = np.int64 + else: + dtype = np.float64 + res_values = np.array(range(5), dtype=dtype) + res_values[:2] = val + return Series(res_values) + + def test_setitem_int_as_positional_fallback_deprecation(): # GH#42215 deprecated falling back to positional on __setitem__ with an # int not contained in the index diff --git a/pandas/tests/series/indexing/test_where.py b/pandas/tests/series/indexing/test_where.py index fc9d3a1e1e6ab..4886968cdc245 100644 --- a/pandas/tests/series/indexing/test_where.py +++ b/pandas/tests/series/indexing/test_where.py @@ -88,7 +88,7 @@ def test_where_unsafe(): s = Series(np.arange(10)) mask = s > 5 - msg = "cannot assign mismatch length to masked array" + msg = "cannot set using a list-like indexer with a different length than the value" with pytest.raises(ValueError, match=msg): s[mask] = [5, 4, 3, 2, 1] @@ -161,13 +161,9 @@ def test_where_error(): tm.assert_series_equal(s, expected) # failures - msg = "cannot assign mismatch length to masked array" + msg = "cannot set using a list-like indexer with a different length than the value" with pytest.raises(ValueError, match=msg): s[[True, False]] = [0, 2, 3] - msg = ( - "NumPy boolean array indexing assignment cannot assign 0 input " - "values to the 1 output values where the mask is true" - ) with pytest.raises(ValueError, match=msg): s[[True, False]] = [] @@ -309,7 +305,7 @@ def test_broadcast(size, mask, item, box): ) s = Series(data) - s[selection] = box(item) + s[selection] = item tm.assert_series_equal(s, expected) s = Series(data) From 64b40ca1c18e85bcbb0b7c0a776a4fe7b856a111 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Nov 2021 09:03:05 -0700 Subject: [PATCH 2/3] whatsnew --- doc/source/whatsnew/v1.4.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index a1bb3261d0445..a7d2de0315f68 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -535,6 +535,7 @@ Indexing - Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) - Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`) - Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`) +- Bug in setting floating-dtype values into a :class:`Series` with integer dtype failing to set inplace when those values can be losslessly converted to integers (:issue:`44316`) - Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`) - Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`) - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`) From b8c22ff9fa9b97e83bd0bdf0ed17fad630c7af69 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 5 Nov 2021 11:37:27 -0700 Subject: [PATCH 3/3] extra inplace-ness check --- pandas/tests/indexing/multiindex/test_setitem.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexing/multiindex/test_setitem.py b/pandas/tests/indexing/multiindex/test_setitem.py index bd6c41ac07ff2..b97aaf6c551d8 100644 --- a/pandas/tests/indexing/multiindex/test_setitem.py +++ b/pandas/tests/indexing/multiindex/test_setitem.py @@ -196,7 +196,7 @@ def test_multiindex_assignment(self): df.loc[4, "d"] = arr tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d")) - def test_multiindex_assignment_single_dtype(self): + def test_multiindex_assignment_single_dtype(self, using_array_manager): # GH3777 part 2b # single dtype arr = np.array([0.0, 1.0]) @@ -207,12 +207,18 @@ def test_multiindex_assignment_single_dtype(self): index=[[4, 4, 8], [8, 10, 12]], dtype=np.int64, ) + view = df["c"].iloc[:2].values # arr can be losslessly cast to int, so this setitem is inplace df.loc[4, "c"] = arr exp = Series(arr, index=[8, 10], name="c", dtype="int64") result = df.loc[4, "c"] tm.assert_series_equal(result, exp) + if not using_array_manager: + # FIXME(ArrayManager): this correctly preserves dtype, + # but incorrectly is not inplace. + # extra check for inplace-ness + tm.assert_numpy_array_equal(view, exp.values) # arr + 0.5 cannot be cast losslessly to int, so we upcast df.loc[4, "c"] = arr + 0.5