Skip to content

BUG: failure to cast all-int floating dtypes when setting into int dtypes #44316

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ Indexing
- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
- Bug in indexing on a non-unique object-dtype :class:`Index` with an NA scalar (e.g. ``np.nan``) (:issue:`43711`)
- Bug in :meth:`DataFrame.__setitem__` incorrectly writing into an existing column's array rather than setting a new array when the new dtype and the old dtype match (:issue:`43406`)
- Bug in setting floating-dtype values into a :class:`Series` with integer dtype failing to set inplace when those values can be losslessly converted to integers (:issue:`44316`)
- Bug in :meth:`Series.__setitem__` with object dtype when setting an array with matching size and dtype='datetime64[ns]' or dtype='timedelta64[ns]' incorrectly converting the datetime/timedeltas to integers (:issue:`43868`)
- Bug in :meth:`DataFrame.sort_index` where ``ignore_index=True`` was not being respected when the index was already sorted (:issue:`43591`)
- Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.datetime64("NaT")`` and ``np.timedelta64("NaT")`` (:issue:`43869`)
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -2205,6 +2205,14 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
if tipo.kind not in ["i", "u"]:
if is_float(element) and element.is_integer():
return True

if isinstance(element, np.ndarray) and element.dtype.kind == "f":
# If all can be losslessly cast to integers, then we can hold them
# We do something similar in putmask_smart
casted = element.astype(dtype)
comp = casted == element
return comp.all()

# Anything other than integer we cannot hold
return False
elif dtype.itemsize < tipo.itemsize:
Expand Down
12 changes: 12 additions & 0 deletions pandas/core/indexes/numeric.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,6 +425,18 @@ def asi8(self) -> npt.NDArray[np.int64]:
)
return self._values.view(self._default_dtype)

def _validate_fill_value(self, value):
# e.g. np.array([1.0]) we want np.array([1], dtype=self.dtype)
# see TestSetitemFloatNDarrayIntoIntegerSeries
super()._validate_fill_value(value)
if hasattr(value, "dtype") and is_float_dtype(value.dtype):
converted = value.astype(self.dtype)
if (converted == value).all():
# See also: can_hold_element
return converted
raise TypeError
return value


class Int64Index(IntegerIndex):
_index_descr_args = {
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1193,6 +1193,14 @@ def where(self, other, cond) -> list[Block]:
values, icond.sum(), other # type: ignore[arg-type]
)
if alt is not other:
if is_list_like(other) and len(other) < len(values):
# call np.where with other to get the appropriate ValueError
np.where(~icond, values, other)
raise NotImplementedError(
"This should not be reached; call to np.where above is "
"expected to raise ValueError. Please report a bug at "
"github.com/pandas-dev/pandas"
)
result = values.copy()
np.putmask(result, icond, alt)
else:
Expand Down
13 changes: 13 additions & 0 deletions pandas/tests/dtypes/cast/test_can_hold_element.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,16 @@ def test_can_hold_element_range(any_int_numpy_dtype):
rng = range(10 ** 10, 10 ** 10)
assert len(rng) == 0
assert can_hold_element(arr, rng)


def test_can_hold_element_int_values_float_ndarray():
arr = np.array([], dtype=np.int64)

element = np.array([1.0, 2.0])
assert can_hold_element(arr, element)

assert not can_hold_element(arr, element + 0.5)

# integer but not losslessly castable to int64
element = np.array([3, 2 ** 65], dtype=np.float64)
assert not can_hold_element(arr, element)
23 changes: 21 additions & 2 deletions pandas/tests/indexing/multiindex/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,35 @@ def test_multiindex_assignment(self):
df.loc[4, "d"] = arr
tm.assert_series_equal(df.loc[4, "d"], Series(arr, index=[8, 10], name="d"))

def test_multiindex_assignment_single_dtype(self, using_array_manager):
# GH3777 part 2b
# single dtype
arr = np.array([0.0, 1.0])

df = DataFrame(
np.random.randint(5, 10, size=9).reshape(3, 3),
columns=list("abc"),
index=[[4, 4, 8], [8, 10, 12]],
dtype=np.int64,
)
view = df["c"].iloc[:2].values

# arr can be losslessly cast to int, so this setitem is inplace
df.loc[4, "c"] = arr
exp = Series(arr, index=[8, 10], name="c", dtype="float64")
tm.assert_series_equal(df.loc[4, "c"], exp)
exp = Series(arr, index=[8, 10], name="c", dtype="int64")
result = df.loc[4, "c"]
tm.assert_series_equal(result, exp)
if not using_array_manager:
# FIXME(ArrayManager): this correctly preserves dtype,
# but incorrectly is not inplace.
# extra check for inplace-ness
tm.assert_numpy_array_equal(view, exp.values)

# arr + 0.5 cannot be cast losslessly to int, so we upcast
df.loc[4, "c"] = arr + 0.5
result = df.loc[4, "c"]
exp = exp + 0.5
tm.assert_series_equal(result, exp)

# scalar ok
df.loc[4, "c"] = 10
Expand Down
11 changes: 10 additions & 1 deletion pandas/tests/indexing/test_iloc.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,9 +515,18 @@ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(
# but on a DataFrame with multiple blocks
df = DataFrame([[0, 1], [2, 3]], columns=["B", "B"])

# setting float values that can be held by existing integer arrays
# is inplace
df.iloc[:, 0] = df.iloc[:, 0].astype("f8")
if not using_array_manager:
assert len(df._mgr.blocks) == 1

# if the assigned values cannot be held by existing integer arrays,
# we cast
df.iloc[:, 0] = df.iloc[:, 0] + 0.5
if not using_array_manager:
assert len(df._mgr.blocks) == 2

expected = df.copy()

# assign back to self
Expand Down Expand Up @@ -892,7 +901,7 @@ def test_iloc_with_boolean_operation(self):
tm.assert_frame_equal(result, expected)

result.iloc[[False, False, True, True]] /= 2
expected = DataFrame([[0.0, 4.0], [8.0, 12.0], [4.0, 5.0], [6.0, np.nan]])
expected = DataFrame([[0, 4.0], [8, 12.0], [4, 5.0], [6, np.nan]])
tm.assert_frame_equal(result, expected)

def test_iloc_getitem_singlerow_slice_categoricaldtype_gives_series(self):
Expand Down
57 changes: 47 additions & 10 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -620,23 +620,27 @@ def test_mask_key(self, obj, key, expected, val, indexer_sli):
mask[key] = True

obj = obj.copy()

if is_list_like(val) and len(val) < mask.sum():
msg = "boolean index did not match indexed array along dimension"
with pytest.raises(IndexError, match=msg):
indexer_sli(obj)[mask] = val
return

indexer_sli(obj)[mask] = val
tm.assert_series_equal(obj, expected)

def test_series_where(self, obj, key, expected, val, is_inplace):
if is_list_like(val) and len(val) < len(obj):
# Series.where is not valid here
if isinstance(val, range):
return

# FIXME: The remaining TestSetitemDT64IntoInt that go through here
# are relying on technically-incorrect behavior because Block.where
# uses np.putmask instead of expressions.where in those cases,
# which has different length-checking semantics.

mask = np.zeros(obj.shape, dtype=bool)
mask[key] = True

if is_list_like(val) and len(val) < len(obj):
# Series.where is not valid here
msg = "operands could not be broadcast together with shapes"
with pytest.raises(ValueError, match=msg):
obj.where(~mask, val)
return

orig = obj
obj = obj.copy()
arr = obj._values
Expand Down Expand Up @@ -1014,6 +1018,39 @@ def inplace(self):
return True


@pytest.mark.parametrize(
"val",
[
np.array([2.0, 3.0]),
np.array([2.5, 3.5]),
np.array([2 ** 65, 2 ** 65 + 1], dtype=np.float64), # all ints, but can't cast
],
)
class TestSetitemFloatNDarrayIntoIntegerSeries(SetitemCastingEquivalents):
@pytest.fixture
def obj(self):
return Series(range(5), dtype=np.int64)

@pytest.fixture
def key(self):
return slice(0, 2)

@pytest.fixture
def inplace(self, val):
# NB: this condition is based on currently-harcoded "val" cases
return val[0] == 2

@pytest.fixture
def expected(self, val, inplace):
if inplace:
dtype = np.int64
else:
dtype = np.float64
res_values = np.array(range(5), dtype=dtype)
res_values[:2] = val
return Series(res_values)


def test_setitem_int_as_positional_fallback_deprecation():
# GH#42215 deprecated falling back to positional on __setitem__ with an
# int not contained in the index
Expand Down