Skip to content

BUG: Special-case setting nan into integer series #54527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 28, 2023
1 change: 1 addition & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def array_equivalent_object(
right: npt.NDArray[np.object_],
) -> bool: ...
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:]
def get_reverse_indexer(
indexer: np.ndarray, # const intp_t[:]
length: int,
Expand Down
16 changes: 16 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,22 @@ def has_infs(floating[:] arr) -> bool:
return ret


@cython.boundscheck(False)
@cython.wraparound(False)
def has_only_ints_or_nan(floating[:] arr) -> bool:
cdef:
floating val
intp_t i

for i in range(len(arr)):
val = arr[i]
if (val != val) or (val == <int64_t>val):
continue
else:
return False
return True
Comment on lines +535 to +546
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So one thing that this doesn't yet do, is checking for the proper range for lower bitwidth integers.

For example setting [1000.0, np.nan] into a int8 Series should still raise the warning because 1000 is too big, but this helper function will now say it are only integers or NaN.



def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
Expand Down
29 changes: 29 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,10 @@
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_scalar,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
Expand Down Expand Up @@ -454,6 +457,32 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
and will receive the same block
"""
new_dtype = find_result_type(self.values.dtype, other)

# In a future version of pandas, the default will be that
# setting `nan` into an integer series won't raise.
if is_scalar(other) and is_integer_dtype(self.values.dtype):
try:
is_nan = bool(np.isnan(other))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should check for all NA values since we hopefully end up with NA only?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Setting pd.NA or None also cast to float with NaN nowadays, so agreed that those should be catched here as well.

But setting pd.NaT currently casts to object dtype, and that one is maybe not something we want to allow? (i.e. continue to warn for that is fine?) So maybe add a and other is not pd.NaT or something?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I had that earlier, but @phofl brought up that in the future that would also be treated as NA here and so wouldn't upcast

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but before you were only checking for nan with np.isnan, I think? (plus checking for not being NaT)
To be clear I think we should also check in general with isna so we also check for None and pd.NA. I would maybe just continue disallowing NaT (that currently upcasts to object dtype, not to float)

except TypeError:
# ufunc 'isnan' not supported for the input types
# boolean value of NA is ambiguous
is_nan = False
try:
is_nat = np.isnat(other)
except TypeError:
# ufunc 'isnat' is only defined for np.datetime64 and np.timedelta64
is_nat = False
if is_nan and not is_nat:
warn_on_upcast = False
elif (
isinstance(other, np.ndarray)
and other.ndim == 1
and is_integer_dtype(self.values.dtype)
and is_float_dtype(other.dtype)
and lib.has_only_ints_or_nan(other)
):
warn_on_upcast = False

if warn_on_upcast:
warnings.warn(
f"Setting an item of incompatible dtype is deprecated "
Expand Down
10 changes: 2 additions & 8 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,18 +337,12 @@ def test_setitem(self, float_frame, using_copy_on_write):
def test_setitem2(self):
# dtype changing GH4204
df = DataFrame([[0, 0]])
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.iloc[0] = np.nan
df.iloc[0] = np.nan
expected = DataFrame([[np.nan, np.nan]])
tm.assert_frame_equal(df, expected)

df = DataFrame([[0, 0]])
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.loc[0] = np.nan
df.loc[0] = np.nan
tm.assert_frame_equal(df, expected)

def test_setitem_boolean(self, float_frame):
Expand Down
16 changes: 4 additions & 12 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected):
start_data, expected_result, warn = expected

start_dataframe = DataFrame({"foo": start_data})
with tm.assert_produces_warning(warn, match="incompatible dtype"):
start_dataframe.loc[0, ["foo"]] = None
start_dataframe.loc[0, ["foo"]] = None

expected_dataframe = DataFrame({"foo": expected_result})
tm.assert_frame_equal(start_dataframe, expected_dataframe)
Expand All @@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected):
start_data, expected_result, warn = expected

start_dataframe = DataFrame({"foo": start_data})
with tm.assert_produces_warning(warn, match="incompatible dtype"):
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None

expected_dataframe = DataFrame({"foo": expected_result})
tm.assert_frame_equal(start_dataframe, expected_dataframe)
Expand All @@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected):
start_data, expected_result, warn = expected

start_dataframe = DataFrame({"foo": start_data})
with tm.assert_produces_warning(warn, match="incompatible dtype"):
start_dataframe.loc[
start_dataframe["foo"] == start_dataframe["foo"][0]
] = None
start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None

expected_dataframe = DataFrame({"foo": expected_result})
tm.assert_frame_equal(start_dataframe, expected_dataframe)
Expand All @@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self):
"d": ["a", "b", "c"],
}
)
with tm.assert_produces_warning(
FutureWarning, match="item of incompatible dtype"
):
start_dataframe.iloc[0] = None
start_dataframe.iloc[0] = None

exp = DataFrame(
{
Expand Down
27 changes: 11 additions & 16 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,11 @@ def test_setitem_series_object_dtype(self, indexer, ser_index):
expected = Series([Series([42], index=[ser_index]), 0], dtype="object")
tm.assert_series_equal(ser, expected)

@pytest.mark.parametrize(
"index, exp_value, warn", [(0, 42, None), (1, np.nan, FutureWarning)]
)
def test_setitem_series(self, index, exp_value, warn):
@pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)])
def test_setitem_series(self, index, exp_value):
# GH#38303
ser = Series([0, 0])
with tm.assert_produces_warning(warn, match="item of incompatible dtype"):
ser.loc[0] = Series([42], index=[index])
ser.loc[0] = Series([42], index=[index])
expected = Series([exp_value, 0])
tm.assert_series_equal(ser, expected)

Expand Down Expand Up @@ -575,15 +572,15 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype):
[
(NA, NA, "Int64", "Int64", 1, None),
(NA, NA, "Int64", "Int64", 2, None),
(NA, np.nan, "int64", "float64", 1, FutureWarning),
(NA, np.nan, "int64", "float64", 1, None),
(NA, np.nan, "int64", "float64", 2, None),
(NaT, NaT, "int64", "object", 1, FutureWarning),
(NaT, NaT, "int64", "object", 2, None),
(np.nan, NA, "Int64", "Int64", 1, None),
(np.nan, NA, "Int64", "Int64", 2, None),
(np.nan, NA, "Float64", "Float64", 1, None),
(np.nan, NA, "Float64", "Float64", 2, None),
(np.nan, np.nan, "int64", "float64", 1, FutureWarning),
(np.nan, np.nan, "int64", "float64", 1, None),
(np.nan, np.nan, "int64", "float64", 2, None),
],
)
Expand Down Expand Up @@ -884,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val):
Series([2, 3, 4, 5, 6, 7, 8, 9, 10]),
Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]),
slice(None, None, 2),
FutureWarning,
None,
id="int_series_slice_key_step",
),
pytest.param(
Expand All @@ -899,15 +896,15 @@ def test_index_putmask(self, obj, key, expected, warn, val):
Series(np.arange(10)),
Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]),
slice(None, 5),
FutureWarning,
None,
id="int_series_slice_key",
),
pytest.param(
# changes dtype GH#4463
Series([1, 2, 3]),
Series([np.nan, 2, 3]),
0,
FutureWarning,
None,
id="int_series_int_key",
),
pytest.param(
Expand Down Expand Up @@ -1134,7 +1131,7 @@ def warn(self):
"obj,expected,warn",
[
# For numeric series, we should coerce to NaN.
(Series([1, 2, 3]), Series([np.nan, 2, 3]), FutureWarning),
(Series([1, 2, 3]), Series([np.nan, 2, 3]), None),
(Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None),
# For datetime series, we should coerce to NaT.
(
Expand Down Expand Up @@ -1584,13 +1581,11 @@ def test_20643_comment():
expected = Series([np.nan, 1, 2], index=["a", "b", "c"])

ser = orig.copy()
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
ser.iat[0] = None
ser.iat[0] = None
tm.assert_series_equal(ser, expected)

ser = orig.copy()
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
ser.iloc[0] = None
ser.iloc[0] = None
tm.assert_series_equal(ser, expected)


Expand Down
12 changes: 1 addition & 11 deletions pandas/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,7 @@ def test_convert_dtypes(
# Test that it is a copy
copy = series.copy(deep=True)

if result.notna().sum() > 0 and result.dtype in [
"int8",
"uint8",
"int16",
"uint16",
"int32",
"uint32",
"int64",
"uint64",
"interval[int64, right]",
]:
if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]:
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
result[result.notna()] = np.nan
else:
Expand Down