Skip to content

BUG: Special-case setting nan into integer series #54527

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Aug 28, 2023
1 change: 1 addition & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ def array_equivalent_object(
right: npt.NDArray[np.object_],
) -> bool: ...
def has_infs(arr: np.ndarray) -> bool: ... # const floating[:]
def has_only_ints_or_nan(arr: np.ndarray) -> bool: ... # const floating[:]
def get_reverse_indexer(
indexer: np.ndarray, # const intp_t[:]
length: int,
Expand Down
16 changes: 16 additions & 0 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,22 @@ def has_infs(floating[:] arr) -> bool:
return ret


@cython.boundscheck(False)
@cython.wraparound(False)
def has_only_ints_or_nan(floating[:] arr) -> bool:
cdef:
floating val
intp_t i

for i in range(len(arr)):
val = arr[i]
if (val != val) or (val == <int64_t>val):
continue
else:
return False
return True
Comment on lines +535 to +546
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So one thing that this doesn't yet do, is checking for the proper range for lower bitwidth integers.

For example setting [1000.0, np.nan] into a int8 Series should still raise the warning because 1000 is too big, but this helper function will now say it are only integers or NaN.



def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
cdef:
Py_ssize_t i, n = len(indices)
Expand Down
23 changes: 23 additions & 0 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from pandas._config import using_copy_on_write

from pandas._libs import (
NaT,
internals as libinternals,
lib,
writers,
Expand Down Expand Up @@ -59,7 +60,10 @@
from pandas.core.dtypes.common import (
ensure_platform_int,
is_1d_only_ea_dtype,
is_float_dtype,
is_integer_dtype,
is_list_like,
is_scalar,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import (
Expand Down Expand Up @@ -453,6 +457,25 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
and will receive the same block
"""
new_dtype = find_result_type(self.values.dtype, other)

# In a future version of pandas, the default will be that
# setting `nan` into an integer series won't raise.
if (
is_scalar(other)
and is_integer_dtype(self.values.dtype)
and isna(other)
and other is not NaT
):
warn_on_upcast = False
elif (
isinstance(other, np.ndarray)
and other.ndim == 1
and is_integer_dtype(self.values.dtype)
and is_float_dtype(other.dtype)
and lib.has_only_ints_or_nan(other)
):
warn_on_upcast = False

if warn_on_upcast:
warnings.warn(
f"Setting an item of incompatible dtype is deprecated "
Expand Down
30 changes: 13 additions & 17 deletions pandas/tests/frame/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,18 +337,12 @@ def test_setitem(self, float_frame, using_copy_on_write):
def test_setitem2(self):
# dtype changing GH4204
df = DataFrame([[0, 0]])
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.iloc[0] = np.nan
df.iloc[0] = np.nan
expected = DataFrame([[np.nan, np.nan]])
tm.assert_frame_equal(df, expected)

df = DataFrame([[0, 0]])
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
df.loc[0] = np.nan
df.loc[0] = np.nan
tm.assert_frame_equal(df, expected)

def test_setitem_boolean(self, float_frame):
Expand Down Expand Up @@ -1579,9 +1573,7 @@ def test_setitem(self, uint64_frame):
# With NaN: because uint64 has no NaN element,
# the column should be cast to object.
df2 = df.copy()
with tm.assert_produces_warning(
FutureWarning, match="Setting an item of incompatible dtype"
):
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
df2.iloc[1, 1] = pd.NaT
df2.iloc[1, 2] = pd.NaT
result = df2["B"]
Expand Down Expand Up @@ -1901,19 +1893,19 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key):
class TestSetitemValidation:
# This is adapted from pandas/tests/arrays/masked/test_indexing.py
# but checks for warnings instead of errors.
def _check_setitem_invalid(self, df, invalid, indexer):
def _check_setitem_invalid(self, df, invalid, indexer, warn):
msg = "Setting an item of incompatible dtype is deprecated"
msg = re.escape(msg)

orig_df = df.copy()

# iloc
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
df.iloc[indexer, 0] = invalid
df = orig_df.copy()

# loc
with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
df.loc[indexer, "a"] = invalid
df = orig_df.copy()

Expand All @@ -1934,16 +1926,20 @@ def _check_setitem_invalid(self, df, invalid, indexer):
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_bool(self, invalid, indexer):
df = DataFrame({"a": [True, False, False]}, dtype="bool")
self._check_setitem_invalid(df, invalid, indexer)
self._check_setitem_invalid(df, invalid, indexer, FutureWarning)

@pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer):
df = DataFrame({"a": [1, 2, 3]}, dtype=any_int_numpy_dtype)
self._check_setitem_invalid(df, invalid, indexer)
if isna(invalid) and invalid is not pd.NaT:
warn = None
else:
warn = FutureWarning
self._check_setitem_invalid(df, invalid, indexer, warn)

@pytest.mark.parametrize("invalid", _invalid_scalars + [True])
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer):
df = DataFrame({"a": [1, 2, None]}, dtype=float_numpy_dtype)
self._check_setitem_invalid(df, invalid, indexer)
self._check_setitem_invalid(df, invalid, indexer, FutureWarning)
16 changes: 4 additions & 12 deletions pandas/tests/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -830,8 +830,7 @@ def test_coercion_with_loc(self, expected):
start_data, expected_result, warn = expected

start_dataframe = DataFrame({"foo": start_data})
with tm.assert_produces_warning(warn, match="incompatible dtype"):
start_dataframe.loc[0, ["foo"]] = None
start_dataframe.loc[0, ["foo"]] = None

expected_dataframe = DataFrame({"foo": expected_result})
tm.assert_frame_equal(start_dataframe, expected_dataframe)
Expand All @@ -841,8 +840,7 @@ def test_coercion_with_setitem_and_dataframe(self, expected):
start_data, expected_result, warn = expected

start_dataframe = DataFrame({"foo": start_data})
with tm.assert_produces_warning(warn, match="incompatible dtype"):
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None

expected_dataframe = DataFrame({"foo": expected_result})
tm.assert_frame_equal(start_dataframe, expected_dataframe)
Expand All @@ -852,10 +850,7 @@ def test_none_coercion_loc_and_dataframe(self, expected):
start_data, expected_result, warn = expected

start_dataframe = DataFrame({"foo": start_data})
with tm.assert_produces_warning(warn, match="incompatible dtype"):
start_dataframe.loc[
start_dataframe["foo"] == start_dataframe["foo"][0]
] = None
start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None

expected_dataframe = DataFrame({"foo": expected_result})
tm.assert_frame_equal(start_dataframe, expected_dataframe)
Expand All @@ -869,10 +864,7 @@ def test_none_coercion_mixed_dtypes(self):
"d": ["a", "b", "c"],
}
)
with tm.assert_produces_warning(
FutureWarning, match="item of incompatible dtype"
):
start_dataframe.iloc[0] = None
start_dataframe.iloc[0] = None

exp = DataFrame(
{
Expand Down
21 changes: 13 additions & 8 deletions pandas/tests/series/indexing/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
Timestamp,
concat,
date_range,
isna,
period_range,
timedelta_range,
)
Expand Down Expand Up @@ -456,25 +457,25 @@ def test_setitem_dict_and_set_disallowed_multiindex(self, key):
class TestSetitemValidation:
# This is adapted from pandas/tests/arrays/masked/test_indexing.py
# but checks for warnings instead of errors.
def _check_setitem_invalid(self, ser, invalid, indexer):
def _check_setitem_invalid(self, ser, invalid, indexer, warn):
msg = "Setting an item of incompatible dtype is deprecated"
msg = re.escape(msg)

orig_ser = ser.copy()

with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
ser[indexer] = invalid
ser = orig_ser.copy()

with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
ser.iloc[indexer] = invalid
ser = orig_ser.copy()

with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
ser.loc[indexer] = invalid
ser = orig_ser.copy()

with tm.assert_produces_warning(FutureWarning, match=msg):
with tm.assert_produces_warning(warn, match=msg):
ser[:] = invalid

_invalid_scalars = [
Expand All @@ -494,16 +495,20 @@ def _check_setitem_invalid(self, ser, invalid, indexer):
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_bool(self, invalid, indexer):
ser = Series([True, False, False], dtype="bool")
self._check_setitem_invalid(ser, invalid, indexer)
self._check_setitem_invalid(ser, invalid, indexer, FutureWarning)

@pytest.mark.parametrize("invalid", _invalid_scalars + [True, 1.5, np.float64(1.5)])
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_int(self, invalid, any_int_numpy_dtype, indexer):
ser = Series([1, 2, 3], dtype=any_int_numpy_dtype)
self._check_setitem_invalid(ser, invalid, indexer)
if isna(invalid) and invalid is not NaT:
warn = None
else:
warn = FutureWarning
self._check_setitem_invalid(ser, invalid, indexer, warn)

@pytest.mark.parametrize("invalid", _invalid_scalars + [True])
@pytest.mark.parametrize("indexer", _indexers)
def test_setitem_validation_scalar_float(self, invalid, float_numpy_dtype, indexer):
ser = Series([1, 2, None], dtype=float_numpy_dtype)
self._check_setitem_invalid(ser, invalid, indexer)
self._check_setitem_invalid(ser, invalid, indexer, FutureWarning)
29 changes: 12 additions & 17 deletions pandas/tests/series/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,14 +191,11 @@ def test_setitem_series_object_dtype(self, indexer, ser_index):
expected = Series([Series([42], index=[ser_index]), 0], dtype="object")
tm.assert_series_equal(ser, expected)

@pytest.mark.parametrize(
"index, exp_value, warn", [(0, 42, None), (1, np.nan, FutureWarning)]
)
def test_setitem_series(self, index, exp_value, warn):
@pytest.mark.parametrize("index, exp_value", [(0, 42), (1, np.nan)])
def test_setitem_series(self, index, exp_value):
# GH#38303
ser = Series([0, 0])
with tm.assert_produces_warning(warn, match="item of incompatible dtype"):
ser.loc[0] = Series([42], index=[index])
ser.loc[0] = Series([42], index=[index])
expected = Series([exp_value, 0])
tm.assert_series_equal(ser, expected)

Expand Down Expand Up @@ -575,15 +572,15 @@ def test_setitem_keep_precision(self, any_numeric_ea_dtype):
[
(NA, NA, "Int64", "Int64", 1, None),
(NA, NA, "Int64", "Int64", 2, None),
(NA, np.nan, "int64", "float64", 1, FutureWarning),
(NA, np.nan, "int64", "float64", 1, None),
(NA, np.nan, "int64", "float64", 2, None),
(NaT, NaT, "int64", "object", 1, FutureWarning),
(NaT, NaT, "int64", "object", 2, None),
(np.nan, NA, "Int64", "Int64", 1, None),
(np.nan, NA, "Int64", "Int64", 2, None),
(np.nan, NA, "Float64", "Float64", 1, None),
(np.nan, NA, "Float64", "Float64", 2, None),
(np.nan, np.nan, "int64", "float64", 1, FutureWarning),
(np.nan, np.nan, "int64", "float64", 1, None),
(np.nan, np.nan, "int64", "float64", 2, None),
],
)
Expand All @@ -592,7 +589,7 @@ def test_setitem_enlarge_with_na(
):
# GH#32346
ser = Series([1, 2], dtype=dtype)
with tm.assert_produces_warning(warn, match="item of incompatible dtype"):
with tm.assert_produces_warning(warn, match="incompatible dtype"):
ser[indexer] = na
expected_values = [1, target_na] if indexer == 1 else [1, 2, target_na]
expected = Series(expected_values, dtype=target_dtype)
Expand Down Expand Up @@ -884,7 +881,7 @@ def test_index_putmask(self, obj, key, expected, warn, val):
Series([2, 3, 4, 5, 6, 7, 8, 9, 10]),
Series([np.nan, 3, np.nan, 5, np.nan, 7, np.nan, 9, np.nan]),
slice(None, None, 2),
FutureWarning,
None,
id="int_series_slice_key_step",
),
pytest.param(
Expand All @@ -899,15 +896,15 @@ def test_index_putmask(self, obj, key, expected, warn, val):
Series(np.arange(10)),
Series([np.nan, np.nan, np.nan, np.nan, np.nan, 5, 6, 7, 8, 9]),
slice(None, 5),
FutureWarning,
None,
id="int_series_slice_key",
),
pytest.param(
# changes dtype GH#4463
Series([1, 2, 3]),
Series([np.nan, 2, 3]),
0,
FutureWarning,
None,
id="int_series_int_key",
),
pytest.param(
Expand Down Expand Up @@ -1134,7 +1131,7 @@ def warn(self):
"obj,expected,warn",
[
# For numeric series, we should coerce to NaN.
(Series([1, 2, 3]), Series([np.nan, 2, 3]), FutureWarning),
(Series([1, 2, 3]), Series([np.nan, 2, 3]), None),
(Series([1.0, 2.0, 3.0]), Series([np.nan, 2.0, 3.0]), None),
# For datetime series, we should coerce to NaT.
(
Expand Down Expand Up @@ -1584,13 +1581,11 @@ def test_20643_comment():
expected = Series([np.nan, 1, 2], index=["a", "b", "c"])

ser = orig.copy()
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
ser.iat[0] = None
ser.iat[0] = None
tm.assert_series_equal(ser, expected)

ser = orig.copy()
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
ser.iloc[0] = None
ser.iloc[0] = None
tm.assert_series_equal(ser, expected)


Expand Down
12 changes: 1 addition & 11 deletions pandas/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,17 +206,7 @@ def test_convert_dtypes(
# Test that it is a copy
copy = series.copy(deep=True)

if result.notna().sum() > 0 and result.dtype in [
"int8",
"uint8",
"int16",
"uint16",
"int32",
"uint32",
"int64",
"uint64",
"interval[int64, right]",
]:
if result.notna().sum() > 0 and result.dtype in ["interval[int64, right]"]:
with tm.assert_produces_warning(FutureWarning, match="incompatible dtype"):
result[result.notna()] = np.nan
else:
Expand Down