Skip to content

Commit 6dff995

Browse files
authored
BUG: DataFrame(floatdata, dtype=inty) does unsafe casting (#41578)
1 parent 7424f8a commit 6dff995

File tree

5 files changed

+64
-9
lines changed

5 files changed

+64
-9
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -834,6 +834,7 @@ Missing
834834
- Bug in :func:`isna`, and :meth:`Series.isna`, :meth:`Index.isna`, :meth:`DataFrame.isna` (and the corresponding ``notna`` functions) not recognizing ``Decimal("NaN")`` objects (:issue:`39409`)
835835
- Bug in :meth:`DataFrame.fillna` not accepting dictionary for ``downcast`` keyword (:issue:`40809`)
836836
- Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`)
837+
- Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`)
837838

838839
MultiIndex
839840
^^^^^^^^^^

pandas/core/dtypes/cast.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
DtypeObj,
4141
Scalar,
4242
)
43+
from pandas.errors import IntCastingNaNError
4344
from pandas.util._exceptions import find_stack_level
4445
from pandas.util._validators import validate_bool_kwarg
4546

@@ -1167,9 +1168,7 @@ def astype_nansafe(
11671168
raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")
11681169

11691170
elif np.issubdtype(arr.dtype, np.floating) and np.issubdtype(dtype, np.integer):
1170-
1171-
if not np.isfinite(arr).all():
1172-
raise ValueError("Cannot convert non-finite values (NA or inf) to integer")
1171+
return astype_float_to_int_nansafe(arr, dtype, copy)
11731172

11741173
elif is_object_dtype(arr):
11751174

@@ -1207,6 +1206,19 @@ def astype_nansafe(
12071206
return arr.astype(dtype, copy=copy)
12081207

12091208

1209+
def astype_float_to_int_nansafe(
1210+
values: np.ndarray, dtype: np.dtype, copy: bool
1211+
) -> np.ndarray:
1212+
"""
1213+
astype with a check preventing converting NaN to an meaningless integer value.
1214+
"""
1215+
if not np.isfinite(values).all():
1216+
raise IntCastingNaNError(
1217+
"Cannot convert non-finite values (NA or inf) to integer"
1218+
)
1219+
return values.astype(dtype, copy=copy)
1220+
1221+
12101222
def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> ArrayLike:
12111223
"""
12121224
Cast array (ndarray or ExtensionArray) to the new dtype.
@@ -1946,6 +1958,17 @@ def construct_1d_ndarray_preserving_na(
19461958
):
19471959
# TODO(numpy#12550): special-case can be removed
19481960
subarr = construct_1d_object_array_from_listlike(list(values))
1961+
elif (
1962+
dtype is not None
1963+
and dtype.kind in ["i", "u"]
1964+
and isinstance(values, np.ndarray)
1965+
and values.dtype.kind == "f"
1966+
):
1967+
# Argument 2 to "astype_float_to_int_nansafe" has incompatible
1968+
# type "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]"
1969+
return astype_float_to_int_nansafe(
1970+
values, dtype, copy=copy # type: ignore[arg-type]
1971+
)
19491972
else:
19501973
# error: Argument "dtype" to "array" has incompatible type
19511974
# "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any],

pandas/core/internals/construction.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
DtypeObj,
2222
Manager,
2323
)
24+
from pandas.errors import IntCastingNaNError
2425

2526
from pandas.core.dtypes.cast import (
2627
construct_1d_arraylike_from_scalar,
@@ -315,10 +316,11 @@ def ndarray_to_mgr(
315316
values = construct_1d_ndarray_preserving_na(
316317
flat, dtype=dtype, copy=False
317318
)
318-
except Exception as err:
319-
# e.g. ValueError when trying to cast object dtype to float64
320-
msg = f"failed to cast to '{dtype}' (Exception was: {err})"
321-
raise ValueError(msg) from err
319+
except IntCastingNaNError:
320+
# following Series, we ignore the dtype and retain floating
321+
# values instead of casting nans to meaningless ints
322+
pass
323+
322324
values = values.reshape(shape)
323325

324326
# _prep_ndarray ensures that values.ndim == 2 at this point

pandas/errors/__init__.py

+9
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@
1212
)
1313

1414

15+
class IntCastingNaNError(ValueError):
16+
"""
17+
raised when attempting an astype operation on an array with NaN to an integer
18+
dtype.
19+
"""
20+
21+
pass
22+
23+
1524
class NullFrequencyError(ValueError):
1625
"""
1726
Error raised when a null `freq` attribute is used in an operation

pandas/tests/frame/test_constructors.py

+22-2
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,18 @@
6666

6767

6868
class TestDataFrameConstructors:
69+
def test_construct_ndarray_with_nas_and_int_dtype(self):
70+
# GH#26919 match Series by not casting np.nan to meaningless int
71+
arr = np.array([[1, np.nan], [2, 3]])
72+
df = DataFrame(arr, dtype="i8")
73+
assert df.values.dtype == arr.dtype
74+
assert isna(df.iloc[0, 1])
75+
76+
# check this matches Series behavior
77+
ser = Series(arr[0], dtype="i8", name=0)
78+
expected = df.iloc[0]
79+
tm.assert_series_equal(ser, expected)
80+
6981
def test_construct_from_list_of_datetimes(self):
7082
df = DataFrame([datetime.now(), datetime.now()])
7183
assert df[0].dtype == np.dtype("M8[ns]")
@@ -851,9 +863,17 @@ def _check_basic_constructor(self, empty):
851863
assert len(frame.index) == 3
852864
assert len(frame.columns) == 1
853865

854-
# cast type
855866
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
856-
assert frame.values.dtype == np.int64
867+
if empty is np.ones:
868+
# passing dtype casts
869+
assert frame.values.dtype == np.int64
870+
else:
871+
# i.e. ma.masked_all
872+
# Since we have NaNs, refuse to cast to int dtype, which would take NaN
873+
# to meaningless integers. This matches Series behavior. GH#26919
874+
assert frame.isna().all().all()
875+
assert frame.values.dtype == np.float64
876+
assert isna(frame.values).all()
857877

858878
# wrong size axis labels
859879
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"

0 commit comments

Comments
 (0)