Skip to content

Commit d53dfd8

Browse files
jbrockmendelJulianWgs
authored andcommitted
DEPR: DataFrame(floaty, dtype=inty) match Series (pandas-dev#41770)
1 parent deb471a commit d53dfd8

File tree

10 files changed

+90
-27
lines changed

10 files changed

+90
-27
lines changed

asv_bench/benchmarks/frame_methods.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,9 @@ class Rank:
652652
]
653653

654654
def setup(self, dtype):
655-
self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype)
655+
self.df = DataFrame(
656+
np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype
657+
)
656658

657659
def time_rank(self, dtype):
658660
self.df.rank()

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -700,6 +700,7 @@ Deprecations
700700
- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`)
701701
- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
702702
- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`)
703+
- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`)
703704
- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
704705
- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
705706
- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)

pandas/core/construction.py

+18
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
Dtype,
2525
DtypeObj,
2626
)
27+
from pandas.errors import IntCastingNaNError
2728

2829
from pandas.core.dtypes.base import (
2930
ExtensionDtype,
@@ -511,7 +512,24 @@ def sanitize_array(
511512
# possibility of nan -> garbage
512513
try:
513514
subarr = _try_cast(data, dtype, copy, True)
515+
except IntCastingNaNError:
516+
subarr = np.array(data, copy=copy)
514517
except ValueError:
518+
if not raise_cast_failure:
519+
# i.e. called via DataFrame constructor
520+
warnings.warn(
521+
"In a future version, passing float-dtype values and an "
522+
"integer dtype to DataFrame will retain floating dtype "
523+
"if they cannot be cast losslessly (matching Series behavior). "
524+
"To retain the old behavior, use DataFrame(data).astype(dtype)",
525+
FutureWarning,
526+
stacklevel=4,
527+
)
528+
# GH#40110 until the deprecation is enforced, we _dont_
529+
# ignore the dtype for DataFrame, and _do_ cast even though
530+
# it is lossy.
531+
dtype = cast(np.dtype, dtype)
532+
return np.array(data, dtype=dtype, copy=copy)
515533
subarr = np.array(data, copy=copy)
516534
else:
517535
# we will try to copy by-definition here

pandas/core/dtypes/cast.py

+18-1
Original file line numberDiff line numberDiff line change
@@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array(
20882088
if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
20892089
raise OverflowError("Trying to coerce negative values to unsigned integers")
20902090

2091-
if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype):
2091+
if is_float_dtype(arr.dtype):
2092+
if not np.isfinite(arr).all():
2093+
raise IntCastingNaNError(
2094+
"Cannot convert non-finite values (NA or inf) to integer"
2095+
)
2096+
raise ValueError("Trying to coerce float values to integers")
2097+
if is_object_dtype(arr.dtype):
20922098
raise ValueError("Trying to coerce float values to integers")
20932099

20942100
if casted.dtype < arr.dtype:
@@ -2102,6 +2108,17 @@ def maybe_cast_to_integer_array(
21022108
)
21032109
return casted
21042110

2111+
if arr.dtype.kind in ["m", "M"]:
2112+
# test_constructor_maskedarray_nonfloat
2113+
warnings.warn(
2114+
f"Constructing Series or DataFrame from {arr.dtype} values and "
2115+
f"dtype={dtype} is deprecated and will raise in a future version. "
2116+
"Use values.view(dtype) instead",
2117+
FutureWarning,
2118+
stacklevel=find_stack_level(),
2119+
)
2120+
return casted
2121+
21052122
# No known cases that get here, but raising explicitly to cover our bases.
21062123
raise ValueError(f"values cannot be losslessly cast to {dtype}")
21072124

pandas/core/internals/construction.py

+6-18
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,9 @@
2222
DtypeObj,
2323
Manager,
2424
)
25-
from pandas.errors import IntCastingNaNError
2625

2726
from pandas.core.dtypes.cast import (
2827
construct_1d_arraylike_from_scalar,
29-
construct_1d_ndarray_preserving_na,
3028
maybe_cast_to_datetime,
3129
maybe_convert_platform,
3230
maybe_infer_to_datetimelike,
@@ -303,22 +301,12 @@ def ndarray_to_mgr(
303301
shape = values.shape
304302
flat = values.ravel()
305303

306-
if not is_integer_dtype(dtype):
307-
# TODO: skipping integer_dtype is needed to keep the tests passing,
308-
# not clear it is correct
309-
# Note: we really only need _try_cast, but keeping to exposed funcs
310-
values = sanitize_array(
311-
flat, None, dtype=dtype, copy=copy, raise_cast_failure=True
312-
)
313-
else:
314-
try:
315-
values = construct_1d_ndarray_preserving_na(
316-
flat, dtype=dtype, copy=False
317-
)
318-
except IntCastingNaNError:
319-
# following Series, we ignore the dtype and retain floating
320-
# values instead of casting nans to meaningless ints
321-
pass
304+
# GH#40110 see similar check inside sanitize_array
305+
rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")
306+
307+
values = sanitize_array(
308+
flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf
309+
)
322310

323311
values = values.reshape(shape)
324312

pandas/tests/frame/methods/test_sort_index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -603,15 +603,15 @@ def test_sort_index_level_large_cardinality(self):
603603

604604
# GH#2684 (int64)
605605
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
606-
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
606+
df = DataFrame(np.random.randn(4000).astype("int64"), index=index)
607607

608608
# it works!
609609
result = df.sort_index(level=0)
610610
assert result.index._lexsort_depth == 3
611611

612612
# GH#2684 (int32)
613613
index = MultiIndex.from_arrays([np.arange(4000)] * 3)
614-
df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
614+
df = DataFrame(np.random.randn(4000).astype("int32"), index=index)
615615

616616
# it works!
617617
result = df.sort_index(level=0)

pandas/tests/frame/methods/test_to_csv.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -714,7 +714,9 @@ def create_cols(name):
714714
np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
715715
)
716716
df_int = DataFrame(
717-
np.random.randn(100, 5), dtype="int64", columns=create_cols("int")
717+
np.random.randn(100, 5).astype("int64"),
718+
dtype="int64",
719+
columns=create_cols("int"),
718720
)
719721
df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
720722
df_object = DataFrame(
@@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self):
765767
tm.assert_frame_equal(result, df)
766768

767769
df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
768-
df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
770+
df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
769771
df_bool = DataFrame(True, index=df_float.index, columns=range(3))
770772
df_object = DataFrame("foo", index=df_float.index, columns=range(3))
771773
df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))

pandas/tests/frame/test_constructors.py

+33-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import functools
1111
import itertools
1212
import re
13+
import warnings
1314

1415
import numpy as np
1516
import numpy.ma as ma
@@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self):
9991000
assert isna(frame).values.all()
10001001

10011002
# cast type
1002-
frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
1003+
msg = r"datetime64\[ns\] values and dtype=int64"
1004+
with tm.assert_produces_warning(FutureWarning, match=msg):
1005+
with warnings.catch_warnings():
1006+
warnings.filterwarnings(
1007+
"ignore",
1008+
category=DeprecationWarning,
1009+
message="elementwise comparison failed",
1010+
)
1011+
frame = DataFrame(
1012+
mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
1013+
)
10031014
assert frame.values.dtype == np.int64
10041015

10051016
# Check non-masked values
@@ -2484,6 +2495,27 @@ def test_nested_list_columns(self):
24842495
tm.assert_frame_equal(result, expected)
24852496

24862497

2498+
class TestDataFrameConstructorWithDtypeCoercion:
2499+
def test_floating_values_integer_dtype(self):
2500+
# GH#40110 make DataFrame behavior with arraylike floating data and
2501+
# inty dtype match Series behavior
2502+
2503+
arr = np.random.randn(10, 5)
2504+
2505+
msg = "if they cannot be cast losslessly"
2506+
with tm.assert_produces_warning(FutureWarning, match=msg):
2507+
DataFrame(arr, dtype="i8")
2508+
2509+
with tm.assert_produces_warning(None):
2510+
# if they can be cast losslessly, no warning
2511+
DataFrame(arr.round(), dtype="i8")
2512+
2513+
# with NaNs, we already have the correct behavior, so no warning
2514+
arr[0, 0] = np.nan
2515+
with tm.assert_produces_warning(None):
2516+
DataFrame(arr, dtype="i8")
2517+
2518+
24872519
class TestDataFrameConstructorWithDatetimeTZ:
24882520
@pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
24892521
def test_construction_preserves_tzaware_dtypes(self, tz):

pandas/tests/frame/test_nonunique_indexes.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ def test_multi_dtype2(self):
294294
def test_dups_across_blocks(self, using_array_manager):
295295
# dups across blocks
296296
df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
297-
df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
297+
df_int = DataFrame(np.random.randn(10, 3).astype("int64"))
298298
df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
299299
df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
300300
df_dt = DataFrame(

pandas/tests/indexing/test_coercion.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request):
134134
)
135135
request.node.add_marker(mark)
136136

137-
exp = pd.Series([1, val, 3, 4], dtype=np.int8)
137+
warn = None if exp_dtype is np.int8 else FutureWarning
138+
msg = "Values are too large to be losslessly cast to int8"
139+
with tm.assert_produces_warning(warn, match=msg):
140+
exp = pd.Series([1, val, 3, 4], dtype=np.int8)
138141
self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
139142

140143
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)