DEPR: DataFrame(floaty, dtype=inty) match Series (pandas-dev#41770)

jbrockmendel · JulianWgs · commit d53dfd88e5cf · 2021-07-03T13:10:22.000+02:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -652,7 +652,9 @@ class Rank:
     ]
 
     def setup(self, dtype):
-        self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype)
+        self.df = DataFrame(
+            np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype
+        )
 
     def time_rank(self, dtype):
         self.df.rank()
diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
@@ -700,6 +700,7 @@ Deprecations
 - Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`)
 - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype.  Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`)
 - Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`)
+- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`)
 - Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`)
 - In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`)
 - Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`)
diff --git a/pandas/core/construction.py b/pandas/core/construction.py
@@ -24,6 +24,7 @@
     Dtype,
     DtypeObj,
 )
+from pandas.errors import IntCastingNaNError
 
 from pandas.core.dtypes.base import (
     ExtensionDtype,
@@ -511,7 +512,24 @@ def sanitize_array(
             # possibility of nan -> garbage
             try:
                 subarr = _try_cast(data, dtype, copy, True)
+            except IntCastingNaNError:
+                subarr = np.array(data, copy=copy)
             except ValueError:
+                if not raise_cast_failure:
+                    # i.e. called via DataFrame constructor
+                    warnings.warn(
+                        "In a future version, passing float-dtype values and an "
+                        "integer dtype to DataFrame will retain floating dtype "
+                        "if they cannot be cast losslessly (matching Series behavior). "
+                        "To retain the old behavior, use DataFrame(data).astype(dtype)",
+                        FutureWarning,
+                        stacklevel=4,
+                    )
+                    # GH#40110 until the deprecation is enforced, we _dont_
+                    #  ignore the dtype for DataFrame, and _do_ cast even though
+                    #  it is lossy.
+                    dtype = cast(np.dtype, dtype)
+                    return np.array(data, dtype=dtype, copy=copy)
                 subarr = np.array(data, copy=copy)
         else:
             # we will try to copy by-definition here
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -2088,7 +2088,13 @@ def maybe_cast_to_integer_array(
     if is_unsigned_integer_dtype(dtype) and (arr < 0).any():
         raise OverflowError("Trying to coerce negative values to unsigned integers")
 
-    if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype):
+    if is_float_dtype(arr.dtype):
+        if not np.isfinite(arr).all():
+            raise IntCastingNaNError(
+                "Cannot convert non-finite values (NA or inf) to integer"
+            )
+        raise ValueError("Trying to coerce float values to integers")
+    if is_object_dtype(arr.dtype):
         raise ValueError("Trying to coerce float values to integers")
 
     if casted.dtype < arr.dtype:
@@ -2102,6 +2108,17 @@ def maybe_cast_to_integer_array(
         )
         return casted
 
+    if arr.dtype.kind in ["m", "M"]:
+        # test_constructor_maskedarray_nonfloat
+        warnings.warn(
+            f"Constructing Series or DataFrame from {arr.dtype} values and "
+            f"dtype={dtype} is deprecated and will raise in a future version. "
+            "Use values.view(dtype) instead",
+            FutureWarning,
+            stacklevel=find_stack_level(),
+        )
+        return casted
+
     # No known cases that get here, but raising explicitly to cover our bases.
     raise ValueError(f"values cannot be losslessly cast to {dtype}")
 
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -22,11 +22,9 @@
     DtypeObj,
     Manager,
 )
-from pandas.errors import IntCastingNaNError
 
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar,
-    construct_1d_ndarray_preserving_na,
     maybe_cast_to_datetime,
     maybe_convert_platform,
     maybe_infer_to_datetimelike,
@@ -303,22 +301,12 @@ def ndarray_to_mgr(
         shape = values.shape
         flat = values.ravel()
 
-        if not is_integer_dtype(dtype):
-            # TODO: skipping integer_dtype is needed to keep the tests passing,
-            #  not clear it is correct
-            # Note: we really only need _try_cast, but keeping to exposed funcs
-            values = sanitize_array(
-                flat, None, dtype=dtype, copy=copy, raise_cast_failure=True
-            )
-        else:
-            try:
-                values = construct_1d_ndarray_preserving_na(
-                    flat, dtype=dtype, copy=False
-                )
-            except IntCastingNaNError:
-                # following Series, we ignore the dtype and retain floating
-                # values instead of casting nans to meaningless ints
-                pass
+        # GH#40110 see similar check inside sanitize_array
+        rcf = not (is_integer_dtype(dtype) and values.dtype.kind == "f")
+
+        values = sanitize_array(
+            flat, None, dtype=dtype, copy=copy, raise_cast_failure=rcf
+        )
 
         values = values.reshape(shape)
 
diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py
@@ -603,15 +603,15 @@ def test_sort_index_level_large_cardinality(self):
 
         # GH#2684 (int64)
         index = MultiIndex.from_arrays([np.arange(4000)] * 3)
-        df = DataFrame(np.random.randn(4000), index=index, dtype=np.int64)
+        df = DataFrame(np.random.randn(4000).astype("int64"), index=index)
 
         # it works!
         result = df.sort_index(level=0)
         assert result.index._lexsort_depth == 3
 
         # GH#2684 (int32)
         index = MultiIndex.from_arrays([np.arange(4000)] * 3)
-        df = DataFrame(np.random.randn(4000), index=index, dtype=np.int32)
+        df = DataFrame(np.random.randn(4000).astype("int32"), index=index)
 
         # it works!
         result = df.sort_index(level=0)
diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py
@@ -714,7 +714,9 @@ def create_cols(name):
             np.random.randn(100, 5), dtype="float64", columns=create_cols("float")
         )
         df_int = DataFrame(
-            np.random.randn(100, 5), dtype="int64", columns=create_cols("int")
+            np.random.randn(100, 5).astype("int64"),
+            dtype="int64",
+            columns=create_cols("int"),
         )
         df_bool = DataFrame(True, index=df_float.index, columns=create_cols("bool"))
         df_object = DataFrame(
@@ -765,7 +767,7 @@ def test_to_csv_dups_cols(self):
             tm.assert_frame_equal(result, df)
 
         df_float = DataFrame(np.random.randn(1000, 3), dtype="float64")
-        df_int = DataFrame(np.random.randn(1000, 3), dtype="int64")
+        df_int = DataFrame(np.random.randn(1000, 3)).astype("int64")
         df_bool = DataFrame(True, index=df_float.index, columns=range(3))
         df_object = DataFrame("foo", index=df_float.index, columns=range(3))
         df_dt = DataFrame(Timestamp("20010101"), index=df_float.index, columns=range(3))
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
@@ -10,6 +10,7 @@
 import functools
 import itertools
 import re
+import warnings
 
 import numpy as np
 import numpy.ma as ma
@@ -999,7 +1000,17 @@ def test_constructor_maskedarray_nonfloat(self):
         assert isna(frame).values.all()
 
         # cast type
-        frame = DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
+        msg = r"datetime64\[ns\] values and dtype=int64"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            with warnings.catch_warnings():
+                warnings.filterwarnings(
+                    "ignore",
+                    category=DeprecationWarning,
+                    message="elementwise comparison failed",
+                )
+                frame = DataFrame(
+                    mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
+                )
         assert frame.values.dtype == np.int64
 
         # Check non-masked values
@@ -2484,6 +2495,27 @@ def test_nested_list_columns(self):
         tm.assert_frame_equal(result, expected)
 
 
+class TestDataFrameConstructorWithDtypeCoercion:
+    def test_floating_values_integer_dtype(self):
+        # GH#40110 make DataFrame behavior with arraylike floating data and
+        #  inty dtype match Series behavior
+
+        arr = np.random.randn(10, 5)
+
+        msg = "if they cannot be cast losslessly"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            DataFrame(arr, dtype="i8")
+
+        with tm.assert_produces_warning(None):
+            # if they can be cast losslessly, no warning
+            DataFrame(arr.round(), dtype="i8")
+
+        # with NaNs, we already have the correct behavior, so no warning
+        arr[0, 0] = np.nan
+        with tm.assert_produces_warning(None):
+            DataFrame(arr, dtype="i8")
+
+
 class TestDataFrameConstructorWithDatetimeTZ:
     @pytest.mark.parametrize("tz", ["US/Eastern", "dateutil/US/Eastern"])
     def test_construction_preserves_tzaware_dtypes(self, tz):
diff --git a/pandas/tests/frame/test_nonunique_indexes.py b/pandas/tests/frame/test_nonunique_indexes.py
@@ -294,7 +294,7 @@ def test_multi_dtype2(self):
     def test_dups_across_blocks(self, using_array_manager):
         # dups across blocks
         df_float = DataFrame(np.random.randn(10, 3), dtype="float64")
-        df_int = DataFrame(np.random.randn(10, 3), dtype="int64")
+        df_int = DataFrame(np.random.randn(10, 3).astype("int64"))
         df_bool = DataFrame(True, index=df_float.index, columns=df_float.columns)
         df_object = DataFrame("foo", index=df_float.index, columns=df_float.columns)
         df_dt = DataFrame(
diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -134,7 +134,10 @@ def test_setitem_series_int8(self, val, exp_dtype, request):
             )
             request.node.add_marker(mark)
 
-        exp = pd.Series([1, val, 3, 4], dtype=np.int8)
+        warn = None if exp_dtype is np.int8 else FutureWarning
+        msg = "Values are too large to be losslessly cast to int8"
+        with tm.assert_produces_warning(warn, match=msg):
+            exp = pd.Series([1, val, 3, 4], dtype=np.int8)
         self._assert_setitem_series_conversion(obj, val, exp, exp_dtype)
 
     @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -652,7 +652,9 @@ class Rank:`
`652`	`652`	`]`
`653`	`653`
`654`	`654`	`def setup(self, dtype):`
`655`		`- self.df = DataFrame(np.random.randn(10000, 10), columns=range(10), dtype=dtype)`
	`655`	`+ self.df = DataFrame(`
	`656`	`+ np.random.randn(10000, 10).astype(dtype), columns=range(10), dtype=dtype`
	`657`	`+ )`
`656`	`658`
`657`	`659`	`def time_rank(self, dtype):`
`658`	`660`	`self.df.rank()`