diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 6dff4a017e2a9..6f0bb3091133f 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -600,31 +600,35 @@ def time_frame_agg(self, dtype, method): class Cumulative: - param_names = ["dtype", "method"] + param_names = ["dtype", "method", "with_nans"] params = [ ["float64", "int64", "Float64", "Int64"], ["cummin", "cummax", "cumsum"], + [True, False], ] - def setup(self, dtype, method): + def setup(self, dtype, method, with_nans): + if with_nans and dtype == "int64": + raise NotImplementedError("Construction of df would raise") + N = 500_000 - vals = np.random.randint(-10, 10, (N, 5)) - null_vals = vals.astype(float, copy=True) - null_vals[::2, :] = np.nan - null_vals[::3, :] = np.nan - df = DataFrame(vals, columns=list("abcde"), dtype=dtype) - null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) keys = np.random.randint(0, 100, size=N) - df["key"] = keys - null_df["key"] = keys - self.df = df - self.null_df = null_df + vals = np.random.randint(-10, 10, (N, 5)) - def time_frame_transform(self, dtype, method): - self.df.groupby("key").transform(method) + if with_nans: + null_vals = vals.astype(float, copy=True) + null_vals[::2, :] = np.nan + null_vals[::3, :] = np.nan + df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype) + df["key"] = keys + self.df = df + else: + df = DataFrame(vals, columns=list("abcde")).astype(dtype, copy=False) + df["key"] = keys + self.df = df - def time_frame_transform_many_nulls(self, dtype, method): - self.null_df.groupby("key").transform(method) + def time_frame_transform(self, dtype, method, with_nans): + self.df.groupby("key").transform(method) class RankWithTies: diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 07a4e4af3dbe7..4c23642987a84 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -489,6 +489,7 @@ Removal of prior version deprecations/changes - Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`) - Changed behavior of setitem-like operations (``__setitem__``, ``fillna``, ``where``, ``mask``, ``replace``, ``insert``, fill_value for ``shift``) on an object with :class:`DatetimeTZDtype` when using a value with a non-matching timezone, the value will be cast to the object's timezone instead of casting both to object-dtype (:issue:`44243`) - Changed behavior of :class:`Index`, :class:`Series`, :class:`DataFrame` constructors with floating-dtype data and a :class:`DatetimeTZDtype`, the data are now interpreted as UTC-times instead of wall-times, consistent with how integer-dtype data are treated (:issue:`45573`) +- Changed behavior of :class:`Series` and :class:`DataFrame` constructors with integer dtype and floating-point data containing ``NaN``, this now raises ``IntCastingNaNError`` (:issue:`40110`) - Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`) - Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`) - Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 3ae509e74074e..d40c334ab1840 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -14,7 +14,6 @@ cast, overload, ) -import warnings import numpy as np from numpy import ma @@ -29,7 +28,6 @@ T, ) from pandas.errors import IntCastingNaNError -from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.base import ( ExtensionDtype, @@ -577,16 +575,7 @@ def sanitize_array( subarr = maybe_cast_to_integer_array(data, dtype) except IntCastingNaNError: - warnings.warn( - "In a future version, passing float-dtype values containing NaN " - "and an integer dtype will raise IntCastingNaNError " - "(subclass of ValueError) instead of silently ignoring the " - "passed dtype. To retain the old behavior, call Series(arr) or " - "DataFrame(arr) without passing a dtype.", - FutureWarning, - stacklevel=find_stack_level(), - ) - subarr = np.array(data, copy=copy) + raise except ValueError: # Pre-2.0, we would have different behavior for Series vs DataFrame. # DataFrame would call np.array(data, dtype=dtype, copy=copy), diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 17a76decce3c7..c1d40949b858b 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -18,6 +18,7 @@ import pytest import pytz +from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -105,16 +106,13 @@ def test_constructor_dict_with_tzaware_scalar(self): def test_construct_ndarray_with_nas_and_int_dtype(self): # GH#26919 match Series by not casting np.nan to meaningless int arr = np.array([[1, np.nan], [2, 3]]) - with tm.assert_produces_warning(FutureWarning): - df = DataFrame(arr, dtype="i8") - assert df.values.dtype == arr.dtype - assert isna(df.iloc[0, 1]) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + DataFrame(arr, dtype="i8") # check this matches Series behavior - with tm.assert_produces_warning(FutureWarning): - ser = Series(arr[0], dtype="i8", name=0) - expected = df.iloc[0] - tm.assert_series_equal(ser, expected) + with pytest.raises(IntCastingNaNError, match=msg): + Series(arr[0], dtype="i8", name=0) def test_construct_from_list_of_datetimes(self): df = DataFrame([datetime.now(), datetime.now()]) @@ -966,21 +964,16 @@ def _check_basic_constructor(self, empty): assert len(frame.index) == 3 assert len(frame.columns) == 1 - warn = None if empty is np.ones else FutureWarning - with tm.assert_produces_warning(warn): + if empty is not np.ones: + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64) + return + else: frame = DataFrame( mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64 ) - if empty is np.ones: - # passing dtype casts assert frame.values.dtype == np.int64 - else: - # i.e. ma.masked_all - # Since we have NaNs, refuse to cast to int dtype, which would take NaN - # to meaningless integers. This matches Series behavior. GH#26919 - assert frame.isna().all().all() - assert frame.values.dtype == np.float64 - assert isna(frame.values).all() # wrong size axis labels msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)" @@ -1741,11 +1734,10 @@ def test_constructor_mix_series_nonseries(self, float_frame): DataFrame({"A": float_frame["A"], "B": list(float_frame["B"])[:-2]}) def test_constructor_miscast_na_int_dtype(self): - msg = "float-dtype values containing NaN and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) - expected = DataFrame([[np.nan, 1], [1, 0]]) - tm.assert_frame_equal(df, expected) + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + + with pytest.raises(IntCastingNaNError, match=msg): + DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64) def test_constructor_column_duplicates(self): # it works! #2079 @@ -2735,16 +2727,16 @@ def test_floating_values_integer_dtype(self): # with NaNs, we go through a different path with a different warning arr[0, 0] = np.nan - msg = "passing float-dtype values containing NaN" - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): DataFrame(arr, dtype="i8") - with tm.assert_produces_warning(FutureWarning, match=msg): + with pytest.raises(IntCastingNaNError, match=msg): Series(arr[0], dtype="i8") # The future (raising) behavior matches what we would get via astype: msg = r"Cannot convert non-finite values \(NA or inf\) to integer" - with pytest.raises(ValueError, match=msg): + with pytest.raises(IntCastingNaNError, match=msg): DataFrame(arr).astype("i8") - with pytest.raises(ValueError, match=msg): + with pytest.raises(IntCastingNaNError, match=msg): Series(arr[0]).astype("i8") diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 37348bb743537..95baf0cd3d9de 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -15,6 +15,7 @@ lib, ) from pandas.compat import is_numpy_dev +from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td from pandas.core.dtypes.common import ( @@ -670,10 +671,9 @@ def test_constructor_sanitize(self): s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8") assert s.dtype == np.dtype("i8") - msg = "float-dtype values containing NaN and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - ser = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") - assert ser.dtype == np.dtype("f8") + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8") def test_constructor_copy(self): # GH15125 @@ -809,18 +809,17 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series): res = frame_or_series(list(arr), dtype="i8") tm.assert_equal(res, expected) - # When we have NaNs, we silently ignore the integer dtype + # pre-2.0, when we had NaNs, we silently ignored the integer dtype arr[0] = np.nan expected = frame_or_series(arr) - msg = "passing float-dtype values containing NaN and an integer dtype" - with tm.assert_produces_warning(FutureWarning, match=msg): - obj = frame_or_series(arr, dtype="i8") - tm.assert_equal(obj, expected) - with tm.assert_produces_warning(FutureWarning, match=msg): + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + frame_or_series(arr, dtype="i8") + + with pytest.raises(IntCastingNaNError, match=msg): # same behavior if we pass list instead of the ndarray - obj = frame_or_series(list(arr), dtype="i8") - tm.assert_equal(obj, expected) + frame_or_series(list(arr), dtype="i8") # float array that can be losslessly cast to integers arr = np.array([1.0, 2.0], dtype="float64") @@ -854,13 +853,13 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp # Updated: make sure we treat this list the same as we would treat the # equivalent ndarray vals = [1, 2, np.nan] - msg = "In a future version, passing float-dtype values containing NaN" - with tm.assert_produces_warning(FutureWarning, match=msg): - res = Series(vals, dtype=any_int_numpy_dtype) - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = Series(np.array(vals), dtype=any_int_numpy_dtype) - tm.assert_series_equal(res, expected) - assert np.isnan(expected.iloc[-1]) + # pre-2.0 this would return with a float dtype, in 2.0 we raise + + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" + with pytest.raises(IntCastingNaNError, match=msg): + Series(vals, dtype=any_int_numpy_dtype) + with pytest.raises(IntCastingNaNError, match=msg): + Series(np.array(vals), dtype=any_int_numpy_dtype) def test_constructor_dtype_no_cast(self): # see gh-1572 diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index a7f4269fa62b1..1396ab262a79a 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from pandas.errors import IntCastingNaNError import pandas.util._test_decorators as td import pandas as pd @@ -100,13 +101,13 @@ def test_construct_dask_float_array_int_dtype_match_ndarray(): expected = Series(arr, dtype="i8") tm.assert_series_equal(res, expected) - msg = "In a future version, passing float-dtype values containing NaN" + msg = r"Cannot convert non-finite values \(NA or inf\) to integer" arr[2] = np.nan - with tm.assert_produces_warning(FutureWarning, match=msg): - res = Series(darr, dtype="i8") - with tm.assert_produces_warning(FutureWarning, match=msg): - expected = Series(arr, dtype="i8") - tm.assert_series_equal(res, expected) + with pytest.raises(IntCastingNaNError, match=msg): + Series(darr, dtype="i8") + # which is the same as we get with a numpy input + with pytest.raises(IntCastingNaNError, match=msg): + Series(arr, dtype="i8") def test_xarray(df):