Skip to content

DEPR: Enforce Series(float_with_nan, dtype=inty) #49605

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 12 additions & 5 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,22 +608,29 @@ class Cumulative:

def setup(self, dtype, method):
N = 500_000
keys = np.random.randint(0, 100, size=N)
vals = np.random.randint(-10, 10, (N, 5))
null_vals = vals.astype(float, copy=True)
null_vals[::2, :] = np.nan
null_vals[::3, :] = np.nan
df = DataFrame(vals, columns=list("abcde"), dtype=dtype)
null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
keys = np.random.randint(0, 100, size=N)

df = DataFrame(vals, columns=list("abcde")).astype(dtype, copy=False)
df["key"] = keys
null_df["key"] = keys
self.df = df
self.null_df = null_df

if dtype != "int64":
# Would raise on DataFrame construction with int64
null_df = DataFrame(null_vals, columns=list("abcde"), dtype=dtype)
null_df["key"] = keys
self.null_df = null_df

def time_frame_transform(self, dtype, method):
self.df.groupby("key").transform(method)

def time_frame_transform_many_nulls(self, dtype, method):
if dtype == "int64":
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this may cause the benchmark tracking to look odd with the prior benchmark having some finite time to then completing almost instantly.

Could this benchmark be wholly refactored like:

    param_names = ["dtype", "method", "with_nans"]
    params = [
        ["float64", "int64", "Float64", "Int64"],
        ["cummin", "cummax", "cumsum"],
        [True, False]
    ]

    def setup(self, dtype, method, with_nans):
          vals = np.random.randint(-10, 10, (N, 5))
          if with_nans
              if dtype = "int64":
                  raise NotImplementedError
              else:
                  vals = vals.astype(float, copy=True)
                  vals[::2, :] = np.nan
                  vals[::3, :] = np.nan
          ....

    def time_frame_transform(self, dtype, method, with_nans):

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure

# We can't instantiate null_df, so skip this case
return
self.null_df.groupby("key").transform(method)


Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ Removal of prior version deprecations/changes
- Changed behavior of :class:`Index` constructor when passed a ``SparseArray`` or ``SparseDtype`` to retain that dtype instead of casting to ``numpy.ndarray`` (:issue:`43930`)
- Changed behavior of setitem-like operations (``__setitem__``, ``fillna``, ``where``, ``mask``, ``replace``, ``insert``, fill_value for ``shift``) on an object with :class:`DatetimeTZDtype` when using a value with a non-matching timezone, the value will be cast to the object's timezone instead of casting both to object-dtype (:issue:`44243`)
- Changed behavior of :class:`Index`, :class:`Series`, :class:`DataFrame` constructors with floating-dtype data and a :class:`DatetimeTZDtype`, the data are now interpreted as UTC-times instead of wall-times, consistent with how integer-dtype data are treated (:issue:`45573`)
- Changed behavior of :class:`Series` and :class:`DataFrame` constructors with integer dtype and floating-point data containing ``NaN``, this now raises ``IntCastingNaNError`` (:issue:`40110`)
- Removed the deprecated ``base`` and ``loffset`` arguments from :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample` and :class:`pandas.Grouper`. Use ``offset`` or ``origin`` instead (:issue:`31809`)
- Changed behavior of :meth:`Series.fillna` and :meth:`DataFrame.fillna` with ``timedelta64[ns]`` dtype and an incompatible ``fill_value``; this now casts to ``object`` dtype instead of raising, consistent with the behavior with other dtypes (:issue:`45746`)
- Change the default argument of ``regex`` for :meth:`Series.str.replace` from ``True`` to ``False``. Additionally, a single character ``pat`` with ``regex=True`` is now treated as a regular expression instead of a string literal. (:issue:`36695`, :issue:`24804`)
Expand Down
13 changes: 1 addition & 12 deletions pandas/core/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
cast,
overload,
)
import warnings

import numpy as np
from numpy import ma
Expand All @@ -29,7 +28,6 @@
T,
)
from pandas.errors import IntCastingNaNError
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.base import (
ExtensionDtype,
Expand Down Expand Up @@ -577,16 +575,7 @@ def sanitize_array(
subarr = maybe_cast_to_integer_array(data, dtype)

except IntCastingNaNError:
warnings.warn(
"In a future version, passing float-dtype values containing NaN "
"and an integer dtype will raise IntCastingNaNError "
"(subclass of ValueError) instead of silently ignoring the "
"passed dtype. To retain the old behavior, call Series(arr) or "
"DataFrame(arr) without passing a dtype.",
FutureWarning,
stacklevel=find_stack_level(),
)
subarr = np.array(data, copy=copy)
raise
except ValueError:
# Pre-2.0, we would have different behavior for Series vs DataFrame.
# DataFrame would call np.array(data, dtype=dtype, copy=copy),
Expand Down
50 changes: 21 additions & 29 deletions pandas/tests/frame/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pytest
import pytz

from pandas.errors import IntCastingNaNError
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import is_integer_dtype
Expand Down Expand Up @@ -105,16 +106,13 @@ def test_constructor_dict_with_tzaware_scalar(self):
def test_construct_ndarray_with_nas_and_int_dtype(self):
# GH#26919 match Series by not casting np.nan to meaningless int
arr = np.array([[1, np.nan], [2, 3]])
with tm.assert_produces_warning(FutureWarning):
df = DataFrame(arr, dtype="i8")
assert df.values.dtype == arr.dtype
assert isna(df.iloc[0, 1])
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(IntCastingNaNError, match=msg):
DataFrame(arr, dtype="i8")

# check this matches Series behavior
with tm.assert_produces_warning(FutureWarning):
ser = Series(arr[0], dtype="i8", name=0)
expected = df.iloc[0]
tm.assert_series_equal(ser, expected)
with pytest.raises(IntCastingNaNError, match=msg):
Series(arr[0], dtype="i8", name=0)

def test_construct_from_list_of_datetimes(self):
df = DataFrame([datetime.now(), datetime.now()])
Expand Down Expand Up @@ -966,21 +964,16 @@ def _check_basic_constructor(self, empty):
assert len(frame.index) == 3
assert len(frame.columns) == 1

warn = None if empty is np.ones else FutureWarning
with tm.assert_produces_warning(warn):
if empty is not np.ones:
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(IntCastingNaNError, match=msg):
DataFrame(mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64)
return
else:
frame = DataFrame(
mat, columns=["A", "B", "C"], index=[1, 2], dtype=np.int64
)
if empty is np.ones:
# passing dtype casts
assert frame.values.dtype == np.int64
else:
# i.e. ma.masked_all
# Since we have NaNs, refuse to cast to int dtype, which would take NaN
# to meaningless integers. This matches Series behavior. GH#26919
assert frame.isna().all().all()
assert frame.values.dtype == np.float64
assert isna(frame.values).all()

# wrong size axis labels
msg = r"Shape of passed values is \(2, 3\), indices imply \(1, 3\)"
Expand Down Expand Up @@ -1741,11 +1734,10 @@ def test_constructor_mix_series_nonseries(self, float_frame):
DataFrame({"A": float_frame["A"], "B": list(float_frame["B"])[:-2]})

def test_constructor_miscast_na_int_dtype(self):
msg = "float-dtype values containing NaN and an integer dtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
df = DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)
expected = DataFrame([[np.nan, 1], [1, 0]])
tm.assert_frame_equal(df, expected)
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"

with pytest.raises(IntCastingNaNError, match=msg):
DataFrame([[np.nan, 1], [1, 0]], dtype=np.int64)

def test_constructor_column_duplicates(self):
# it works! #2079
Expand Down Expand Up @@ -2735,16 +2727,16 @@ def test_floating_values_integer_dtype(self):

# with NaNs, we go through a different path with a different warning
arr[0, 0] = np.nan
msg = "passing float-dtype values containing NaN"
with tm.assert_produces_warning(FutureWarning, match=msg):
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(IntCastingNaNError, match=msg):
DataFrame(arr, dtype="i8")
with tm.assert_produces_warning(FutureWarning, match=msg):
with pytest.raises(IntCastingNaNError, match=msg):
Series(arr[0], dtype="i8")
# The future (raising) behavior matches what we would get via astype:
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(ValueError, match=msg):
with pytest.raises(IntCastingNaNError, match=msg):
DataFrame(arr).astype("i8")
with pytest.raises(ValueError, match=msg):
with pytest.raises(IntCastingNaNError, match=msg):
Series(arr[0]).astype("i8")


Expand Down
37 changes: 18 additions & 19 deletions pandas/tests/series/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
lib,
)
from pandas.compat import is_numpy_dev
from pandas.errors import IntCastingNaNError
import pandas.util._test_decorators as td

from pandas.core.dtypes.common import (
Expand Down Expand Up @@ -670,10 +671,9 @@ def test_constructor_sanitize(self):
s = Series(np.array([1.0, 1.0, 8.0]), dtype="i8")
assert s.dtype == np.dtype("i8")

msg = "float-dtype values containing NaN and an integer dtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
ser = Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8")
assert ser.dtype == np.dtype("f8")
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(IntCastingNaNError, match=msg):
Series(np.array([1.0, 1.0, np.nan]), copy=True, dtype="i8")

def test_constructor_copy(self):
# GH15125
Expand Down Expand Up @@ -809,18 +809,17 @@ def test_constructor_floating_data_int_dtype(self, frame_or_series):
res = frame_or_series(list(arr), dtype="i8")
tm.assert_equal(res, expected)

# When we have NaNs, we silently ignore the integer dtype
# pre-2.0, when we had NaNs, we silently ignored the integer dtype
arr[0] = np.nan
expected = frame_or_series(arr)
msg = "passing float-dtype values containing NaN and an integer dtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
obj = frame_or_series(arr, dtype="i8")
tm.assert_equal(obj, expected)

with tm.assert_produces_warning(FutureWarning, match=msg):
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(IntCastingNaNError, match=msg):
frame_or_series(arr, dtype="i8")

with pytest.raises(IntCastingNaNError, match=msg):
# same behavior if we pass list instead of the ndarray
obj = frame_or_series(list(arr), dtype="i8")
tm.assert_equal(obj, expected)
frame_or_series(list(arr), dtype="i8")

# float array that can be losslessly cast to integers
arr = np.array([1.0, 2.0], dtype="float64")
Expand Down Expand Up @@ -854,13 +853,13 @@ def test_constructor_invalid_coerce_ints_with_float_nan(self, any_int_numpy_dtyp
# Updated: make sure we treat this list the same as we would treat the
# equivalent ndarray
vals = [1, 2, np.nan]
msg = "In a future version, passing float-dtype values containing NaN"
with tm.assert_produces_warning(FutureWarning, match=msg):
res = Series(vals, dtype=any_int_numpy_dtype)
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = Series(np.array(vals), dtype=any_int_numpy_dtype)
tm.assert_series_equal(res, expected)
assert np.isnan(expected.iloc[-1])
# pre-2.0 this would return with a float dtype, in 2.0 we raise

msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
with pytest.raises(IntCastingNaNError, match=msg):
Series(vals, dtype=any_int_numpy_dtype)
with pytest.raises(IntCastingNaNError, match=msg):
Series(np.array(vals), dtype=any_int_numpy_dtype)

def test_constructor_dtype_no_cast(self):
# see gh-1572
Expand Down
13 changes: 7 additions & 6 deletions pandas/tests/test_downstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pytest

from pandas.errors import IntCastingNaNError
import pandas.util._test_decorators as td

import pandas as pd
Expand Down Expand Up @@ -100,13 +101,13 @@ def test_construct_dask_float_array_int_dtype_match_ndarray():
expected = Series(arr, dtype="i8")
tm.assert_series_equal(res, expected)

msg = "In a future version, passing float-dtype values containing NaN"
msg = r"Cannot convert non-finite values \(NA or inf\) to integer"
arr[2] = np.nan
with tm.assert_produces_warning(FutureWarning, match=msg):
res = Series(darr, dtype="i8")
with tm.assert_produces_warning(FutureWarning, match=msg):
expected = Series(arr, dtype="i8")
tm.assert_series_equal(res, expected)
with pytest.raises(IntCastingNaNError, match=msg):
Series(darr, dtype="i8")
# which is the same as we get with a numpy input
with pytest.raises(IntCastingNaNError, match=msg):
Series(arr, dtype="i8")


def test_xarray(df):
Expand Down