Skip to content

DEPR: Remove silent dropping of nuisance columns in window ops #50576

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jan 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ class Groupby:
["sum", "median", "mean", "max", "min", "kurt", "sum"],
[
("rolling", {"window": 2}),
("rolling", {"window": "30s", "on": "C"}),
("rolling", {"window": "30s"}),
("expanding", {}),
],
)
Expand All @@ -304,9 +304,10 @@ def setup(self, method, window_kwargs):
{
"A": [str(i) for i in range(N)] * 10,
"B": list(range(N)) * 10,
"C": pd.date_range(start="1900-01-01", freq="1min", periods=N * 10),
}
)
if isinstance(kwargs.get("window", None), str):
df.index = pd.date_range(start="1900-01-01", freq="1min", periods=N * 10)
self.groupby_window = getattr(df.groupby("A"), window)(**kwargs)

def time_method(self, method, window_kwargs):
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,7 @@ Removal of prior version deprecations/changes
- Changed default of ``numeric_only`` to ``False`` in all DataFrame methods with that argument (:issue:`46096`, :issue:`46906`)
- Changed default of ``numeric_only`` to ``False`` in :meth:`Series.rank` (:issue:`47561`)
- Enforced deprecation of silently dropping nuisance columns in groupby and resample operations when ``numeric_only=False`` (:issue:`41475`)
- Enforced deprecation of silently dropping nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`ExponentialMovingWindow` ops. This will now raise a :class:`.errors.DataError` (:issue:`42834`)
- Changed behavior in setting values with ``df.loc[:, foo] = bar`` or ``df.iloc[:, foo] = bar``, these now always attempt to set values inplace before falling back to casting (:issue:`45333`)
- Changed default of ``numeric_only`` in various :class:`.DataFrameGroupBy` methods; all methods now default to ``numeric_only=False`` (:issue:`46072`)
- Changed default of ``numeric_only`` to ``False`` in :class:`.Resampler` methods (:issue:`47177`)
Expand Down
33 changes: 9 additions & 24 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
Sized,
cast,
)
import warnings

import numpy as np

Expand All @@ -37,7 +36,6 @@
from pandas.compat._optional import import_optional_dependency
from pandas.errors import DataError
from pandas.util._decorators import doc
from pandas.util._exceptions import find_stack_level

from pandas.core.dtypes.common import (
ensure_float64,
Expand Down Expand Up @@ -473,24 +471,23 @@ def _apply_blockwise(
obj = notna(obj).astype(int)
obj._mgr = obj._mgr.consolidate()

def hfunc(values: ArrayLike) -> ArrayLike:
values = self._prep_values(values)
return homogeneous_func(values)

if self.axis == 1:
obj = obj.T

taker = []
res_values = []
for i, arr in enumerate(obj._iter_column_arrays()):
# GH#42736 operate column-wise instead of block-wise
# As of 2.0, hfunc will raise for nuisance columns
try:
res = hfunc(arr)
except (TypeError, NotImplementedError):
pass
else:
res_values.append(res)
taker.append(i)
arr = self._prep_values(arr)
except (TypeError, NotImplementedError) as err:
raise DataError(
f"Cannot aggregate non-numeric type: {arr.dtype}"
) from err
res = homogeneous_func(arr)
res_values.append(res)
taker.append(i)

index = self._slice_axis_for_step(
obj.index, res_values[0] if len(res_values) > 0 else None
Expand All @@ -505,18 +502,6 @@ def hfunc(values: ArrayLike) -> ArrayLike:
if self.axis == 1:
df = df.T

if 0 != len(res_values) != len(obj.columns):
# GH#42738 ignore_failures dropped nuisance columns
dropped = obj.columns.difference(obj.columns.take(taker))
warnings.warn(
"Dropping of nuisance columns in rolling operations "
"is deprecated; in a future version this will raise TypeError. "
"Select only valid columns before calling the operation. "
f"Dropped columns were {dropped}",
FutureWarning,
stacklevel=find_stack_level(),
)

return self._resolve_output(df, obj)

def _apply_tablewise(
Expand Down
19 changes: 8 additions & 11 deletions pandas/tests/window/test_api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import numpy as np
import pytest

from pandas.errors import SpecificationError
from pandas.errors import (
DataError,
SpecificationError,
)

from pandas import (
DataFrame,
Expand Down Expand Up @@ -66,18 +69,12 @@ def tests_skip_nuisance(step):
tm.assert_frame_equal(result, expected)


def test_skip_sum_object_raises(step):
def test_sum_object_str_raises(step):
df = DataFrame({"A": range(5), "B": range(5, 10), "C": "foo"})
r = df.rolling(window=3, step=step)
msg = r"nuisance columns.*Dropped columns were Index\(\['C'\], dtype='object'\)"
with tm.assert_produces_warning(FutureWarning, match=msg):
# GH#42738
result = r.sum()
expected = DataFrame(
{"A": [np.nan, np.nan, 3, 6, 9], "B": [np.nan, np.nan, 18, 21, 24]},
columns=list("AB"),
)[::step]
tm.assert_frame_equal(result, expected)
with pytest.raises(DataError, match="Cannot aggregate non-numeric type: object"):
# GH#42738, enforced in 2.0
r.sum()


def test_agg(step):
Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/window/test_dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_dataframe_dtypes(method, expected_data, dtypes, min_periods, step):
rolled = df.rolling(2, min_periods=min_periods, step=step)

if dtypes in ("m8[ns]", "M8[ns]", "datetime64[ns, UTC]") and method != "count":
msg = "No numeric types to aggregate"
msg = "Cannot aggregate non-numeric type"
with pytest.raises(DataError, match=msg):
getattr(rolled, method)()
else:
Expand Down
8 changes: 3 additions & 5 deletions pandas/tests/window/test_ewm.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,9 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods):
halflife = halflife_with_times
data = np.arange(10.0)
data[::2] = np.nan
df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)})
with tm.assert_produces_warning(FutureWarning, match="nuisance columns"):
# GH#42738
result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean()
expected = df.ewm(halflife=1.0, min_periods=min_periods).mean()
df = DataFrame({"A": data})
result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean()
expected = df.ewm(halflife=1.0, min_periods=min_periods).mean()
tm.assert_frame_equal(result, expected)


Expand Down
39 changes: 6 additions & 33 deletions pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1125,13 +1125,6 @@ def test_methods(self, method, expected_data):
)
tm.assert_frame_equal(result, expected)

with tm.assert_produces_warning(FutureWarning, match="nuisance"):
# GH#42738
expected = df.groupby("A", group_keys=True).apply(
lambda x: getattr(x.ewm(com=1.0), method)()
)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"method, expected_data",
[["corr", [np.nan, 1.0, 1.0, 1]], ["cov", [np.nan, 0.5, 0.928571, 1.385714]]],
Expand Down Expand Up @@ -1160,13 +1153,9 @@ def test_pairwise_methods(self, method, expected_data):
def test_times(self, times_frame):
# GH 40951
halflife = "23 days"
with tm.assert_produces_warning(FutureWarning, match="nuisance"):
# GH#42738
result = (
times_frame.groupby("A")
.ewm(halflife=halflife, times=times_frame["C"])
.mean()
)
# GH#42738
times = times_frame.pop("C")
result = times_frame.groupby("A").ewm(halflife=halflife, times=times).mean()
expected = DataFrame(
{
"B": [
Expand Down Expand Up @@ -1200,29 +1189,13 @@ def test_times(self, times_frame):
)
tm.assert_frame_equal(result, expected)

def test_times_vs_apply(self, times_frame):
# GH 40951
halflife = "23 days"
with tm.assert_produces_warning(FutureWarning, match="nuisance"):
# GH#42738
result = (
times_frame.groupby("A")
.ewm(halflife=halflife, times=times_frame["C"])
.mean()
)
expected = times_frame.groupby("A", group_keys=True).apply(
lambda x: x.ewm(halflife=halflife, times=x["C"]).mean()
)
tm.assert_frame_equal(result, expected)

def test_times_array(self, times_frame):
# GH 40951
halflife = "23 days"
times = times_frame.pop("C")
gb = times_frame.groupby("A")
with tm.assert_produces_warning(FutureWarning, match="nuisance"):
# GH#42738
result = gb.ewm(halflife=halflife, times=times_frame["C"]).mean()
expected = gb.ewm(halflife=halflife, times=times_frame["C"].values).mean()
result = gb.ewm(halflife=halflife, times=times).mean()
expected = gb.ewm(halflife=halflife, times=times.values).mean()
tm.assert_frame_equal(result, expected)

def test_dont_mutate_obj_after_slicing(self):
Expand Down
22 changes: 8 additions & 14 deletions pandas/tests/window/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,35 +253,32 @@ def test_invalid_engine_kwargs(self, grouper, method):
def test_cython_vs_numba(
self, grouper, method, nogil, parallel, nopython, ignore_na, adjust
):
df = DataFrame({"B": range(4)})
if grouper == "None":
grouper = lambda x: x
warn = FutureWarning
else:
df["A"] = ["a", "b", "a", "b"]
grouper = lambda x: x.groupby("A")
warn = None
if method == "sum":
adjust = True
df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)})
ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na)

engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
with tm.assert_produces_warning(warn, match="nuisance"):
# GH#42738
result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs)
expected = getattr(ewm, method)(engine="cython")
result = getattr(ewm, method)(engine="numba", engine_kwargs=engine_kwargs)
expected = getattr(ewm, method)(engine="cython")

tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("grouper", ["None", "groupby"])
def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na):
# GH 40951

df = DataFrame({"B": [0, 0, 1, 1, 2, 2]})
if grouper == "None":
grouper = lambda x: x
warn = FutureWarning
else:
grouper = lambda x: x.groupby("A")
warn = None
df["A"] = ["a", "b", "a", "b", "b", "a"]

halflife = "23 days"
times = to_datetime(
Expand All @@ -294,17 +291,14 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_
"2020-01-03",
]
)
df = DataFrame({"A": ["a", "b", "a", "b", "b", "a"], "B": [0, 0, 1, 1, 2, 2]})
ewm = grouper(df).ewm(
halflife=halflife, adjust=True, ignore_na=ignore_na, times=times
)

engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}

with tm.assert_produces_warning(warn, match="nuisance"):
# GH#42738
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
expected = ewm.mean(engine="cython")
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs)
expected = ewm.mean(engine="cython")

tm.assert_frame_equal(result, expected)

Expand Down