-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
DEPR: dropping nuisance columns in rolling aggregations #42834
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
121e2df
0c8f332
eb52049
ff82b73
083d21b
4bcbd40
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -32,6 +32,7 @@ | |
from pandas.compat._optional import import_optional_dependency | ||
from pandas.compat.numpy import function as nv | ||
from pandas.util._decorators import doc | ||
from pandas.util._exceptions import find_stack_level | ||
|
||
from pandas.core.dtypes.common import ( | ||
ensure_float64, | ||
|
@@ -436,6 +437,16 @@ def hfunc2d(values: ArrayLike) -> ArrayLike: | |
new_mgr = mgr.apply_2d(hfunc2d, ignore_failures=True) | ||
else: | ||
new_mgr = mgr.apply(hfunc, ignore_failures=True) | ||
|
||
if 0 != len(new_mgr.items) != len(mgr.items): | ||
# GH#42738 ignore_failures dropped nuisance columns | ||
warnings.warn( | ||
"Dropping of nuisance columns in rolling operations " | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you list the columns that are problematic in the error message? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated + green |
||
"is deprecated; in a future version this will raise TypeError. " | ||
"Select only valid columns before calling the operation.", | ||
FutureWarning, | ||
stacklevel=find_stack_level(), | ||
) | ||
out = obj._constructor(new_mgr) | ||
|
||
return self._resolve_output(out, obj) | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -116,8 +116,10 @@ def test_ewma_with_times_equal_spacing(halflife_with_times, times, min_periods): | |||
data = np.arange(10.0) | ||||
data[::2] = np.nan | ||||
df = DataFrame({"A": data, "time_col": date_range("2000", freq="D", periods=10)}) | ||||
result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() | ||||
expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() | ||||
with tm.assert_produces_warning(FutureWarning, match="nuisance columns"): | ||||
# GH#42738 | ||||
result = df.ewm(halflife=halflife, min_periods=min_periods, times=times).mean() | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also should probably add a deprecation here too that allows times to be a string of a column label (which is already kinda flawed because a column label isn't always a string). A column of times will always be a nuisance column I think pandas/pandas/core/window/ewm.py Line 317 in a5f8c9a
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be done separately? this is less obvious to me There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so, yes. |
||||
expected = df.ewm(halflife=1.0, min_periods=min_periods).mean() | ||||
tm.assert_frame_equal(result, expected) | ||||
|
||||
|
||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -170,26 +170,39 @@ def test_invalid_engine_kwargs(self, grouper): | |
engine="cython", engine_kwargs={"nopython": True} | ||
) | ||
|
||
@pytest.mark.parametrize( | ||
"grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] | ||
) | ||
@pytest.mark.parametrize("grouper", ["None", "groupby"]) | ||
def test_cython_vs_numba( | ||
self, grouper, nogil, parallel, nopython, ignore_na, adjust | ||
): | ||
if grouper == "None": | ||
grouper = lambda x: x | ||
warn = FutureWarning | ||
else: | ||
grouper = lambda x: x.groupby("A") | ||
warn = None | ||
|
||
df = DataFrame({"A": ["a", "b", "a", "b"], "B": range(4)}) | ||
ewm = grouper(df).ewm(com=1.0, adjust=adjust, ignore_na=ignore_na) | ||
|
||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) | ||
expected = ewm.mean(engine="cython") | ||
with tm.assert_produces_warning(warn, match="nuisance"): | ||
# GH#42738 | ||
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) | ||
expected = ewm.mean(engine="cython") | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
@pytest.mark.parametrize( | ||
"grouper", [lambda x: x, lambda x: x.groupby("A")], ids=["None", "groupby"] | ||
) | ||
@pytest.mark.parametrize("grouper", ["None", "groupby"]) | ||
def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_na): | ||
# GH 40951 | ||
|
||
if grouper == "None": | ||
grouper = lambda x: x | ||
warn = FutureWarning | ||
else: | ||
grouper = lambda x: x.groupby("A") | ||
warn = None | ||
|
||
halflife = "23 days" | ||
times = to_datetime( | ||
[ | ||
|
@@ -207,8 +220,12 @@ def test_cython_vs_numba_times(self, grouper, nogil, parallel, nopython, ignore_ | |
) | ||
|
||
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython} | ||
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) | ||
expected = ewm.mean(engine="cython") | ||
|
||
# TODO: why only in these cases? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Because column "A" is string dtype and in the non-group case there's an attempt to agg over the column, while in the groupby case, the grouping column should be dropped internally before hand |
||
with tm.assert_produces_warning(warn, match="nuisance"): | ||
# GH#42738 | ||
result = ewm.mean(engine="numba", engine_kwargs=engine_kwargs) | ||
expected = ewm.mean(engine="cython") | ||
|
||
tm.assert_frame_equal(result, expected) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good to include Expanding and EWM as well.