From 8ac452cafd6f5710b495d205036fae3f9d615e65 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Sat, 4 Jun 2022 09:59:04 -0400 Subject: [PATCH 1/2] ENH: Add numeric_only to window ops --- doc/source/whatsnew/v1.5.0.rst | 3 + pandas/core/window/doc.py | 9 + pandas/core/window/ewm.py | 67 ++++-- pandas/core/window/expanding.py | 103 +++++++-- pandas/core/window/rolling.py | 310 +++++++++++++++++++++----- pandas/tests/window/conftest.py | 6 + pandas/tests/window/test_ewm.py | 37 +++ pandas/tests/window/test_expanding.py | 34 +++ pandas/tests/window/test_rolling.py | 33 +++ 9 files changed, 515 insertions(+), 87 deletions(-) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 8a7ad077c2a90..d7ebf1abb85da 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -651,6 +651,9 @@ gained the ``numeric_only`` argument. - :meth:`.Resampler.sem` - :meth:`.Resampler.std` - :meth:`.Resampler.var` +- :meth:`DataFrame.rolling` operations +- :meth:`DataFrame.expanding` operations +- :meth:`DataFrame.ewm` operations .. _whatsnew_150.deprecations.other: diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index 930c12841e4e4..61cfa29ffc481 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -29,6 +29,15 @@ def create_section_header(header: str) -> str: """ ).replace("\n", "", 1) +kwargs_numeric_only = dedent( + """ + numeric_only : bool, default False + Include only float, int, boolean columns. + + .. versionadded:: 1.5.0\n + """ +).replace("\n", "", 1) + args_compat = dedent( """ *args diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index d2b4db75f839b..bceb4622152f1 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -26,7 +26,10 @@ from pandas.util._decorators import doc from pandas.util._exceptions import find_stack_level -from pandas.core.dtypes.common import is_datetime64_ns_dtype +from pandas.core.dtypes.common import ( + is_datetime64_ns_dtype, + is_numeric_dtype, +) from pandas.core.dtypes.missing import isna import pandas.core.common as common # noqa: PDF018 @@ -45,6 +48,7 @@ args_compat, create_section_header, kwargs_compat, + kwargs_numeric_only, numba_notes, template_header, template_returns, @@ -518,6 +522,7 @@ def aggregate(self, func, *args, **kwargs): @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -531,7 +536,14 @@ def aggregate(self, func, *args, **kwargs): aggregation_description="(exponential weighted moment) mean", agg_method="mean", ) - def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): + def mean( + self, + numeric_only: bool = False, + *args, + engine=None, + engine_kwargs=None, + **kwargs, + ): if maybe_use_numba(engine): if self.method == "single": func = generate_numba_ewm_func @@ -545,7 +557,7 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): deltas=tuple(self._deltas), normalize=True, ) - return self._apply(ewm_func) + return self._apply(ewm_func, name="mean") elif engine in ("cython", None): if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") @@ -560,13 +572,14 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): deltas=deltas, normalize=True, ) - return self._apply(window_func) + return self._apply(window_func, name="mean", numeric_only=numeric_only) else: raise ValueError("engine must be either 'numba' or 'cython'") @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -580,7 +593,14 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): aggregation_description="(exponential weighted moment) sum", agg_method="sum", ) - def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): + def sum( + self, + numeric_only: bool = False, + *args, + engine=None, + engine_kwargs=None, + **kwargs, + ): if not self.adjust: raise NotImplementedError("sum is not implemented with adjust=False") if maybe_use_numba(engine): @@ -596,7 +616,7 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): deltas=tuple(self._deltas), normalize=False, ) - return self._apply(ewm_func) + return self._apply(ewm_func, name="sum") elif engine in ("cython", None): if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") @@ -611,7 +631,7 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): deltas=deltas, normalize=False, ) - return self._apply(window_func) + return self._apply(window_func, name="sum", numeric_only=numeric_only) else: raise ValueError("engine must be either 'numba' or 'cython'") @@ -624,6 +644,7 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): Use a standard estimation bias correction. """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, kwargs_compat, create_section_header("Returns"), @@ -634,9 +655,18 @@ def sum(self, *args, engine=None, engine_kwargs=None, **kwargs): aggregation_description="(exponential weighted moment) standard deviation", agg_method="std", ) - def std(self, bias: bool = False, *args, **kwargs): + def std(self, bias: bool = False, numeric_only: bool = False, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - return zsqrt(self.var(bias=bias, **kwargs)) + if ( + numeric_only + and self._selected_obj.ndim == 1 + and not is_numeric_dtype(self._selected_obj.dtype) + ): + # Raise directly so error message says std instead of var + raise NotImplementedError( + f"{type(self).__name__}.std does not implement numeric_only" + ) + return zsqrt(self.var(bias=bias, numeric_only=numeric_only, **kwargs)) def vol(self, bias: bool = False, *args, **kwargs): warnings.warn( @@ -658,6 +688,7 @@ def vol(self, bias: bool = False, *args, **kwargs): Use a standard estimation bias correction. """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, kwargs_compat, create_section_header("Returns"), @@ -668,7 +699,7 @@ def vol(self, bias: bool = False, *args, **kwargs): aggregation_description="(exponential weighted moment) variance", agg_method="var", ) - def var(self, bias: bool = False, *args, **kwargs): + def var(self, bias: bool = False, numeric_only: bool = False, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = window_aggregations.ewmcov wfunc = partial( @@ -682,7 +713,7 @@ def var(self, bias: bool = False, *args, **kwargs): def var_func(values, begin, end, min_periods): return wfunc(values, begin, end, min_periods, values) - return self._apply(var_func) + return self._apply(var_func, name="var", numeric_only=numeric_only) @doc( template_header, @@ -703,6 +734,7 @@ def var_func(values, begin, end, min_periods): Use a standard estimation bias correction. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -717,6 +749,7 @@ def cov( other: DataFrame | Series | None = None, pairwise: bool | None = None, bias: bool = False, + numeric_only: bool = False, **kwargs, ): from pandas import Series @@ -752,7 +785,9 @@ def cov_func(x, y): ) return Series(result, index=x.index, name=x.name) - return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) + return self._apply_pairwise( + self._selected_obj, other, pairwise, cov_func, numeric_only + ) @doc( template_header, @@ -771,6 +806,7 @@ def cov_func(x, y): observations will be used. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -784,6 +820,7 @@ def corr( self, other: DataFrame | Series | None = None, pairwise: bool | None = None, + numeric_only: bool = False, **kwargs, ): from pandas import Series @@ -825,7 +862,9 @@ def _cov(X, Y): result = cov / zsqrt(x_var * y_var) return Series(result, index=x.index, name=x.name) - return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) + return self._apply_pairwise( + self._selected_obj, other, pairwise, cov_func, numeric_only + ) class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow): @@ -921,6 +960,7 @@ def corr( self, other: DataFrame | Series | None = None, pairwise: bool | None = None, + numeric_only: bool = False, **kwargs, ): return NotImplementedError @@ -930,6 +970,7 @@ def cov( other: DataFrame | Series | None = None, pairwise: bool | None = None, bias: bool = False, + numeric_only: bool = False, **kwargs, ): return NotImplementedError diff --git a/pandas/core/window/expanding.py b/pandas/core/window/expanding.py index 36a1da0dbf837..7f9dfece959de 100644 --- a/pandas/core/window/expanding.py +++ b/pandas/core/window/expanding.py @@ -29,6 +29,7 @@ args_compat, create_section_header, kwargs_compat, + kwargs_numeric_only, numba_notes, template_header, template_returns, @@ -192,8 +193,8 @@ def aggregate(self, func, *args, **kwargs): aggregation_description="count of non NaN observations", agg_method="count", ) - def count(self): - return super().count() + def count(self, numeric_only: bool = False): + return super().count(numeric_only=numeric_only) @doc( template_header, @@ -228,6 +229,7 @@ def apply( @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -243,17 +245,24 @@ def apply( ) def sum( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("sum", args, kwargs) - return super().sum(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().sum( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -269,17 +278,24 @@ def sum( ) def max( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("max", args, kwargs) - return super().max(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().max( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -295,17 +311,24 @@ def max( ) def min( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("min", args, kwargs) - return super().min(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().min( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -321,17 +344,24 @@ def min( ) def mean( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_expanding_func("mean", args, kwargs) - return super().mean(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().mean( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), @@ -346,11 +376,17 @@ def mean( ) def median( self, + numeric_only: bool = False, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): - return super().median(engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().median( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, @@ -362,6 +398,7 @@ def median( is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, window_agg_numba_parameters("1.4"), kwargs_compat, @@ -402,6 +439,7 @@ def median( def std( self, ddof: int = 1, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -409,7 +447,11 @@ def std( ): nv.validate_expanding_func("std", args, kwargs) return super().std( - ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, ) @doc( @@ -422,6 +464,7 @@ def std( is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, window_agg_numba_parameters("1.4"), kwargs_compat, @@ -462,6 +505,7 @@ def std( def var( self, ddof: int = 1, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -469,7 +513,11 @@ def var( ): nv.validate_expanding_func("var", args, kwargs) return super().var( - ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, ) @doc( @@ -482,6 +530,7 @@ def var( is ``N - ddof``, where ``N`` represents the number of elements.\n """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, kwargs_compat, create_section_header("Returns"), @@ -513,6 +562,7 @@ def sem(self, ddof: int = 1, *args, **kwargs): @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -525,12 +575,13 @@ def sem(self, ddof: int = 1, *args, **kwargs): aggregation_description="unbiased skewness", agg_method="skew", ) - def skew(self, **kwargs): - return super().skew(**kwargs) + def skew(self, numeric_only: bool = False, **kwargs): + return super().skew(numeric_only=numeric_only, **kwargs) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -565,8 +616,8 @@ def skew(self, **kwargs): aggregation_description="Fisher's definition of kurtosis without bias", agg_method="kurt", ) - def kurt(self, **kwargs): - return super().kurt(**kwargs) + def kurt(self, numeric_only: bool = False, **kwargs): + return super().kurt(numeric_only=numeric_only, **kwargs) @doc( template_header, @@ -587,6 +638,7 @@ def kurt(self, **kwargs): * midpoint: (`i` + `j`) / 2. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -600,11 +652,13 @@ def quantile( self, quantile: float, interpolation: str = "linear", + numeric_only: bool = False, **kwargs, ): return super().quantile( quantile=quantile, interpolation=interpolation, + numeric_only=numeric_only, **kwargs, ) @@ -628,6 +682,7 @@ def quantile( form. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -674,12 +729,14 @@ def rank( method: WindowingRankType = "average", ascending: bool = True, pct: bool = False, + numeric_only: bool = False, **kwargs, ): return super().rank( method=method, ascending=ascending, pct=pct, + numeric_only=numeric_only, **kwargs, ) @@ -703,6 +760,7 @@ def rank( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -717,9 +775,16 @@ def cov( other: DataFrame | Series | None = None, pairwise: bool | None = None, ddof: int = 1, + numeric_only: bool = False, **kwargs, ): - return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + return super().cov( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) @doc( template_header, @@ -738,6 +803,7 @@ def cov( observations will be used. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -782,9 +848,16 @@ def corr( other: DataFrame | Series | None = None, pairwise: bool | None = None, ddof: int = 1, + numeric_only: bool = False, **kwargs, ): - return super().corr(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + return super().corr( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) class ExpandingGroupby(BaseWindowGroupby, Expanding): diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 9e8f95cf340c4..0ca8557cfb8ea 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -42,6 +42,7 @@ is_bool, is_integer, is_list_like, + is_numeric_dtype, is_scalar, needs_i8_conversion, ) @@ -84,6 +85,7 @@ args_compat, create_section_header, kwargs_compat, + kwargs_numeric_only, kwargs_scipy, numba_notes, template_header, @@ -258,18 +260,53 @@ def _slice_axis_for_step(self, index: Index, result: Sized | None = None) -> Ind else index[:: self.step] ) - def _create_data(self, obj: NDFrameT) -> NDFrameT: + def _validate_numeric_only(self, name: str, numeric_only: bool) -> None: + """ + Validate numeric_only argument, raising if invalid for the input. + + Parameters + ---------- + name : str + Name of the operator (kernel). + numeric_only : bool + Value passed by user. + """ + if ( + self._selected_obj.ndim == 1 + and numeric_only + and not is_numeric_dtype(self._selected_obj.dtype) + ): + raise NotImplementedError( + f"{type(self).__name__}.{name} does not implement numeric_only" + ) + + def _make_numeric_only(self, obj: NDFrameT) -> NDFrameT: + """Subset DataFrame to numeric columns. + + Parameters + ---------- + obj : DataFrame + + Returns + ------- + obj subset to numeric-only columns. + """ + result = obj.select_dtypes(include=["number"], exclude=["timedelta"]) + return result + + def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: """ Split data into blocks & return conformed data. """ # filter out the on from the object if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - if self.axis == 1: + if numeric_only or self.axis == 1: # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything # to float to calculate the complete row at once. We exclude all non-numeric # dtypes. - obj = obj.select_dtypes(include=["number"], exclude=["timedelta"]) + obj = self._make_numeric_only(obj) + if self.axis == 1: obj = obj.astype("float64", copy=False) obj._mgr = obj._mgr.consolidate() return obj @@ -451,16 +488,20 @@ def _apply_series( return obj._constructor(result, index=index, name=obj.name) def _apply_blockwise( - self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None + self, + homogeneous_func: Callable[..., ArrayLike], + name: str, + numeric_only: bool = False, ) -> DataFrame | Series: """ Apply the given function to the DataFrame broken down into homogeneous sub-frames. """ + self._validate_numeric_only(name, numeric_only) if self._selected_obj.ndim == 1: return self._apply_series(homogeneous_func, name) - obj = self._create_data(self._selected_obj) + obj = self._create_data(self._selected_obj, numeric_only) if name == "count": # GH 12541: Special case for count where we support date-like types obj = notna(obj).astype(int) @@ -513,14 +554,17 @@ def hfunc(values: ArrayLike) -> ArrayLike: return self._resolve_output(df, obj) def _apply_tablewise( - self, homogeneous_func: Callable[..., ArrayLike], name: str | None = None + self, + homogeneous_func: Callable[..., ArrayLike], + name: str | None = None, + numeric_only: bool = False, ) -> DataFrame | Series: """ Apply the given function to the DataFrame across the entire object """ if self._selected_obj.ndim == 1: raise ValueError("method='table' not applicable for Series objects.") - obj = self._create_data(self._selected_obj) + obj = self._create_data(self._selected_obj, numeric_only) values = self._prep_values(obj.to_numpy()) values = values.T if self.axis == 1 else values result = homogeneous_func(values) @@ -541,23 +585,28 @@ def _apply_pairwise( other: DataFrame | Series | None, pairwise: bool | None, func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series], + numeric_only: bool, ) -> DataFrame | Series: """ Apply the given pairwise function given 2 pandas objects (DataFrame/Series) """ + target = self._create_data(target, numeric_only) if other is None: other = target # only default unset pairwise = True if pairwise is None else pairwise elif not isinstance(other, (ABCDataFrame, ABCSeries)): raise ValueError("other must be a DataFrame or Series") + elif other.ndim == 2 and numeric_only: + other = self._make_numeric_only(other) return flex_binary_moment(target, other, func, pairwise=bool(pairwise)) def _apply( self, func: Callable[..., Any], - name: str | None = None, + name: str, + numeric_only: bool = False, numba_args: tuple[Any, ...] = (), **kwargs, ): @@ -610,9 +659,9 @@ def calc(x): return result if self.method == "single": - return self._apply_blockwise(homogeneous_func, name) + return self._apply_blockwise(homogeneous_func, name, numeric_only) else: - return self._apply_tablewise(homogeneous_func, name) + return self._apply_tablewise(homogeneous_func, name, numeric_only) def _numba_apply( self, @@ -699,13 +748,15 @@ def __init__( def _apply( self, func: Callable[..., Any], - name: str | None = None, + name: str, + numeric_only: bool = False, numba_args: tuple[Any, ...] = (), **kwargs, ) -> DataFrame | Series: result = super()._apply( func, name, + numeric_only, numba_args, **kwargs, ) @@ -761,14 +812,14 @@ def _apply_pairwise( other: DataFrame | Series | None, pairwise: bool | None, func: Callable[[DataFrame | Series, DataFrame | Series], DataFrame | Series], + numeric_only: bool, ) -> DataFrame | Series: """ Apply the given pairwise function given 2 pandas objects (DataFrame/Series) """ # Manually drop the grouping column first target = target.drop(columns=self._grouper.names, errors="ignore") - target = self._create_data(target) - result = super()._apply_pairwise(target, other, pairwise, func) + result = super()._apply_pairwise(target, other, pairwise, func, numeric_only) # 1) Determine the levels + codes of the groupby levels if other is not None and not all( len(group) == len(other) for group in self._grouper.indices.values() @@ -839,7 +890,7 @@ def _apply_pairwise( result.index = result_index return result - def _create_data(self, obj: NDFrameT) -> NDFrameT: + def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: """ Split data into blocks & return conformed data. """ @@ -851,7 +902,7 @@ def _create_data(self, obj: NDFrameT) -> NDFrameT: np.int64 ) obj = obj.take(groupby_order) - return super()._create_data(obj) + return super()._create_data(obj, numeric_only) def _gotitem(self, key, ndim, subset=None): # we are setting the index on the actual object @@ -1137,7 +1188,8 @@ def _center_window(self, result: np.ndarray, offset: int) -> np.ndarray: def _apply( self, func: Callable[[np.ndarray, int, int], np.ndarray], - name: str | None = None, + name: str, + numeric_only: bool = False, numba_args: tuple[Any, ...] = (), **kwargs, ): @@ -1150,6 +1202,8 @@ def _apply( ---------- func : callable function to apply name : str, + numeric_only : bool, default False + Whether to only operate on bool, int, and float columns numba_args : tuple unused **kwargs @@ -1185,7 +1239,7 @@ def calc(x): return result - return self._apply_blockwise(homogeneous_func, name)[:: self.step] + return self._apply_blockwise(homogeneous_func, name, numeric_only)[:: self.step] @doc( _shared_docs["aggregate"], @@ -1232,6 +1286,7 @@ def aggregate(self, func, *args, **kwargs): @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), template_returns, @@ -1241,17 +1296,23 @@ def aggregate(self, func, *args, **kwargs): aggregation_description="weighted window sum", agg_method="sum", ) - def sum(self, *args, **kwargs): + def sum(self, numeric_only: bool = False, *args, **kwargs): nv.validate_window_func("sum", args, kwargs) window_func = window_aggregations.roll_weighted_sum # error: Argument 1 to "_apply" of "Window" has incompatible type # "Callable[[ndarray, ndarray, int], ndarray]"; expected # "Callable[[ndarray, int, int], ndarray]" - return self._apply(window_func, name="sum", **kwargs) # type: ignore[arg-type] + return self._apply( + window_func, # type: ignore[arg-type] + name="sum", + numeric_only=numeric_only, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), template_returns, @@ -1261,18 +1322,24 @@ def sum(self, *args, **kwargs): aggregation_description="weighted window mean", agg_method="mean", ) - def mean(self, *args, **kwargs): + def mean(self, numeric_only: bool = False, *args, **kwargs): nv.validate_window_func("mean", args, kwargs) window_func = window_aggregations.roll_weighted_mean # error: Argument 1 to "_apply" of "Window" has incompatible type # "Callable[[ndarray, ndarray, int], ndarray]"; expected # "Callable[[ndarray, int, int], ndarray]" - return self._apply(window_func, name="mean", **kwargs) # type: ignore[arg-type] + return self._apply( + window_func, # type: ignore[arg-type] + name="mean", + numeric_only=numeric_only, + **kwargs, + ) @doc( template_header, ".. versionadded:: 1.0.0 \n\n", create_section_header("Parameters"), + kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), template_returns, @@ -1282,16 +1349,17 @@ def mean(self, *args, **kwargs): aggregation_description="weighted window variance", agg_method="var", ) - def var(self, ddof: int = 1, *args, **kwargs): + def var(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): nv.validate_window_func("var", args, kwargs) window_func = partial(window_aggregations.roll_weighted_var, ddof=ddof) kwargs.pop("name", None) - return self._apply(window_func, name="var", **kwargs) + return self._apply(window_func, name="var", numeric_only=numeric_only, **kwargs) @doc( template_header, ".. versionadded:: 1.0.0 \n\n", create_section_header("Parameters"), + kwargs_numeric_only, kwargs_scipy, create_section_header("Returns"), template_returns, @@ -1301,15 +1369,17 @@ def var(self, ddof: int = 1, *args, **kwargs): aggregation_description="weighted window standard deviation", agg_method="std", ) - def std(self, ddof: int = 1, *args, **kwargs): + def std(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): nv.validate_window_func("std", args, kwargs) - return zsqrt(self.var(ddof=ddof, name="std", **kwargs)) + return zsqrt( + self.var(ddof=ddof, name="std", numeric_only=numeric_only, **kwargs) + ) class RollingAndExpandingMixin(BaseWindow): - def count(self): + def count(self, numeric_only: bool = False): window_func = window_aggregations.roll_sum - return self._apply(window_func, name="count") + return self._apply(window_func, name="count", numeric_only=numeric_only) def apply( self, @@ -1350,6 +1420,7 @@ def apply( return self._apply( apply_func, + name="apply", numba_args=numba_args, ) @@ -1380,6 +1451,7 @@ def apply_func(values, begin, end, min_periods, raw=raw): def sum( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -1400,10 +1472,11 @@ def sum( return self._numba_apply(sliding_sum, engine_kwargs) window_func = window_aggregations.roll_sum - return self._apply(window_func, name="sum", **kwargs) + return self._apply(window_func, name="sum", numeric_only=numeric_only, **kwargs) def max( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -1424,10 +1497,11 @@ def max( return self._numba_apply(sliding_min_max, engine_kwargs, True) window_func = window_aggregations.roll_max - return self._apply(window_func, name="max", **kwargs) + return self._apply(window_func, name="max", numeric_only=numeric_only, **kwargs) def min( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -1448,10 +1522,11 @@ def min( return self._numba_apply(sliding_min_max, engine_kwargs, False) window_func = window_aggregations.roll_min - return self._apply(window_func, name="min", **kwargs) + return self._apply(window_func, name="min", numeric_only=numeric_only, **kwargs) def mean( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -1472,10 +1547,13 @@ def mean( return self._numba_apply(sliding_mean, engine_kwargs) window_func = window_aggregations.roll_mean - return self._apply(window_func, name="mean", **kwargs) + return self._apply( + window_func, name="mean", numeric_only=numeric_only, **kwargs + ) def median( self, + numeric_only: bool = False, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, @@ -1493,11 +1571,14 @@ def median( engine_kwargs=engine_kwargs, ) window_func = window_aggregations.roll_median_c - return self._apply(window_func, name="median", **kwargs) + return self._apply( + window_func, name="median", numeric_only=numeric_only, **kwargs + ) def std( self, ddof: int = 1, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -1519,12 +1600,14 @@ def zsqrt_func(values, begin, end, min_periods): return self._apply( zsqrt_func, name="std", + numeric_only=numeric_only, **kwargs, ) def var( self, ddof: int = 1, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -1542,29 +1625,43 @@ def var( return self._apply( window_func, name="var", + numeric_only=numeric_only, **kwargs, ) - def skew(self, **kwargs): + def skew(self, numeric_only: bool = False, **kwargs): window_func = window_aggregations.roll_skew return self._apply( window_func, name="skew", + numeric_only=numeric_only, **kwargs, ) - def sem(self, ddof: int = 1, *args, **kwargs): - return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) + def sem(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): + nv.validate_rolling_func("sem", args, kwargs) + # Raise here so error message says sem instead of std + self._validate_numeric_only("sem", numeric_only) + return self.std(numeric_only=numeric_only, **kwargs) / ( + self.count(numeric_only=numeric_only) - ddof + ).pow(0.5) - def kurt(self, **kwargs): + def kurt(self, numeric_only: bool = False, **kwargs): window_func = window_aggregations.roll_kurt return self._apply( window_func, name="kurt", + numeric_only=numeric_only, **kwargs, ) - def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): + def quantile( + self, + quantile: float, + interpolation: str = "linear", + numeric_only: bool = False, + **kwargs, + ): if quantile == 1.0: window_func = window_aggregations.roll_max elif quantile == 0.0: @@ -1576,13 +1673,16 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): interpolation=interpolation, ) - return self._apply(window_func, name="quantile", **kwargs) + return self._apply( + window_func, name="quantile", numeric_only=numeric_only, **kwargs + ) def rank( self, method: WindowingRankType = "average", ascending: bool = True, pct: bool = False, + numeric_only: bool = False, **kwargs, ): window_func = partial( @@ -1592,13 +1692,16 @@ def rank( percentile=pct, ) - return self._apply(window_func, name="rank", **kwargs) + return self._apply( + window_func, name="rank", numeric_only=numeric_only, **kwargs + ) def cov( self, other: DataFrame | Series | None = None, pairwise: bool | None = None, ddof: int = 1, + numeric_only: bool = False, **kwargs, ): if self.step is not None: @@ -1636,13 +1739,16 @@ def cov_func(x, y): result = (mean_x_y - mean_x * mean_y) * (count_x_y / (count_x_y - ddof)) return Series(result, index=x.index, name=x.name) - return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func) + return self._apply_pairwise( + self._selected_obj, other, pairwise, cov_func, numeric_only + ) def corr( self, other: DataFrame | Series | None = None, pairwise: bool | None = None, ddof: int = 1, + numeric_only: bool = False, **kwargs, ): if self.step is not None: @@ -1690,7 +1796,9 @@ def corr_func(x, y): result = numerator / denominator return Series(result, index=x.index, name=x.name) - return self._apply_pairwise(self._selected_obj, other, pairwise, corr_func) + return self._apply_pairwise( + self._selected_obj, other, pairwise, corr_func, numeric_only + ) class Rolling(RollingAndExpandingMixin): @@ -1815,6 +1923,8 @@ def aggregate(self, func, *args, **kwargs): @doc( template_header, + create_section_header("Parameters"), + kwargs_numeric_only, create_section_header("Returns"), template_returns, create_section_header("See Also"), @@ -1847,7 +1957,7 @@ def aggregate(self, func, *args, **kwargs): aggregation_description="count of non NaN observations", agg_method="count", ) - def count(self): + def count(self, numeric_only: bool = False): if self.min_periods is None: warnings.warn( ( @@ -1862,7 +1972,7 @@ def count(self): result = super().count() self.min_periods = None else: - result = super().count() + result = super().count(numeric_only) return result @doc( @@ -1898,6 +2008,7 @@ def apply( @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -1961,17 +2072,24 @@ def apply( ) def sum( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_rolling_func("sum", args, kwargs) - return super().sum(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().sum( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -1987,17 +2105,24 @@ def sum( ) def max( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_rolling_func("max", args, kwargs) - return super().max(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().max( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -2028,17 +2153,24 @@ def max( ) def min( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_rolling_func("min", args, kwargs) - return super().min(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().min( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, args_compat, window_agg_numba_parameters(), kwargs_compat, @@ -2076,17 +2208,24 @@ def min( ) def mean( self, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): nv.validate_rolling_func("mean", args, kwargs) - return super().mean(*args, engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().mean( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, window_agg_numba_parameters(), kwargs_compat, create_section_header("Returns"), @@ -2116,11 +2255,17 @@ def mean( ) def median( self, + numeric_only: bool = False, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, **kwargs, ): - return super().median(engine=engine, engine_kwargs=engine_kwargs, **kwargs) + return super().median( + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, + ) @doc( template_header, @@ -2132,6 +2277,7 @@ def median( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, window_agg_numba_parameters("1.4"), kwargs_compat, @@ -2171,6 +2317,7 @@ def median( def std( self, ddof: int = 1, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -2178,7 +2325,11 @@ def std( ): nv.validate_rolling_func("std", args, kwargs) return super().std( - ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, ) @doc( @@ -2191,6 +2342,7 @@ def std( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, window_agg_numba_parameters("1.4"), kwargs_compat, @@ -2230,6 +2382,7 @@ def std( def var( self, ddof: int = 1, + numeric_only: bool = False, *args, engine: str | None = None, engine_kwargs: dict[str, bool] | None = None, @@ -2237,12 +2390,17 @@ def var( ): nv.validate_rolling_func("var", args, kwargs) return super().var( - ddof=ddof, engine=engine, engine_kwargs=engine_kwargs, **kwargs + ddof=ddof, + numeric_only=numeric_only, + engine=engine, + engine_kwargs=engine_kwargs, + **kwargs, ) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -2255,8 +2413,8 @@ def var( aggregation_description="unbiased skewness", agg_method="skew", ) - def skew(self, **kwargs): - return super().skew(**kwargs) + def skew(self, numeric_only: bool = False, **kwargs): + return super().skew(numeric_only=numeric_only, **kwargs) @doc( template_header, @@ -2268,6 +2426,7 @@ def skew(self, **kwargs): is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + kwargs_numeric_only, args_compat, kwargs_compat, create_section_header("Returns"), @@ -2292,12 +2451,18 @@ def skew(self, **kwargs): aggregation_description="standard error of mean", agg_method="sem", ) - def sem(self, ddof: int = 1, *args, **kwargs): - return self.std(*args, **kwargs) / (self.count() - ddof).pow(0.5) + def sem(self, ddof: int = 1, numeric_only: bool = False, *args, **kwargs): + nv.validate_rolling_func("sem", args, kwargs) + # Raise here so error message says sem instead of std + self._validate_numeric_only("sem", numeric_only) + return self.std(numeric_only=numeric_only, **kwargs) / ( + self.count(numeric_only) - ddof + ).pow(0.5) @doc( template_header, create_section_header("Parameters"), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -2332,8 +2497,8 @@ def sem(self, ddof: int = 1, *args, **kwargs): aggregation_description="Fisher's definition of kurtosis without bias", agg_method="kurt", ) - def kurt(self, **kwargs): - return super().kurt(**kwargs) + def kurt(self, numeric_only: bool = False, **kwargs): + return super().kurt(numeric_only=numeric_only, **kwargs) @doc( template_header, @@ -2354,6 +2519,7 @@ def kurt(self, **kwargs): * midpoint: (`i` + `j`) / 2. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -2382,10 +2548,17 @@ def kurt(self, **kwargs): aggregation_description="quantile", agg_method="quantile", ) - def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): + def quantile( + self, + quantile: float, + interpolation: str = "linear", + numeric_only: bool = False, + **kwargs, + ): return super().quantile( quantile=quantile, interpolation=interpolation, + numeric_only=numeric_only, **kwargs, ) @@ -2409,6 +2582,7 @@ def quantile(self, quantile: float, interpolation: str = "linear", **kwargs): form. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -2455,12 +2629,14 @@ def rank( method: WindowingRankType = "average", ascending: bool = True, pct: bool = False, + numeric_only: bool = False, **kwargs, ): return super().rank( method=method, ascending=ascending, pct=pct, + numeric_only=numeric_only, **kwargs, ) @@ -2484,6 +2660,7 @@ def rank( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -2498,9 +2675,16 @@ def cov( other: DataFrame | Series | None = None, pairwise: bool | None = None, ddof: int = 1, + numeric_only: bool = False, **kwargs, ): - return super().cov(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + return super().cov( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) @doc( template_header, @@ -2522,6 +2706,7 @@ def cov( is ``N - ddof``, where ``N`` represents the number of elements. """ ).replace("\n", "", 1), + kwargs_numeric_only, kwargs_compat, create_section_header("Returns"), template_returns, @@ -2623,9 +2808,16 @@ def corr( other: DataFrame | Series | None = None, pairwise: bool | None = None, ddof: int = 1, + numeric_only: bool = False, **kwargs, ): - return super().corr(other=other, pairwise=pairwise, ddof=ddof, **kwargs) + return super().corr( + other=other, + pairwise=pairwise, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) Rolling.__doc__ = Window.__doc__ diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 8977d1a0d9d1b..d05bb3d51bcde 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -84,6 +84,12 @@ def ignore_na(request): return request.param +@pytest.fixture(params=[True, False]) +def numeric_only(request): + """numeric_only keyword argument""" + return request.param + + @pytest.fixture(params=[pytest.param("numba", marks=td.skip_if_no("numba")), "cython"]) def engine(request): """engine keyword argument for rolling.apply""" diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 66cd36d121750..83b86fba6432b 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -666,3 +666,40 @@ def test_ewm_pairwise_cov_corr(func, frame): result.index = result.index.droplevel(1) expected = getattr(frame[1].ewm(span=10, min_periods=5), func)(frame[5]) tm.assert_series_equal(result, expected, check_names=False) + + +def test_numeric_only_frame(arithmetic_win_operators, numeric_only): + # GH#46560 + kernel = arithmetic_win_operators + df = DataFrame({"a": [1], "b": 2, "c": 3}) + df["c"] = df["c"].astype(object) + ewm = df.ewm(span=2, min_periods=1) + op = getattr(ewm, kernel, None) + if op is not None: + result = op(numeric_only=numeric_only) + + columns = ["a", "b"] if numeric_only else ["a", "b", "c"] + expected = df[columns].agg([kernel]).reset_index(drop=True).astype(float) + assert list(expected.columns) == columns + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [int, object]) +def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): + # GH#46560 + kernel = arithmetic_win_operators + ser = Series([1], dtype=dtype) + ewm = ser.ewm(span=2, min_periods=1) + op = getattr(ewm, kernel, None) + if op is None: + # Nothing to test + return + if numeric_only and dtype is object: + msg = f"ExponentialMovingWindow.{kernel} does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): + op(numeric_only=numeric_only) + else: + result = op(numeric_only=numeric_only) + expected = ser.agg([kernel]).reset_index(drop=True).astype(float) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 7ba81e84dfe3e..7f04593846fc1 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -651,3 +651,37 @@ def mean_w_arg(x, const): result = df.expanding().apply(mean_w_arg, raw=raw, kwargs={"const": 20}) tm.assert_frame_equal(result, expected) + + +def test_numeric_only_frame(arithmetic_win_operators, numeric_only): + # GH#46560 + kernel = arithmetic_win_operators + df = DataFrame({"a": [1], "b": 2, "c": 3}) + df["c"] = df["c"].astype(object) + expanding = df.expanding() + op = getattr(expanding, kernel, None) + if op is not None: + result = op(numeric_only=numeric_only) + + columns = ["a", "b"] if numeric_only else ["a", "b", "c"] + expected = df[columns].agg([kernel]).reset_index(drop=True).astype(float) + assert list(expected.columns) == columns + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [int, object]) +def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): + # GH#46560 + kernel = arithmetic_win_operators + ser = Series([1], dtype=dtype) + expanding = ser.expanding() + op = getattr(expanding, kernel) + if numeric_only and dtype is object: + msg = f"Expanding.{kernel} does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): + op(numeric_only=numeric_only) + else: + result = op(numeric_only=numeric_only) + expected = ser.agg([kernel]).reset_index(drop=True).astype(float) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 4c26cfb95fd85..6a45f5f7bb1da 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1871,3 +1871,36 @@ def test_rolling_skew_kurt_floating_artifacts(): assert (result[-2:] == 0).all() result = r.kurt() assert (result[-2:] == -3).all() + + +def test_numeric_only_frame(arithmetic_win_operators, numeric_only): + # GH#46560 + kernel = arithmetic_win_operators + df = DataFrame({"a": [1], "b": 2, "c": 3}) + df["c"] = df["c"].astype(object) + rolling = df.rolling(2, min_periods=1) + op = getattr(rolling, kernel) + result = op(numeric_only=numeric_only) + + columns = ["a", "b"] if numeric_only else ["a", "b", "c"] + expected = df[columns].agg([kernel]).reset_index(drop=True).astype(float) + assert list(expected.columns) == columns + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [int, object]) +def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): + # GH#46560 + kernel = arithmetic_win_operators + ser = Series([1], dtype=dtype) + rolling = ser.rolling(2, min_periods=1) + op = getattr(rolling, kernel) + if numeric_only and dtype is object: + msg = f"Rolling.{kernel} does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): + op(numeric_only=numeric_only) + else: + result = op(numeric_only=numeric_only) + expected = ser.agg([kernel]).reset_index(drop=True).astype(float) + tm.assert_series_equal(result, expected) From 5f736c2144ba5b5fa321e0871fa992830886429b Mon Sep 17 00:00:00 2001 From: richard Date: Tue, 7 Jun 2022 22:13:21 -0400 Subject: [PATCH 2/2] Fix corr/cov for Series; add tests --- pandas/core/window/ewm.py | 4 +++ pandas/core/window/rolling.py | 4 ++- pandas/tests/window/test_ewm.py | 46 +++++++++++++++++++++++++++ pandas/tests/window/test_expanding.py | 46 +++++++++++++++++++++++++++ pandas/tests/window/test_rolling.py | 46 +++++++++++++++++++++++++++ 5 files changed, 145 insertions(+), 1 deletion(-) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index bceb4622152f1..a153761f377b3 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -754,6 +754,8 @@ def cov( ): from pandas import Series + self._validate_numeric_only("cov", numeric_only) + def cov_func(x, y): x_array = self._prep_values(x) y_array = self._prep_values(y) @@ -825,6 +827,8 @@ def corr( ): from pandas import Series + self._validate_numeric_only("corr", numeric_only) + def cov_func(x, y): x_array = self._prep_values(x) y_array = self._prep_values(y) diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0ca8557cfb8ea..4d506fbf896b6 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -301,7 +301,7 @@ def _create_data(self, obj: NDFrameT, numeric_only: bool = False) -> NDFrameT: # filter out the on from the object if self.on is not None and not isinstance(self.on, Index) and obj.ndim == 2: obj = obj.reindex(columns=obj.columns.difference([self.on]), copy=False) - if numeric_only or self.axis == 1: + if obj.ndim > 1 and (numeric_only or self.axis == 1): # GH: 20649 in case of mixed dtype and axis=1 we have to convert everything # to float to calculate the complete row at once. We exclude all non-numeric # dtypes. @@ -1706,6 +1706,7 @@ def cov( ): if self.step is not None: raise NotImplementedError("step not implemented for cov") + self._validate_numeric_only("cov", numeric_only) from pandas import Series @@ -1753,6 +1754,7 @@ def corr( ): if self.step is not None: raise NotImplementedError("step not implemented for corr") + self._validate_numeric_only("corr", numeric_only) from pandas import Series diff --git a/pandas/tests/window/test_ewm.py b/pandas/tests/window/test_ewm.py index 83b86fba6432b..e0051ee6d51c6 100644 --- a/pandas/tests/window/test_ewm.py +++ b/pandas/tests/window/test_ewm.py @@ -685,6 +685,28 @@ def test_numeric_only_frame(arithmetic_win_operators, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("kernel", ["corr", "cov"]) +@pytest.mark.parametrize("use_arg", [True, False]) +def test_numeric_only_corr_cov_frame(kernel, numeric_only, use_arg): + # GH#46560 + df = DataFrame({"a": [1, 2, 3], "b": 2, "c": 3}) + df["c"] = df["c"].astype(object) + arg = (df,) if use_arg else () + ewm = df.ewm(span=2, min_periods=1) + op = getattr(ewm, kernel) + result = op(*arg, numeric_only=numeric_only) + + # Compare result to op using float dtypes, dropping c when numeric_only is True + columns = ["a", "b"] if numeric_only else ["a", "b", "c"] + df2 = df[columns].astype(float) + arg2 = (df2,) if use_arg else () + ewm2 = df2.ewm(span=2, min_periods=1) + op2 = getattr(ewm2, kernel) + expected = op2(*arg2, numeric_only=numeric_only) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [int, object]) def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): # GH#46560 @@ -703,3 +725,27 @@ def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): result = op(numeric_only=numeric_only) expected = ser.agg([kernel]).reset_index(drop=True).astype(float) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("kernel", ["corr", "cov"]) +@pytest.mark.parametrize("use_arg", [True, False]) +@pytest.mark.parametrize("dtype", [int, object]) +def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): + # GH#46560 + ser = Series([1, 2, 3], dtype=dtype) + arg = (ser,) if use_arg else () + ewm = ser.ewm(span=2, min_periods=1) + op = getattr(ewm, kernel) + if numeric_only and dtype is object: + msg = f"ExponentialMovingWindow.{kernel} does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): + op(*arg, numeric_only=numeric_only) + else: + result = op(*arg, numeric_only=numeric_only) + + ser2 = ser.astype(float) + arg2 = (ser2,) if use_arg else () + ewm2 = ser2.ewm(span=2, min_periods=1) + op2 = getattr(ewm2, kernel) + expected = op2(*arg2, numeric_only=numeric_only) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_expanding.py b/pandas/tests/window/test_expanding.py index 7f04593846fc1..e0c9294c445f2 100644 --- a/pandas/tests/window/test_expanding.py +++ b/pandas/tests/window/test_expanding.py @@ -670,6 +670,28 @@ def test_numeric_only_frame(arithmetic_win_operators, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("kernel", ["corr", "cov"]) +@pytest.mark.parametrize("use_arg", [True, False]) +def test_numeric_only_corr_cov_frame(kernel, numeric_only, use_arg): + # GH#46560 + df = DataFrame({"a": [1, 2, 3], "b": 2, "c": 3}) + df["c"] = df["c"].astype(object) + arg = (df,) if use_arg else () + expanding = df.expanding() + op = getattr(expanding, kernel) + result = op(*arg, numeric_only=numeric_only) + + # Compare result to op using float dtypes, dropping c when numeric_only is True + columns = ["a", "b"] if numeric_only else ["a", "b", "c"] + df2 = df[columns].astype(float) + arg2 = (df2,) if use_arg else () + expanding2 = df2.expanding() + op2 = getattr(expanding2, kernel) + expected = op2(*arg2, numeric_only=numeric_only) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [int, object]) def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): # GH#46560 @@ -685,3 +707,27 @@ def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): result = op(numeric_only=numeric_only) expected = ser.agg([kernel]).reset_index(drop=True).astype(float) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("kernel", ["corr", "cov"]) +@pytest.mark.parametrize("use_arg", [True, False]) +@pytest.mark.parametrize("dtype", [int, object]) +def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): + # GH#46560 + ser = Series([1, 2, 3], dtype=dtype) + arg = (ser,) if use_arg else () + expanding = ser.expanding() + op = getattr(expanding, kernel) + if numeric_only and dtype is object: + msg = f"Expanding.{kernel} does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): + op(*arg, numeric_only=numeric_only) + else: + result = op(*arg, numeric_only=numeric_only) + + ser2 = ser.astype(float) + arg2 = (ser2,) if use_arg else () + expanding2 = ser2.expanding() + op2 = getattr(expanding2, kernel) + expected = op2(*arg2, numeric_only=numeric_only) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 6a45f5f7bb1da..785603f6e05f0 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -1889,6 +1889,28 @@ def test_numeric_only_frame(arithmetic_win_operators, numeric_only): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("kernel", ["corr", "cov"]) +@pytest.mark.parametrize("use_arg", [True, False]) +def test_numeric_only_corr_cov_frame(kernel, numeric_only, use_arg): + # GH#46560 + df = DataFrame({"a": [1, 2, 3], "b": 2, "c": 3}) + df["c"] = df["c"].astype(object) + arg = (df,) if use_arg else () + rolling = df.rolling(2, min_periods=1) + op = getattr(rolling, kernel) + result = op(*arg, numeric_only=numeric_only) + + # Compare result to op using float dtypes, dropping c when numeric_only is True + columns = ["a", "b"] if numeric_only else ["a", "b", "c"] + df2 = df[columns].astype(float) + arg2 = (df2,) if use_arg else () + rolling2 = df2.rolling(2, min_periods=1) + op2 = getattr(rolling2, kernel) + expected = op2(*arg2, numeric_only=numeric_only) + + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", [int, object]) def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): # GH#46560 @@ -1904,3 +1926,27 @@ def test_numeric_only_series(arithmetic_win_operators, numeric_only, dtype): result = op(numeric_only=numeric_only) expected = ser.agg([kernel]).reset_index(drop=True).astype(float) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("kernel", ["corr", "cov"]) +@pytest.mark.parametrize("use_arg", [True, False]) +@pytest.mark.parametrize("dtype", [int, object]) +def test_numeric_only_corr_cov_series(kernel, use_arg, numeric_only, dtype): + # GH#46560 + ser = Series([1, 2, 3], dtype=dtype) + arg = (ser,) if use_arg else () + rolling = ser.rolling(2, min_periods=1) + op = getattr(rolling, kernel) + if numeric_only and dtype is object: + msg = f"Rolling.{kernel} does not implement numeric_only" + with pytest.raises(NotImplementedError, match=msg): + op(*arg, numeric_only=numeric_only) + else: + result = op(*arg, numeric_only=numeric_only) + + ser2 = ser.astype(float) + arg2 = (ser2,) if use_arg else () + rolling2 = ser2.rolling(2, min_periods=1) + op2 = getattr(rolling2, kernel) + expected = op2(*arg2, numeric_only=numeric_only) + tm.assert_series_equal(result, expected)