From fa750f5fdaafb39b84de4a57c08bd856da1c0b6b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Mar 2023 08:57:41 -0700 Subject: [PATCH 1/2] REF: use WrappedCythonOp for GroupBy.std, sem --- pandas/_libs/groupby.pyi | 1 + pandas/_libs/groupby.pyx | 12 +++- pandas/core/groupby/groupby.py | 95 ++++++----------------------- pandas/core/groupby/ops.py | 23 +++++-- pandas/tests/groupby/test_raises.py | 20 ++++-- 5 files changed, 64 insertions(+), 87 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index e3ca9c44d5664..1a69f42296aa3 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -86,6 +86,7 @@ def group_var( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., + name: str = ..., ) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 0c378acbc6dc3..ead49151db066 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -3,6 +3,7 @@ from cython cimport ( Py_ssize_t, floating, ) +from libc.math cimport sqrt from libc.stdlib cimport ( free, malloc, @@ -819,6 +820,7 @@ def group_var( const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + str name="var", ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -827,6 +829,8 @@ def group_var( int64_t[:, ::1] nobs Py_ssize_t len_values = len(values), len_labels = len(labels) bint isna_entry, uses_mask = mask is not None + bint is_std = name == "std" + bint is_sem = name == "sem" assert min_count == -1, "'min_count' only used in sum and prod" @@ -876,7 +880,13 @@ def group_var( else: out[i, j] = NAN else: - out[i, j] /= (ct - ddof) + if is_std: + out[i, j] = sqrt(out[i, j] / (ct - ddof)) + elif is_sem: + out[i, j] = sqrt(out[i, j] / (ct - ddof) / ct) + else: + # just "var" + out[i, j] /= (ct - ddof) @cython.wraparound(False) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 96f39bb99e544..6d24d73f68b97 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -95,10 +95,8 @@ class providing the base-class of operations. BaseMaskedArray, BooleanArray, Categorical, - DatetimeArray, ExtensionArray, FloatingArray, - TimedeltaArray, ) from pandas.core.base import ( PandasObject, @@ -1933,32 +1931,12 @@ def std( return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof)) else: - - def _preprocessing(values): - if isinstance(values, BaseMaskedArray): - return values._data, None - return values, None - - def _postprocessing( - vals, inference, nullable: bool = False, result_mask=None - ) -> ArrayLike: - if nullable: - if result_mask.ndim == 2: - result_mask = result_mask[:, 0] - return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_)) - return np.sqrt(vals) - - result = self._get_cythonized_result( - libgroupby.group_var, - cython_dtype=np.dtype(np.float64), + return self._cython_agg_general( + "std", + alt=lambda x: Series(x).std(ddof=ddof), numeric_only=numeric_only, - needs_counts=True, - pre_processing=_preprocessing, - post_processing=_postprocessing, ddof=ddof, - how="std", ) - return result @final @Substitution(name="groupby") @@ -2185,18 +2163,12 @@ def sem(self, ddof: int = 1, numeric_only: bool = False): f"{type(self).__name__}.sem called with " f"numeric_only={numeric_only} and dtype {self.obj.dtype}" ) - result = self.std(ddof=ddof, numeric_only=numeric_only) - - if result.ndim == 1: - result /= np.sqrt(self.count()) - else: - cols = result.columns.difference(self.exclusions).unique() - counts = self.count() - result_ilocs = result.columns.get_indexer_for(cols) - count_ilocs = counts.columns.get_indexer_for(cols) - - result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs]) - return result + return self._cython_agg_general( + "sem", + alt=lambda x: Series(x).sem(ddof=ddof), + numeric_only=numeric_only, + ddof=ddof, + ) @final @Substitution(name="groupby") @@ -3630,7 +3602,6 @@ def _get_cythonized_result( base_func: Callable, cython_dtype: np.dtype, numeric_only: bool = False, - needs_counts: bool = False, pre_processing=None, post_processing=None, how: str = "any_all", @@ -3646,8 +3617,6 @@ def _get_cythonized_result( Type of the array that will be modified by the Cython call. numeric_only : bool, default False Whether only numeric datatypes should be computed - needs_counts : bool, default False - Whether the counts should be a part of the Cython call pre_processing : function, default None Function to be applied to `values` prior to passing to Cython. Function should return a tuple where the first element is the @@ -3694,14 +3663,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: inferences = None - if needs_counts: - counts = np.zeros(ngroups, dtype=np.int64) - func = partial(func, counts=counts) - - is_datetimelike = values.dtype.kind in ["m", "M"] vals = values - if is_datetimelike and how == "std": - vals = vals.view("i8") if pre_processing: vals, inferences = pre_processing(vals) @@ -3710,46 +3672,23 @@ def blk_func(values: ArrayLike) -> ArrayLike: vals = vals.reshape((-1, 1)) func = partial(func, values=vals) - if how != "std" or isinstance(values, BaseMaskedArray): - mask = isna(values).view(np.uint8) - if mask.ndim == 1: - mask = mask.reshape(-1, 1) - func = partial(func, mask=mask) - - if how != "std": - is_nullable = isinstance(values, BaseMaskedArray) - func = partial(func, nullable=is_nullable) + mask = isna(values).view(np.uint8) + if mask.ndim == 1: + mask = mask.reshape(-1, 1) + func = partial(func, mask=mask) - elif isinstance(values, BaseMaskedArray): - result_mask = np.zeros(result.shape, dtype=np.bool_) - func = partial(func, result_mask=result_mask) + is_nullable = isinstance(values, BaseMaskedArray) + func = partial(func, nullable=is_nullable) # Call func to modify result in place - if how == "std": - func(**kwargs, is_datetimelike=is_datetimelike) - else: - func(**kwargs) + func(**kwargs) if values.ndim == 1: assert result.shape[1] == 1, result.shape result = result[:, 0] if post_processing: - pp_kwargs: dict[str, bool | np.ndarray] = {} - pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray) - if how == "std" and pp_kwargs["nullable"]: - pp_kwargs["result_mask"] = result_mask - - result = post_processing(result, inferences, **pp_kwargs) - - if how == "std" and is_datetimelike: - values = cast("DatetimeArray | TimedeltaArray", values) - unit = values.unit - with warnings.catch_warnings(): - # suppress "RuntimeWarning: invalid value encountered in cast" - warnings.filterwarnings("ignore") - result = result.astype(np.int64, copy=False) - result = result.view(f"m8[{unit}]") + result = post_processing(result, inferences, nullable=is_nullable) return result.T diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 0acc7fe29b5db..22d8bad6b5225 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -131,6 +131,8 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: "mean": "group_mean", "median": "group_median_float64", "var": "group_var", + "std": functools.partial(libgroupby.group_var, name="std"), + "sem": functools.partial(libgroupby.group_var, name="sem"), "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", @@ -158,7 +160,10 @@ def _get_cython_function( # see if there is a fused-type version of function # only valid for numeric - f = getattr(libgroupby, ftype) + if callable(ftype): + f = ftype + else: + f = getattr(libgroupby, ftype) if is_numeric: return f elif dtype == np.dtype(object): @@ -168,6 +173,9 @@ def _get_cython_function( f"function is not implemented for this dtype: " f"[how->{how},dtype->{dtype_str}]" ) + elif how in ["std", "sem"]: + # We have a partial object that does not have __signatures__ + return f if "object" not in f.__signatures__: # raise NotImplementedError here rather than TypeError later raise NotImplementedError( @@ -196,7 +204,7 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ how = self.how - if how == "median": + if how in ["median", "std", "sem"]: # median only has a float64 implementation # We should only get here with is_numeric, as non-numeric cases # should raise in _get_cython_function @@ -314,7 +322,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: if how in ["sum", "cumsum", "sum", "prod", "cumprod"]: if dtype == np.dtype(bool): return np.dtype(np.int64) - elif how in ["mean", "median", "var"]: + elif how in ["mean", "median", "var", "std", "sem"]: if is_float_dtype(dtype) or is_complex_dtype(dtype): return dtype elif is_numeric_dtype(dtype): @@ -413,6 +421,11 @@ def _reconstruct_ea_result( elif isinstance(values, (DatetimeArray, TimedeltaArray, PeriodArray)): # In to_cython_values we took a view as M8[ns] assert res_values.dtype == "M8[ns]" + if self.how in ["std", "sem"]: + new_dtype = f"m8[{values.unit}]" + res_values = res_values.view(new_dtype) + return TimedeltaArray(res_values) + res_values = res_values.view(values._ndarray.dtype) return values._from_backing_data(res_values) @@ -556,7 +569,9 @@ def _call_cython_op( result_mask=result_mask, is_datetimelike=is_datetimelike, ) - elif self.how in ["var", "ohlc", "prod", "median"]: + elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]: + if self.how in ["std", "sem"]: + kwargs["is_datetimelike"] = is_datetimelike func( result, counts, diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index f29e2545cde14..60d4f98aa33f7 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -399,14 +399,20 @@ def test_groupby_raises_category( "prod": (TypeError, "category type does not support prod operations"), "quantile": (TypeError, "No matching signature found"), "rank": (None, ""), - "sem": (ValueError, "Cannot cast object dtype to float64"), + "sem": ( + TypeError, + "'Categorical' with dtype category does not support reduction 'sem'", + ), "shift": (None, ""), "size": (None, ""), "skew": ( TypeError, "'Categorical' with dtype category does not support reduction 'skew'", ), - "std": (ValueError, "Cannot cast object dtype to float64"), + "std": ( + TypeError, + "'Categorical' with dtype category does not support reduction 'std'", + ), "sum": (TypeError, "category type does not support sum operations"), "var": ( TypeError, @@ -594,14 +600,20 @@ def test_groupby_raises_category_on_category( "prod": (TypeError, "category type does not support prod operations"), "quantile": (TypeError, ""), "rank": (None, ""), - "sem": (ValueError, "Cannot cast object dtype to float64"), + "sem": ( + TypeError, + "'Categorical' with dtype category does not support reduction 'sem'", + ), "shift": (None, ""), "size": (None, ""), "skew": ( TypeError, "'Categorical' with dtype category does not support reduction 'skew'", ), - "std": (ValueError, "Cannot cast object dtype to float64"), + "std": ( + TypeError, + "'Categorical' with dtype category does not support reduction 'std'", + ), "sum": (TypeError, "category type does not support sum operations"), "var": ( TypeError, From 59ba0824c0eb3c6bde6bf45fa4cb6c7879c8a9d0 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 17 Mar 2023 10:15:37 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/core/groupby/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 22d8bad6b5225..e4ae25ff26207 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -122,7 +122,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None: self.how = how self.has_dropped_na = has_dropped_na - _CYTHON_FUNCTIONS = { + _CYTHON_FUNCTIONS: dict[str, dict] = { "aggregate": { "sum": "group_sum", "prod": "group_prod", @@ -422,6 +422,8 @@ def _reconstruct_ea_result( # In to_cython_values we took a view as M8[ns] assert res_values.dtype == "M8[ns]" if self.how in ["std", "sem"]: + if isinstance(values, PeriodArray): + raise TypeError("'std' and 'sem' are not valid for PeriodDtype") new_dtype = f"m8[{values.unit}]" res_values = res_values.view(new_dtype) return TimedeltaArray(res_values)