From 507f230be5bffbb6bc640ca2b5405895a2a80652 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Mar 2022 15:39:27 -0800 Subject: [PATCH 1/7] REF: simplify groupby.ops --- pandas/core/groupby/ops.py | 79 +++++++++++++++++++++----------------- 1 file changed, 44 insertions(+), 35 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 132c1d11dfe73..8a2f41c1b19b0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -123,14 +123,14 @@ def __init__(self, kind: str, how: str): "min": "group_min", "max": "group_max", "mean": "group_mean", - "median": "group_median", + "median": "group_median_float64", "var": "group_var", "first": "group_nth", "last": "group_last", "ohlc": "group_ohlc", }, "transform": { - "cumprod": "group_cumprod", + "cumprod": "group_cumprod_float64", "cumsum": "group_cumsum", "cummin": "group_cummin", "cummax": "group_cummax", @@ -161,52 +161,54 @@ def _get_cython_function( if is_numeric: return f elif dtype == object: - if "object" not in f.__signatures__: + if how in ["median", "cumprod"]: + # no fused types -> no __signatures__ + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + elif "object" not in f.__signatures__: # raise NotImplementedError here rather than TypeError later raise NotImplementedError( f"function is not implemented for this dtype: " f"[how->{how},dtype->{dtype_str}]" ) return f + else: + raise NotImplementedError( + "This should not be reached. Please report a bug at " + "github.com/pandas-dev/pandas/", + dtype, + ) - def get_cython_func_and_vals(self, values: np.ndarray, is_numeric: bool): + def _get_cython_vals(self, values: np.ndarray) -> np.ndarray: """ - Find the appropriate cython function, casting if necessary. + Cast numeric dtypes to float64 for functions that only support that. Parameters ---------- values : np.ndarray - is_numeric : bool Returns ------- - func : callable values : np.ndarray """ how = self.how - kind = self.kind if how in ["median", "cumprod"]: # these two only have float64 implementations - if is_numeric: - values = ensure_float64(values) - else: - raise NotImplementedError( - f"function is not implemented for this dtype: " - f"[how->{how},dtype->{values.dtype.name}]" - ) - func = getattr(libgroupby, f"group_{how}_float64") - return func, values - - func = self._get_cython_function(kind, how, values.dtype, is_numeric) + # We should only get here with is_numeric, as non-numeric cases + # should raise in _get_cython_function + values = ensure_float64(values) - if values.dtype.kind in ["i", "u"]: + elif values.dtype.kind in ["i", "u"]: if how in ["add", "var", "prod", "mean", "ohlc"]: # result may still include NaN, so we have to cast values = ensure_float64(values) - return func, values + return values + # TODO: general case implementation overridable by EAs. def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): """ Check if we can do this operation with our cython functions. @@ -235,6 +237,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False): # are not setup for dim transforming raise NotImplementedError(f"{dtype} dtype not supported") elif is_datetime64_any_dtype(dtype): + # TODO: same for period_dtype? no for these methods with Period # we raise NotImplemented if this is an invalid operation # entirely, e.g. adding datetimes if how in ["add", "prod", "cumsum", "cumprod"]: @@ -262,7 +265,7 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape: out_shape = (ngroups,) + values.shape[1:] return out_shape - def get_out_dtype(self, dtype: np.dtype) -> np.dtype: + def _get_out_dtype(self, dtype: np.dtype) -> np.dtype: how = self.how if how == "rank": @@ -282,6 +285,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype: def _get_result_dtype(self, dtype: ExtensionDtype) -> ExtensionDtype: ... # pragma: no cover + # TODO: general case implementation overridable by EAs. def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj: """ Get the desired dtype of a result based on the @@ -329,7 +333,6 @@ def _ea_wrap_cython_operation( If we have an ExtensionArray, unwrap, call _cython_operation, and re-wrap if appropriate. """ - # TODO: general case implementation overridable by EAs. if isinstance(values, BaseMaskedArray) and self.uses_mask(): return self._masked_ea_wrap_cython_operation( values, @@ -357,7 +360,8 @@ def _ea_wrap_cython_operation( return self._reconstruct_ea_result(values, res_values) - def _ea_to_cython_values(self, values: ExtensionArray): + # TODO: general case implementation overridable by EAs. + def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray: # GH#43682 if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)): # All of the functions implemented here are ordinal, so we can @@ -378,15 +382,15 @@ def _ea_to_cython_values(self, values: ExtensionArray): ) return npvalues - def _reconstruct_ea_result(self, values, res_values): + # TODO: general case implementation overridable by EAs. + def _reconstruct_ea_result( + self, values: ExtensionArray, res_values: np.ndarray + ) -> ExtensionArray: """ Construct an ExtensionArray result from an ndarray result. """ - # TODO: allow EAs to override this logic - if isinstance( - values.dtype, (BooleanDtype, IntegerDtype, FloatingDtype, StringDtype) - ): + if isinstance(values.dtype, (BaseMaskedDtype, StringDtype)): dtype = self._get_result_dtype(values.dtype) cls = dtype.construct_array_type() return cls._from_sequence(res_values, dtype=dtype) @@ -429,13 +433,17 @@ def _masked_ea_wrap_cython_operation( ) dtype = self._get_result_dtype(orig_values.dtype) - assert isinstance(dtype, BaseMaskedDtype) - cls = dtype.construct_array_type() + assert res_values.dtype == dtype.type, (res_values.dtype, dtype) + # TODO: avoid cast as res_values *should* already have the right + # dtype; last attempt ran into trouble on 32bit linux build + # res_values = res_values.astype(dtype.type, copy=False) if self.kind != "aggregate": - return cls(res_values.astype(dtype.type, copy=False), mask) + out_mask = mask else: - return cls(res_values.astype(dtype.type, copy=False), result_mask) + out_mask = result_mask + + return orig_values._maybe_mask_result(res_values, out_mask) @final def _cython_op_ndim_compat( @@ -521,8 +529,9 @@ def _call_cython_op( result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) - func, values = self.get_cython_func_and_vals(values, is_numeric) - out_dtype = self.get_out_dtype(values.dtype) + func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric) + values = self._get_cython_vals(values) + out_dtype = self._get_out_dtype(values.dtype) result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) if self.kind == "aggregate": From a68b093943ddf23dcaad5f59ed68e8b0339998e1 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Mar 2022 16:45:17 -0800 Subject: [PATCH 2/7] mypy fixup --- pandas/core/groupby/ops.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8a2f41c1b19b0..7d8901308db52 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -398,7 +398,8 @@ def _reconstruct_ea_result( elif needs_i8_conversion(values.dtype): assert res_values.dtype.kind != "f" # just to be on the safe side i8values = res_values.view("i8") - return type(values)(i8values, dtype=values.dtype) + # error: Too many arguments for "ExtensionArray" + return type(values)(i8values, dtype=values.dtype) # type: ignore[call-arg] raise NotImplementedError From 03c3c803ac2641696b1aa8df53f18560b341fd23 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Mar 2022 08:56:13 -0800 Subject: [PATCH 3/7] troubleshoot 32bit build --- pandas/_libs/groupby.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d250a69c1985b..48e1d9b8cf283 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1141,6 +1141,11 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] + if isna_entry: + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 else: isna_entry = checknull(val) From 36720aac48d697952004d9b5cf1cd4ebe3b7d9d1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Mar 2022 09:57:15 -0800 Subject: [PATCH 4/7] troubleshoot 32bit build --- pandas/core/groupby/ops.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7d8901308db52..a4ef3a3cea6fd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -607,6 +607,14 @@ def _call_cython_op( # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) + if self.how == "first": + # troubleshooting 32bit linux build + assert res_dtype == orig_values.dtype, (res_dtype, orig_values.dtype) + assert op_result.dtype == res_dtype, ( + op_result.dtype, + res_dtype, + result, + ) else: op_result = result From 365733e2841aba11b10e31a64240c8f1220048b2 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Mar 2022 11:09:18 -0800 Subject: [PATCH 5/7] troubleshoot 32bit build --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index a4ef3a3cea6fd..51c486aedd0b5 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -607,7 +607,7 @@ def _call_cython_op( # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) - if self.how == "first": + if self.how == "first" and result_mask is not None and self.uses_mask(): # troubleshooting 32bit linux build assert res_dtype == orig_values.dtype, (res_dtype, orig_values.dtype) assert op_result.dtype == res_dtype, ( From 6e0efdd2bc6e979402ff0435328161ff22abf061 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Mar 2022 13:05:30 -0800 Subject: [PATCH 6/7] troubleshoot 32bit linux build --- pandas/_libs/groupby.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 48e1d9b8cf283..4bd4c892195cd 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1176,6 +1176,12 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] + if isna_entry: + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 + else: isna_entry = _treat_as_na(val, True) # TODO: Sure we always want is_datetimelike=True? From b4cf263b56a726f33aea5a21c6a428dc6c459c3f Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 2 Mar 2022 18:00:15 -0800 Subject: [PATCH 7/7] punt on 32bit build --- pandas/_libs/groupby.pyx | 11 ----------- pandas/core/groupby/ops.py | 11 +---------- 2 files changed, 1 insertion(+), 21 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 4bd4c892195cd..d250a69c1985b 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1141,11 +1141,6 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] - if isna_entry: - # set out[i, j] to 0 to be deterministic, as - # it was initialized with np.empty. Also ensures - # we can downcast out if appropriate. - out[i, j] = 0 else: isna_entry = checknull(val) @@ -1176,12 +1171,6 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] - if isna_entry: - # set out[i, j] to 0 to be deterministic, as - # it was initialized with np.empty. Also ensures - # we can downcast out if appropriate. - out[i, j] = 0 - else: isna_entry = _treat_as_na(val, True) # TODO: Sure we always want is_datetimelike=True? diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 51c486aedd0b5..e984279c17a65 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -434,10 +434,9 @@ def _masked_ea_wrap_cython_operation( ) dtype = self._get_result_dtype(orig_values.dtype) - assert res_values.dtype == dtype.type, (res_values.dtype, dtype) # TODO: avoid cast as res_values *should* already have the right # dtype; last attempt ran into trouble on 32bit linux build - # res_values = res_values.astype(dtype.type, copy=False) + res_values = res_values.astype(dtype.type, copy=False) if self.kind != "aggregate": out_mask = mask @@ -607,14 +606,6 @@ def _call_cython_op( # "rank" is the only member of cast_blocklist we get here res_dtype = self._get_result_dtype(orig_values.dtype) op_result = maybe_downcast_to_dtype(result, res_dtype) - if self.how == "first" and result_mask is not None and self.uses_mask(): - # troubleshooting 32bit linux build - assert res_dtype == orig_values.dtype, (res_dtype, orig_values.dtype) - assert op_result.dtype == res_dtype, ( - op_result.dtype, - res_dtype, - result, - ) else: op_result = result