From 64c6ef2463cb017492f8e6701921b97eec123feb Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 27 Mar 2021 14:36:00 -0700 Subject: [PATCH 1/6] merge --- pandas/_libs/groupby.pyx | 18 +++--------------- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 4 ++-- pandas/tests/groupby/test_libgroupby.py | 7 +++---- 4 files changed, 9 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index e23fa9b82f12e..8e3f72e8aa808 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -514,7 +514,7 @@ group_add_complex128 = _group_add['double complex'] @cython.wraparound(False) @cython.boundscheck(False) -def _group_prod(floating[:, ::1] out, +def group_prod(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[:] labels, @@ -560,14 +560,10 @@ def _group_prod(floating[:, ::1] out, out[i, j] = prodx[i, j] -group_prod_float32 = _group_prod['float'] -group_prod_float64 = _group_prod['double'] - - @cython.wraparound(False) @cython.boundscheck(False) @cython.cdivision(True) -def _group_var(floating[:, ::1] out, +def group_var(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[:] labels, @@ -619,13 +615,9 @@ def _group_var(floating[:, ::1] out, out[i, j] /= (ct - ddof) -group_var_float32 = _group_var['float'] -group_var_float64 = _group_var['double'] - - @cython.wraparound(False) @cython.boundscheck(False) -def _group_mean(floating[:, ::1] out, +def group_mean(floating[:, ::1] out, int64_t[::1] counts, ndarray[floating, ndim=2] values, const intp_t[::1] labels, @@ -675,10 +667,6 @@ def _group_mean(floating[:, ::1] out, out[i, j] = sumx[i, j] / count -group_mean_float32 = _group_mean['float'] -group_mean_float64 = _group_mean['double'] - - @cython.wraparound(False) @cython.boundscheck(False) def group_ohlc(floating[:, ::1] out, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1ee38834c5758..d5a97eb9011b0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1602,7 +1602,7 @@ def std(self, ddof: int = 1): Standard deviation of values within each group. """ return self._get_cythonized_result( - "group_var_float64", + "group_var", aggregate=True, needs_counts=True, needs_values=True, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 99b9aea4f82df..023531cac430b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -488,8 +488,8 @@ def _get_cython_func_and_vals( raise else: if values.dtype.kind in ["i", "u"]: - if how in ["ohlc"]: - # The output may still include nans, so we have to cast + if how in ["add", "var", "prod", "mean", "ohlc"]: + # For OHLC the output may still include nans, so we have to cast values = ensure_float64(values) return func, values diff --git a/pandas/tests/groupby/test_libgroupby.py b/pandas/tests/groupby/test_libgroupby.py index d776c34f5b5ec..7a9cadb6c8232 100644 --- a/pandas/tests/groupby/test_libgroupby.py +++ b/pandas/tests/groupby/test_libgroupby.py @@ -4,8 +4,7 @@ from pandas._libs.groupby import ( group_cumprod_float64, group_cumsum, - group_var_float32, - group_var_float64, + group_var, ) from pandas.core.dtypes.common import ensure_platform_int @@ -102,7 +101,7 @@ def test_group_var_constant(self): class TestGroupVarFloat64(GroupVarTestMixin): __test__ = True - algo = staticmethod(group_var_float64) + algo = staticmethod(group_var) dtype = np.float64 rtol = 1e-5 @@ -124,7 +123,7 @@ def test_group_var_large_inputs(self): class TestGroupVarFloat32(GroupVarTestMixin): __test__ = True - algo = staticmethod(group_var_float32) + algo = staticmethod(group_var) dtype = np.float32 rtol = 1e-2 From 9f5689484c8e1139e3207140e6e0ca9e33cb1871 Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 27 Mar 2021 20:45:23 -0700 Subject: [PATCH 2/6] handle ohlc explicitly --- pandas/_libs/groupby.pyx | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 8e3f72e8aa808..cb4d8c51a3974 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -515,10 +515,10 @@ group_add_complex128 = _group_add['double complex'] @cython.wraparound(False) @cython.boundscheck(False) def group_prod(floating[:, ::1] out, - int64_t[::1] counts, - ndarray[floating, ndim=2] values, - const intp_t[:] labels, - Py_ssize_t min_count=0): + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const intp_t[:] labels, + Py_ssize_t min_count=0): """ Only aggregates on axis=0 """ @@ -564,11 +564,11 @@ def group_prod(floating[:, ::1] out, @cython.boundscheck(False) @cython.cdivision(True) def group_var(floating[:, ::1] out, - int64_t[::1] counts, - ndarray[floating, ndim=2] values, - const intp_t[:] labels, - Py_ssize_t min_count=-1, - int64_t ddof=1): + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const intp_t[:] labels, + Py_ssize_t min_count=-1, + int64_t ddof=1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, ct, oldmean @@ -618,10 +618,10 @@ def group_var(floating[:, ::1] out, @cython.wraparound(False) @cython.boundscheck(False) def group_mean(floating[:, ::1] out, - int64_t[::1] counts, - ndarray[floating, ndim=2] values, - const intp_t[::1] labels, - Py_ssize_t min_count=-1): + int64_t[::1] counts, + ndarray[floating, ndim=2] values, + const intp_t[::1] labels, + Py_ssize_t min_count=-1): cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) floating val, count, y, t From 6c700a883247bcafeea0effce5f86d3a0db29235 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Mar 2021 11:55:49 -0700 Subject: [PATCH 3/6] REF: better use of fused types for group_add --- pandas/_libs/groupby.pyx | 16 +++++----------- pandas/core/groupby/ops.py | 9 ++++----- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index cb4d8c51a3974..b43a0d2eced93 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -455,11 +455,11 @@ ctypedef fused complexfloating_t: @cython.wraparound(False) @cython.boundscheck(False) -def _group_add(complexfloating_t[:, ::1] out, - int64_t[::1] counts, - ndarray[complexfloating_t, ndim=2] values, - const intp_t[:] labels, - Py_ssize_t min_count=0): +def group_add(complexfloating_t[:, ::1] out, + int64_t[::1] counts, + ndarray[complexfloating_t, ndim=2] values, + const intp_t[:] labels, + Py_ssize_t min_count=0): """ Only aggregates on axis=0 using Kahan summation """ @@ -506,12 +506,6 @@ def _group_add(complexfloating_t[:, ::1] out, out[i, j] = sumx[i, j] -group_add_float32 = _group_add['float32_t'] -group_add_float64 = _group_add['float64_t'] -group_add_complex64 = _group_add['float complex'] -group_add_complex128 = _group_add['double complex'] - - @cython.wraparound(False) @cython.boundscheck(False) def group_prod(floating[:, ::1] out, diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 023531cac430b..c1ef201df363d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -489,7 +489,7 @@ def _get_cython_func_and_vals( else: if values.dtype.kind in ["i", "u"]: if how in ["add", "var", "prod", "mean", "ohlc"]: - # For OHLC the output may still include nans, so we have to cast + # result may still include NaN, so we have to cast values = ensure_float64(values) return func, values @@ -623,10 +623,9 @@ def _cython_operation( values = ensure_float64(values) else: values = ensure_int_or_float(values) - elif is_numeric and not is_complex_dtype(dtype): - values = ensure_float64(values) - else: - values = values.astype(object) + elif is_numeric: + if not is_complex_dtype(dtype): + values = ensure_float64(values) arity = self._cython_arity.get(how, 1) From e3064ef61c7d211c2bdf888412e2f96299cff0af Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 28 Mar 2021 20:44:05 -0700 Subject: [PATCH 4/6] REF: try/except in fewer cases in _get_cython_function --- pandas/core/groupby/ops.py | 56 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c1ef201df363d..ac2127e2604a1 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -128,31 +128,22 @@ def _get_cython_function(kind: str, how: str, dtype: np.dtype, is_numeric: bool) # see if there is a fused-type version of function # only valid for numeric f = getattr(libgroupby, ftype, None) - if f is not None and is_numeric: - return f - - # otherwise find dtype-specific version, falling back to object - for dt in [dtype_str, "object"]: - f2 = getattr(libgroupby, f"{ftype}_{dt}", None) - if f2 is not None: - return f2 - - if hasattr(f, "__signatures__"): - # inspect what fused types are implemented - if dtype_str == "object" and "object" not in f.__signatures__: - # disallow this function so we get a NotImplementedError below - # instead of a TypeError at runtime - f = None - - func = f - - if func is None: - raise NotImplementedError( - f"function is not implemented for this dtype: " - f"[how->{how},dtype->{dtype_str}]" - ) + if f is not None: + if is_numeric: + return f + elif dtype == object: + if "object" not in f.__signatures__: + # raise NotImplementedError here rather than TypeError later + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) + return f - return func + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{dtype_str}]" + ) class BaseGrouper: @@ -472,6 +463,18 @@ def _get_cython_func_and_vals( func : callable values : np.ndarray """ + if how in ["median", "cumprod"]: + # these two only have float64 implementations + if is_numeric: + values = ensure_float64(values) + else: + raise NotImplementedError( + f"function is not implemented for this dtype: " + f"[how->{how},dtype->{values.dtype.name}]" + ) + func = getattr(libgroupby, f"group_{how}_float64") + return func, values + try: func = _get_cython_function(kind, how, values.dtype, is_numeric) except NotImplementedError: @@ -660,7 +663,10 @@ def _cython_operation( if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) - result = self._aggregate(result, counts, values, codes, func, min_count) + try: + result = self._aggregate(result, counts, values, codes, func, min_count) + except TypeError as err: + raise TypeError(values.dtype) from err elif kind == "transform": result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) From 6994ce81b097f52fb3e607559f8cfd3b6ee38e95 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Mar 2021 09:44:51 -0700 Subject: [PATCH 5/6] simplify get_cython_function_and_values --- pandas/core/groupby/ops.py | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index ac2127e2604a1..f45a9a8e309c9 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -475,25 +475,12 @@ def _get_cython_func_and_vals( func = getattr(libgroupby, f"group_{how}_float64") return func, values - try: - func = _get_cython_function(kind, how, values.dtype, is_numeric) - except NotImplementedError: - if is_numeric: - try: - values = ensure_float64(values) - except TypeError: - if lib.infer_dtype(values, skipna=False) == "complex": - values = values.astype(complex) - else: - raise - func = _get_cython_function(kind, how, values.dtype, is_numeric) - else: - raise - else: - if values.dtype.kind in ["i", "u"]: - if how in ["add", "var", "prod", "mean", "ohlc"]: - # result may still include NaN, so we have to cast - values = ensure_float64(values) + func = _get_cython_function(kind, how, values.dtype, is_numeric) + + if values.dtype.kind in ["i", "u"]: + if how in ["add", "var", "prod", "mean", "ohlc"]: + # result may still include NaN, so we have to cast + values = ensure_float64(values) return func, values From 00f9b9c3c31eb81da9484b6bb296df1078116163 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 29 Mar 2021 10:07:28 -0700 Subject: [PATCH 6/6] REF: simplify groupby.ops dispatch --- pandas/core/groupby/ops.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f45a9a8e309c9..a128e494c2d98 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -650,10 +650,7 @@ def _cython_operation( if kind == "aggregate": result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) - try: - result = self._aggregate(result, counts, values, codes, func, min_count) - except TypeError as err: - raise TypeError(values.dtype) from err + result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": result = maybe_fill(np.empty(values.shape, dtype=out_dtype))