From 507f230be5bffbb6bc640ca2b5405895a2a80652 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 1 Mar 2022 15:39:27 -0800
Subject: [PATCH 1/7] REF: simplify groupby.ops

---
 pandas/core/groupby/ops.py | 79 +++++++++++++++++++++-----------------
 1 file changed, 44 insertions(+), 35 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 132c1d11dfe73..8a2f41c1b19b0 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -123,14 +123,14 @@ def __init__(self, kind: str, how: str):
             "min": "group_min",
             "max": "group_max",
             "mean": "group_mean",
-            "median": "group_median",
+            "median": "group_median_float64",
             "var": "group_var",
             "first": "group_nth",
             "last": "group_last",
             "ohlc": "group_ohlc",
         },
         "transform": {
-            "cumprod": "group_cumprod",
+            "cumprod": "group_cumprod_float64",
             "cumsum": "group_cumsum",
             "cummin": "group_cummin",
             "cummax": "group_cummax",
@@ -161,52 +161,54 @@ def _get_cython_function(
         if is_numeric:
             return f
         elif dtype == object:
-            if "object" not in f.__signatures__:
+            if how in ["median", "cumprod"]:
+                # no fused types -> no __signatures__
+                raise NotImplementedError(
+                    f"function is not implemented for this dtype: "
+                    f"[how->{how},dtype->{dtype_str}]"
+                )
+            elif "object" not in f.__signatures__:
                 # raise NotImplementedError here rather than TypeError later
                 raise NotImplementedError(
                     f"function is not implemented for this dtype: "
                     f"[how->{how},dtype->{dtype_str}]"
                 )
             return f
+        else:
+            raise NotImplementedError(
+                "This should not be reached. Please report a bug at "
+                "github.com/pandas-dev/pandas/",
+                dtype,
+            )
 
-    def get_cython_func_and_vals(self, values: np.ndarray, is_numeric: bool):
+    def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
         """
-        Find the appropriate cython function, casting if necessary.
+        Cast numeric dtypes to float64 for functions that only support that.
 
         Parameters
         ----------
         values : np.ndarray
-        is_numeric : bool
 
         Returns
         -------
-        func : callable
         values : np.ndarray
         """
         how = self.how
-        kind = self.kind
 
         if how in ["median", "cumprod"]:
             # these two only have float64 implementations
-            if is_numeric:
-                values = ensure_float64(values)
-            else:
-                raise NotImplementedError(
-                    f"function is not implemented for this dtype: "
-                    f"[how->{how},dtype->{values.dtype.name}]"
-                )
-            func = getattr(libgroupby, f"group_{how}_float64")
-            return func, values
-
-        func = self._get_cython_function(kind, how, values.dtype, is_numeric)
+            # We should only get here with is_numeric, as non-numeric cases
+            #  should raise in _get_cython_function
+            values = ensure_float64(values)
 
-        if values.dtype.kind in ["i", "u"]:
+        elif values.dtype.kind in ["i", "u"]:
             if how in ["add", "var", "prod", "mean", "ohlc"]:
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
-        return func, values
+        return values
 
+    # TODO: general case implementation overridable by EAs.
     def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
         """
         Check if we can do this operation with our cython functions.
@@ -235,6 +237,7 @@ def _disallow_invalid_ops(self, dtype: DtypeObj, is_numeric: bool = False):
             #  are not setup for dim transforming
             raise NotImplementedError(f"{dtype} dtype not supported")
         elif is_datetime64_any_dtype(dtype):
+            # TODO: same for period_dtype?  no for these methods with Period
             # we raise NotImplemented if this is an invalid operation
             #  entirely, e.g. adding datetimes
             if how in ["add", "prod", "cumsum", "cumprod"]:
@@ -262,7 +265,7 @@ def _get_output_shape(self, ngroups: int, values: np.ndarray) -> Shape:
             out_shape = (ngroups,) + values.shape[1:]
         return out_shape
 
-    def get_out_dtype(self, dtype: np.dtype) -> np.dtype:
+    def _get_out_dtype(self, dtype: np.dtype) -> np.dtype:
         how = self.how
 
         if how == "rank":
@@ -282,6 +285,7 @@ def _get_result_dtype(self, dtype: np.dtype) -> np.dtype:
     def _get_result_dtype(self, dtype: ExtensionDtype) -> ExtensionDtype:
         ...  # pragma: no cover
 
+    # TODO: general case implementation overridable by EAs.
     def _get_result_dtype(self, dtype: DtypeObj) -> DtypeObj:
         """
         Get the desired dtype of a result based on the
@@ -329,7 +333,6 @@ def _ea_wrap_cython_operation(
         If we have an ExtensionArray, unwrap, call _cython_operation, and
         re-wrap if appropriate.
         """
-        # TODO: general case implementation overridable by EAs.
         if isinstance(values, BaseMaskedArray) and self.uses_mask():
             return self._masked_ea_wrap_cython_operation(
                 values,
@@ -357,7 +360,8 @@ def _ea_wrap_cython_operation(
 
         return self._reconstruct_ea_result(values, res_values)
 
-    def _ea_to_cython_values(self, values: ExtensionArray):
+    # TODO: general case implementation overridable by EAs.
+    def _ea_to_cython_values(self, values: ExtensionArray) -> np.ndarray:
         # GH#43682
         if isinstance(values, (DatetimeArray, PeriodArray, TimedeltaArray)):
             # All of the functions implemented here are ordinal, so we can
@@ -378,15 +382,15 @@ def _ea_to_cython_values(self, values: ExtensionArray):
             )
         return npvalues
 
-    def _reconstruct_ea_result(self, values, res_values):
+    # TODO: general case implementation overridable by EAs.
+    def _reconstruct_ea_result(
+        self, values: ExtensionArray, res_values: np.ndarray
+    ) -> ExtensionArray:
         """
         Construct an ExtensionArray result from an ndarray result.
         """
-        # TODO: allow EAs to override this logic
 
-        if isinstance(
-            values.dtype, (BooleanDtype, IntegerDtype, FloatingDtype, StringDtype)
-        ):
+        if isinstance(values.dtype, (BaseMaskedDtype, StringDtype)):
             dtype = self._get_result_dtype(values.dtype)
             cls = dtype.construct_array_type()
             return cls._from_sequence(res_values, dtype=dtype)
@@ -429,13 +433,17 @@ def _masked_ea_wrap_cython_operation(
         )
 
         dtype = self._get_result_dtype(orig_values.dtype)
-        assert isinstance(dtype, BaseMaskedDtype)
-        cls = dtype.construct_array_type()
+        assert res_values.dtype == dtype.type, (res_values.dtype, dtype)
+        # TODO: avoid cast as res_values *should* already have the right
+        #  dtype; last attempt ran into trouble on 32bit linux build
+        # res_values = res_values.astype(dtype.type, copy=False)
 
         if self.kind != "aggregate":
-            return cls(res_values.astype(dtype.type, copy=False), mask)
+            out_mask = mask
         else:
-            return cls(res_values.astype(dtype.type, copy=False), result_mask)
+            out_mask = result_mask
+
+        return orig_values._maybe_mask_result(res_values, out_mask)
 
     @final
     def _cython_op_ndim_compat(
@@ -521,8 +529,9 @@ def _call_cython_op(
                 result_mask = result_mask.T
 
         out_shape = self._get_output_shape(ngroups, values)
-        func, values = self.get_cython_func_and_vals(values, is_numeric)
-        out_dtype = self.get_out_dtype(values.dtype)
+        func = self._get_cython_function(self.kind, self.how, values.dtype, is_numeric)
+        values = self._get_cython_vals(values)
+        out_dtype = self._get_out_dtype(values.dtype)
 
         result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
         if self.kind == "aggregate":

From a68b093943ddf23dcaad5f59ed68e8b0339998e1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 1 Mar 2022 16:45:17 -0800
Subject: [PATCH 2/7] mypy fixup

---
 pandas/core/groupby/ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 8a2f41c1b19b0..7d8901308db52 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -398,7 +398,8 @@ def _reconstruct_ea_result(
         elif needs_i8_conversion(values.dtype):
             assert res_values.dtype.kind != "f"  # just to be on the safe side
             i8values = res_values.view("i8")
-            return type(values)(i8values, dtype=values.dtype)
+            # error: Too many arguments for "ExtensionArray"
+            return type(values)(i8values, dtype=values.dtype)  # type: ignore[call-arg]
 
         raise NotImplementedError
 

From 03c3c803ac2641696b1aa8df53f18560b341fd23 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 2 Mar 2022 08:56:13 -0800
Subject: [PATCH 3/7] troubleshoot 32bit build

---
 pandas/_libs/groupby.pyx | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index d250a69c1985b..48e1d9b8cf283 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1141,6 +1141,11 @@ def group_nth(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
+                    if isna_entry:
+                        # set out[i, j] to 0 to be deterministic, as
+                        #  it was initialized with np.empty. Also ensures
+                        #  we can downcast out if appropriate.
+                        out[i, j] = 0
                 else:
                     isna_entry = checknull(val)
 

From 36720aac48d697952004d9b5cf1cd4ebe3b7d9d1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 2 Mar 2022 09:57:15 -0800
Subject: [PATCH 4/7] troubleshoot 32bit build

---
 pandas/core/groupby/ops.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 7d8901308db52..a4ef3a3cea6fd 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -607,6 +607,14 @@ def _call_cython_op(
             # "rank" is the only member of cast_blocklist we get here
             res_dtype = self._get_result_dtype(orig_values.dtype)
             op_result = maybe_downcast_to_dtype(result, res_dtype)
+            if self.how == "first":
+                # troubleshooting 32bit linux build
+                assert res_dtype == orig_values.dtype, (res_dtype, orig_values.dtype)
+                assert op_result.dtype == res_dtype, (
+                    op_result.dtype,
+                    res_dtype,
+                    result,
+                )
         else:
             op_result = result
 

From 365733e2841aba11b10e31a64240c8f1220048b2 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 2 Mar 2022 11:09:18 -0800
Subject: [PATCH 5/7] troubleshoot 32bit build

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index a4ef3a3cea6fd..51c486aedd0b5 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -607,7 +607,7 @@ def _call_cython_op(
             # "rank" is the only member of cast_blocklist we get here
             res_dtype = self._get_result_dtype(orig_values.dtype)
             op_result = maybe_downcast_to_dtype(result, res_dtype)
-            if self.how == "first":
+            if self.how == "first" and result_mask is not None and self.uses_mask():
                 # troubleshooting 32bit linux build
                 assert res_dtype == orig_values.dtype, (res_dtype, orig_values.dtype)
                 assert op_result.dtype == res_dtype, (

From 6e0efdd2bc6e979402ff0435328161ff22abf061 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 2 Mar 2022 13:05:30 -0800
Subject: [PATCH 6/7] troubleshoot 32bit linux build

---
 pandas/_libs/groupby.pyx | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 48e1d9b8cf283..4bd4c892195cd 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1176,6 +1176,12 @@ def group_nth(
 
                     if uses_mask:
                         isna_entry = mask[i, j]
+                        if isna_entry:
+                            # set out[i, j] to 0 to be deterministic, as
+                            #  it was initialized with np.empty. Also ensures
+                            #  we can downcast out if appropriate.
+                            out[i, j] = 0
+
                     else:
                         isna_entry = _treat_as_na(val, True)
                         # TODO: Sure we always want is_datetimelike=True?

From b4cf263b56a726f33aea5a21c6a428dc6c459c3f Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 2 Mar 2022 18:00:15 -0800
Subject: [PATCH 7/7] punt on 32bit build

---
 pandas/_libs/groupby.pyx   | 11 -----------
 pandas/core/groupby/ops.py | 11 +----------
 2 files changed, 1 insertion(+), 21 deletions(-)

diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
index 4bd4c892195cd..d250a69c1985b 100644
--- a/pandas/_libs/groupby.pyx
+++ b/pandas/_libs/groupby.pyx
@@ -1141,11 +1141,6 @@ def group_nth(
 
                 if uses_mask:
                     isna_entry = mask[i, j]
-                    if isna_entry:
-                        # set out[i, j] to 0 to be deterministic, as
-                        #  it was initialized with np.empty. Also ensures
-                        #  we can downcast out if appropriate.
-                        out[i, j] = 0
                 else:
                     isna_entry = checknull(val)
 
@@ -1176,12 +1171,6 @@ def group_nth(
 
                     if uses_mask:
                         isna_entry = mask[i, j]
-                        if isna_entry:
-                            # set out[i, j] to 0 to be deterministic, as
-                            #  it was initialized with np.empty. Also ensures
-                            #  we can downcast out if appropriate.
-                            out[i, j] = 0
-
                     else:
                         isna_entry = _treat_as_na(val, True)
                         # TODO: Sure we always want is_datetimelike=True?
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 51c486aedd0b5..e984279c17a65 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -434,10 +434,9 @@ def _masked_ea_wrap_cython_operation(
         )
 
         dtype = self._get_result_dtype(orig_values.dtype)
-        assert res_values.dtype == dtype.type, (res_values.dtype, dtype)
         # TODO: avoid cast as res_values *should* already have the right
         #  dtype; last attempt ran into trouble on 32bit linux build
-        # res_values = res_values.astype(dtype.type, copy=False)
+        res_values = res_values.astype(dtype.type, copy=False)
 
         if self.kind != "aggregate":
             out_mask = mask
@@ -607,14 +606,6 @@ def _call_cython_op(
             # "rank" is the only member of cast_blocklist we get here
             res_dtype = self._get_result_dtype(orig_values.dtype)
             op_result = maybe_downcast_to_dtype(result, res_dtype)
-            if self.how == "first" and result_mask is not None and self.uses_mask():
-                # troubleshooting 32bit linux build
-                assert res_dtype == orig_values.dtype, (res_dtype, orig_values.dtype)
-                assert op_result.dtype == res_dtype, (
-                    op_result.dtype,
-                    res_dtype,
-                    result,
-                )
         else:
             op_result = result