From a392dd06e23578d7f45923703e89e342dbcc27b4 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Tue, 9 Mar 2021 09:37:30 +0100
Subject: [PATCH 1/5] PERF: reduce overhead in groupby _cython_operation

---
 pandas/core/dtypes/missing.py |  8 ++++----
 pandas/core/groupby/ops.py    | 37 +++++++++++++++++++----------------
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index b4a77337ce9f2..4c847650f54ac 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -536,12 +536,12 @@ def infer_fill_value(val):
     return np.nan
 
 
-def maybe_fill(arr, fill_value=np.nan):
+def maybe_fill(arr: np.ndarray):
     """
-    if we have a compatible fill_value and arr dtype, then fill
+    Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
     """
-    if isna_compat(arr, fill_value):
-        arr.fill(fill_value)
+    if arr.dtype.kind not in ("u", "i", "b"):
+        arr.fill(np.nan)
     return arr
 
 
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 008ee4dff4f7b..9883ba4ab1b1f 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -31,7 +31,7 @@
 import pandas._libs.groupby as libgroupby
 import pandas._libs.reduction as libreduction
 from pandas._typing import (
-    ArrayLike,
+    DtypeObj,
     F,
     FrameOrSeries,
     Shape,
@@ -46,7 +46,6 @@
     maybe_downcast_to_dtype,
 )
 from pandas.core.dtypes.common import (
-    ensure_float,
     ensure_float64,
     ensure_int64,
     ensure_int_or_float,
@@ -490,7 +489,9 @@ def _get_cython_func_and_vals(
         return func, values
 
     @final
-    def _disallow_invalid_ops(self, values: ArrayLike, how: str):
+    def _disallow_invalid_ops(
+        self, dtype: DtypeObj, how: str, is_numeric: bool = False
+    ):
         """
         Check if we can do this operation with our cython functions.
 
@@ -500,7 +501,9 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str):
             This is either not a valid function for this dtype, or
             valid but not implemented in cython.
         """
-        dtype = values.dtype
+        if is_numeric:
+            # never an invalid op for those dtypes, so return early as fastpath
+            return
 
         if is_categorical_dtype(dtype) or is_sparse(dtype):
             # categoricals are only 1d, so we
@@ -587,32 +590,34 @@ def _cython_operation(
             #  as we can have 1D ExtensionArrays that we need to treat as 2D
             assert axis == 1, axis
 
+        dtype = values.dtype
+        is_numeric = is_numeric_dtype(dtype)
+
         # can we do this operation with our cython functions
         # if not raise NotImplementedError
-        self._disallow_invalid_ops(values, how)
+        self._disallow_invalid_ops(values, how, is_numeric)
 
-        if is_extension_array_dtype(values.dtype):
+        if is_extension_array_dtype(dtype):
             return self._ea_wrap_cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
 
-        is_datetimelike = needs_i8_conversion(values.dtype)
-        is_numeric = is_numeric_dtype(values.dtype)
+        is_datetimelike = needs_i8_conversion(dtype)
 
         if is_datetimelike:
             values = values.view("int64")
             is_numeric = True
-        elif is_bool_dtype(values.dtype):
+        elif is_bool_dtype(dtype):
             values = ensure_int_or_float(values)
-        elif is_integer_dtype(values):
+        elif is_integer_dtype(dtype):
             # we use iNaT for the missing value on ints
             # so pre-convert to guard this condition
             if (values == iNaT).any():
                 values = ensure_float64(values)
             else:
                 values = ensure_int_or_float(values)
-        elif is_numeric and not is_complex_dtype(values):
-            values = ensure_float64(ensure_float(values))
+        elif is_numeric and not is_complex_dtype(dtype):
+            values = ensure_float64(values)
         else:
             values = values.astype(object)
 
@@ -647,20 +652,18 @@ def _cython_operation(
         codes, _, _ = self.group_info
 
         if kind == "aggregate":
-            result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan)
+            result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
             counts = np.zeros(self.ngroups, dtype=np.int64)
             result = self._aggregate(result, counts, values, codes, func, min_count)
         elif kind == "transform":
-            result = maybe_fill(
-                np.empty(values.shape, dtype=out_dtype), fill_value=np.nan
-            )
+            result = maybe_fill(np.empty(values.shape, dtype=out_dtype))
 
             # TODO: min_count
             result = self._transform(
                 result, values, codes, func, is_datetimelike, **kwargs
             )
 
-        if is_integer_dtype(result) and not is_datetimelike:
+        if is_integer_dtype(result.dtype) and not is_datetimelike:
             mask = result == iNaT
             if mask.any():
                 result = result.astype("float64")

From c738c6435c5e29d17a56ac8a4003e4275cc34197 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 12 Mar 2021 09:07:14 +0100
Subject: [PATCH 2/5] fix passing dtype

---
 pandas/core/groupby/ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 76fb578370c1e..4ead4b44200ae 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -602,7 +602,7 @@ def _cython_operation(
 
         # can we do this operation with our cython functions
         # if not raise NotImplementedError
-        self._disallow_invalid_ops(values, how, is_numeric)
+        self._disallow_invalid_ops(dtype, how, is_numeric)
 
         if is_extension_array_dtype(dtype):
             # error: Incompatible return value type (got "Tuple[ndarray,

From 606c4c25e2d28961eef5e45bb0ecc085c88c747d Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 12 Mar 2021 10:08:11 +0100
Subject: [PATCH 3/5] remove type ignore

---
 pandas/core/groupby/ops.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 4ead4b44200ae..4f5b49f9c860c 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -692,9 +692,7 @@ def _cython_operation(
             # e.g. if we are int64 and need to restore to datetime64/timedelta64
             # "rank" is the only member of cython_cast_blocklist we get here
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
-            # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type
-            # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]"
-            result = maybe_downcast_to_dtype(result, dtype)  # type: ignore[arg-type]
+            result = maybe_downcast_to_dtype(result, dtype)
 
         return result
 

From 27f27d13b6bbf64f1a23c62ee8ca17d6ea6f4fa6 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Fri, 12 Mar 2021 19:36:34 +0100
Subject: [PATCH 4/5] add return type annotation

---
 pandas/core/dtypes/missing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 2993be182911b..a67f8e295bdf3 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -556,7 +556,7 @@ def infer_fill_value(val):
     return np.nan
 
 
-def maybe_fill(arr: np.ndarray):
+def maybe_fill(arr: np.ndarray) -> np.ndarray:
     """
     Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype.
     """

From cf0e90ab69242aa9bc0dda4c3878b11bf04d0c25 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 15 Mar 2021 14:16:50 +0100
Subject: [PATCH 5/5] update typing error

---
 pandas/core/groupby/ops.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 7bd7f5eab2b63..74e96015b4544 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -685,7 +685,9 @@ def _cython_operation(
             # e.g. if we are int64 and need to restore to datetime64/timedelta64
             # "rank" is the only member of cython_cast_blocklist we get here
             dtype = maybe_cast_result_dtype(orig_values.dtype, how)
-            result = maybe_downcast_to_dtype(result, dtype)
+            # error: Incompatible types in assignment (expression has type
+            # "Union[ExtensionArray, ndarray]", variable has type "ndarray")
+            result = maybe_downcast_to_dtype(result, dtype)  # type: ignore[assignment]
 
         return result