From a392dd06e23578d7f45923703e89e342dbcc27b4 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Mar 2021 09:37:30 +0100 Subject: [PATCH 1/5] PERF: reduce overhead in groupby _cython_operation --- pandas/core/dtypes/missing.py | 8 ++++---- pandas/core/groupby/ops.py | 37 +++++++++++++++++++---------------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index b4a77337ce9f2..4c847650f54ac 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -536,12 +536,12 @@ def infer_fill_value(val): return np.nan -def maybe_fill(arr, fill_value=np.nan): +def maybe_fill(arr: np.ndarray): """ - if we have a compatible fill_value and arr dtype, then fill + Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. """ - if isna_compat(arr, fill_value): - arr.fill(fill_value) + if arr.dtype.kind not in ("u", "i", "b"): + arr.fill(np.nan) return arr diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 008ee4dff4f7b..9883ba4ab1b1f 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -31,7 +31,7 @@ import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction from pandas._typing import ( - ArrayLike, + DtypeObj, F, FrameOrSeries, Shape, @@ -46,7 +46,6 @@ maybe_downcast_to_dtype, ) from pandas.core.dtypes.common import ( - ensure_float, ensure_float64, ensure_int64, ensure_int_or_float, @@ -490,7 +489,9 @@ def _get_cython_func_and_vals( return func, values @final - def _disallow_invalid_ops(self, values: ArrayLike, how: str): + def _disallow_invalid_ops( + self, dtype: DtypeObj, how: str, is_numeric: bool = False + ): """ Check if we can do this operation with our cython functions. @@ -500,7 +501,9 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str): This is either not a valid function for this dtype, or valid but not implemented in cython. """ - dtype = values.dtype + if is_numeric: + # never an invalid op for those dtypes, so return early as fastpath + return if is_categorical_dtype(dtype) or is_sparse(dtype): # categoricals are only 1d, so we @@ -587,32 +590,34 @@ def _cython_operation( # as we can have 1D ExtensionArrays that we need to treat as 2D assert axis == 1, axis + dtype = values.dtype + is_numeric = is_numeric_dtype(dtype) + # can we do this operation with our cython functions # if not raise NotImplementedError - self._disallow_invalid_ops(values, how) + self._disallow_invalid_ops(values, how, is_numeric) - if is_extension_array_dtype(values.dtype): + if is_extension_array_dtype(dtype): return self._ea_wrap_cython_operation( kind, values, how, axis, min_count, **kwargs ) - is_datetimelike = needs_i8_conversion(values.dtype) - is_numeric = is_numeric_dtype(values.dtype) + is_datetimelike = needs_i8_conversion(dtype) if is_datetimelike: values = values.view("int64") is_numeric = True - elif is_bool_dtype(values.dtype): + elif is_bool_dtype(dtype): values = ensure_int_or_float(values) - elif is_integer_dtype(values): + elif is_integer_dtype(dtype): # we use iNaT for the missing value on ints # so pre-convert to guard this condition if (values == iNaT).any(): values = ensure_float64(values) else: values = ensure_int_or_float(values) - elif is_numeric and not is_complex_dtype(values): - values = ensure_float64(ensure_float(values)) + elif is_numeric and not is_complex_dtype(dtype): + values = ensure_float64(values) else: values = values.astype(object) @@ -647,20 +652,18 @@ def _cython_operation( codes, _, _ = self.group_info if kind == "aggregate": - result = maybe_fill(np.empty(out_shape, dtype=out_dtype), fill_value=np.nan) + result = maybe_fill(np.empty(out_shape, dtype=out_dtype)) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate(result, counts, values, codes, func, min_count) elif kind == "transform": - result = maybe_fill( - np.empty(values.shape, dtype=out_dtype), fill_value=np.nan - ) + result = maybe_fill(np.empty(values.shape, dtype=out_dtype)) # TODO: min_count result = self._transform( result, values, codes, func, is_datetimelike, **kwargs ) - if is_integer_dtype(result) and not is_datetimelike: + if is_integer_dtype(result.dtype) and not is_datetimelike: mask = result == iNaT if mask.any(): result = result.astype("float64") From c738c6435c5e29d17a56ac8a4003e4275cc34197 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Mar 2021 09:07:14 +0100 Subject: [PATCH 2/5] fix passing dtype --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 76fb578370c1e..4ead4b44200ae 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -602,7 +602,7 @@ def _cython_operation( # can we do this operation with our cython functions # if not raise NotImplementedError - self._disallow_invalid_ops(values, how, is_numeric) + self._disallow_invalid_ops(dtype, how, is_numeric) if is_extension_array_dtype(dtype): # error: Incompatible return value type (got "Tuple[ndarray, From 606c4c25e2d28961eef5e45bb0ecc085c88c747d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Mar 2021 10:08:11 +0100 Subject: [PATCH 3/5] remove type ignore --- pandas/core/groupby/ops.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4ead4b44200ae..4f5b49f9c860c 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -692,9 +692,7 @@ def _cython_operation( # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) - # error: Argument 2 to "maybe_downcast_to_dtype" has incompatible type - # "Union[dtype[Any], ExtensionDtype]"; expected "Union[str, dtype[Any]]" - result = maybe_downcast_to_dtype(result, dtype) # type: ignore[arg-type] + result = maybe_downcast_to_dtype(result, dtype) return result From 27f27d13b6bbf64f1a23c62ee8ca17d6ea6f4fa6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Mar 2021 19:36:34 +0100 Subject: [PATCH 4/5] add return type annotation --- pandas/core/dtypes/missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 2993be182911b..a67f8e295bdf3 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -556,7 +556,7 @@ def infer_fill_value(val): return np.nan -def maybe_fill(arr: np.ndarray): +def maybe_fill(arr: np.ndarray) -> np.ndarray: """ Fill numpy.ndarray with NaN, unless we have a integer or boolean dtype. """ From cf0e90ab69242aa9bc0dda4c3878b11bf04d0c25 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Mar 2021 14:16:50 +0100 Subject: [PATCH 5/5] update typing error --- pandas/core/groupby/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7bd7f5eab2b63..74e96015b4544 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -685,7 +685,9 @@ def _cython_operation( # e.g. if we are int64 and need to restore to datetime64/timedelta64 # "rank" is the only member of cython_cast_blocklist we get here dtype = maybe_cast_result_dtype(orig_values.dtype, how) - result = maybe_downcast_to_dtype(result, dtype) + # error: Incompatible types in assignment (expression has type + # "Union[ExtensionArray, ndarray]", variable has type "ndarray") + result = maybe_downcast_to_dtype(result, dtype) # type: ignore[assignment] return result