ENH: Add support for groupby.ohlc for ea dtypes (#48081)

phofl · web-flow · commit d9e2fb5e4be0 · 2022-08-15T10:47:37.000-07:00
* ENH: Add support for groupby.ohlc for ea dtypes

* Fix type
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -287,6 +287,7 @@ Other enhancements
 - ``times`` argument in :class:`.ExponentialMovingWindow` now accepts ``np.timedelta64`` (:issue:`47003`)
 - :class:`.DataError`, :class:`.SpecificationError`, :class:`.SettingWithCopyError`, :class:`.SettingWithCopyWarning`, :class:`.NumExprClobberingError`, :class:`.UndefinedVariableError`, :class:`.IndexingError`, :class:`.PyperclipException`, :class:`.PyperclipWindowsException`, :class:`.CSSWarning`, :class:`.PossibleDataLossError`, :class:`.ClosedFileError`, :class:`.IncompatibilityWarning`, :class:`.AttributeConflictWarning`, :class:`.DatabaseError, :class:`.PossiblePrecisionLoss, :class:`.ValueLabelTypeMismatch, :class:`.InvalidColumnName, and :class:`.CategoricalConversionWarning` are now exposed in ``pandas.errors`` (:issue:`27656`)
 - Added ``check_like`` argument to :func:`testing.assert_series_equal` (:issue:`47247`)
+- Add support for :meth:`GroupBy.ohlc` for extension array dtypes (:issue:`37493`)
 - Allow reading compressed SAS files with :func:`read_sas` (e.g., ``.sas7bdat.gz`` files)
 - :meth:`DatetimeIndex.astype` now supports casting timezone-naive indexes to ``datetime64[s]``, ``datetime64[ms]``, and ``datetime64[us]``, and timezone-aware indexes to the corresponding ``datetime64[unit, tzname]`` dtypes (:issue:`47579`)
 - :class:`Series` reducers (e.g. ``min``, ``max``, ``sum``, ``mean``) will now successfully operate when the dtype is numeric and ``numeric_only=True`` is provided; previously this would raise a ``NotImplementedError`` (:issue:`47500`)
diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi
@@ -86,11 +86,13 @@ def group_mean(
     result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_ohlc(
-    out: np.ndarray,  # floating[:, ::1]
+    out: np.ndarray,  # floatingintuint_t[:, ::1]
     counts: np.ndarray,  # int64_t[::1]
-    values: np.ndarray,  # ndarray[floating, ndim=2]
+    values: np.ndarray,  # ndarray[floatingintuint_t, ndim=2]
     labels: np.ndarray,  # const intp_t[:]
     min_count: int = ...,
+    mask: np.ndarray | None = ...,
+    result_mask: np.ndarray | None = ...,
 ) -> None: ...
 def group_quantile(
     out: npt.NDArray[np.float64],
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -834,21 +834,32 @@ def group_mean(
                     out[i, j] = sumx[i, j] / count
 
 
+ctypedef fused int64float_t:
+    float32_t
+    float64_t
+    int64_t
+    uint64_t
+
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_ohlc(
-    floating[:, ::1] out,
+    int64float_t[:, ::1] out,
     int64_t[::1] counts,
-    ndarray[floating, ndim=2] values,
+    ndarray[int64float_t, ndim=2] values,
     const intp_t[::1] labels,
     Py_ssize_t min_count=-1,
+    const uint8_t[:, ::1] mask=None,
+    uint8_t[:, ::1] result_mask=None,
 ) -> None:
     """
     Only aggregates on axis=0
     """
     cdef:
         Py_ssize_t i, j, N, K, lab
-        floating val
+        int64float_t val
+        uint8_t[::1] first_element_set
+        bint isna_entry, uses_mask = not mask is None
 
     assert min_count == -1, "'min_count' only used in sum and prod"
 
@@ -862,7 +873,15 @@ def group_ohlc(
 
     if K > 1:
         raise NotImplementedError("Argument 'values' must have only one dimension")
-    out[:] = np.nan
+
+    if int64float_t is float32_t or int64float_t is float64_t:
+        out[:] = np.nan
+    else:
+        out[:] = 0
+
+    first_element_set = np.zeros((<object>counts).shape, dtype=np.uint8)
+    if uses_mask:
+        result_mask[:] = True
 
     with nogil:
         for i in range(N):
@@ -872,11 +891,22 @@ def group_ohlc(
 
             counts[lab] += 1
             val = values[i, 0]
-            if val != val:
+
+            if uses_mask:
+                isna_entry = mask[i, 0]
+            elif int64float_t is float32_t or int64float_t is float64_t:
+                isna_entry = val != val
+            else:
+                isna_entry = False
+
+            if isna_entry:
                 continue
 
-            if out[lab, 0] != out[lab, 0]:
+            if not first_element_set[lab]:
                 out[lab, 0] = out[lab, 1] = out[lab, 2] = out[lab, 3] = val
+                first_element_set[lab] = True
+                if uses_mask:
+                    result_mask[lab] = False
             else:
                 out[lab, 1] = max(out[lab, 1], val)
                 out[lab, 2] = min(out[lab, 2], val)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -157,6 +157,7 @@ def __init__(self, kind: str, how: str, has_dropped_na: bool) -> None:
         "first",
         "rank",
         "sum",
+        "ohlc",
     }
 
     _cython_arity = {"ohlc": 4}  # OHLC
@@ -219,13 +220,13 @@ def _get_cython_vals(self, values: np.ndarray) -> np.ndarray:
             values = ensure_float64(values)
 
         elif values.dtype.kind in ["i", "u"]:
-            if how in ["var", "prod", "mean", "ohlc"] or (
+            if how in ["var", "prod", "mean"] or (
                 self.kind == "transform" and self.has_dropped_na
             ):
                 # result may still include NaN, so we have to cast
                 values = ensure_float64(values)
 
-            elif how == "sum":
+            elif how in ["sum", "ohlc"]:
                 # Avoid overflow during group op
                 if values.dtype.kind == "i":
                     values = ensure_int64(values)
@@ -480,6 +481,9 @@ def _masked_ea_wrap_cython_operation(
             **kwargs,
         )
 
+        if self.how == "ohlc":
+            result_mask = np.tile(result_mask, (4, 1)).T
+
         # res_values should already have the correct dtype, we just need to
         #  wrap in a MaskedArray
         return orig_values._maybe_mask_result(res_values, result_mask)
@@ -592,6 +596,8 @@ def _call_cython_op(
                     min_count=min_count,
                     is_datetimelike=is_datetimelike,
                 )
+            elif self.how == "ohlc":
+                func(result, counts, values, comp_ids, min_count, mask, result_mask)
             else:
                 func(result, counts, values, comp_ids, min_count)
         else:
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -564,6 +564,22 @@ def test_order_aggregate_multiple_funcs():
     tm.assert_index_equal(result, expected)
 
 
+def test_ohlc_ea_dtypes(any_numeric_ea_dtype):
+    # GH#37493
+    df = DataFrame(
+        {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]},
+        dtype=any_numeric_ea_dtype,
+    )
+    result = df.groupby("a").ohlc()
+    expected = DataFrame(
+        [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4],
+        columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]),
+        index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"),
+        dtype=any_numeric_ea_dtype,
+    )
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize("dtype", [np.int64, np.uint64])
 @pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
 def test_uint64_type_handling(dtype, how):