pandas-dev · AdamOrmondroyd · Mar 2, 2023 · Mar 2, 2023 · Mar 2, 2023 · Mar 3, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -223,6 +223,10 @@ Plotting
 - Bug in :meth:`Series.plot` when invoked with ``color=None`` (:issue:`51953`)
 -
 
+Groupby
+- Bug in :meth:`GroupBy.mean`, :meth:`GroupBy.median`, :meth:`GroupBy.std`, :meth:`GroupBy.var`, :meth:`GroupBy.sem`, :meth:`GroupBy.prod`, :meth:`GroupBy.min`, :meth:`GroupBy.max` don't use corresponding methods of subclasses of :class:`Series` or :class:`DataFrame` (:issue:`51757`)
+-
+
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)

@@ -284,7 +284,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
                 )
 
                 # result is a dict whose keys are the elements of result_index
-                result = Series(result, index=self.grouper.result_index)
+                result = self._obj_1d_constructor(
+                    result, index=self.grouper.result_index
+                )
                 result = self._wrap_aggregated_output(result)
                 return result
 
@@ -681,14 +683,20 @@ def value_counts(
 
         index_names = self.grouper.names + [self.obj.name]
 
+        constructor_1d = (
+            self._obj_1d_constructor
+            if isinstance(self._obj_1d_constructor, Series)
+            else Series
+        )
+
         if is_categorical_dtype(val.dtype) or (
             bins is not None and not np.iterable(bins)
         ):
             # scalar bins cannot be done at top level
             # in a backward compatible way
             # GH38672 relates to categorical dtype
             ser = self.apply(
-                Series.value_counts,
+                constructor_1d.value_counts,
                 normalize=normalize,
                 sort=sort,
                 ascending=ascending,
@@ -707,7 +715,7 @@ def value_counts(
             llab = lambda lab, inc: lab[inc]
         else:
             # lab is a Categorical with categories an IntervalIndex
-            cat_ser = cut(Series(val), bins, include_lowest=True)
+            cat_ser = cut(self.obj._constructor(val), bins, include_lowest=True)
             cat_obj = cast("Categorical", cat_ser._values)
             lev = cat_obj.categories
             lab = lev.take(
@@ -1308,9 +1316,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
         elif relabeling:
             # this should be the only (non-raising) case with relabeling
             # used reordered index of columns
-            result = cast(DataFrame, result)
+            result = cast(self.obj._constructor, result)
             result = result.iloc[:, order]
-            result = cast(DataFrame, result)
+            result = cast(self.obj._constructor, result)
             # error: Incompatible types in assignment (expression has type
             # "Optional[List[str]]", variable has type
             # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
@@ -1353,7 +1361,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
                 else:
                     # GH#32040, GH#35246
                     # e.g. test_groupby_as_index_select_column_sum_empty_df
-                    result = cast(DataFrame, result)
+                    result = cast(self._constructor, result)
                     result.columns = self._obj_with_exclusions.columns.copy()
 
         if not self.as_index:
@@ -1481,7 +1489,7 @@ def _wrap_applied_output_series(
         is_transform: bool,
     ) -> DataFrame | Series:
         kwargs = first_not_none._construct_axes_dict()
-        backup = Series(**kwargs)
+        backup = self._obj_1d_constructor(**kwargs)
         values = [x if (x is not None) else backup for x in values]
 
         all_indexed_same = all_indexes_same(x.index for x in values)
@@ -1876,7 +1884,9 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
 
         if not len(results):
             # concat would raise
-            res_df = DataFrame([], columns=columns, index=self.grouper.result_index)
+            res_df = self.obj._constructor(
+                [], columns=columns, index=self.grouper.result_index
+            )
         else:
             res_df = concat(results, keys=columns, axis=1)
 

@@ -1859,14 +1859,26 @@ def mean(
         Name: B, dtype: float64
         """
 
+        if not (
+            type(self.obj).mean is Series.mean or type(self.obj).mean is DataFrame.mean
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).mean(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_mean
 
             return self._numba_agg_general(sliding_mean, engine_kwargs)
         else:
             result = self._cython_agg_general(
                 "mean",
-                alt=lambda x: Series(x).mean(numeric_only=numeric_only),
+                alt=lambda x: self._obj_1d_constructor(x).mean(
+                    numeric_only=numeric_only
+                ),
                 numeric_only=numeric_only,
             )
             return result.__finalize__(self.obj, method="groupby")
@@ -1892,9 +1904,20 @@ def median(self, numeric_only: bool = False):
         Series or DataFrame
             Median of values within each group.
         """
+        if not (
+            type(self.obj).median is Series.median
+            or type(self.obj).median is DataFrame.median
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).median(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         result = self._cython_agg_general(
             "median",
-            alt=lambda x: Series(x).median(numeric_only=numeric_only),
+            alt=lambda x: self._obj_1d_constructor(x).median(numeric_only=numeric_only),
             numeric_only=numeric_only,
         )
         return result.__finalize__(self.obj, method="groupby")
@@ -1950,6 +1973,16 @@ def std(
         Series or DataFrame
             Standard deviation of values within each group.
         """
+        if not (
+            type(self.obj).std is Series.std or type(self.obj).std is DataFrame.std
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).std(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_var
 
@@ -2013,14 +2046,24 @@ def var(
         Series or DataFrame
             Variance of values within each group.
         """
+        if not (
+            type(self.obj).var is Series.var or type(self.obj).var is DataFrame.var
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).var(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_var
 
             return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
         else:
             return self._cython_agg_general(
                 "var",
-                alt=lambda x: Series(x).var(ddof=ddof),
+                alt=lambda x: self._obj_1d_constructor(x).var(ddof=ddof),
                 numeric_only=numeric_only,
                 ddof=ddof,
             )
@@ -2184,6 +2227,17 @@ def sem(self, ddof: int = 1, numeric_only: bool = False):
         Series or DataFrame
             Standard error of the mean of values within each group.
         """
+        # TODO: think sem() needs considering more closely
+        if not (
+            type(self.obj).sem is Series.sem or type(self.obj).sem is DataFrame.sem
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).sem(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
             raise TypeError(
                 f"{type(self).__name__}.sem called with "
@@ -2236,6 +2290,16 @@ def sum(
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
+        if not (
+            type(self.obj).sum is Series.sum or type(self.obj).sum is DataFrame.sum
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).sum(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_sum
 
@@ -2260,6 +2324,16 @@ def sum(
     @final
     @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
     def prod(self, numeric_only: bool = False, min_count: int = 0):
+        if not (
+            type(self.obj).prod is Series.prod or type(self.obj).prod is DataFrame.prod
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).prod(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         return self._agg_general(
             numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
         )
@@ -2273,6 +2347,16 @@ def min(
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
+        if not (
+            type(self.obj).min is Series.min or type(self.obj).min is DataFrame.min
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).min(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_min_max
 
@@ -2294,6 +2378,16 @@ def max(
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
+        if not (
+            type(self.obj).max is Series.max or type(self.obj).max is DataFrame.max
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).max(numeric_only=numeric_only)
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_min_max
 

diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
@@ -103,3 +103,95 @@ def test_groupby_resample_preserves_subclass(obj):
     # Confirm groupby.resample() preserves dataframe type
     result = df.groupby("Buyer").resample("5D").sum()
     assert isinstance(result, obj)
+
+
+def test_groupby_overridden_methods():
+    class UnitSeries(Series):
+        @property
+        def _constructor(self):
+            return UnitSeries
+
+        @property
+        def _constructor_expanddim(self):
+            return UnitDataFrame
+
+        def mean(self, *args, **kwargs):
+            return 1
+
+        def median(self, *args, **kwargs):
+            return 2
+
+        def std(self, *args, **kwargs):
+            return 3
+
+        def var(self, *args, **kwargs):
+            return 4
+
+        def sem(self, *args, **kwargs):
+            return 5
+
+        def prod(self, *args, **kwargs):
+            return 6
+
+        def min(self, *args, **kwargs):
+            return 7
+
+        def max(self, *args, **kwargs):
+            return 8
+
+    class UnitDataFrame(DataFrame):
+        @property
+        def _constructor(self):
+            return UnitDataFrame
+
+        @property
+        def _constructor_expanddim(self):
+            return UnitSeries
+
+        def mean(self, *args, **kwargs):
+            return 1
+
+        def median(self, *args, **kwargs):
+            return 2
+
+        def std(self, *args, **kwargs):
+            return 3
+
+        def var(self, *args, **kwargs):
+            return 4
+
+        def sem(self, *args, **kwargs):
+            return 5
+
+        def prod(self, *args, **kwargs):
+            return 6
+
+        def min(self, *args, **kwargs):
+            return 7
+
+        def max(self, *args, **kwargs):
+            return 8
+
+    columns = ["a", "b"]
+    data = np.random.rand(4, 2)
+    udf = UnitDataFrame(data, columns=columns)
+    udf["group"] = np.ones(4, dtype=int)
+    udf.loc[2:, "group"] = 2
+
+    assert np.all(udf.groupby("group").mean() == 1)
+    assert np.all(udf.groupby("group").median() == 2)
+    assert np.all(udf.groupby("group").std() == 3)
+    assert np.all(udf.groupby("group").var() == 4)
+    assert np.all(udf.groupby("group").sem() == 5)
+    assert np.all(udf.groupby("group").prod() == 6)
+    assert np.all(udf.groupby("group").min() == 7)
+    assert np.all(udf.groupby("group").max() == 8)
+
+    assert np.all(udf.groupby("group").transform("mean") == 1)
+    assert np.all(udf.groupby("group").transform("median") == 2)
+    assert np.all(udf.groupby("group").transform("std") == 3)
+    assert np.all(udf.groupby("group").transform("var") == 4)
+    assert np.all(udf.groupby("group").transform("sem") == 5)
+    assert np.all(udf.groupby("group").transform("prod") == 6)
+    assert np.all(udf.groupby("group").transform("min") == 7)
+    assert np.all(udf.groupby("group").transform("max") == 8)