pandas-dev · AdamOrmondroyd · Mar 2, 2023 · Mar 2, 2023 · Mar 2, 2023 · Mar 3, 2023
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -189,6 +189,10 @@ Plotting
 -
 -
 
+Groupby
+- Bug in :meth:`GroupBy.mean`, :meth:`GroupBy.median`, :meth:`GroupBy.std`, :meth:`GroupBy.var`, :meth:`GroupBy.sem`, :meth:`GroupBy.prod`, :meth:`GroupBy.min`, :meth:`GroupBy.max` don't use corresponding methods of subclasses of :class:`Series` or :class:`DataFrame` (:issue:`51757`)
+-
+
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 - Bug in :meth:`DataFrameGroupBy.idxmin`, :meth:`SeriesGroupBy.idxmin`, :meth:`DataFrameGroupBy.idxmax`, :meth:`SeriesGroupBy.idxmax` return wrong dtype when used on empty DataFrameGroupBy or SeriesGroupBy (:issue:`51423`)

@@ -284,7 +284,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
                 )
 
                 # result is a dict whose keys are the elements of result_index
-                result = Series(result, index=self.grouper.result_index)
+                result = self._obj_1d_constructor(
+                    result, index=self.grouper.result_index
+                )
                 result = self._wrap_aggregated_output(result)
                 return result
 
@@ -681,14 +683,20 @@ def value_counts(
 
         index_names = self.grouper.names + [self.obj.name]
 
+        constructor_1d = (
+            self._obj_1d_constructor
+            if isinstance(self._obj_1d_constructor, Series)
+            else Series
+        )
+
         if is_categorical_dtype(val.dtype) or (
             bins is not None and not np.iterable(bins)
         ):
             # scalar bins cannot be done at top level
             # in a backward compatible way
             # GH38672 relates to categorical dtype
             ser = self.apply(
-                Series.value_counts,
+                constructor_1d.value_counts,
                 normalize=normalize,
                 sort=sort,
                 ascending=ascending,
@@ -707,7 +715,7 @@ def value_counts(
             llab = lambda lab, inc: lab[inc]
         else:
             # lab is a Categorical with categories an IntervalIndex
-            cat_ser = cut(Series(val), bins, include_lowest=True)
+            cat_ser = cut(self.obj._constructor(val), bins, include_lowest=True)
             cat_obj = cast("Categorical", cat_ser._values)
             lev = cat_obj.categories
             lab = lev.take(
@@ -1290,9 +1298,9 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
         elif relabeling:
             # this should be the only (non-raising) case with relabeling
             # used reordered index of columns
-            result = cast(DataFrame, result)
+            result = cast(self.obj._constructor, result)
             result = result.iloc[:, order]
-            result = cast(DataFrame, result)
+            result = cast(self.obj._constructor, result)
             # error: Incompatible types in assignment (expression has type
             # "Optional[List[str]]", variable has type
             # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
@@ -1335,7 +1343,7 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs)
                 else:
                     # GH#32040, GH#35246
                     # e.g. test_groupby_as_index_select_column_sum_empty_df
-                    result = cast(DataFrame, result)
+                    result = cast(self._obj_1d_constructor, result)
                     result.columns = self._obj_with_exclusions.columns.copy()
 
         if not self.as_index:
@@ -1463,7 +1471,7 @@ def _wrap_applied_output_series(
         is_transform: bool,
     ) -> DataFrame | Series:
         kwargs = first_not_none._construct_axes_dict()
-        backup = Series(**kwargs)
+        backup = self._obj_1d_constructor(**kwargs)
         values = [x if (x is not None) else backup for x in values]
 
         all_indexed_same = all_indexes_same(x.index for x in values)
@@ -1858,7 +1866,9 @@ def _apply_to_column_groupbys(self, func) -> DataFrame:
 
         if not len(results):
             # concat would raise
-            res_df = DataFrame([], columns=columns, index=self.grouper.result_index)
+            res_df = self.obj._constructor(
+                [], columns=columns, index=self.grouper.result_index
+            )
         else:
             res_df = concat(results, keys=columns, axis=1)
 

@@ -1629,7 +1629,7 @@ def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
 
     @final
     @property
-    def _obj_1d_constructor(self) -> Callable:
+    def _obj_1d_constructor(self):
         # GH28330 preserve subclassed Series/DataFrames
         if isinstance(self.obj, DataFrame):
             return self.obj._constructor_sliced
@@ -1837,14 +1837,24 @@ def mean(
         Name: B, dtype: float64
         """
 
+        if not (self.obj.mean is Series.mean or self.obj.mean is DataFrame.mean):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).mean()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_mean
 
             return self._numba_agg_general(sliding_mean, engine_kwargs)
         else:
             result = self._cython_agg_general(
                 "mean",
-                alt=lambda x: Series(x).mean(numeric_only=numeric_only),
+                alt=lambda x: self._obj_1d_constructor(x).mean(
+                    numeric_only=numeric_only
+                ),
                 numeric_only=numeric_only,
             )
             return result.__finalize__(self.obj, method="groupby")
@@ -1870,9 +1880,19 @@ def median(self, numeric_only: bool = False):
         Series or DataFrame
             Median of values within each group.
         """
+        if not (
+            self.obj.median is Series.median or self.obj.median is DataFrame.median
+        ):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).median()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         result = self._cython_agg_general(
             "median",
-            alt=lambda x: Series(x).median(numeric_only=numeric_only),
+            alt=lambda x: self._obj_1d_constructor(x).median(numeric_only=numeric_only),
             numeric_only=numeric_only,
         )
         return result.__finalize__(self.obj, method="groupby")
@@ -1928,6 +1948,14 @@ def std(
         Series or DataFrame
             Standard deviation of values within each group.
         """
+        if not (self.obj.std is Series.std or self.obj.std is DataFrame.std):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).std()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_var
 
@@ -2011,14 +2039,22 @@ def var(
         Series or DataFrame
             Variance of values within each group.
         """
+        if not (self.obj.var is Series.var or self.obj.var is DataFrame.var):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).var()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_var
 
             return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
         else:
             return self._cython_agg_general(
                 "var",
-                alt=lambda x: Series(x).var(ddof=ddof),
+                alt=lambda x: self._obj_1d_constructor(x).var(ddof=ddof),
                 numeric_only=numeric_only,
                 ddof=ddof,
             )
@@ -2180,6 +2216,15 @@ def sem(self, ddof: int = 1, numeric_only: bool = False):
         Series or DataFrame
             Standard error of the mean of values within each group.
         """
+        # TODO: think sem() needs considering more closely
+        if not (self.obj.sem is Series.sem or self.obj.sem is DataFrame.sem):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).sem()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
             raise TypeError(
                 f"{type(self).__name__}.sem called with "
@@ -2238,6 +2283,14 @@ def sum(
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
+        if not (self.obj.sum is Series.sum or self.obj.sum is DataFrame.sum):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).sum()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_sum
 
@@ -2262,6 +2315,14 @@ def sum(
     @final
     @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
     def prod(self, numeric_only: bool = False, min_count: int = 0):
+        if not (self.obj.prod is Series.prod or self.obj.prod is DataFrame.prod):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).prod()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         return self._agg_general(
             numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
         )
@@ -2275,6 +2336,14 @@ def min(
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
+        if not (self.obj.min is Series.min or self.obj.min is DataFrame.min):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).min()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_min_max
 
@@ -2296,6 +2365,14 @@ def max(
         engine: str | None = None,
         engine_kwargs: dict[str, bool] | None = None,
     ):
+        if not (self.obj.max is Series.max or self.obj.max is DataFrame.max):
+
+            def f(df, *args, **kwargs):
+                return self.obj._constructor(df).max()
+
+            result = self.agg(f)
+            return result.__finalize__(self.obj, method="groupby")
+
         if maybe_use_numba(engine):
             from pandas.core._numba.kernels import sliding_min_max
 

diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
@@ -103,3 +103,95 @@ def test_groupby_resample_preserves_subclass(obj):
     # Confirm groupby.resample() preserves dataframe type
     result = df.groupby("Buyer").resample("5D").sum()
     assert isinstance(result, obj)
+
+
+def test_groupby_overridden_methods():
+    class UnitSeries(Series):
+        @property
+        def _constructor(self):
+            return UnitSeries
+
+        @property
+        def _constructor_expanddim(self):
+            return UnitDataFrame
+
+        def mean(self, *args, **kwargs):
+            return 1
+
+        def median(self, *args, **kwargs):
+            return 2
+
+        def std(self, *args, **kwargs):
+            return 3
+
+        def var(self, *args, **kwargs):
+            return 4
+
+        def sem(self, *args, **kwargs):
+            return 5
+
+        def prod(self, *args, **kwargs):
+            return 6
+
+        def min(self, *args, **kwargs):
+            return 7
+
+        def max(self, *args, **kwargs):
+            return 8
+
+    class UnitDataFrame(DataFrame):
+        @property
+        def _constructor(self):
+            return UnitDataFrame
+
+        @property
+        def _constructor_expanddim(self):
+            return UnitSeries
+
+        def mean(self, *args, **kwargs):
+            return 1
+
+        def median(self, *args, **kwargs):
+            return 2
+
+        def std(self, *args, **kwargs):
+            return 3
+
+        def var(self, *args, **kwargs):
+            return 4
+
+        def sem(self, *args, **kwargs):
+            return 5
+
+        def prod(self, *args, **kwargs):
+            return 6
+
+        def min(self, *args, **kwargs):
+            return 7
+
+        def max(self, *args, **kwargs):
+            return 8
+
+    params = ["a", "b"]
+    data = np.random.rand(4, 2)
+    udf = UnitDataFrame(data, columns=params)
+    udf["group"] = np.ones(4, dtype=int)
+    udf.loc[2:, "group"] = 2
+
+    assert np.all(udf.groupby("group").mean() == 1)
+    assert np.all(udf.groupby("group").median() == 2)
+    assert np.all(udf.groupby("group").std() == 3)
+    assert np.all(udf.groupby("group").var() == 4)
+    assert np.all(udf.groupby("group").sem() == 5)
+    assert np.all(udf.groupby("group").prod() == 6)
+    assert np.all(udf.groupby("group").min() == 7)
+    assert np.all(udf.groupby("group").max() == 8)
+    for useries in udf:
+        assert np.all(useries.groupby("group").mean() == 1)
+        assert np.all(useries.groupby("group").median() == 2)
+        assert np.all(useries.groupby("group").std() == 3)
+        assert np.all(useries.groupby("group").var() == 4)
+        assert np.all(useries.groupby("group").sem() == 5)
+        assert np.all(useries.groupby("group").prod() == 6)
+        assert np.all(useries.groupby("group").min() == 7)
+        assert np.all(useries.groupby("group").max() == 8)