BUG: Preserve Series/DataFrame subclasses through groupby operations (#33884)

JBGreisman · web-flow · commit aa3e61193287 · 2020-05-13T10:52:09.000-04:00
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -809,6 +809,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`)
 - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`)
 - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`)
+- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`)
 
 Reshaping
 ^^^^^^^^^
diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py
@@ -327,7 +327,7 @@ def _aggregate_multiple_funcs(self, arg):
             # let higher level handle
             return results
 
-        return DataFrame(results, columns=columns)
+        return self.obj._constructor_expanddim(results, columns=columns)
 
     def _wrap_series_output(
         self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index,
@@ -356,10 +356,12 @@ def _wrap_series_output(
 
         result: Union[Series, DataFrame]
         if len(output) > 1:
-            result = DataFrame(indexed_output, index=index)
+            result = self.obj._constructor_expanddim(indexed_output, index=index)
             result.columns = columns
         else:
-            result = Series(indexed_output[0], index=index, name=columns[0])
+            result = self.obj._constructor(
+                indexed_output[0], index=index, name=columns[0]
+            )
 
         return result
 
@@ -418,7 +420,9 @@ def _wrap_transformed_output(
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
             # GH #6265
-            return Series([], name=self._selection_name, index=keys, dtype=np.float64)
+            return self.obj._constructor(
+                [], name=self._selection_name, index=keys, dtype=np.float64
+            )
 
         def _get_index() -> Index:
             if self.grouper.nkeys > 1:
@@ -430,7 +434,9 @@ def _get_index() -> Index:
         if isinstance(values[0], dict):
             # GH #823 #24880
             index = _get_index()
-            result = self._reindex_output(DataFrame(values, index=index))
+            result = self._reindex_output(
+                self.obj._constructor_expanddim(values, index=index)
+            )
             # if self.observed is False,
             # keep all-NaN rows created while re-indexing
             result = result.stack(dropna=self.observed)
@@ -444,7 +450,9 @@ def _get_index() -> Index:
             return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
         else:
             # GH #6265 #24880
-            result = Series(data=values, index=_get_index(), name=self._selection_name)
+            result = self.obj._constructor(
+                data=values, index=_get_index(), name=self._selection_name
+            )
             return self._reindex_output(result)
 
     def _aggregate_named(self, func, *args, **kwargs):
@@ -520,7 +528,7 @@ def _transform_general(
 
             result = concat(results).sort_index()
         else:
-            result = Series(dtype=np.float64)
+            result = self.obj._constructor(dtype=np.float64)
 
         # we will only try to coerce the result type if
         # we have a numeric dtype, as these are *always* user-defined funcs
@@ -543,7 +551,7 @@ def _transform_fast(self, result, func_nm: str) -> Series:
         out = algorithms.take_1d(result._values, ids)
         if cast:
             out = maybe_cast_result(out, self.obj, how=func_nm)
-        return Series(out, index=self.obj.index, name=self.obj.name)
+        return self.obj._constructor(out, index=self.obj.index, name=self.obj.name)
 
     def filter(self, func, dropna=True, *args, **kwargs):
         """
@@ -644,7 +652,7 @@ def nunique(self, dropna: bool = True) -> Series:
             res, out = np.zeros(len(ri), dtype=out.dtype), res
             res[ids[idx]] = out
 
-        result = Series(res, index=ri, name=self._selection_name)
+        result = self.obj._constructor(res, index=ri, name=self._selection_name)
         return self._reindex_output(result, fill_value=0)
 
     @doc(Series.describe)
@@ -746,7 +754,7 @@ def value_counts(
 
             if is_integer_dtype(out):
                 out = ensure_int64(out)
-            return Series(out, index=mi, name=self._selection_name)
+            return self.obj._constructor(out, index=mi, name=self._selection_name)
 
         # for compat. with libgroupby.value_counts need to ensure every
         # bin is present at every index level, null filled with zeros
@@ -778,7 +786,7 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray:
 
         if is_integer_dtype(out):
             out = ensure_int64(out)
-        return Series(out, index=mi, name=self._selection_name)
+        return self.obj._constructor(out, index=mi, name=self._selection_name)
 
     def count(self) -> Series:
         """
@@ -797,7 +805,7 @@ def count(self) -> Series:
         minlength = ngroups or 0
         out = np.bincount(ids[mask], minlength=minlength)
 
-        result = Series(
+        result = self.obj._constructor(
             out,
             index=self.grouper.result_index,
             name=self._selection_name,
@@ -1195,11 +1203,11 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame:
         if cannot_agg:
             result_columns = result_columns.drop(cannot_agg)
 
-        return DataFrame(result, columns=result_columns)
+        return self.obj._constructor(result, columns=result_columns)
 
     def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if len(keys) == 0:
-            return DataFrame(index=keys)
+            return self.obj._constructor(index=keys)
 
         key_names = self.grouper.names
 
@@ -1209,7 +1217,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
         if first_not_none is None:
             # GH9684. If all values are None, then this will throw an error.
             # We'd prefer it return an empty dataframe.
-            return DataFrame()
+            return self.obj._constructor()
         elif isinstance(first_not_none, DataFrame):
             return self._concat_objects(keys, values, not_indexed_same=not_indexed_same)
         elif self.grouper.groupings is not None:
@@ -1240,13 +1248,13 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
 
             # make Nones an empty object
             if first_not_none is None:
-                return DataFrame()
+                return self.obj._constructor()
             elif isinstance(first_not_none, NDFrame):
 
                 # this is to silence a DeprecationWarning
                 # TODO: Remove when default dtype of empty Series is object
                 kwargs = first_not_none._construct_axes_dict()
-                if first_not_none._constructor is Series:
+                if isinstance(first_not_none, Series):
                     backup = create_series_with_explicit_dtype(
                         **kwargs, dtype_if_empty=object
                     )
@@ -1313,7 +1321,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                         or isinstance(key_index, MultiIndex)
                     ):
                         stacked_values = np.vstack([np.asarray(v) for v in values])
-                        result = DataFrame(
+                        result = self.obj._constructor(
                             stacked_values, index=key_index, columns=index
                         )
                     else:
@@ -1330,15 +1338,17 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                         result.columns = index
                 elif isinstance(v, ABCSeries):
                     stacked_values = np.vstack([np.asarray(v) for v in values])
-                    result = DataFrame(
+                    result = self.obj._constructor(
                         stacked_values.T, index=v.index, columns=key_index
                     )
                 else:
                     # GH#1738: values is list of arrays of unequal lengths
                     #  fall through to the outer else clause
                     # TODO: sure this is right?  we used to do this
                     #  after raising AttributeError above
-                    return Series(values, index=key_index, name=self._selection_name)
+                    return self.obj._constructor_sliced(
+                        values, index=key_index, name=self._selection_name
+                    )
 
                 # if we have date/time like in the original, then coerce dates
                 # as we are stacking can easily have object dtypes here
@@ -1355,7 +1365,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False):
                 # self._selection_name not passed through to Series as the
                 # result should not take the name of original selection
                 # of columns
-                return Series(values, index=key_index)
+                return self.obj._constructor_sliced(values, index=key_index)
 
         else:
             # Handle cases like BinGrouper
@@ -1385,7 +1395,9 @@ def _transform_general(
                 if cache_key not in NUMBA_FUNC_CACHE:
                     NUMBA_FUNC_CACHE[cache_key] = numba_func
                 # Return the result as a DataFrame for concatenation later
-                res = DataFrame(res, index=group.index, columns=group.columns)
+                res = self.obj._constructor(
+                    res, index=group.index, columns=group.columns
+                )
             else:
                 # Try slow path and fast path.
                 try:
@@ -1408,7 +1420,7 @@ def _transform_general(
                     r.columns = group.columns
                     r.index = group.index
                 else:
-                    r = DataFrame(
+                    r = self.obj._constructor(
                         np.concatenate([res.values] * len(group.index)).reshape(
                             group.shape
                         ),
@@ -1484,7 +1496,9 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame:
                 res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm)
             output.append(res)
 
-        return DataFrame._from_arrays(output, columns=result.columns, index=obj.index)
+        return self.obj._constructor._from_arrays(
+            output, columns=result.columns, index=obj.index
+        )
 
     def _define_paths(self, func, *args, **kwargs):
         if isinstance(func, str):
@@ -1546,7 +1560,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame:
         if len(output) < len(obj.columns):
             columns = columns.take(inds)
 
-        return DataFrame(output, index=obj.index, columns=columns)
+        return self.obj._constructor(output, index=obj.index, columns=columns)
 
     def filter(self, func, dropna=True, *args, **kwargs):
         """
@@ -1661,9 +1675,11 @@ def _wrap_frame_output(self, result, obj) -> DataFrame:
         result_index = self.grouper.levels[0]
 
         if self.axis == 0:
-            return DataFrame(result, index=obj.columns, columns=result_index).T
+            return self.obj._constructor(
+                result, index=obj.columns, columns=result_index
+            ).T
         else:
-            return DataFrame(result, index=obj.index, columns=result_index)
+            return self.obj._constructor(result, index=obj.index, columns=result_index)
 
     def _get_data_to_aggregate(self) -> BlockManager:
         obj = self._obj_with_exclusions
@@ -1707,7 +1723,7 @@ def _wrap_aggregated_output(
         indexed_output = {key.position: val for key, val in output.items()}
         columns = Index(key.label for key in output)
 
-        result = DataFrame(indexed_output)
+        result = self.obj._constructor(indexed_output)
         result.columns = columns
 
         if not self.as_index:
@@ -1740,7 +1756,7 @@ def _wrap_transformed_output(
         indexed_output = {key.position: val for key, val in output.items()}
         columns = Index(key.label for key in output)
 
-        result = DataFrame(indexed_output)
+        result = self.obj._constructor(indexed_output)
         result.columns = columns
         result.index = self.obj.index
 
@@ -1750,14 +1766,14 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra
         if not self.as_index:
             index = np.arange(blocks[0].values.shape[-1])
             mgr = BlockManager(blocks, axes=[items, index])
-            result = DataFrame(mgr)
+            result = self.obj._constructor(mgr)
 
             self._insert_inaxis_grouper_inplace(result)
             result = result._consolidate()
         else:
             index = self.grouper.result_index
             mgr = BlockManager(blocks, axes=[items, index])
-            result = DataFrame(mgr)
+            result = self.obj._constructor(mgr)
 
         if self.axis == 1:
             result = result.T
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -1185,6 +1185,14 @@ class GroupBy(_GroupBy[FrameOrSeries]):
     more
     """
 
+    @property
+    def _obj_1d_constructor(self) -> Type["Series"]:
+        # GH28330 preserve subclassed Series/DataFrames
+        if isinstance(self.obj, DataFrame):
+            return self.obj._constructor_sliced
+        assert isinstance(self.obj, Series)
+        return self.obj._constructor
+
     def _bool_agg(self, val_test, skipna):
         """
         Shared func to call any / all Cython GroupBy implementations.
@@ -1423,8 +1431,11 @@ def size(self):
         """
         result = self.grouper.size()
 
-        if isinstance(self.obj, Series):
-            result.name = self.obj.name
+        # GH28330 preserve subclassed Series/DataFrames through calls
+        if issubclass(self.obj._constructor, Series):
+            result = self._obj_1d_constructor(result, name=self.obj.name)
+        else:
+            result = self._obj_1d_constructor(result)
         return self._reindex_output(result, fill_value=0)
 
     @classmethod
@@ -2110,7 +2121,7 @@ def ngroup(self, ascending: bool = True):
         """
         with _group_selection_context(self):
             index = self._selected_obj.index
-            result = Series(self.grouper.group_info[0], index)
+            result = self._obj_1d_constructor(self.grouper.group_info[0], index)
             if not ascending:
                 result = self.ngroups - 1 - result
             return result
@@ -2172,7 +2183,7 @@ def cumcount(self, ascending: bool = True):
         with _group_selection_context(self):
             index = self._selected_obj.index
             cumcounts = self._cumcount_array(ascending=ascending)
-            return Series(cumcounts, index)
+            return self._obj_1d_constructor(cumcounts, index)
 
     @Substitution(name="groupby")
     @Appender(_common_see_also)
diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py
@@ -469,7 +469,9 @@ def get_result(self):
             # combine as columns in a frame
             else:
                 data = dict(zip(range(len(self.objs)), self.objs))
-                cons = DataFrame
+
+                # GH28330 Preserves subclassed objects through concat
+                cons = self.objs[0]._constructor_expanddim
 
                 index, columns = self.new_axes
                 df = cons(data, index=index)
diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py
@@ -682,3 +682,17 @@ def test_asof(self):
 
         result = df.asof("1989-12-31")
         assert isinstance(result, tm.SubclassedSeries)
+
+    def test_idxmin_preserves_subclass(self):
+        # GH 28330
+
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.idxmin()
+        assert isinstance(result, tm.SubclassedSeries)
+
+    def test_idxmax_preserves_subclass(self):
+        # GH 28330
+
+        df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
+        result = df.idxmax()
+        assert isinstance(result, tm.SubclassedSeries)
diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py