diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 173cc6b6b483c..8ddb714fae90e 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -272,6 +272,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.rolling` not allowing for rolling over datetimes when ``axis=1`` (:issue: `28192`) - Bug in :meth:`DataFrame.groupby` not offering selection by column name when ``axis=1`` (:issue:`27614`) - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) +- Bug in groupby/resampling where subclasses were not returned from groupby operations (:issue:`28330`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index f8f1455561c03..b535e7e2f4288 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -337,7 +337,7 @@ def _aggregate_item_by_item(self, func, *args, **kwargs): if not len(result_columns) and errors is not None: raise errors - return DataFrame(result, columns=result_columns) + return self.obj._constructor(result, columns=result_columns) def _decide_output_index(self, output, labels): if len(output) == len(labels): @@ -356,7 +356,7 @@ def _decide_output_index(self, output, labels): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: - return DataFrame(index=keys) + return self.obj._constructor(index=keys) key_names = self.grouper.names @@ -372,7 +372,7 @@ def first_not_none(values): if v is None: # GH9684. If all values are None, then this will throw an error. # We'd prefer it return an empty dataframe. - return DataFrame() + return self.obj._constructor() elif isinstance(v, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: @@ -401,7 +401,7 @@ def first_not_none(values): # make Nones an empty object v = first_not_none(values) if v is None: - return DataFrame() + return self.obj._constructor() elif isinstance(v, NDFrame): values = [ x if x is not None else v._constructor(**v._construct_axes_dict()) @@ -467,7 +467,7 @@ def first_not_none(values): or isinstance(key_index, MultiIndex) ): stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( + result = self.obj._constructor( stacked_values, index=key_index, columns=index ) else: @@ -484,14 +484,16 @@ def first_not_none(values): result.columns = index else: stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( + result = self.obj._constructor( stacked_values.T, index=v.index, columns=key_index ) except (ValueError, AttributeError): # GH1738: values is list of arrays of unequal lengths fall # through to the outer else caluse - return Series(values, index=key_index, name=self._selection_name) + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -510,7 +512,7 @@ def first_not_none(values): # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return Series(values, index=key_index)._convert( + return self.obj._constructor_sliced(values, index=key_index)._convert( datetime=True, coerce=coerce ) @@ -554,7 +556,7 @@ def _transform_general(self, func, *args, **kwargs): r.columns = group.columns r.index = group.index else: - r = DataFrame( + r = self.obj._constructor( np.concatenate([res.values] * len(group.index)).reshape( group.shape ), @@ -681,7 +683,7 @@ def _transform_item_by_item(self, obj, wrapper): if len(output) < len(obj.columns): columns = columns.take(inds) - return DataFrame(output, index=obj.index, columns=columns) + return self.obj._constructor(output, index=obj.index, columns=columns) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -875,7 +877,7 @@ def aggregate(self, func=None, *args, **kwargs): result = self._aggregate_named(func, *args, **kwargs) index = Index(sorted(result), name=self.grouper.names[0]) - ret = Series(result, index=index) + ret = self.obj._constructor(result, index=index) if not self.as_index: # pragma: no cover print("Warning, ignoring as_index=True") @@ -943,20 +945,21 @@ def _aggregate_multiple_funcs(self, arg, _level): # let higher level handle if _level: return results + self.obj._constructor(results, columns=columns) - return DataFrame(results, columns=columns) + return self.obj._constructor_expanddim(results, columns=columns) def _wrap_output(self, output, index, names=None): """ common agg/transform wrapping logic """ output = output[self._selection_name] if names is not None: - return DataFrame(output, index=index, columns=names) + return self.obj._constructor_expanddim(output, index=index, columns=names) else: name = self._selection_name if name is None: name = self._selected_obj.name - return Series(output, index=index, name=name) + return self.obj._constructor(output, index=index, name=name) def _wrap_aggregated_output(self, output, names=None): result = self._wrap_output( @@ -970,7 +973,7 @@ def _wrap_transformed_output(self, output, names=None): def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self._selection_name, index=keys) + return self.obj._constructor([], name=self._selection_name, index=keys) def _get_index(): if self.grouper.nkeys > 1: @@ -982,7 +985,9 @@ def _get_index(): if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result = self._reindex_output(DataFrame(values, index=index)) + result = self._reindex_output( + self.obj._constructor_expanddim(values, index=index) + ) # if self.observed is False, # keep all-NaN rows created while re-indexing result = result.stack(dropna=self.observed) @@ -996,7 +1001,9 @@ def _get_index(): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(data=values, index=_get_index(), name=self._selection_name) + result = self.obj._constructor( + data=values, index=_get_index(), name=self._selection_name + ) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -1052,7 +1059,7 @@ def transform(self, func, *args, **kwargs): result = concat(results).sort_index() else: - result = Series() + result = self.obj._constructor() # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* udfs @@ -1078,7 +1085,7 @@ def _transform_fast(self, func, func_nm): out = algorithms.take_1d(func()._values, ids) if cast: out = self._try_cast(out, self.obj) - return Series(out, index=self.obj.index, name=self.obj.name) + return self.obj._constructor(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): # noqa """ @@ -1193,7 +1200,7 @@ def nunique(self, dropna=True): res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - return Series(res, index=ri, name=self._selection_name) + return self.obj._constructor(res, index=ri, name=self._selection_name) @Appender(Series.describe.__doc__) def describe(self, **kwargs): @@ -1233,7 +1240,7 @@ def value_counts( else: # lab is a Categorical with categories an IntervalIndex - lab = cut(Series(val), bins, include_lowest=True) + lab = cut(self.obj._constructor(val), bins, include_lowest=True) lev = lab.cat.categories lab = lev.take(lab.cat.codes) llab = lambda lab, inc: lab[inc]._multiindex.codes[-1] @@ -1293,7 +1300,7 @@ def value_counts( if is_integer_dtype(out): out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.obj._constructor(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -1322,7 +1329,7 @@ def value_counts( if is_integer_dtype(out): out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.obj._constructor(out, index=mi, name=self._selection_name) def count(self): """ @@ -1341,7 +1348,7 @@ def count(self): minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - return Series( + return self.obj._constructor( out, index=self.grouper.result_index, name=self._selection_name, @@ -1513,9 +1520,11 @@ def _wrap_generic_output(self, result, obj): result_index = self.grouper.levels[0] if self.axis == 0: - return DataFrame(result, index=obj.columns, columns=result_index).T + return self.obj._constructor( + result, index=obj.columns, columns=result_index + ).T else: - return DataFrame(result, index=obj.index, columns=result_index) + return self.obj._constructor(result, index=obj.index, columns=result_index) def _get_data_to_aggregate(self): obj = self._obj_with_exclusions @@ -1548,12 +1557,12 @@ def _wrap_aggregated_output(self, output, names=None): output_keys = self._decide_output_index(output, agg_labels) if not self.as_index: - result = DataFrame(output, columns=output_keys) + result = self.obj._constructor(output, columns=output_keys) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index - result = DataFrame(output, index=index, columns=output_keys) + result = self.obj._constructor(output, index=index, columns=output_keys) if self.axis == 1: result = result.T @@ -1561,20 +1570,20 @@ def _wrap_aggregated_output(self, output, names=None): return self._reindex_output(result)._convert(datetime=True) def _wrap_transformed_output(self, output, names=None): - return DataFrame(output, index=self.obj.index) + return self.obj._constructor(output, index=self.obj.index) def _wrap_agged_blocks(self, items, blocks): if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, [items, index]) - result = DataFrame(mgr) + result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index mgr = BlockManager(blocks, [items, index]) - result = DataFrame(mgr) + result = self.obj._constructor(mgr) if self.axis == 1: result = result.T diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index bec5cbc5fecb8..d00b3ce64f021 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -134,6 +134,23 @@ def func(dataf): result = df.groupby("X", squeeze=False).count() assert isinstance(result, DataFrame) + # https://github.com/pandas-dev/pandas/issues/28330 + # Test groupby operations on subclassed dataframes/series + + cdf = tm.SubclassedDataFrame( + [ + {"val1": 1, "val2": 20}, + {"val1": 1, "val2": 19}, + {"val1": 2, "val2": 27}, + {"val1": 2, "val2": 12}, + ] + ) + result = cdf.groupby("val1").sum() + assert isinstance(result, tm.SubclassedDataFrame) + assert isinstance(result, DataFrame) + assert isinstance(result["val2"], tm.SubclassedSeries) + assert isinstance(result["val2"], Series) + def test_inconsistent_return_type(): # GH5592