From 7da070306484b2bb48115aa86eb571c499ad518f Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 14:38:28 -0400 Subject: [PATCH 01/33] Add tests to confirm groupby ops preserve subclasses --- pandas/tests/groupby/test_groupby.py | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c88d16e34eab8..3b88135a9e042 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -195,6 +195,45 @@ def f(grp): tm.assert_series_equal(result, e) +@pytest.mark.parametrize( + "obj", + [ + tm.SubclassedDataFrame( + dict(A=[0, 0, 0, 1, 1, 1], B=range(6)), + index=["A", "B", "C", "D", "E", "F"]), + tm.SubclassedSeries([0, 0, 0, 1, 1, 1], + index=["A", "B", "C", "D", "E", "F"]), + ], +) +def test_groupby_preserves_subclass(obj, groupby_func): + # GH28330 -- preserve subclass through groupby operations + + if isinstance(obj, Series) and groupby_func in {"corrwith"}: + pytest.skip("Not applicable") + + grouped = obj.groupby(np.repeat([0, 1], 3)) + + # Groups should preserve subclass type + assert isinstance(grouped.get_group(0), type(obj)) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(obj) + elif groupby_func == "tshift": + args.extend([0, 0]) + + result = getattr(grouped, groupby_func)(*args) + + # Reduction or transformation kernels should preserve type + slices = groupby_func in {"cumcount", "ngroup", "size"} + if isinstance(obj, DataFrame) and slices: + assert isinstance(result, type(obj._constructor_sliced(dtype=object))) + else: + assert isinstance(result, type(obj)) + + def test_pass_args_kwargs(ts, tsframe): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) From f6e1a892f90445762a41a067b738e29f4b1972ca Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 15:17:21 -0400 Subject: [PATCH 02/33] Update SeriesGroupBy constructor calls to preserve subclassed Series --- pandas/core/groupby/generic.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ddf553dd1dd62..b707e972cb745 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -344,7 +344,7 @@ def _aggregate_multiple_funcs(self, arg): # let higher level handle return results - return DataFrame(results, columns=columns) + return self.obj._constructor_expanddim(results, columns=columns) def _wrap_series_output( self, output: Mapping[base.OutputKey, Union[Series, np.ndarray]], index: Index, @@ -373,10 +373,10 @@ def _wrap_series_output( result: Union[Series, DataFrame] if len(output) > 1: - result = DataFrame(indexed_output, index=index) + result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns else: - result = Series(indexed_output[0], index=index, name=columns[0]) + result = self.obj._constructor(indexed_output[0], index=index, name=columns[0]) return result @@ -435,7 +435,7 @@ def _wrap_transformed_output( def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return Series([], name=self._selection_name, index=keys, dtype=np.float64) + return self.obj._constructor([], name=self._selection_name, index=keys, dtype=np.float64) def _get_index() -> Index: if self.grouper.nkeys > 1: @@ -447,7 +447,7 @@ def _get_index() -> Index: if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result = self._reindex_output(DataFrame(values, index=index)) + result = self._reindex_output(self.obj._constructor_expanddim(values, index=index)) # if self.observed is False, # keep all-NaN rows created while re-indexing result = result.stack(dropna=self.observed) @@ -461,7 +461,7 @@ def _get_index() -> Index: return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = Series(data=values, index=_get_index(), name=self._selection_name) + result = self.obj._constructor(data=values, index=_get_index(), name=self._selection_name) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -541,7 +541,7 @@ def _transform_general( result = concat(results).sort_index() else: - result = Series(dtype=np.float64) + result = self.obj.constructor(dtype=np.float64) # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs @@ -564,7 +564,7 @@ def _transform_fast(self, result, func_nm: str) -> Series: out = algorithms.take_1d(result._values, ids) if cast: out = maybe_cast_result(out, self.obj, how=func_nm) - return Series(out, index=self.obj.index, name=self.obj.name) + return self.obj._constructor(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -665,7 +665,7 @@ def nunique(self, dropna: bool = True) -> Series: res, out = np.zeros(len(ri), dtype=out.dtype), res res[ids[idx]] = out - result = Series(res, index=ri, name=self._selection_name) + result = self.obj._constructor(res, index=ri, name=self._selection_name) return self._reindex_output(result, fill_value=0) @doc(Series.describe) @@ -767,7 +767,7 @@ def value_counts( if is_integer_dtype(out): out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.obj._constructor(out, index=mi, name=self._selection_name) # for compat. with libgroupby.value_counts need to ensure every # bin is present at every index level, null filled with zeros @@ -799,7 +799,7 @@ def build_codes(lev_codes: np.ndarray) -> np.ndarray: if is_integer_dtype(out): out = ensure_int64(out) - return Series(out, index=mi, name=self._selection_name) + return self.obj._constructor(out, index=mi, name=self._selection_name) def count(self) -> Series: """ @@ -818,7 +818,7 @@ def count(self) -> Series: minlength = ngroups or 0 out = np.bincount(ids[mask], minlength=minlength) - result = Series( + result = self.obj._constructor( out, index=self.grouper.result_index, name=self._selection_name, From ee6de4351b313a1b00268e07d6248f1070fde49b Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 16:05:04 -0400 Subject: [PATCH 03/33] Fix concat() method to preserve subclassed DataFrames --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index a868e663b06a5..83900f537418f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -469,7 +469,7 @@ def get_result(self): # combine as columns in a frame else: data = dict(zip(range(len(self.objs)), self.objs)) - cons = DataFrame + cons = self.objs[0]._constructor_expanddim index, columns = self.new_axes df = cons(data, index=index) From 3f9f4c4cb2f16f4cb27f32cb3caae39800cbf5b9 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 16:12:49 -0400 Subject: [PATCH 04/33] Add GH28330 comment to concat.py --- pandas/core/reshape/concat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 83900f537418f..57acf1a005668 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -467,8 +467,10 @@ def get_result(self): return result.__finalize__(self, method="concat") # combine as columns in a frame - else: + else: data = dict(zip(range(len(self.objs)), self.objs)) + + # GH28330 Preserves subclassed objects through concat cons = self.objs[0]._constructor_expanddim index, columns = self.new_axes From 01128268965f3b5c14d91f976f9f49af6bf98bef Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 17:11:31 -0400 Subject: [PATCH 05/33] Preserve subclassing in DataFrame.idxmin() and DataFrame.idxmax() calls --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4b4801f4e8c58..8ecf1fafeb399 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8524,7 +8524,7 @@ def idxmin(self, axis=0, skipna=True) -> Series: indices = nanops.nanargmin(self.values, axis=axis, skipna=skipna) index = self._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return Series(result, index=self._get_agg_axis(axis)) + return self._constructor_sliced(result, index=self._get_agg_axis(axis)) def idxmax(self, axis=0, skipna=True) -> Series: """ @@ -8591,7 +8591,7 @@ def idxmax(self, axis=0, skipna=True) -> Series: indices = nanops.nanargmax(self.values, axis=axis, skipna=skipna) index = self._get_axis(axis) result = [index[i] if i >= 0 else np.nan for i in indices] - return Series(result, index=self._get_agg_axis(axis)) + return self._constructor_sliced(result, index=self._get_agg_axis(axis)) def _get_agg_axis(self, axis_num: int) -> Index: """ From 422e702e928d9a7d1e78a7e027fe9e70a39340e0 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 17:48:28 -0400 Subject: [PATCH 06/33] GH28330 Fix GroupBy.size() to preserve subclassed types --- pandas/core/groupby/groupby.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6924c7d320bc4..c7c85ebd3998a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1351,8 +1351,12 @@ def size(self): """ result = self.grouper.size() + # GH28330 preserve subclassed Series/DataFrames through calls if isinstance(self.obj, Series): + result = self.obj._constructor(result) result.name = self.obj.name + elif isinstance(self.obj, DataFrame): + result = self.obj._constructor_sliced(result) return self._reindex_output(result, fill_value=0) @classmethod From 53ac3977220ea448dcddf23413f1c5fd6de7b33d Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 17:56:44 -0400 Subject: [PATCH 07/33] GH28330 Fix GroupBy.ngroup() to preserve subclassed types --- pandas/core/groupby/groupby.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c7c85ebd3998a..041eabd47f17a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2051,7 +2051,13 @@ def ngroup(self, ascending: bool = True): """ with _group_selection_context(self): index = self._selected_obj.index - result = Series(self.grouper.group_info[0], index) + + # GH28330 preserve subclassed Series/DataFrames through calls + if isinstance(self._selected_obj, Series): + result = self.obj._constructor(self.grouper.group_info[0], index) + elif isinstance(self._selected_obj, DataFrame): + result = self.obj._constructor_sliced(self.grouper.group_info[0], index) + if not ascending: result = self.ngroups - 1 - result return result From 2bc2520bad2ed3cf393e063a0ed235a8cb209018 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 18:01:50 -0400 Subject: [PATCH 08/33] GH28330 Fix GroupBy.cumcount() to preserve subclassed types --- pandas/core/groupby/groupby.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 041eabd47f17a..acf05e92a9e46 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2119,7 +2119,12 @@ def cumcount(self, ascending: bool = True): with _group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) - return Series(cumcounts, index) + + # GH28330 preserve subclassed Series/DataFrames through calls + if isinstance(self._selected_obj, Series): + return self.obj._constructor(cumcounts, index) + elif isinstance(self._selected_obj, DataFrame): + return self.obj._constructor_sliced(cumcounts, index) @Substitution(name="groupby") @Appender(_common_see_also) From 8d9a88550537e847e6de2525122f6be3b96cbcf3 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 18:02:41 -0400 Subject: [PATCH 09/33] GH28330 Fix constructor calls to preserve subclasses through groupby() --- pandas/core/groupby/generic.py | 40 +++++++++++++++++----------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b707e972cb745..6d1f011cd70c0 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1062,7 +1062,7 @@ def _cython_agg_blocks( deleted_items: List[np.ndarray] = [] # Some object-dtype blocks might be split into List[Block[T], Block[U]] split_items: List[np.ndarray] = [] - split_frames: List[DataFrame] = [] + split_frames: List[type(self.obj)] = [] no_result = object() for block in data.blocks: @@ -1101,7 +1101,7 @@ def _cython_agg_blocks( deleted_items.append(locs) continue else: - result = cast(DataFrame, result) + result = cast(type(self.obj), result) # unwrap DataFrame to get array if len(result._mgr.blocks) != 1: # We've split an object block! Everything we've assumed @@ -1231,11 +1231,11 @@ def _aggregate_item_by_item(self, func, *args, **kwargs) -> DataFrame: if cannot_agg: result_columns = result_columns.drop(cannot_agg) - return DataFrame(result, columns=result_columns) + return self.obj._constructor(result, columns=result_columns) def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: - return DataFrame(index=keys) + return self.obj._constructor(index=keys) key_names = self.grouper.names @@ -1245,7 +1245,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if first_not_none is None: # GH9684. If all values are None, then this will throw an error. # We'd prefer it return an empty dataframe. - return DataFrame() + return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) elif self.grouper.groupings is not None: @@ -1276,7 +1276,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # make Nones an empty object if first_not_none is None: - return DataFrame() + return self.obj._constructor() elif isinstance(first_not_none, NDFrame): # this is to silence a DeprecationWarning @@ -1349,7 +1349,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): or isinstance(key_index, MultiIndex) ): stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( + result = self.obj._constructor( stacked_values, index=key_index, columns=index ) else: @@ -1366,7 +1366,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): result.columns = index elif isinstance(v, ABCSeries): stacked_values = np.vstack([np.asarray(v) for v in values]) - result = DataFrame( + result = self.obj._constructor( stacked_values.T, index=v.index, columns=key_index ) else: @@ -1374,7 +1374,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # fall through to the outer else clause # TODO: sure this is right? we used to do this # after raising AttributeError above - return Series(values, index=key_index, name=self._selection_name) + return self.obj._constructor_sliced(values, index=key_index, name=self._selection_name) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -1391,7 +1391,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # self._selection_name not passed through to Series as the # result should not take the name of original selection # of columns - return Series(values, index=key_index) + return self.obj._constructor_sliced(values, index=key_index) else: # Handle cases like BinGrouper @@ -1425,7 +1425,7 @@ def _transform_general( if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_func # Return the result as a DataFrame for concatenation later - res = DataFrame(res, index=group.index, columns=group.columns) + res = self.obj._constructor(res, index=group.index, columns=group.columns) else: # Try slow path and fast path. try: @@ -1448,7 +1448,7 @@ def _transform_general( r.columns = group.columns r.index = group.index else: - r = DataFrame( + r = self.obj._constructor( np.concatenate([res.values] * len(group.index)).reshape( group.shape ), @@ -1524,7 +1524,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) - return DataFrame._from_arrays(output, columns=result.columns, index=obj.index) + return type(self.obj)._from_arrays(output, columns=result.columns, index=obj.index) def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): @@ -1586,7 +1586,7 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: if len(output) < len(obj.columns): columns = columns.take(inds) - return DataFrame(output, index=obj.index, columns=columns) + return self.obj._constructor(output, index=obj.index, columns=columns) def filter(self, func, dropna=True, *args, **kwargs): """ @@ -1701,9 +1701,9 @@ def _wrap_frame_output(self, result, obj) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: - return DataFrame(result, index=obj.columns, columns=result_index).T + return self.obj._constructor(result, index=obj.columns, columns=result_index).T else: - return DataFrame(result, index=obj.index, columns=result_index) + return self.obj._constructor(result, index=obj.index, columns=result_index) def _get_data_to_aggregate(self) -> BlockManager: obj = self._obj_with_exclusions @@ -1747,7 +1747,7 @@ def _wrap_aggregated_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result = DataFrame(indexed_output) + result = self.obj._constructor(indexed_output) result.columns = columns if not self.as_index: @@ -1780,7 +1780,7 @@ def _wrap_transformed_output( indexed_output = {key.position: val for key, val in output.items()} columns = Index(key.label for key in output) - result = DataFrame(indexed_output) + result = self.obj._constructor(indexed_output) result.columns = columns result.index = self.obj.index @@ -1790,14 +1790,14 @@ def _wrap_agged_blocks(self, blocks: "Sequence[Block]", items: Index) -> DataFra if not self.as_index: index = np.arange(blocks[0].values.shape[-1]) mgr = BlockManager(blocks, axes=[items, index]) - result = DataFrame(mgr) + result = self.obj._constructor(mgr) self._insert_inaxis_grouper_inplace(result) result = result._consolidate() else: index = self.grouper.result_index mgr = BlockManager(blocks, axes=[items, index]) - result = DataFrame(mgr) + result = self.obj._constructor(mgr) if self.axis == 1: result = result.T From e4d7fa8d096748651a28c1f41d9071aba543573c Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 18:49:16 -0400 Subject: [PATCH 10/33] Fix typo -- Series.constructor() to Series._constructor() --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6d1f011cd70c0..dfbb8ad934784 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -541,7 +541,7 @@ def _transform_general( result = concat(results).sort_index() else: - result = self.obj.constructor(dtype=np.float64) + result = self.obj._constructor(dtype=np.float64) # we will only try to coerce the result type if # we have a numeric dtype, as these are *always* user-defined funcs From 1dbe986ccfc70bd4de575854f00b8df54e17e611 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 20:05:59 -0400 Subject: [PATCH 11/33] Remove DeprecationWarning due to empty Series construction --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index dfbb8ad934784..b893c6d1f9bdc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1287,7 +1287,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): **kwargs, dtype_if_empty=object ) else: - backup = first_not_none._constructor(**kwargs) + backup = first_not_none._constructor(**kwargs, dtype=object) values = [x if (x is not None) else backup for x in values] From c998422c2f43ee1466032f79eb24f5e9d099a63a Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 20:10:07 -0400 Subject: [PATCH 12/33] BUG: GH28330 Preserve subclassing with groupby operations --- pandas/tests/groupby/test_groupby.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3b88135a9e042..3f45c55b4c0da 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -198,11 +198,8 @@ def f(grp): @pytest.mark.parametrize( "obj", [ - tm.SubclassedDataFrame( - dict(A=[0, 0, 0, 1, 1, 1], B=range(6)), - index=["A", "B", "C", "D", "E", "F"]), - tm.SubclassedSeries([0, 0, 0, 1, 1, 1], - index=["A", "B", "C", "D", "E", "F"]), + tm.SubclassedDataFrame({"A": np.arange(0, 10)}), + tm.SubclassedSeries(np.arange(0, 10), name="A") ], ) def test_groupby_preserves_subclass(obj, groupby_func): @@ -211,7 +208,7 @@ def test_groupby_preserves_subclass(obj, groupby_func): if isinstance(obj, Series) and groupby_func in {"corrwith"}: pytest.skip("Not applicable") - grouped = obj.groupby(np.repeat([0, 1], 3)) + grouped = obj.groupby(np.arange(0, 10)) # Groups should preserve subclass type assert isinstance(grouped.get_group(0), type(obj)) From abdb86147e3216c73097bdfa7e9b040649277cd6 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 20:14:37 -0400 Subject: [PATCH 13/33] BUG: GH28330 Preserve subclassing with groupby operations --- doc/source/whatsnew/v1.1.0.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 597a0c5386cf0..1d89338efe6c7 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -699,7 +699,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) - +- Bug where subclasses were not preserved through groupby operations (:issue:`28330`) + Reshaping ^^^^^^^^^ From 5b83062207322f1996046d3afe25cd245641ccf2 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 20:25:59 -0400 Subject: [PATCH 14/33] Fix formatting of .py files with black --- pandas/core/groupby/generic.py | 32 +++++++++++++++++++++------- pandas/core/reshape/concat.py | 2 +- pandas/tests/groupby/test_groupby.py | 6 +++--- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 57c34e8e633c1..9c361c234ec7e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -376,7 +376,9 @@ def _wrap_series_output( result = self.obj._constructor_expanddim(indexed_output, index=index) result.columns = columns else: - result = self.obj._constructor(indexed_output[0], index=index, name=columns[0]) + result = self.obj._constructor( + indexed_output[0], index=index, name=columns[0] + ) return result @@ -435,7 +437,9 @@ def _wrap_transformed_output( def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: # GH #6265 - return self.obj._constructor([], name=self._selection_name, index=keys, dtype=np.float64) + return self.obj._constructor( + [], name=self._selection_name, index=keys, dtype=np.float64 + ) def _get_index() -> Index: if self.grouper.nkeys > 1: @@ -447,7 +451,9 @@ def _get_index() -> Index: if isinstance(values[0], dict): # GH #823 #24880 index = _get_index() - result = self._reindex_output(self.obj._constructor_expanddim(values, index=index)) + result = self._reindex_output( + self.obj._constructor_expanddim(values, index=index) + ) # if self.observed is False, # keep all-NaN rows created while re-indexing result = result.stack(dropna=self.observed) @@ -461,7 +467,9 @@ def _get_index() -> Index: return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: # GH #6265 #24880 - result = self.obj._constructor(data=values, index=_get_index(), name=self._selection_name) + result = self.obj._constructor( + data=values, index=_get_index(), name=self._selection_name + ) return self._reindex_output(result) def _aggregate_named(self, func, *args, **kwargs): @@ -1375,7 +1383,9 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # fall through to the outer else clause # TODO: sure this is right? we used to do this # after raising AttributeError above - return self.obj._constructor_sliced(values, index=key_index, name=self._selection_name) + return self.obj._constructor_sliced( + values, index=key_index, name=self._selection_name + ) # if we have date/time like in the original, then coerce dates # as we are stacking can easily have object dtypes here @@ -1426,7 +1436,9 @@ def _transform_general( if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_func # Return the result as a DataFrame for concatenation later - res = self.obj._constructor(res, index=group.index, columns=group.columns) + res = self.obj._constructor( + res, index=group.index, columns=group.columns + ) else: # Try slow path and fast path. try: @@ -1525,7 +1537,9 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) - return type(self.obj)._from_arrays(output, columns=result.columns, index=obj.index) + return type(self.obj)._from_arrays( + output, columns=result.columns, index=obj.index + ) def _define_paths(self, func, *args, **kwargs): if isinstance(func, str): @@ -1702,7 +1716,9 @@ def _wrap_frame_output(self, result, obj) -> DataFrame: result_index = self.grouper.levels[0] if self.axis == 0: - return self.obj._constructor(result, index=obj.columns, columns=result_index).T + return self.obj._constructor( + result, index=obj.columns, columns=result_index + ).T else: return self.obj._constructor(result, index=obj.index, columns=result_index) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 57acf1a005668..052c9b2707ded 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -467,7 +467,7 @@ def get_result(self): return result.__finalize__(self, method="concat") # combine as columns in a frame - else: + else: data = dict(zip(range(len(self.objs)), self.objs)) # GH28330 Preserves subclassed objects through concat diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3f45c55b4c0da..21ff0dd0f9521 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -199,7 +199,7 @@ def f(grp): "obj", [ tm.SubclassedDataFrame({"A": np.arange(0, 10)}), - tm.SubclassedSeries(np.arange(0, 10), name="A") + tm.SubclassedSeries(np.arange(0, 10), name="A"), ], ) def test_groupby_preserves_subclass(obj, groupby_func): @@ -207,12 +207,12 @@ def test_groupby_preserves_subclass(obj, groupby_func): if isinstance(obj, Series) and groupby_func in {"corrwith"}: pytest.skip("Not applicable") - + grouped = obj.groupby(np.arange(0, 10)) # Groups should preserve subclass type assert isinstance(grouped.get_group(0), type(obj)) - + args = [] if groupby_func in {"fillna", "nth"}: args.append(0) From b6ea7317082c69eb2cfccf766ab827488906746f Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Wed, 29 Apr 2020 21:54:33 -0400 Subject: [PATCH 15/33] Removed trailing whitespace in doc/source/whatsnew/v1.1.0.rst --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1d89338efe6c7..1db6834c2899e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -700,7 +700,7 @@ Groupby/resample/rolling - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) - Bug where subclasses were not preserved through groupby operations (:issue:`28330`) - + Reshaping ^^^^^^^^^ From 0cdf0eaffc290ede857cc14a3f118f71f02fd356 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 10:47:16 -0400 Subject: [PATCH 16/33] Update DataFrameGroupBy._cython_agg_blocks() to pass mypy --- pandas/core/groupby/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9c361c234ec7e..8e09545f4cf8d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1070,7 +1070,7 @@ def _cython_agg_blocks( deleted_items: List[np.ndarray] = [] # Some object-dtype blocks might be split into List[Block[T], Block[U]] split_items: List[np.ndarray] = [] - split_frames: List[type(self.obj)] = [] + split_frames: List[DataFrame] = [] no_result = object() for block in data.blocks: @@ -1109,7 +1109,8 @@ def _cython_agg_blocks( deleted_items.append(locs) continue else: - result = cast(type(self.obj), result) + # Cast to DataFrame-like type + result = self.obj._constructor(result) # unwrap DataFrame to get array if len(result._mgr.blocks) != 1: # We've split an object block! Everything we've assumed From a70c21a357a8e7084b3ad13ab0d3f514f9f6205e Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 11:35:44 -0400 Subject: [PATCH 17/33] Remove unused import of typing.cast from pandas/core/groupby/generic.py --- pandas/core/groupby/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 8e09545f4cf8d..5a61226718399 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,7 +23,6 @@ Tuple, Type, Union, - cast, ) import warnings From c03d459ace998c1692de029ca594a9f6dd7b7ae4 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 16:39:41 -0400 Subject: [PATCH 18/33] Move tests to test_groupby_subclass.py --- pandas/tests/groupby/test_groupby.py | 36 --------- pandas/tests/groupby/test_groupby_subclass.py | 77 +++++++++++++++++++ 2 files changed, 77 insertions(+), 36 deletions(-) create mode 100644 pandas/tests/groupby/test_groupby_subclass.py diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 21ff0dd0f9521..c88d16e34eab8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -195,42 +195,6 @@ def f(grp): tm.assert_series_equal(result, e) -@pytest.mark.parametrize( - "obj", - [ - tm.SubclassedDataFrame({"A": np.arange(0, 10)}), - tm.SubclassedSeries(np.arange(0, 10), name="A"), - ], -) -def test_groupby_preserves_subclass(obj, groupby_func): - # GH28330 -- preserve subclass through groupby operations - - if isinstance(obj, Series) and groupby_func in {"corrwith"}: - pytest.skip("Not applicable") - - grouped = obj.groupby(np.arange(0, 10)) - - # Groups should preserve subclass type - assert isinstance(grouped.get_group(0), type(obj)) - - args = [] - if groupby_func in {"fillna", "nth"}: - args.append(0) - elif groupby_func == "corrwith": - args.append(obj) - elif groupby_func == "tshift": - args.extend([0, 0]) - - result = getattr(grouped, groupby_func)(*args) - - # Reduction or transformation kernels should preserve type - slices = groupby_func in {"cumcount", "ngroup", "size"} - if isinstance(obj, DataFrame) and slices: - assert isinstance(result, type(obj._constructor_sliced(dtype=object))) - else: - assert isinstance(result, type(obj)) - - def test_pass_args_kwargs(ts, tsframe): def f(x, q=None, axis=0): return np.percentile(x, q, axis=axis) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py new file mode 100644 index 0000000000000..39131188fbd2d --- /dev/null +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -0,0 +1,77 @@ +from datetime import datetime + +import numpy as np +import pytest + +from pandas import DataFrame, Series +import pandas._testing as tm + + +@pytest.mark.parametrize( + "obj", + [ + tm.SubclassedDataFrame({"A": np.arange(0, 10)}), + tm.SubclassedSeries(np.arange(0, 10), name="A"), + ], +) +def test_groupby_preserves_subclass(obj, groupby_func): + # GH28330 -- preserve subclass through groupby operations + + if isinstance(obj, Series) and groupby_func in {"corrwith"}: + pytest.skip("Not applicable") + + grouped = obj.groupby(np.arange(0, 10)) + + # Groups should preserve subclass type + assert isinstance(grouped.get_group(0), type(obj)) + + args = [] + if groupby_func in {"fillna", "nth"}: + args.append(0) + elif groupby_func == "corrwith": + args.append(obj) + elif groupby_func == "tshift": + args.extend([0, 0]) + + result1 = getattr(grouped, groupby_func)(*args) + result2 = grouped.agg(groupby_func, *args) + + # Reduction or transformation kernels should preserve type + slices = groupby_func in {"cumcount", "ngroup", "size"} + if isinstance(obj, DataFrame) and slices: + assert isinstance(result1, type(obj._constructor_sliced(dtype=object))) + else: + assert isinstance(result1, type(obj)) + + # Confirm .agg() groupby operations return same results + if isinstance(result1, DataFrame): + tm.assert_frame_equal(result1, result2) + else: + tm.assert_series_equal(result1, result2) + + +@pytest.mark.parametrize( + "obj", [DataFrame, tm.SubclassedDataFrame], +) +def test_groupby_resample_preserves_subclass(obj): + # GH28330 -- preserve subclass through groupby.resample() + + df = obj( + { + "Buyer": "Carl Carl Carl Carl Joe Carl".split(), + "Quantity": [18, 3, 5, 1, 9, 3], + "Date": [ + datetime(2013, 9, 1, 13, 0), + datetime(2013, 9, 1, 13, 5), + datetime(2013, 10, 1, 20, 0), + datetime(2013, 10, 3, 10, 0), + datetime(2013, 12, 2, 12, 0), + datetime(2013, 9, 2, 14, 0), + ], + } + ) + df = df.set_index("Date") + + # Confirm groupby.resample() preserves dataframe type + result = df.groupby("Buyer").resample("5D").sum() + assert isinstance(result, obj) From 9e42c7958b62a2075f3658390f1d4474b5670c45 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 16:53:28 -0400 Subject: [PATCH 19/33] Add tests for DataFrame.idxmin() and DataFrame.idxmax() with subclasses --- pandas/tests/frame/test_subclass.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py index 16bf651829a04..f4f7cc5c1a7d6 100644 --- a/pandas/tests/frame/test_subclass.py +++ b/pandas/tests/frame/test_subclass.py @@ -573,3 +573,17 @@ def test_subclassed_boolean_reductions(self, all_boolean_reductions): df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) result = getattr(df, all_boolean_reductions)() assert isinstance(result, tm.SubclassedSeries) + + def test_idxmin_preserves_subclass(self): + # GH 28330 + + df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + result = df.idxmin() + assert isinstance(result, tm.SubclassedSeries) + + def test_idxmax_preserves_subclass(self): + # GH 28330 + + df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}) + result = df.idxmax() + assert isinstance(result, tm.SubclassedSeries) From 9fbc645d0a8397da543b4a19eb71e27155995c29 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 17:07:14 -0400 Subject: [PATCH 20/33] Add test to confirm concat() preserves subclassed types --- pandas/tests/reshape/test_concat.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py index 7c01664df0607..6b5f294eaf1a7 100644 --- a/pandas/tests/reshape/test_concat.py +++ b/pandas/tests/reshape/test_concat.py @@ -2802,3 +2802,17 @@ def test_concat_multiindex_datetime_object_index(): ) result = concat([s, s2], axis=1) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "obj", + [ + tm.SubclassedDataFrame({"A": np.arange(0, 10)}), + tm.SubclassedSeries(np.arange(0, 10), name="A"), + ], +) +def test_concat_preserves_subclass(obj): + # GH28330 -- preserve subclass + + result = concat([obj, obj]) + assert isinstance(result, type(obj)) From 5750d724e4bfbe7db1c31a8ed6fe5a30435dcb9c Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 17:17:00 -0400 Subject: [PATCH 21/33] Update whatsnew entry bugfix --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 92d5efa410d68..61b91d948d9d5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -701,7 +701,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` where a ``ValueError`` would be raised when grouping by a categorical column with read-only categories and ``sort=False`` (:issue:`33410`) - Bug in :meth:`GroupBy.first` and :meth:`GroupBy.last` where None is not preserved in object dtype (:issue:`32800`) - Bug in :meth:`Rolling.min` and :meth:`Rolling.max`: Growing memory usage after multiple calls when using a fixed window (:issue:`30726`) -- Bug where subclasses were not preserved through groupby operations (:issue:`28330`) +- Bug in :meth:`GroupBy.agg`, :meth:`GroupBy.transform`, and :meth:`GroupBy.resample` where subclasses are not preserved (:issue:`28330`) Reshaping ^^^^^^^^^ From 8eee73c151896bbafaec4035931c9299dad9ae1a Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 20:26:33 -0400 Subject: [PATCH 22/33] Revert unnecessary changes in GroupBy() --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 14 ++------------ 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 5a61226718399..d948e1926cc34 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1296,7 +1296,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): **kwargs, dtype_if_empty=object ) else: - backup = first_not_none._constructor(**kwargs, dtype=object) + backup = first_not_none._constructor(**kwargs) values = [x if (x is not None) else backup for x in values] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index acf05e92a9e46..8722fa4efaa31 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2051,13 +2051,7 @@ def ngroup(self, ascending: bool = True): """ with _group_selection_context(self): index = self._selected_obj.index - - # GH28330 preserve subclassed Series/DataFrames through calls - if isinstance(self._selected_obj, Series): - result = self.obj._constructor(self.grouper.group_info[0], index) - elif isinstance(self._selected_obj, DataFrame): - result = self.obj._constructor_sliced(self.grouper.group_info[0], index) - + result = Series(self.grouper.group_info[0], index) if not ascending: result = self.ngroups - 1 - result return result @@ -2119,12 +2113,8 @@ def cumcount(self, ascending: bool = True): with _group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) + return Series(cumcounts, index) - # GH28330 preserve subclassed Series/DataFrames through calls - if isinstance(self._selected_obj, Series): - return self.obj._constructor(cumcounts, index) - elif isinstance(self._selected_obj, DataFrame): - return self.obj._constructor_sliced(cumcounts, index) @Substitution(name="groupby") @Appender(_common_see_also) From 4b304c17f8a41d0e4952f728f808faee6b503cd3 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 20:31:21 -0400 Subject: [PATCH 23/33] Fix test to expect Series from GroupBy.ngroup() and GroupBy.cumcount() --- pandas/tests/groupby/test_groupby_subclass.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index 39131188fbd2d..ada9931d7219f 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,10 +36,12 @@ def test_groupby_preserves_subclass(obj, groupby_func): result1 = getattr(grouped, groupby_func)(*args) result2 = grouped.agg(groupby_func, *args) - # Reduction or transformation kernels should preserve type - slices = groupby_func in {"cumcount", "ngroup", "size"} - if isinstance(obj, DataFrame) and slices: + # Reduction or transformation kernels should preserve type; + # ngroup() and cumcount() always return Series + if isinstance(obj, DataFrame) and groupby_func in {"size"}: assert isinstance(result1, type(obj._constructor_sliced(dtype=object))) + elif groupby_func in {"ngroup", "cumcount"}: + assert isinstance(result1, Series) else: assert isinstance(result1, type(obj)) From b3e039a500abc110b0e67029ac4db41ab8f800bb Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Thu, 30 Apr 2020 20:35:23 -0400 Subject: [PATCH 24/33] Fix formatting of groupby.py --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8722fa4efaa31..c7c85ebd3998a 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2115,7 +2115,6 @@ def cumcount(self, ascending: bool = True): cumcounts = self._cumcount_array(ascending=ascending) return Series(cumcounts, index) - @Substitution(name="groupby") @Appender(_common_see_also) def rank( From a490e389258d4d060bb2bb8544462ce68e05d188 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Fri, 1 May 2020 12:26:03 -0400 Subject: [PATCH 25/33] Avoid DeprecationWarning by checking for instance of Series --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index d948e1926cc34..8f9b0dfdd81d4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1291,7 +1291,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if first_not_none._constructor is Series: + if isinstance(first_not_none, Series): backup = create_series_with_explicit_dtype( **kwargs, dtype_if_empty=object ) From 0244b36d8a77647746502aebc09037fcc346d901 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Sun, 3 May 2020 14:59:52 -0400 Subject: [PATCH 26/33] Remove unnecessary constructor call in DataFrameGroupBy._cython_agg_blocks() --- pandas/core/groupby/generic.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ab3f1de713468..3b473a760a7ce 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1078,8 +1078,9 @@ def _cython_agg_blocks( deleted_items.append(locs) continue else: - # Cast to DataFrame-like type - result = self.obj._constructor(result) + # Ensure result is DataFrame-like type + assert isinstance(result, type(self.obj)) + # unwrap DataFrame to get array if len(result._mgr.blocks) != 1: # We've split an object block! Everything we've assumed From dcd4692811dc095f431cc0836f1aad34d66d3b9f Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Sun, 3 May 2020 16:23:46 -0400 Subject: [PATCH 27/33] Fix mypy static typing issue in DataFrameGroupBy._cython_agg_blocks() --- pandas/core/groupby/generic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 3b473a760a7ce..e085581d90e84 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -23,6 +23,7 @@ Tuple, Type, Union, + cast, ) import warnings @@ -1079,7 +1080,8 @@ def _cython_agg_blocks( continue else: # Ensure result is DataFrame-like type - assert isinstance(result, type(self.obj)) + if not isinstance(result, DataFrame): + result = cast(DataFrame, result) # unwrap DataFrame to get array if len(result._mgr.blocks) != 1: From a92c51bfe96dc18a9bf33af7ac86ef0258772d86 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Sun, 3 May 2020 16:24:38 -0400 Subject: [PATCH 28/33] Ensure consistent return types for GroupBy.size(), ngroup(), and cumcount() --- pandas/core/groupby/groupby.py | 20 +++++++++++++++---- pandas/tests/groupby/test_groupby_subclass.py | 10 ++++------ 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3747cf7a9ad74..71200a3adce00 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1421,9 +1421,10 @@ def size(self): result = self.grouper.size() # GH28330 preserve subclassed Series/DataFrames through calls - if isinstance(self.obj, Series): - result = self.obj._constructor(result) + if self.obj._constructor is Series: result.name = self.obj.name + elif isinstance(self.obj, Series): + result = self.obj._constructor(result, name=self.obj.name) elif isinstance(self.obj, DataFrame): result = self.obj._constructor_sliced(result) return self._reindex_output(result, fill_value=0) @@ -2120,7 +2121,13 @@ def ngroup(self, ascending: bool = True): """ with _group_selection_context(self): index = self._selected_obj.index - result = Series(self.grouper.group_info[0], index) + + # GH28330 preserve subclassed Series/DataFrames + if isinstance(self.obj, Series): + result = self.obj._constructor(self.grouper.group_info[0], index) + elif isinstance(self.obj, DataFrame): + result = self.obj._constructor_sliced(self.grouper.group_info[0], index) + if not ascending: result = self.ngroups - 1 - result return result @@ -2182,7 +2189,12 @@ def cumcount(self, ascending: bool = True): with _group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) - return Series(cumcounts, index) + + # GH28330 preserve subclassed Series/DataFrames + if isinstance(self.obj, Series): + return self.obj._constructor(cumcounts, index) + elif isinstance(self.obj, DataFrame): + return self.obj._constructor_sliced(cumcounts, index) @Substitution(name="groupby") @Appender(_common_see_also) diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py index ada9931d7219f..6adae19005c3a 100644 --- a/pandas/tests/groupby/test_groupby_subclass.py +++ b/pandas/tests/groupby/test_groupby_subclass.py @@ -36,12 +36,10 @@ def test_groupby_preserves_subclass(obj, groupby_func): result1 = getattr(grouped, groupby_func)(*args) result2 = grouped.agg(groupby_func, *args) - # Reduction or transformation kernels should preserve type; - # ngroup() and cumcount() always return Series - if isinstance(obj, DataFrame) and groupby_func in {"size"}: - assert isinstance(result1, type(obj._constructor_sliced(dtype=object))) - elif groupby_func in {"ngroup", "cumcount"}: - assert isinstance(result1, Series) + # Reduction or transformation kernels should preserve type + slices = {"ngroup", "cumcount", "size"} + if isinstance(obj, DataFrame) and groupby_func in slices: + assert isinstance(result1, obj._constructor_sliced) else: assert isinstance(result1, type(obj)) From cf3b978d2872eecd83adde5fb07742cfcda35eec Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Mon, 11 May 2020 17:06:36 -0400 Subject: [PATCH 29/33] Revert DataFrameGroupBy._cython_agg_blocks() back to origin/master --- pandas/core/groupby/generic.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e085581d90e84..4996062fa23a3 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1079,10 +1079,7 @@ def _cython_agg_blocks( deleted_items.append(locs) continue else: - # Ensure result is DataFrame-like type - if not isinstance(result, DataFrame): - result = cast(DataFrame, result) - + result = cast(DataFrame, result) # unwrap DataFrame to get array if len(result._mgr.blocks) != 1: # We've split an object block! Everything we've assumed From f08cf597f81b8460f1ac210de0f67660288b30e4 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Mon, 11 May 2020 18:29:21 -0400 Subject: [PATCH 30/33] Add GroupBy._constructor() to facilitate preserving subclassed types --- pandas/core/groupby/groupby.py | 35 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 71200a3adce00..4cd21376288f1 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1182,6 +1182,16 @@ class GroupBy(_GroupBy[FrameOrSeries]): more """ + @property + def _constructor(self) -> Type["Series"]: + # GH28330 preserve subclassed Series/DataFrames + if isinstance(self.obj, DataFrame): + return self.obj._constructor_sliced + elif isinstance(self.obj, Series): + return self.obj._constructor + else: + return Series + def _bool_agg(self, val_test, skipna): """ Shared func to call any / all Cython GroupBy implementations. @@ -1421,12 +1431,10 @@ def size(self): result = self.grouper.size() # GH28330 preserve subclassed Series/DataFrames through calls - if self.obj._constructor is Series: - result.name = self.obj.name - elif isinstance(self.obj, Series): - result = self.obj._constructor(result, name=self.obj.name) - elif isinstance(self.obj, DataFrame): - result = self.obj._constructor_sliced(result) + if issubclass(self.obj._constructor, Series): + result = self._constructor(result, name=self.obj.name) + else: + result = self._constructor(result) return self._reindex_output(result, fill_value=0) @classmethod @@ -2121,13 +2129,7 @@ def ngroup(self, ascending: bool = True): """ with _group_selection_context(self): index = self._selected_obj.index - - # GH28330 preserve subclassed Series/DataFrames - if isinstance(self.obj, Series): - result = self.obj._constructor(self.grouper.group_info[0], index) - elif isinstance(self.obj, DataFrame): - result = self.obj._constructor_sliced(self.grouper.group_info[0], index) - + result = self._constructor(self.grouper.group_info[0], index) if not ascending: result = self.ngroups - 1 - result return result @@ -2189,12 +2191,7 @@ def cumcount(self, ascending: bool = True): with _group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) - - # GH28330 preserve subclassed Series/DataFrames - if isinstance(self.obj, Series): - return self.obj._constructor(cumcounts, index) - elif isinstance(self.obj, DataFrame): - return self.obj._constructor_sliced(cumcounts, index) + return self._constructor(cumcounts, index) @Substitution(name="groupby") @Appender(_common_see_also) From d2a7de21aa0a895aa0546acbcf8076bb5b8570ec Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Tue, 12 May 2020 09:01:53 -0400 Subject: [PATCH 31/33] Change DataFrameGroupBy._transform_fast() to use _constructor property --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4996062fa23a3..04a3524fed480 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1507,7 +1507,7 @@ def _transform_fast(self, result: DataFrame, func_nm: str) -> DataFrame: res = maybe_cast_result(res, obj.iloc[:, i], how=func_nm) output.append(res) - return type(self.obj)._from_arrays( + return self.obj._constructor._from_arrays( output, columns=result.columns, index=obj.index ) From 37ea97f35da5cc7b2bfab743307aec8f0625e69f Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Tue, 12 May 2020 14:48:12 -0400 Subject: [PATCH 32/33] Restructure GroupBy._constructor() to remove else statement --- pandas/core/groupby/groupby.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 4cd21376288f1..0475fc50fe4cd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1187,10 +1187,8 @@ def _constructor(self) -> Type["Series"]: # GH28330 preserve subclassed Series/DataFrames if isinstance(self.obj, DataFrame): return self.obj._constructor_sliced - elif isinstance(self.obj, Series): - return self.obj._constructor - else: - return Series + assert isinstance(self.obj, Series) + return self.obj._constructor def _bool_agg(self, val_test, skipna): """ From f1570da44abaaafe3d596f7a8264cb14ecdf8c45 Mon Sep 17 00:00:00 2001 From: Jack Greisman Date: Tue, 12 May 2020 21:20:30 -0400 Subject: [PATCH 33/33] Rename GroupBy._constructor property to GroupBy._obj_1d_constructor --- pandas/core/groupby/groupby.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 0475fc50fe4cd..08d18397d7225 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1183,7 +1183,7 @@ class GroupBy(_GroupBy[FrameOrSeries]): """ @property - def _constructor(self) -> Type["Series"]: + def _obj_1d_constructor(self) -> Type["Series"]: # GH28330 preserve subclassed Series/DataFrames if isinstance(self.obj, DataFrame): return self.obj._constructor_sliced @@ -1430,9 +1430,9 @@ def size(self): # GH28330 preserve subclassed Series/DataFrames through calls if issubclass(self.obj._constructor, Series): - result = self._constructor(result, name=self.obj.name) + result = self._obj_1d_constructor(result, name=self.obj.name) else: - result = self._constructor(result) + result = self._obj_1d_constructor(result) return self._reindex_output(result, fill_value=0) @classmethod @@ -2127,7 +2127,7 @@ def ngroup(self, ascending: bool = True): """ with _group_selection_context(self): index = self._selected_obj.index - result = self._constructor(self.grouper.group_info[0], index) + result = self._obj_1d_constructor(self.grouper.group_info[0], index) if not ascending: result = self.ngroups - 1 - result return result @@ -2189,7 +2189,7 @@ def cumcount(self, ascending: bool = True): with _group_selection_context(self): index = self._selected_obj.index cumcounts = self._cumcount_array(ascending=ascending) - return self._constructor(cumcounts, index) + return self._obj_1d_constructor(cumcounts, index) @Substitution(name="groupby") @Appender(_common_see_also)