From c01577c7a6ad47ecbe3fdbaf135f5cc75b744ce7 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 18 Aug 2020 17:37:45 -0400 Subject: [PATCH 1/5] CLN/BUG: Clean/Simplify _wrap_applied_output --- doc/source/whatsnew/v1.2.0.rst | 3 +- pandas/core/groupby/generic.py | 86 ++++++++---------------------- pandas/core/indexes/api.py | 7 +-- pandas/tests/groupby/test_apply.py | 7 +-- 4 files changed, 32 insertions(+), 71 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 42f95d88d74ac..0d77f1e96cf1c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -249,9 +249,8 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.count` and :meth:`SeriesGroupBy.sum` returning ``NaN`` for missing categories when grouped on multiple ``Categoricals``. Now returning ``0`` (:issue:`35028`) - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) -- -- - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f0cdbd07560f..cd82c5159bbfc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1209,51 +1209,21 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): first_not_none = next(com.not_none(*values), None) if first_not_none is None: - # GH9684. If all values are None, then this will throw an error. - # We'd prefer it return an empty dataframe. + # GH9684 - All values are None, return an empty frame. return self.obj._constructor() elif isinstance(first_not_none, DataFrame): return self._concat_objects(keys, values, not_indexed_same=not_indexed_same) else: - if len(self.grouper.groupings) > 1: - key_index = self.grouper.result_index + key_index = self.grouper.result_index if self.as_index else None - else: - ping = self.grouper.groupings[0] - if len(keys) == ping.ngroups: - key_index = ping.group_index - key_index.name = key_names[0] - - key_lookup = Index(keys) - indexer = key_lookup.get_indexer(key_index) - - # reorder the values - values = [values[i] for i in indexer] - - # update due to the potential reorder - first_not_none = next(com.not_none(*values), None) - else: - - key_index = Index(keys, name=key_names[0]) - - # don't use the key indexer - if not self.as_index: - key_index = None - - # make Nones an empty object - if first_not_none is None: - return self.obj._constructor() - elif isinstance(first_not_none, NDFrame): + if isinstance(first_not_none, Series): # this is to silence a DeprecationWarning # TODO: Remove when default dtype of empty Series is object kwargs = first_not_none._construct_axes_dict() - if isinstance(first_not_none, Series): - backup = create_series_with_explicit_dtype( - **kwargs, dtype_if_empty=object - ) - else: - backup = first_not_none._constructor(**kwargs) + backup = create_series_with_explicit_dtype( + **kwargs, dtype_if_empty=object + ) values = [x if (x is not None) else backup for x in values] @@ -1262,7 +1232,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if isinstance(v, (np.ndarray, Index, Series)) or not self.as_index: if isinstance(v, Series): applied_index = self._selected_obj._get_axis(self.axis) - all_indexed_same = all_indexes_same([x.index for x in values]) + all_indexed_same = all_indexes_same((x.index for x in values)) singular_series = len(values) == 1 and applied_index.nlevels == 1 # GH3596 @@ -1294,7 +1264,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): # GH 8467 return self._concat_objects(keys, values, not_indexed_same=True) - if self.axis == 0 and isinstance(v, ABCSeries): # GH6124 if the list of Series have a consistent name, # then propagate that name to the result. index = v.index.copy() @@ -1307,34 +1276,25 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] - # normally use vstack as its faster than concat - # and if we have mi-columns - if ( - isinstance(v.index, MultiIndex) - or key_index is None - or isinstance(key_index, MultiIndex) - ): - stacked_values = np.vstack([np.asarray(v) for v in values]) - result = self.obj._constructor( - stacked_values, index=key_index, columns=index - ) - else: - # GH5788 instead of stacking; concat gets the - # dtypes correct - from pandas.core.reshape.concat import concat - - result = concat( - values, - keys=key_index, - names=key_index.names, - axis=self.axis, - ).unstack() - result.columns = index - elif isinstance(v, ABCSeries): stacked_values = np.vstack([np.asarray(v) for v in values]) + + if self.axis == 0: + index = key_index + columns = v.index.copy() + if columns.name is None: + # GH6124 - propagate name of Series when it's consistent + names = {v.name for v in values} + if len(names) == 1: + columns.name = list(names)[0] + else: + index = v.index + columns = key_index + stacked_values = stacked_values.T + result = self.obj._constructor( - stacked_values.T, index=v.index, columns=key_index + stacked_values, index=index, columns=columns ) + elif not self.as_index: # We add grouping column below, so create a frame here result = DataFrame( diff --git a/pandas/core/indexes/api.py b/pandas/core/indexes/api.py index 30cc8cf480dcf..d352b001f5d2a 100644 --- a/pandas/core/indexes/api.py +++ b/pandas/core/indexes/api.py @@ -297,15 +297,16 @@ def all_indexes_same(indexes): Parameters ---------- - indexes : list of Index objects + indexes : iterable of Index objects Returns ------- bool True if all indexes contain the same elements, False otherwise. """ - first = indexes[0] - for index in indexes[1:]: + itr = iter(indexes) + first = next(itr) + for index in itr: if not first.equals(index): return False return True diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index ee38722ffb8ce..a1dcb28a32c6c 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -861,13 +861,14 @@ def test_apply_multi_level_name(category): b = [1, 2] * 5 if category: b = pd.Categorical(b, categories=[1, 2, 3]) + expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B") + else: + expected_index = pd.Index([1, 2], name="B") df = pd.DataFrame( {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))} ).set_index(["A", "B"]) result = df.groupby("B").apply(lambda x: x.sum()) - expected = pd.DataFrame( - {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B") - ) + expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] From 3f4045e86fe13411b167c79b9d8bd612c0f4327d Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 18 Aug 2020 18:28:24 -0400 Subject: [PATCH 2/5] Removed key_names variable. --- pandas/core/groupby/generic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index cd82c5159bbfc..4a25dd3ab6c3b 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1203,8 +1203,6 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return self.obj._constructor(index=keys) - key_names = self.grouper.names - # GH12824 first_not_none = next(com.not_none(*values), None) From f779e19e4ad408472f98fec7d2044c7e967b2f67 Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 20 Aug 2020 16:26:23 -0400 Subject: [PATCH 3/5] Added comment --- pandas/core/groupby/generic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 4a25dd3ab6c3b..ea5a84965c00e 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1274,6 +1274,8 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(names) == 1: index.name = list(names)[0] + # Combine values + # Using vstack+constructor is faster than concat, and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) if self.axis == 0: From d600e1496c043e47b667c9e6d13f96347a84edcb Mon Sep 17 00:00:00 2001 From: Richard Date: Thu, 20 Aug 2020 16:27:44 -0400 Subject: [PATCH 4/5] Fixed comment --- pandas/core/groupby/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ea5a84965c00e..cb3a6308ecad9 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1275,7 +1275,7 @@ def _wrap_applied_output(self, keys, values, not_indexed_same=False): index.name = list(names)[0] # Combine values - # Using vstack+constructor is faster than concat, and handles MI-columns + # vstack+constructor is faster than concat and handles MI-columns stacked_values = np.vstack([np.asarray(v) for v in values]) if self.axis == 0: From 6a93da6b8128a90a1648dc84385a3e3f61b8bc10 Mon Sep 17 00:00:00 2001 From: Richard Date: Tue, 25 Aug 2020 16:03:13 -0400 Subject: [PATCH 5/5] Added PR # to whatsnew --- doc/source/whatsnew/v1.2.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d42f8c521acd9..a7fddc886ec98 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -251,7 +251,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` that would some times throw an erroneous ``ValueError`` if the grouping axis had duplicate entries (:issue:`16646`) - Bug when combining methods :meth:`DataFrame.groupby` with :meth:`DataFrame.resample` and :meth:`DataFrame.interpolate` raising an ``TypeError`` (:issue:`35325`) - Bug in :meth:`DataFrameGroupBy.apply` where a non-nuisance grouping column would be dropped from the output columns if another groupby method was called before ``.apply()`` (:issue:`34656`) -- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. +- Bug in :meth:`DataFrameGroupby.apply` would drop a :class:`CategoricalIndex` when grouped on. (:issue:`35792`) Reshaping ^^^^^^^^^