From c6851590e60183459e09d9d7acb5989805be2734 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 14 Jun 2020 08:23:35 -0400 Subject: [PATCH 1/3] BUG: Groupby with as_index=False raises error when type is Category --- doc/source/whatsnew/v1.1.0.rst | 20 ++++++++++++++++++++ pandas/core/frame.py | 2 +- pandas/core/groupby/groupby.py | 17 +++++++++++------ pandas/tests/groupby/test_size.py | 16 ++++++++++++++++ 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2243790a663df..e998c60d3ce85 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -670,6 +670,25 @@ Using :meth:`DataFrame.groupby` with ``as_index=False`` and the function ``idxma df.groupby("a", as_index=False).nunique() +The method :meth:`core.DataFrameGroupBy.size` would previously ignore ``as_index=False``. Now the grouping columns are returned as columns, making the result a `DataFrame` instead of a `Series`. (:issue:`32599`) + +*Previous behavior*: + +.. code-block:: ipython + + In [3]: df.groupby("a", as_index=False).size() + Out[4]: + a + x 2 + y 2 + dtype: int64 + +*New behavior*: + +.. ipython:: python + + df.groupby("a", as_index=False).size() + .. _whatsnew_110.api_breaking.apply_applymap_first_once: apply and applymap on ``DataFrame`` evaluates first row/column only once @@ -983,6 +1002,7 @@ Groupby/resample/rolling The behaviour now is consistent, independent of internal heuristics. (:issue:`31612`, :issue:`14927`, :issue:`13056`) - Bug in :meth:`SeriesGroupBy.agg` where any column name was accepted in the named aggregation of ``SeriesGroupBy`` previously. The behaviour now allows only ``str`` and callables else would raise ``TypeError``. (:issue:`34422`) + Reshaping ^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 5f8ab8966c1f0..560c4acf10d06 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5440,7 +5440,7 @@ def value_counts( if subset is None: subset = self.columns.tolist() - counts = self.groupby(subset).size() + counts = self.groupby(subset).grouper.size() if sort: counts = counts.sort_values(ascending=ascending) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 9838cff9b34f9..32ea9efa9850b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -942,9 +942,9 @@ def _transform_should_cast(self, func_nm: str) -> bool: bool Whether transform should attempt to cast the result of aggregation """ - return (self.size().fillna(0) > 0).any() and ( - func_nm not in base.cython_cast_blacklist - ) + filled_series = self.grouper.size().fillna(0) + assert filled_series is not None + return filled_series.gt(0).any() and func_nm not in base.cython_cast_blacklist def _cython_transform(self, how: str, numeric_only: bool = True, **kwargs): output: Dict[base.OutputKey, np.ndarray] = {} @@ -1507,14 +1507,15 @@ def sem(self, ddof: int = 1): @Substitution(name="groupby") @Appender(_common_see_also) - def size(self): + def size(self) -> FrameOrSeriesUnion: """ Compute group sizes. Returns ------- - Series - Number of rows in each group. + DataFrame or Series + Number of rows in each group as a Series if as_index is True + or a DataFrame if as_index is False. """ result = self.grouper.size() @@ -1523,6 +1524,10 @@ def size(self): result = self._obj_1d_constructor(result, name=self.obj.name) else: result = self._obj_1d_constructor(result) + + if not self.as_index: + result = result.reset_index() + return self._reindex_output(result, fill_value=0) @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 42bccc67fe0f8..a59f31f06a076 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -44,3 +44,19 @@ def test_size_period_index(): grp = ser.groupby(level="A") result = grp.size() tm.assert_series_equal(result, ser) + + +@pytest.mark.parametrize("as_index", [True, False]) +def test_size_on_categorical(as_index): + df = DataFrame([[1, 1], [2, 2]], columns=["A", "B"]) + df["A"] = df["A"].astype("category") + result = df.groupby(["A", "B"], as_index=as_index).size() + + expected = DataFrame( + [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", 0], + ) + expected["A"] = expected["A"].astype("category") + if as_index: + expected = expected.set_index(["A", "B"])[0].rename(None) + + tm.assert_equal(result, expected) From 5ba2657ecbc18f025b700ca4fe8e7db230b524c9 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 14 Jun 2020 08:48:56 -0400 Subject: [PATCH 2/3] Named column "size" when as_index=False --- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_size.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 32ea9efa9850b..5c3dd8ea4fac0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1526,7 +1526,7 @@ def size(self) -> FrameOrSeriesUnion: result = self._obj_1d_constructor(result) if not self.as_index: - result = result.reset_index() + result = result.rename("size").reset_index() return self._reindex_output(result, fill_value=0) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index a59f31f06a076..9cff8b966dad0 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -53,10 +53,10 @@ def test_size_on_categorical(as_index): result = df.groupby(["A", "B"], as_index=as_index).size() expected = DataFrame( - [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", 0], + [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"], ) expected["A"] = expected["A"].astype("category") if as_index: - expected = expected.set_index(["A", "B"])[0].rename(None) + expected = expected.set_index(["A", "B"])["size"].rename(None) tm.assert_equal(result, expected) From 01312df1f0dfae0ce193a6387b30ae7ca22ab679 Mon Sep 17 00:00:00 2001 From: Richard Date: Sun, 14 Jun 2020 08:57:36 -0400 Subject: [PATCH 3/3] No longer skipping size when as_index=False in test --- pandas/tests/groupby/test_groupby.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 80f34bb91cdfd..664c30e003632 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -668,11 +668,14 @@ def test_ops_not_as_index(reduction_func): if reduction_func in ("corrwith",): pytest.skip("Test not applicable") - if reduction_func in ("nth", "ngroup", "size",): + if reduction_func in ("nth", "ngroup",): pytest.skip("Skip until behavior is determined (GH #5755)") df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"]) - expected = getattr(df.groupby("a"), reduction_func)().reset_index() + expected = getattr(df.groupby("a"), reduction_func)() + if reduction_func == "size": + expected = expected.rename("size") + expected = expected.reset_index() g = df.groupby("a", as_index=False)