From 8d10ab4facf7225862150ca7695f60f8f5402bfe Mon Sep 17 00:00:00 2001 From: Stan West <38358698+stanwest@users.noreply.github.com> Date: Mon, 25 Oct 2021 09:09:48 -0400 Subject: [PATCH 1/5] CLN: Remove check for instance of Series performed by prior clause --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 7577b1e671d60..21aedb7cf9dee 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -938,7 +938,7 @@ def _convert_grouper(axis: Index, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, MultiIndex): return grouper._values - elif isinstance(grouper, (list, tuple, Series, Index, np.ndarray)): + elif isinstance(grouper, (list, tuple, Index, np.ndarray)): if len(grouper) != len(axis): raise ValueError("Grouper and axis must be same length") From 08399e1215f2d95d096f11481e69ee21d749b7e5 Mon Sep 17 00:00:00 2001 From: Stan West <38358698+stanwest@users.noreply.github.com> Date: Mon, 25 Oct 2021 09:33:14 -0400 Subject: [PATCH 2/5] BUG: Group by categorical Series with unequal length GH44179 --- doc/source/whatsnew/v1.4.0.rst | 2 +- pandas/core/groupby/grouper.py | 8 +------- pandas/tests/groupby/test_categorical.py | 9 +++++++-- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 254a004a37c40..9bcb44b72395a 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -593,7 +593,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`#3944`) - +- Enabled grouping by a :class:`Series` that has a categorical data type and length unequal to the axis of grouping (:issue:`44179`); also, with this fix, the ``ValueError`` raised when grouping by a :class:`Categorical` with unequal length now has the same message as when grouping by other sequences Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 21aedb7cf9dee..539fd410b75be 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -887,12 +887,6 @@ def is_in_obj(gpr) -> bool: else: in_axis = False - if is_categorical_dtype(gpr) and len(gpr) != obj.shape[axis]: - raise ValueError( - f"Length of grouper ({len(gpr)}) and axis ({obj.shape[axis]}) " - "must be same length" - ) - # create the Grouping # allow us to passing the actual Grouping as the gpr ping = ( @@ -938,7 +932,7 @@ def _convert_grouper(axis: Index, grouper): return grouper.reindex(axis)._values elif isinstance(grouper, MultiIndex): return grouper._values - elif isinstance(grouper, (list, tuple, Index, np.ndarray)): + elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)): if len(grouper) != len(axis): raise ValueError("Grouper and axis must be same length") diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 339bb2c30736d..30bad8fa7188e 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -668,11 +668,16 @@ def test_bins_unequal_len(): bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here - msg = r"Length of grouper \(8\) and axis \(10\) must be same length" - with pytest.raises(ValueError, match=msg): + with pytest.raises(ValueError, match=r"Grouper and axis must be same length"): series.groupby(bins).mean() +def test_categorical_series_unequal_len(): + # GH44179 + groupby = Series(range(7)).groupby(Series(list("ABBA"), dtype="category")) + tm.assert_dict_equal(groupby.groups, {"A": [0, 3], "B": [1, 2]}) + + def test_as_index(): # GH13204 df = DataFrame( From e8a0c9b9f36e2c9896729824ad57906c66b66a2f Mon Sep 17 00:00:00 2001 From: Stan West <38358698+stanwest@users.noreply.github.com> Date: Thu, 28 Oct 2021 10:43:08 -0400 Subject: [PATCH 3/5] TST: Expand test of grouping by categorical Series Add tests of grouping Series with length equal to that of the grouper and index both equal and unequal to that of the grouper. Operate on the GroupBy and check the result. --- pandas/tests/groupby/test_categorical.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 30bad8fa7188e..66976644b4dcf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -672,10 +672,26 @@ def test_bins_unequal_len(): series.groupby(bins).mean() -def test_categorical_series_unequal_len(): - # GH44179 - groupby = Series(range(7)).groupby(Series(list("ABBA"), dtype="category")) - tm.assert_dict_equal(groupby.groups, {"A": [0, 3], "B": [1, 2]}) +@pytest.mark.parametrize( + ["series", "data"], + [ + # Group a series with length and index equal to those of the grouper. + (Series(range(4)), {"A": [0, 3], "B": [1, 2]}), + # Group a series with length equal to that of the grouper and index unequal to + # that of the grouper. + (Series(range(4)).rename(lambda idx: idx + 1), {"A": [2], "B": [0, 1]}), + # GH44179: Group a series with length unequal to that of the grouper. + (Series(range(7)), {"A": [0, 3], "B": [1, 2]}), + ], +) +def test_categorical_series(series, data): + # Group the given series by a series with categorical data type such that group A + # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in + # the given data. + groupby = series.groupby(Series(list("ABBA"), dtype="category")) + tm.assert_series_equal( + groupby.aggregate(list), Series(data, index=CategoricalIndex(data.keys())) + ) def test_as_index(): From 74580bf3e5e0787320163bc36a8fe26825cc2ddb Mon Sep 17 00:00:00 2001 From: Stan West <38358698+stanwest@users.noreply.github.com> Date: Mon, 8 Nov 2021 10:57:38 -0500 Subject: [PATCH 4/5] DOC: Edit whatsnew entry for GH44179 --- doc/source/whatsnew/v1.4.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 9bcb44b72395a..ca7900ff7c420 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -593,7 +593,7 @@ Groupby/resample/rolling - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` for centered datetimelike windows with uneven nanosecond (:issue:`43997`) - Bug in :meth:`GroupBy.nth` failing on ``axis=1`` (:issue:`43926`) - Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not respecting right bound on centered datetime-like windows, if the index contain duplicates (:issue:`#3944`) -- Enabled grouping by a :class:`Series` that has a categorical data type and length unequal to the axis of grouping (:issue:`44179`); also, with this fix, the ``ValueError`` raised when grouping by a :class:`Categorical` with unequal length now has the same message as when grouping by other sequences +- Fixed bug where grouping by a :class:`Series` that has a categorical data type and length unequal to the axis of grouping raised ``ValueError`` (:issue:`44179`) Reshaping ^^^^^^^^^ From 65fcf42b541577fa09c999d983576f7bf8e86e69 Mon Sep 17 00:00:00 2001 From: Stan West <38358698+stanwest@users.noreply.github.com> Date: Mon, 8 Nov 2021 11:37:56 -0500 Subject: [PATCH 5/5] CLN: Revise tests affected by fix for GH44179 --- pandas/tests/groupby/test_categorical.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 66976644b4dcf..bfbf7ecf1a0d6 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -668,7 +668,7 @@ def test_bins_unequal_len(): bins = pd.cut(series.dropna().values, 4) # len(bins) != len(series) here - with pytest.raises(ValueError, match=r"Grouper and axis must be same length"): + with pytest.raises(ValueError, match="Grouper and axis must be same length"): series.groupby(bins).mean() @@ -689,9 +689,9 @@ def test_categorical_series(series, data): # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in # the given data. groupby = series.groupby(Series(list("ABBA"), dtype="category")) - tm.assert_series_equal( - groupby.aggregate(list), Series(data, index=CategoricalIndex(data.keys())) - ) + result = groupby.aggregate(list) + expected = Series(data, index=CategoricalIndex(data.keys())) + tm.assert_series_equal(result, expected) def test_as_index():