From b00b17f66b561b3a507b91ffa0e46f609bc28c68 Mon Sep 17 00:00:00 2001 From: Jan van Heys Date: Sun, 25 Sep 2022 15:05:06 +0200 Subject: [PATCH 1/3] BUG: Bug fix for GH48567 Groupby for Series fails if an entry of the index of the Series is equal to the name of the index. Analyzing the bug, I think the following small change would solve the issue: diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5fc713d84e..9281040a79 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -875,7 +875,7 @@ def get_grouper( exclusions.add(gpr.name) elif is_in_axis(gpr): # df.groupby('name') - if gpr in obj: + if not isinstance(obj, Series) and gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] From my point of view "gpr in obj" does not make sense for series at this point. I think the if-statement is written for dataframes to look if gbr is one of the columns. Skipping the if-statement for series gives the correct results for the examples above. The testsuite for series and groupby runs without errors after the change. I added a test to the test suite at the end of pandas/tests/groupby/test_groupby.py and a comment in the BUG section to doc/source/whatsnew/v1.6.0.rst. --- doc/source/whatsnew/v1.6.0.rst | 1 + pandas/core/groupby/grouper.py | 2 +- pandas/tests/groupby/test_groupby.py | 27 +++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.6.0.rst b/doc/source/whatsnew/v1.6.0.rst index a7c9a7eb88221..7c07d3112be3c 100644 --- a/doc/source/whatsnew/v1.6.0.rst +++ b/doc/source/whatsnew/v1.6.0.rst @@ -196,6 +196,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`DataFrameGroupBy.sample` raises ``ValueError`` when the object is empty (:issue:`48459`) +- Bug in :meth:`Series.groupby` raises ``ValueError`` when an entry of the index is equal to the name of the index (:issue:`48567`) - Reshaping diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 5fc713d84e842..9281040a79e7f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -875,7 +875,7 @@ def is_in_obj(gpr) -> bool: exclusions.add(gpr.name) elif is_in_axis(gpr): # df.groupby('name') - if gpr in obj: + if not isinstance(obj, Series) and gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index ba39f76203623..d37fee289fdcb 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2907,3 +2907,30 @@ def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val): dtype=any_numeric_ea_dtype, ) tm.assert_frame_equal(result, expected) + +def test_groupby_index_name_in_index_content(): + # GH 48567 + series1 = Series(data=[1.0, 2.0, 3.0, 4.0, 5.0], name='values', + index=Index(['foo', 'foo', 'bar', 'baz', 'blah'], + name='blah')) + result1 = series1.groupby('blah').sum() + expected1 = Series(data=[3.0, 4.0, 5.0, 3.0], name='values', + index=Index(['bar', 'baz', 'blah', 'foo'], name='blah')) + tm.assert_series_equal(result1, expected1) + + series2 = Series(data=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='values', + index=Index(['foo', 'foo', 'bar', 'baz', 'blah', 'blah'], + name='blah')) + result2 = series2.groupby('blah').sum() + expected2 = Series(data=[3.0, 4.0, 11.0, 3.0], name='values', + index=Index(['bar', 'baz', 'blah', 'foo'], + name='blah')) + tm.assert_series_equal(result2, expected2) + + result3 = series1.to_frame().groupby('blah').sum() + expected3 = expected1.to_frame() + tm.assert_frame_equal(result3, expected3) + + result4 = series2.to_frame().groupby('blah').sum() + expected4 = expected2.to_frame() + tm.assert_frame_equal(result4, expected4) From 85bd0d8c72655f7cddfc4e60c68b8e53f8a1d0c4 Mon Sep 17 00:00:00 2001 From: Jan van Heys Date: Tue, 27 Sep 2022 21:02:50 +0200 Subject: [PATCH 2/3] BUG: Bug fix for GH48567 Used pytest.mark.parameterize to simplify similar tests. Corrected formatting issues. --- pandas/tests/groupby/test_groupby.py | 52 +++++++++++++++------------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index d37fee289fdcb..b6c159f67582c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2908,29 +2908,33 @@ def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val): ) tm.assert_frame_equal(result, expected) -def test_groupby_index_name_in_index_content(): + +@pytest.mark.parametrize( + "val_in, index, val_out", + [ + ( + [1.0, 2.0, 3.0, 4.0, 5.0], + ["foo", "foo", "bar", "baz", "blah"], + [3.0, 4.0, 5.0, 3.0], + ), + ( + [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], + ["foo", "foo", "bar", "baz", "blah", "blah"], + [3.0, 4.0, 11.0, 3.0], + ), + ], +) +def test_groupby_index_name_in_index_content(val_in, index, val_out): # GH 48567 - series1 = Series(data=[1.0, 2.0, 3.0, 4.0, 5.0], name='values', - index=Index(['foo', 'foo', 'bar', 'baz', 'blah'], - name='blah')) - result1 = series1.groupby('blah').sum() - expected1 = Series(data=[3.0, 4.0, 5.0, 3.0], name='values', - index=Index(['bar', 'baz', 'blah', 'foo'], name='blah')) - tm.assert_series_equal(result1, expected1) - - series2 = Series(data=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name='values', - index=Index(['foo', 'foo', 'bar', 'baz', 'blah', 'blah'], - name='blah')) - result2 = series2.groupby('blah').sum() - expected2 = Series(data=[3.0, 4.0, 11.0, 3.0], name='values', - index=Index(['bar', 'baz', 'blah', 'foo'], - name='blah')) - tm.assert_series_equal(result2, expected2) - - result3 = series1.to_frame().groupby('blah').sum() - expected3 = expected1.to_frame() - tm.assert_frame_equal(result3, expected3) + series = Series(data=val_in, name="values", index=Index(index, name="blah")) + result = series.groupby("blah").sum() + expected = Series( + data=val_out, + name="values", + index=Index(["bar", "baz", "blah", "foo"], name="blah"), + ) + tm.assert_series_equal(result, expected) - result4 = series2.to_frame().groupby('blah').sum() - expected4 = expected2.to_frame() - tm.assert_frame_equal(result4, expected4) + result = series.to_frame().groupby("blah").sum() + expected = expected.to_frame() + tm.assert_frame_equal(result, expected) From 41ac0278bd612466a4c4933d29eec05d5c3d1afd Mon Sep 17 00:00:00 2001 From: Jan van Heys Date: Fri, 30 Sep 2022 14:04:07 +0200 Subject: [PATCH 3/3] BUG: Bug fix for GH48567 Changed the isinstance check to obj.ndim != 1 as requested. --- pandas/core/groupby/grouper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9281040a79e7f..8d52f34b8d825 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -875,7 +875,7 @@ def is_in_obj(gpr) -> bool: exclusions.add(gpr.name) elif is_in_axis(gpr): # df.groupby('name') - if not isinstance(obj, Series) and gpr in obj: + if obj.ndim != 1 and gpr in obj: if validate: obj._check_label_or_level_ambiguity(gpr, axis=axis) in_axis, name, gpr = True, gpr, obj[gpr]