From 6048cdcc1c27778ead999920dd46785a6aec0633 Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Sun, 4 Jun 2023 17:49:46 +0800 Subject: [PATCH 1/6] FIX groupby with column selection not returning tuple when grouping by a singleton list --- pandas/core/groupby/groupby.py | 9 +++++++-- pandas/tests/groupby/test_groupby.py | 11 ++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index caed6c9747d3b..3aaac5e328699 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -901,8 +901,13 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: FutureWarning, stacklevel=find_stack_level(), ) - if isinstance(keys, list) and len(keys) == 1: - # GH#42795 - when keys is a list, return tuples even when length is 1 + if ( + isinstance(keys, list) + and len(keys) == 1 + or isinstance(keys, ops.BaseGrouper) + and len(keys.names) == 1 + ): + # GH#42795, GH#53500 - if groupby by list of one, still return tuples result = (((key,), group) for key, group in result) return result diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 0c6661b49d917..84600cfc37f22 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2723,11 +2723,16 @@ def test_groupby_none_column_name(): def test_single_element_list_grouping(): - # GH 42795 + # GH#42795, GH#53500 df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"]) - result = [key for key, _ in df.groupby(["a"])] + grouped = df.groupby(["a"]) + + result1 = [key for key, _ in grouped] + result2 = [key for key, _ in grouped[["a", "b", "c"]]] expected = [(1,), (2,)] - assert result == expected + + assert result1 == expected + assert result2 == expected def test_groupby_string_dtype(): From 467d51d82797493cf7b54055c0d8375992f032cb Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Sun, 4 Jun 2023 18:02:30 +0800 Subject: [PATCH 2/6] changelog added --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 92124a536fe26..8323da50bc0ce 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -435,6 +435,7 @@ Groupby/resample/rolling grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, the function operated on the whole index rather than each element of the index. (:issue:`51979`) +- Bug in :meth:`DataFrame.groupby` when grouping by a list of a single element and selecting columns of the resulting groupby object. The group keys are now tuples. (:issue:`53500`) - Bug in :meth:`DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`) - Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`) From 63fbe95dcf0654da3bbb86d635ab10493c6bc3f2 Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Sun, 4 Jun 2023 18:06:04 +0800 Subject: [PATCH 3/6] reworded changelog --- doc/source/whatsnew/v2.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 8323da50bc0ce..5de726f8fbc36 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -435,7 +435,7 @@ Groupby/resample/rolling grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, the function operated on the whole index rather than each element of the index. (:issue:`51979`) -- Bug in :meth:`DataFrame.groupby` when grouping by a list of a single element and selecting columns of the resulting groupby object. The group keys are now tuples. (:issue:`53500`) +- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning tuples when grouping by a list of a single element. (:issue:`53500`) - Bug in :meth:`DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`) - Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`) From 77ef5d994a7f7b7b56723c01b87cf4e4591736c5 Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Sun, 4 Jun 2023 22:20:19 +0800 Subject: [PATCH 4/6] revert and change to keys=self.keys --- pandas/core/groupby/generic.py | 2 +- pandas/core/groupby/groupby.py | 9 ++------- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 80e7be0fd3c91..631aac641fc0a 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1935,7 +1935,7 @@ def _gotitem(self, key, ndim: int, subset=None): subset = self.obj return DataFrameGroupBy( subset, - self.grouper, + self.keys, axis=self.axis, level=self.level, grouper=self.grouper, diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3aaac5e328699..caed6c9747d3b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -901,13 +901,8 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: FutureWarning, stacklevel=find_stack_level(), ) - if ( - isinstance(keys, list) - and len(keys) == 1 - or isinstance(keys, ops.BaseGrouper) - and len(keys.names) == 1 - ): - # GH#42795, GH#53500 - if groupby by list of one, still return tuples + if isinstance(keys, list) and len(keys) == 1: + # GH#42795 - when keys is a list, return tuples even when length is 1 result = (((key,), group) for key, group in result) return result From 27ab1d7e42cab01bd8f733ce71f7ed0f12c088f5 Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Mon, 5 Jun 2023 09:48:39 +0800 Subject: [PATCH 5/6] fix also for SeriesGroupby --- doc/source/whatsnew/v2.1.0.rst | 2 +- pandas/core/groupby/generic.py | 1 + pandas/tests/groupby/test_groupby.py | 6 ++++-- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 5de726f8fbc36..4eff4d8ac2d10 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -435,7 +435,7 @@ Groupby/resample/rolling grouped :class:`Series` or :class:`DataFrame` was a :class:`DatetimeIndex`, :class:`TimedeltaIndex` or :class:`PeriodIndex`, and the ``groupby`` method was given a function as its first argument, the function operated on the whole index rather than each element of the index. (:issue:`51979`) -- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning tuples when grouping by a list of a single element. (:issue:`53500`) +- Bug in :meth:`DataFrame.groupby` with column selection on the resulting groupby object not returning names as tuples when grouping by a list of a single element. (:issue:`53500`) - Bug in :meth:`DataFrameGroupBy.agg` with lists not respecting ``as_index=False`` (:issue:`52849`) - Bug in :meth:`DataFrameGroupBy.apply` causing an error to be raised when the input :class:`DataFrame` was subset as a :class:`DataFrame` after groupby (``[['a']]`` and not ``['a']``) and the given callable returned :class:`Series` that were not all indexed the same. (:issue:`52444`) - Bug in :meth:`DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 631aac641fc0a..1d5fb0fb3873d 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1952,6 +1952,7 @@ def _gotitem(self, key, ndim: int, subset=None): subset = self.obj[key] return SeriesGroupBy( subset, + self.keys, level=self.level, grouper=self.grouper, exclusions=self.exclusions, diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 84600cfc37f22..08b8c1312e430 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2727,10 +2727,12 @@ def test_single_element_list_grouping(): df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"]) grouped = df.groupby(["a"]) - result1 = [key for key, _ in grouped] - result2 = [key for key, _ in grouped[["a", "b", "c"]]] + result0 = [key for key, _ in grouped] + result1 = [key for key, _ in grouped["a"]] + result2 = [key for key, _ in grouped[["a"]]] expected = [(1,), (2,)] + assert result0 == expected assert result1 == expected assert result2 == expected From 6d4d6a30df5baaaf121eda2eaace35d2e5ac4b5a Mon Sep 17 00:00:00 2001 From: Charlie-XIAO Date: Tue, 6 Jun 2023 21:12:10 +0800 Subject: [PATCH 6/6] parametrize the test --- pandas/tests/groupby/test_groupby.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 08b8c1312e430..bf0b646847ed6 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2722,19 +2722,15 @@ def test_groupby_none_column_name(): tm.assert_frame_equal(result, expected) -def test_single_element_list_grouping(): +@pytest.mark.parametrize("selection", [None, "a", ["a"]]) +def test_single_element_list_grouping(selection): # GH#42795, GH#53500 df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"]) - grouped = df.groupby(["a"]) + grouped = df.groupby(["a"]) if selection is None else df.groupby(["a"])[selection] + result = [key for key, _ in grouped] - result0 = [key for key, _ in grouped] - result1 = [key for key, _ in grouped["a"]] - result2 = [key for key, _ in grouped[["a"]]] expected = [(1,), (2,)] - - assert result0 == expected - assert result1 == expected - assert result2 == expected + assert result == expected def test_groupby_string_dtype():