From eab37ca366ca371560bc782ebb7d01e54649afe4 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 6 Jun 2024 10:41:27 +0200 Subject: [PATCH 1/4] correct BaseGroupBy.__iter__ and def groups [skip ci] --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1b58317c08736..5f30ee7cce3a4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1012,7 +1012,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: # mypy: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" if ( (is_list_like(level) and len(level) == 1) # type: ignore[arg-type] - or (isinstance(keys, list) and len(keys) == 1) + or (isinstance(keys, list)) ): # GH#42795 - when keys is a list, return tuples even when length is 1 result = (((key,), group) for key, group in result) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..f5033e459c157 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -703,7 +703,7 @@ def size(self) -> Series: @cache_readonly def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" - if len(self.groupings) == 1: + if len(self.groupings) == 1 and not isinstance(self.shape, tuple): return self.groupings[0].groups result_index, ids = self.result_index_and_ids values = result_index._values From 06f5ff3c72cf7062a3eaa43810a1830ad548d196 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Thu, 27 Jun 2024 10:23:06 +0200 Subject: [PATCH 2/4] add to BaseGrouper constructor key_dtype_str paramater, correct def result_index_and_ids and def groups --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/grouper.py | 7 +++++-- pandas/core/groupby/ops.py | 10 ++++++++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5f30ee7cce3a4..1b58317c08736 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1012,7 +1012,7 @@ def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]: # mypy: Argument 1 to "len" has incompatible type "Hashable"; expected "Sized" if ( (is_list_like(level) and len(level) == 1) # type: ignore[arg-type] - or (isinstance(keys, list)) + or (isinstance(keys, list) and len(keys) == 1) ): # GH#42795 - when keys is a list, return tuples even when length is 1 result = (((key,), group) for key, group in result) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e75a5b9089f5f..64145d89353c4 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -780,7 +780,9 @@ def get_grouper( elif isinstance(key, ops.BaseGrouper): return key, frozenset(), obj + key_dtype_str = False if not isinstance(key, list): + key_dtype_str = True keys = [key] match_axis_length = False else: @@ -895,7 +897,6 @@ def is_in_obj(gpr) -> bool: if not isinstance(gpr, Grouping) else gpr ) - groupings.append(ping) if len(groupings) == 0 and len(obj): @@ -904,7 +905,9 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) + grouper = ops.BaseGrouper( + group_axis, groupings, sort=sort, dropna=dropna, key_dtype_str=key_dtype_str + ) return grouper, frozenset(exclusions), obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f5033e459c157..1dfe54f76590d 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -584,6 +584,7 @@ def __init__( groupings: Sequence[grouper.Grouping], sort: bool = True, dropna: bool = True, + key_dtype_str: bool = True, ) -> None: assert isinstance(axis, Index), axis @@ -591,6 +592,7 @@ def __init__( self._groupings: list[grouper.Grouping] = list(groupings) self._sort = sort self.dropna = dropna + self.key_dtype_str = key_dtype_str @property def groupings(self) -> list[grouper.Grouping]: @@ -640,6 +642,7 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" + if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 return self.groupings[0].indices @@ -703,7 +706,10 @@ def size(self) -> Series: @cache_readonly def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" - if len(self.groupings) == 1 and not isinstance(self.shape, tuple): + if len(self.groupings) == 1 and not self.key_dtype_str: + result = self.groupings[0].groups + + if self.key_dtype_str and len(self.groupings) == 1: return self.groupings[0].groups result_index, ids = self.result_index_and_ids values = result_index._values @@ -764,7 +770,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: if ping._passed_categorical: levels[k] = level.set_categories(ping._orig_cats) - if len(self.groupings) == 1: + if self.key_dtype_str and len(self.groupings) == 1: result_index = levels[0] result_index.name = self.names[0] ids = ensure_platform_int(self.codes[0]) From 2b69e46f4ae13d49616c451461b4c25ef09985bc Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 2 Jul 2024 15:25:10 +0200 Subject: [PATCH 3/4] move key_dtype_str from BaseGrouper to Grouping --- pandas/core/groupby/grouper.py | 9 ++++++--- pandas/core/groupby/ops.py | 9 ++------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 64145d89353c4..fd735c81463de 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -440,6 +440,7 @@ def __init__( in_axis: bool = False, dropna: bool = True, uniques: ArrayLike | None = None, + key_dtype_str: bool = False, ) -> None: self.level = level self._orig_grouper = grouper @@ -452,6 +453,7 @@ def __init__( self.in_axis = in_axis self._dropna = dropna self._uniques = uniques + self.key_dtype_str = key_dtype_str # we have a single grouper which may be a myriad of things, # some of which are dependent on the passing in level @@ -666,6 +668,8 @@ def groups(self) -> dict[Hashable, Index]: codes, uniques = self._codes_and_uniques uniques = Index._with_infer(uniques, name=self.name) cats = Categorical.from_codes(codes, uniques, validate=False) + if not self.key_dtype_str: + cats = [(i,) for i in cats] return self._index.groupby(cats) @property @@ -893,6 +897,7 @@ def is_in_obj(gpr) -> bool: observed=observed, in_axis=in_axis, dropna=dropna, + key_dtype_str=key_dtype_str, ) if not isinstance(gpr, Grouping) else gpr @@ -905,9 +910,7 @@ def is_in_obj(gpr) -> bool: groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp))) # create the internals grouper - grouper = ops.BaseGrouper( - group_axis, groupings, sort=sort, dropna=dropna, key_dtype_str=key_dtype_str - ) + grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna) return grouper, frozenset(exclusions), obj diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 1dfe54f76590d..708e549066674 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -584,7 +584,6 @@ def __init__( groupings: Sequence[grouper.Grouping], sort: bool = True, dropna: bool = True, - key_dtype_str: bool = True, ) -> None: assert isinstance(axis, Index), axis @@ -592,7 +591,6 @@ def __init__( self._groupings: list[grouper.Grouping] = list(groupings) self._sort = sort self.dropna = dropna - self.key_dtype_str = key_dtype_str @property def groupings(self) -> list[grouper.Grouping]: @@ -706,10 +704,7 @@ def size(self) -> Series: @cache_readonly def groups(self) -> dict[Hashable, Index]: """dict {group name -> group labels}""" - if len(self.groupings) == 1 and not self.key_dtype_str: - result = self.groupings[0].groups - - if self.key_dtype_str and len(self.groupings) == 1: + if len(self.groupings) == 1: return self.groupings[0].groups result_index, ids = self.result_index_and_ids values = result_index._values @@ -770,7 +765,7 @@ def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]: if ping._passed_categorical: levels[k] = level.set_categories(ping._orig_cats) - if self.key_dtype_str and len(self.groupings) == 1: + if len(self.groupings) == 1: result_index = levels[0] result_index.name = self.names[0] ids = ensure_platform_int(self.codes[0]) From a29cd9afa315a79712723d06338ec4a49da36f64 Mon Sep 17 00:00:00 2001 From: Natalia Mokeeva Date: Tue, 2 Jul 2024 16:39:23 +0200 Subject: [PATCH 4/4] add a test --- pandas/core/groupby/ops.py | 1 - pandas/tests/groupby/test_groupby.py | 7 +++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 708e549066674..4f40c4f4283f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -640,7 +640,6 @@ def _get_splitter(self, data: NDFrame) -> DataSplitter: @cache_readonly def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]: """dict {group name -> group indices}""" - if len(self.groupings) == 1 and isinstance(self.result_index, CategoricalIndex): # This shows unused categories in indices GH#38642 return self.groupings[0].indices diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4c1dc8953580a..f060611e248e5 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2985,3 +2985,10 @@ def test_groupby_agg_namedagg_with_duplicate_columns(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_keys_1length_list(): + # GH#58945 + df = DataFrame({"x": [10, 20, 30], "y": ["a", "b", "c"]}) + result = df.groupby(["x"]).groups + assert all(isinstance(key, tuple) for key in result.keys())