From 523a8620186b41c3fa90a5bbac4a63dc08600ef6 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 23 Dec 2020 00:54:36 +0100 Subject: [PATCH 1/6] BUG: Fix regression for groupby.indices in case of unused categories --- pandas/core/groupby/grouper.py | 9 ++------- pandas/core/groupby/ops.py | 5 +++++ pandas/tests/groupby/test_categorical.py | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index e3196904fa56f..26fb23087ed55 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -556,13 +556,8 @@ def indices(self): if isinstance(self.grouper, ops.BaseGrouper): return self.grouper.indices - # Return a dictionary of {group label: [indices belonging to the group label]} - # respecting whether sort was specified - codes, uniques = algorithms.factorize(self.grouper, sort=self.sort) - return { - category: np.flatnonzero(codes == i) - for i, category in enumerate(Index(uniques)) - } + values = Categorical(self.grouper) + return values._reverse_indexer() @property def codes(self) -> np.ndarray: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d1a4fc6fc74e5..8e33ae0c30df3 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -53,6 +53,7 @@ is_timedelta64_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.generic import ABCCategoricalIndex from pandas.core.dtypes.missing import isna, maybe_fill import pandas.core.algorithms as algorithms @@ -241,6 +242,10 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0): @cache_readonly def indices(self): """ dict {group name -> group indices} """ + if len(self.groupings) == 1 and isinstance( + self.result_index, ABCCategoricalIndex + ): + return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] keys = [ping.group_index for ping in self.groupings] return get_indexer_dict(codes_list, keys) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8cf77ca6335f4..516d54b6a2924 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( df_grp = df.groupby(["a", "b"], observed=observed) result = getattr(df_grp, func)() tm.assert_frame_equal(result, expected) + + +def test_groupby_categorical_indices_unused_categories(): + # GH#38642 + df = DataFrame( + { + "key": pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), + "col": range(3), + } + ) + grouped = df.groupby("key", sort=False) + result = grouped.indices + expected = { + "b": np.array([0, 1]), + "a": np.array([2]), + "c": np.array([], dtype="int64"), + } + assert result.keys() == expected.keys() + for key in result.keys(): + tm.assert_numpy_array_equal(result[key], expected[key]) From 80952f85edfd1aea2091d10320b0dbec62f7adb3 Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 23 Dec 2020 00:58:26 +0100 Subject: [PATCH 2/6] Add comment --- pandas/core/groupby/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 8e33ae0c30df3..d7a3fff5c3898 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -245,6 +245,7 @@ def indices(self): if len(self.groupings) == 1 and isinstance( self.result_index, ABCCategoricalIndex ): + # This shows unused categories in indices GH#38642 return self.groupings[0].indices codes_list = [ping.codes for ping in self.groupings] keys = [ping.group_index for ping in self.groupings] From 530361f893f0ff13d80128c4ebbec5d15177d6ca Mon Sep 17 00:00:00 2001 From: phofl Date: Wed, 23 Dec 2020 01:01:43 +0100 Subject: [PATCH 3/6] Remove pd --- pandas/tests/groupby/test_categorical.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 516d54b6a2924..c1a5a7a86b922 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1684,7 +1684,7 @@ def test_groupby_categorical_indices_unused_categories(): # GH#38642 df = DataFrame( { - "key": pd.Categorical(["b", "b", "a"], categories=["a", "b", "c"]), + "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]), "col": range(3), } ) From c689f63e77c8ceef35456059baa7676de2e4ea1b Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 29 Dec 2020 21:55:43 +0100 Subject: [PATCH 4/6] Change test --- pandas/tests/groupby/test_categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index c1a5a7a86b922..f0bc58cbf07bf 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1691,8 +1691,8 @@ def test_groupby_categorical_indices_unused_categories(): grouped = df.groupby("key", sort=False) result = grouped.indices expected = { - "b": np.array([0, 1]), - "a": np.array([2]), + "b": np.array([0, 1], dtype="int64"), + "a": np.array([2], dtype="int64"), "c": np.array([], dtype="int64"), } assert result.keys() == expected.keys() From fac89853375e0d6bb130ca884c0c66d7e95eef77 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 29 Dec 2020 21:57:03 +0100 Subject: [PATCH 5/6] Add whatsnew --- doc/source/whatsnew/v1.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index a756239ee6798..5ad357a44a6ca 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -18,7 +18,7 @@ Fixed regressions - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) - Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) -- +- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`36842`) .. --------------------------------------------------------------------------- From 86df6408703686ea035b77fe57d05571ec2ce162 Mon Sep 17 00:00:00 2001 From: phofl Date: Tue, 29 Dec 2020 21:57:28 +0100 Subject: [PATCH 6/6] Change gh reference --- doc/source/whatsnew/v1.2.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 5ad357a44a6ca..804886fb987ad 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -18,7 +18,7 @@ Fixed regressions - :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`) - Fixed a regression in ``groupby().rolling()`` where :class:`MultiIndex` levels were dropped (:issue:`38523`) - Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`) -- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`36842`) +- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`) .. ---------------------------------------------------------------------------