Skip to content

Commit 7550eed

Browse files
Backport PR pandas-dev#38649: BUG: Fix regression for groupby.indices in case of unused categories (pandas-dev#38790)
Co-authored-by: patrick <[email protected]>
1 parent 5bdee11 commit 7550eed

File tree

4 files changed

+29
-8
lines changed

4 files changed

+29
-8
lines changed

doc/source/whatsnew/v1.2.1.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Fixed regressions
1717
- The deprecated attributes ``_AXIS_NAMES`` and ``_AXIS_NUMBERS`` of :class:`DataFrame` and :class:`Series` will no longer show up in ``dir`` or ``inspect.getmembers`` calls (:issue:`38740`)
1818
- :meth:`to_csv` created corrupted zip files when there were more rows than ``chunksize`` (issue:`38714`)
1919
- Bug in repr of float-like strings of an ``object`` dtype having trailing 0's truncated after the decimal (:issue:`38708`)
20-
-
20+
- Fixed regression in :meth:`DataFrame.groupby()` with :class:`Categorical` grouping column not showing unused categories for ``grouped.indices`` (:issue:`38642`)
2121

2222
.. ---------------------------------------------------------------------------
2323

pandas/core/groupby/grouper.py

+2-7
Original file line numberDiff line numberDiff line change
@@ -582,13 +582,8 @@ def indices(self):
582582
if isinstance(self.grouper, ops.BaseGrouper):
583583
return self.grouper.indices
584584

585-
# Return a dictionary of {group label: [indices belonging to the group label]}
586-
# respecting whether sort was specified
587-
codes, uniques = algorithms.factorize(self.grouper, sort=self.sort)
588-
return {
589-
category: np.flatnonzero(codes == i)
590-
for i, category in enumerate(Index(uniques))
591-
}
585+
values = Categorical(self.grouper)
586+
return values._reverse_indexer()
592587

593588
@property
594589
def codes(self) -> np.ndarray:

pandas/core/groupby/ops.py

+6
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@
5353
is_timedelta64_dtype,
5454
needs_i8_conversion,
5555
)
56+
from pandas.core.dtypes.generic import ABCCategoricalIndex
5657
from pandas.core.dtypes.missing import isna, maybe_fill
5758

5859
import pandas.core.algorithms as algorithms
@@ -244,6 +245,11 @@ def apply(self, f: F, data: FrameOrSeries, axis: int = 0):
244245
@cache_readonly
245246
def indices(self):
246247
""" dict {group name -> group indices} """
248+
if len(self.groupings) == 1 and isinstance(
249+
self.result_index, ABCCategoricalIndex
250+
):
251+
# This shows unused categories in indices GH#38642
252+
return self.groupings[0].indices
247253
codes_list = [ping.codes for ping in self.groupings]
248254
keys = [ping.group_index for ping in self.groupings]
249255
return get_indexer_dict(codes_list, keys)

pandas/tests/groupby/test_categorical.py

+20
Original file line numberDiff line numberDiff line change
@@ -1678,3 +1678,23 @@ def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
16781678
df_grp = df.groupby(["a", "b"], observed=observed)
16791679
result = getattr(df_grp, func)()
16801680
tm.assert_frame_equal(result, expected)
1681+
1682+
1683+
def test_groupby_categorical_indices_unused_categories():
1684+
# GH#38642
1685+
df = DataFrame(
1686+
{
1687+
"key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
1688+
"col": range(3),
1689+
}
1690+
)
1691+
grouped = df.groupby("key", sort=False)
1692+
result = grouped.indices
1693+
expected = {
1694+
"b": np.array([0, 1], dtype="int64"),
1695+
"a": np.array([2], dtype="int64"),
1696+
"c": np.array([], dtype="int64"),
1697+
}
1698+
assert result.keys() == expected.keys()
1699+
for key in result.keys():
1700+
tm.assert_numpy_array_equal(result[key], expected[key])

0 commit comments

Comments
 (0)