Skip to content

Commit 8f8d75d

Browse files
pijuchajreback
authored andcommitted
BUG: Fix groupby with "as_index" for categorical multi #13204
closes #13204 Fixes a bug that returns all nan's for groupby(as_index=False) with multiple column groupers containing a categorical one (#13204). Also: fixes an internal bug in the string representation of `Grouping`. Author: Piotr Jucha <[email protected]> Closes #13394 from pijucha/groupbycat13204 and squashes the following commits: 374402c [Piotr Jucha] BUG: Fix groupby with as_index for categorical multi groupers #13204
1 parent 449e824 commit 8f8d75d

File tree

3 files changed

+86
-4
lines changed

3 files changed

+86
-4
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -527,3 +527,4 @@ Bug Fixes
527527

528528

529529
- Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`)
530+
- Bug in ``groupby`` with ``as_index=False`` returns all NaN's when grouping on multiple columns including a categorical one (:issue:`13204`)

pandas/core/groupby.py

+34-4
Original file line numberDiff line numberDiff line change
@@ -2250,7 +2250,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
22502250
self.grouper = to_timedelta(self.grouper)
22512251

22522252
def __repr__(self):
2253-
return 'Grouping(%s)' % self.name
2253+
return 'Grouping({0})'.format(self.name)
22542254

22552255
def __iter__(self):
22562256
return iter(self.indices)
@@ -3741,9 +3741,39 @@ def _reindex_output(self, result):
37413741
return result
37423742

37433743
levels_list = [ping.group_index for ping in groupings]
3744-
index = MultiIndex.from_product(levels_list, names=self.grouper.names)
3745-
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
3746-
return result.reindex(**d).sortlevel(axis=self.axis)
3744+
index, _ = MultiIndex.from_product(
3745+
levels_list, names=self.grouper.names).sortlevel()
3746+
3747+
if self.as_index:
3748+
d = {self.obj._get_axis_name(self.axis): index, 'copy': False}
3749+
return result.reindex(**d)
3750+
3751+
# GH 13204
3752+
# Here, the categorical in-axis groupers, which need to be fully
3753+
# expanded, are columns in `result`. An idea is to do:
3754+
# result = result.set_index(self.grouper.names)
3755+
# .reindex(index).reset_index()
3756+
# but special care has to be taken because of possible not-in-axis
3757+
# groupers.
3758+
# So, we manually select and drop the in-axis grouper columns,
3759+
# reindex `result`, and then reset the in-axis grouper columns.
3760+
3761+
# Select in-axis groupers
3762+
in_axis_grps = [(i, ping.name) for (i, ping)
3763+
in enumerate(groupings) if ping.in_axis]
3764+
g_nums, g_names = zip(*in_axis_grps)
3765+
3766+
result = result.drop(labels=list(g_names), axis=1)
3767+
3768+
# Set a temp index and reindex (possibly expanding)
3769+
result = result.set_index(self.grouper.result_index
3770+
).reindex(index, copy=False)
3771+
3772+
# Reset in-axis grouper columns
3773+
# (using level numbers `g_nums` because level names may not be unique)
3774+
result = result.reset_index(level=g_nums)
3775+
3776+
return result.reset_index(drop=True)
37473777

37483778
def _iterate_column_groupbys(self):
37493779
for i, colname in enumerate(self._selected_obj.columns):

pandas/tests/test_groupby.py

+51
Original file line numberDiff line numberDiff line change
@@ -6304,6 +6304,47 @@ def test_groupby_categorical_two_columns(self):
63046304
nan, nan, nan, nan, 200, 34]}, index=idx)
63056305
tm.assert_frame_equal(res, exp)
63066306

6307+
def test_groupby_multi_categorical_as_index(self):
6308+
# GH13204
6309+
df = DataFrame({'cat': Categorical([1, 2, 2], [1, 2, 3]),
6310+
'A': [10, 11, 11],
6311+
'B': [101, 102, 103]})
6312+
result = df.groupby(['cat', 'A'], as_index=False).sum()
6313+
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
6314+
'A': [10, 11, 10, 11, 10, 11],
6315+
'B': [101.0, nan, nan, 205.0, nan, nan]},
6316+
columns=['cat', 'A', 'B'])
6317+
tm.assert_frame_equal(result, expected)
6318+
6319+
# function grouper
6320+
f = lambda r: df.loc[r, 'A']
6321+
result = df.groupby(['cat', f], as_index=False).sum()
6322+
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
6323+
'A': [10.0, nan, nan, 22.0, nan, nan],
6324+
'B': [101.0, nan, nan, 205.0, nan, nan]},
6325+
columns=['cat', 'A', 'B'])
6326+
tm.assert_frame_equal(result, expected)
6327+
6328+
# another not in-axis grouper (conflicting names in index)
6329+
s = Series(['a', 'b', 'b'], name='cat')
6330+
result = df.groupby(['cat', s], as_index=False).sum()
6331+
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
6332+
'A': [10.0, nan, nan, 22.0, nan, nan],
6333+
'B': [101.0, nan, nan, 205.0, nan, nan]},
6334+
columns=['cat', 'A', 'B'])
6335+
tm.assert_frame_equal(result, expected)
6336+
6337+
# is original index dropped?
6338+
expected = DataFrame({'cat': [1, 1, 2, 2, 3, 3],
6339+
'A': [10, 11, 10, 11, 10, 11],
6340+
'B': [101.0, nan, nan, 205.0, nan, nan]},
6341+
columns=['cat', 'A', 'B'])
6342+
6343+
for name in [None, 'X', 'B', 'cat']:
6344+
df.index = Index(list("abc"), name=name)
6345+
result = df.groupby(['cat', 'A'], as_index=False).sum()
6346+
tm.assert_frame_equal(result, expected, check_index_type=True)
6347+
63076348
def test_groupby_apply_all_none(self):
63086349
# Tests to make sure no errors if apply function returns all None
63096350
# values. Issue 9684.
@@ -6431,6 +6472,16 @@ def test_numpy_compat(self):
64316472
tm.assertRaisesRegexp(UnsupportedFunctionCall, msg,
64326473
getattr(g, func), foo=1)
64336474

6475+
def test_grouping_string_repr(self):
6476+
# GH 13394
6477+
mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
6478+
df = DataFrame([[1, 2, 3]], columns=mi)
6479+
gr = df.groupby(df[('A', 'a')])
6480+
6481+
result = gr.grouper.groupings[0].__repr__()
6482+
expected = "Grouping(('A', 'a'))"
6483+
tm.assert_equal(result, expected)
6484+
64346485

64356486
def assert_fp_equal(a, b):
64366487
assert (np.abs(a - b) < 1e-12).all()

0 commit comments

Comments
 (0)