Skip to content

Commit b53ef23

Browse files
author
Junya Hayashi
committed
BUG: Fix not to reindex on non-Categorical groups (GH9049, GH9344)
1 parent 107cb10 commit b53ef23

File tree

4 files changed

+47
-7
lines changed

4 files changed

+47
-7
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -267,3 +267,4 @@ Bug Fixes
267267
- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).
268268

269269
- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
270+
- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)

pandas/core/groupby.py

+9-7
Original file line numberDiff line numberDiff line change
@@ -1862,7 +1862,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
18621862
self.grouper = grouper.values
18631863

18641864
# pre-computed
1865-
self._was_factor = False
1865+
self._grouping_type = None
18661866
self._should_compress = True
18671867

18681868
# we have a single grouper which may be a myriad of things, some of which are
@@ -1887,7 +1887,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
18871887
level_values = index.levels[level].take(inds)
18881888
self.grouper = level_values.map(self.grouper)
18891889
else:
1890-
self._was_factor = True
1890+
self._grouping_type = "level"
18911891

18921892
# all levels may not be observed
18931893
labels, uniques = algos.factorize(inds, sort=True)
@@ -1915,7 +1915,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
19151915
elif isinstance(self.grouper, Categorical):
19161916

19171917
factor = self.grouper
1918-
self._was_factor = True
1918+
self._grouping_type = "categorical"
19191919

19201920
# Is there any way to avoid this?
19211921
self.grouper = np.asarray(factor)
@@ -1988,8 +1988,9 @@ def group_index(self):
19881988
return self._group_index
19891989

19901990
def _make_labels(self):
1991-
if self._was_factor: # pragma: no cover
1992-
raise Exception('Should not call this method grouping by level')
1991+
if self._grouping_type in ("level", "categorical"): # pragma: no cover
1992+
raise Exception(
1993+
'Should not call this method grouping by level or categorical')
19931994
else:
19941995
labels, uniques = algos.factorize(self.grouper, sort=self.sort)
19951996
uniques = Index(uniques, name=self.name)
@@ -3238,10 +3239,11 @@ def _reindex_output(self, result):
32383239
return result
32393240
elif len(groupings) == 1:
32403241
return result
3241-
elif not any([ping._was_factor for ping in groupings]):
3242+
elif not any([ping._grouping_type == "categorical"
3243+
for ping in groupings]):
32423244
return result
32433245

3244-
levels_list = [ ping._group_index for ping in groupings ]
3246+
levels_list = [ ping.group_index for ping in groupings ]
32453247
index = MultiIndex.from_product(levels_list, names=self.grouper.names)
32463248
d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False }
32473249
return result.reindex(**d).sortlevel(axis=self.axis)

pandas/tests/test_groupby.py

+25
Original file line numberDiff line numberDiff line change
@@ -3484,6 +3484,31 @@ def test_groupby_categorical_unequal_len(self):
34843484
# len(bins) != len(series) here
34853485
self.assertRaises(ValueError,lambda : series.groupby(bins).mean())
34863486

3487+
def test_groupby_multiindex_missing_pair(self):
3488+
# GH9049
3489+
df = DataFrame({'group1': ['a','a','a','b'],
3490+
'group2': ['c','c','d','c'],
3491+
'value': [1,1,1,5]})
3492+
df = df.set_index(['group1', 'group2'])
3493+
df_grouped = df.groupby(level=['group1','group2'], sort=True)
3494+
3495+
res = df_grouped.agg('sum')
3496+
idx = MultiIndex.from_tuples([('a','c'), ('a','d'), ('b','c')], names=['group1', 'group2'])
3497+
exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
3498+
3499+
tm.assert_frame_equal(res, exp)
3500+
3501+
def test_groupby_levels_and_columns(self):
3502+
# GH9344, GH9049
3503+
idx_names = ['x', 'y']
3504+
idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
3505+
df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
3506+
3507+
by_levels = df.groupby(level=idx_names).mean()
3508+
by_columns = df.reset_index().groupby(idx_names).mean()
3509+
3510+
tm.assert_frame_equal(by_levels, by_columns)
3511+
34873512
def test_gb_apply_list_of_unequal_len_arrays(self):
34883513

34893514
# GH1738

vb_suite/groupby.py

+12
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,18 @@ def f(g):
390390

391391
groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)
392392

393+
394+
#----------------------------------------------------------------------
395+
# multi-indexed group sum #9049
396+
397+
setup = common_setup + """
398+
N = 50
399+
df = DataFrame({'A': range(N) * 2, 'B': range(N*2), 'C': 1}).set_index(["A", "B"])
400+
"""
401+
402+
groupby_sum_multiindex = Benchmark("df.groupby(level=[0, 1]).sum()", setup)
403+
404+
393405
#----------------------------------------------------------------------
394406
# Transform testing
395407

0 commit comments

Comments
 (0)