Skip to content

Commit 0efd4b3

Browse files
committed
Merge pull request #9177 from ledmonster/fix/9049
BUG: Fix not to reindex on non-Categorical groups (GH9049)
2 parents 09a8437 + 3ce07ea commit 0efd4b3

File tree

4 files changed

+77
-26
lines changed

4 files changed

+77
-26
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -267,3 +267,4 @@ Bug Fixes
267267
- ``SparseSeries`` and ``SparsePanel`` now accept zero argument constructors (same as their non-sparse counterparts) (:issue:`9272`).
268268

269269
- Bug in ``read_csv`` with buffer overflows with certain malformed input files (:issue:`9205`)
270+
- Bug in groupby MultiIndex with missing pair (:issue:`9049`, :issue:`9344`)

pandas/core/groupby.py

+11-26
Original file line numberDiff line numberDiff line change
@@ -1862,7 +1862,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
18621862
self.grouper = grouper.values
18631863

18641864
# pre-computed
1865-
self._was_factor = False
18661865
self._should_compress = True
18671866

18681867
# we have a single grouper which may be a myriad of things, some of which are
@@ -1887,8 +1886,6 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
18871886
level_values = index.levels[level].take(inds)
18881887
self.grouper = level_values.map(self.grouper)
18891888
else:
1890-
self._was_factor = True
1891-
18921889
# all levels may not be observed
18931890
labels, uniques = algos.factorize(inds, sort=True)
18941891

@@ -1913,17 +1910,10 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
19131910

19141911
# a passed Categorical
19151912
elif isinstance(self.grouper, Categorical):
1916-
1917-
factor = self.grouper
1918-
self._was_factor = True
1919-
1920-
# Is there any way to avoid this?
1921-
self.grouper = np.asarray(factor)
1922-
1923-
self._labels = factor.codes
1924-
self._group_index = factor.categories
1913+
self._labels = self.grouper.codes
1914+
self._group_index = self.grouper.categories
19251915
if self.name is None:
1926-
self.name = factor.name
1916+
self.name = self.grouper.name
19271917

19281918
# a passed Grouper like
19291919
elif isinstance(self.grouper, Grouper):
@@ -1936,8 +1926,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
19361926
self.name = grouper.name
19371927

19381928
# no level passed
1939-
if not isinstance(self.grouper, (Series, Index, np.ndarray)):
1940-
if getattr(self.grouper,'ndim', 1) != 1:
1929+
if not isinstance(self.grouper, (Series, Index, Categorical, np.ndarray)):
1930+
if getattr(self.grouper, 'ndim', 1) != 1:
19411931
t = self.name or str(type(self.grouper))
19421932
raise ValueError("Grouper for '%s' not 1-dimensional" % t)
19431933
self.grouper = self.index.map(self.grouper)
@@ -1988,21 +1978,15 @@ def group_index(self):
19881978
return self._group_index
19891979

19901980
def _make_labels(self):
1991-
if self._was_factor: # pragma: no cover
1992-
raise Exception('Should not call this method grouping by level')
1993-
else:
1981+
if self._labels is None or self._group_index is None:
19941982
labels, uniques = algos.factorize(self.grouper, sort=self.sort)
19951983
uniques = Index(uniques, name=self.name)
19961984
self._labels = labels
19971985
self._group_index = uniques
19981986

1999-
_groups = None
2000-
2001-
@property
1987+
@cache_readonly
20021988
def groups(self):
2003-
if self._groups is None:
2004-
self._groups = self.index.groupby(self.grouper)
2005-
return self._groups
1989+
return self.index.groupby(self.grouper)
20061990

20071991
def _get_grouper(obj, key=None, axis=0, level=None, sort=True):
20081992
"""
@@ -3238,10 +3222,11 @@ def _reindex_output(self, result):
32383222
return result
32393223
elif len(groupings) == 1:
32403224
return result
3241-
elif not any([ping._was_factor for ping in groupings]):
3225+
elif not any([isinstance(ping.grouper, Categorical)
3226+
for ping in groupings]):
32423227
return result
32433228

3244-
levels_list = [ ping._group_index for ping in groupings ]
3229+
levels_list = [ ping.group_index for ping in groupings ]
32453230
index = MultiIndex.from_product(levels_list, names=self.grouper.names)
32463231
d = { self.obj._get_axis_name(self.axis) : index, 'copy' : False }
32473232
return result.reindex(**d).sortlevel(axis=self.axis)

pandas/tests/test_groupby.py

+53
Original file line numberDiff line numberDiff line change
@@ -3297,6 +3297,34 @@ def test_groupby_categorical(self):
32973297
expected.index.names = ['myfactor', None]
32983298
assert_frame_equal(desc_result, expected)
32993299

3300+
def test_groupby_datetime_categorical(self):
3301+
# GH9049: ensure backward compatibility
3302+
levels = pd.date_range('2014-01-01', periods=4)
3303+
codes = np.random.randint(0, 4, size=100)
3304+
3305+
cats = Categorical.from_codes(codes, levels, name='myfactor')
3306+
3307+
data = DataFrame(np.random.randn(100, 4))
3308+
3309+
result = data.groupby(cats).mean()
3310+
3311+
expected = data.groupby(np.asarray(cats)).mean()
3312+
expected = expected.reindex(levels)
3313+
expected.index.name = 'myfactor'
3314+
3315+
assert_frame_equal(result, expected)
3316+
self.assertEqual(result.index.name, cats.name)
3317+
3318+
grouped = data.groupby(cats)
3319+
desc_result = grouped.describe()
3320+
3321+
idx = cats.codes.argsort()
3322+
ord_labels = np.asarray(cats).take(idx)
3323+
ord_data = data.take(idx)
3324+
expected = ord_data.groupby(ord_labels, sort=False).describe()
3325+
expected.index.names = ['myfactor', None]
3326+
assert_frame_equal(desc_result, expected)
3327+
33003328
def test_groupby_groups_datetimeindex(self):
33013329
# #1430
33023330
from pandas.tseries.api import DatetimeIndex
@@ -3484,6 +3512,31 @@ def test_groupby_categorical_unequal_len(self):
34843512
# len(bins) != len(series) here
34853513
self.assertRaises(ValueError,lambda : series.groupby(bins).mean())
34863514

3515+
def test_groupby_multiindex_missing_pair(self):
3516+
# GH9049
3517+
df = DataFrame({'group1': ['a','a','a','b'],
3518+
'group2': ['c','c','d','c'],
3519+
'value': [1,1,1,5]})
3520+
df = df.set_index(['group1', 'group2'])
3521+
df_grouped = df.groupby(level=['group1','group2'], sort=True)
3522+
3523+
res = df_grouped.agg('sum')
3524+
idx = MultiIndex.from_tuples([('a','c'), ('a','d'), ('b','c')], names=['group1', 'group2'])
3525+
exp = DataFrame([[2], [1], [5]], index=idx, columns=['value'])
3526+
3527+
tm.assert_frame_equal(res, exp)
3528+
3529+
def test_groupby_levels_and_columns(self):
3530+
# GH9344, GH9049
3531+
idx_names = ['x', 'y']
3532+
idx = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
3533+
df = pd.DataFrame(np.arange(12).reshape(-1, 3), index=idx)
3534+
3535+
by_levels = df.groupby(level=idx_names).mean()
3536+
by_columns = df.reset_index().groupby(idx_names).mean()
3537+
3538+
tm.assert_frame_equal(by_levels, by_columns)
3539+
34873540
def test_gb_apply_list_of_unequal_len_arrays(self):
34883541

34893542
# GH1738

vb_suite/groupby.py

+12
Original file line numberDiff line numberDiff line change
@@ -390,6 +390,18 @@ def f(g):
390390

391391
groupby_sum_booleans = Benchmark("df.groupby('ii').sum()", setup)
392392

393+
394+
#----------------------------------------------------------------------
395+
# multi-indexed group sum #9049
396+
397+
setup = common_setup + """
398+
N = 50
399+
df = DataFrame({'A': range(N) * 2, 'B': range(N*2), 'C': 1}).set_index(["A", "B"])
400+
"""
401+
402+
groupby_sum_multiindex = Benchmark("df.groupby(level=[0, 1]).sum()", setup)
403+
404+
393405
#----------------------------------------------------------------------
394406
# Transform testing
395407

0 commit comments

Comments
 (0)