Skip to content

Commit e70252b

Browse files
jonmmeasejorisvandenbossche
authored andcommitted
Bug: Grouping by index and column fails on DataFrame with single index (GH14327) (#14428)
Existing logic under "if level is not None:" assumed that index was a MultiIndex. Now we check and also handle the case where an Index is passed in with a None grouper. This resolves GH 14327
1 parent 286b9b9 commit e70252b

File tree

5 files changed

+97
-26
lines changed

5 files changed

+97
-26
lines changed

doc/source/whatsnew/v0.19.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,4 @@ Bug Fixes
4646
- Bug in ``pd.concat`` where names of the ``keys`` were not propagated to the resulting ``MultiIndex`` (:issue:`14252`)
4747
- Bug in ``MultiIndex.set_levels`` where illegal level values were still set after raising an error (:issue:`13754`)
4848
- Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`)
49+
- Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)

pandas/core/groupby.py

+2-26
Original file line numberDiff line numberDiff line change
@@ -2201,36 +2201,12 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
22012201
raise AssertionError('Level %s not in index' % str(level))
22022202
level = index.names.index(level)
22032203

2204-
inds = index.labels[level]
2205-
level_index = index.levels[level]
2206-
22072204
if self.name is None:
22082205
self.name = index.names[level]
22092206

2210-
# XXX complete hack
2211-
2212-
if grouper is not None:
2213-
level_values = index.levels[level].take(inds)
2214-
self.grouper = level_values.map(self.grouper)
2215-
else:
2216-
# all levels may not be observed
2217-
labels, uniques = algos.factorize(inds, sort=True)
2218-
2219-
if len(uniques) > 0 and uniques[0] == -1:
2220-
# handle NAs
2221-
mask = inds != -1
2222-
ok_labels, uniques = algos.factorize(inds[mask], sort=True)
2223-
2224-
labels = np.empty(len(inds), dtype=inds.dtype)
2225-
labels[mask] = ok_labels
2226-
labels[~mask] = -1
2227-
2228-
if len(uniques) < len(level_index):
2229-
level_index = level_index.take(uniques)
2207+
self.grouper, self._labels, self._group_index = \
2208+
index._get_grouper_for_level(self.grouper, level)
22302209

2231-
self._labels = labels
2232-
self._group_index = level_index
2233-
self.grouper = level_index.take(labels)
22342210
else:
22352211
if isinstance(self.grouper, (list, tuple)):
22362212
self.grouper = com._asarray_tuplesafe(self.grouper)

pandas/indexes/base.py

+30
Original file line numberDiff line numberDiff line change
@@ -432,6 +432,36 @@ def _update_inplace(self, result, **kwargs):
432432
# guard when called from IndexOpsMixin
433433
raise TypeError("Index can't be updated inplace")
434434

435+
_index_shared_docs['_get_grouper_for_level'] = """
436+
Get index grouper corresponding to an index level
437+
438+
Parameters
439+
----------
440+
mapper: Group mapping function or None
441+
Function mapping index values to groups
442+
level : int or None
443+
Index level
444+
445+
Returns
446+
-------
447+
grouper : Index
448+
Index of values to group on
449+
labels : ndarray of int or None
450+
Array of locations in level_index
451+
uniques : Index or None
452+
Index of unique values for level
453+
"""
454+
455+
@Appender(_index_shared_docs['_get_grouper_for_level'])
456+
def _get_grouper_for_level(self, mapper, level=None):
457+
assert level is None or level == 0
458+
if mapper is None:
459+
grouper = self
460+
else:
461+
grouper = self.map(mapper)
462+
463+
return grouper, None, None
464+
435465
def is_(self, other):
436466
"""
437467
More flexible, faster check like ``is`` but that works through views

pandas/indexes/multi.py

+31
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,37 @@ def _format_native_types(self, na_rep='nan', **kwargs):
539539

540540
return mi.values
541541

542+
@Appender(_index_shared_docs['_get_grouper_for_level'])
543+
def _get_grouper_for_level(self, mapper, level):
544+
indexer = self.labels[level]
545+
level_index = self.levels[level]
546+
547+
if mapper is not None:
548+
# Handle group mapping function and return
549+
level_values = self.levels[level].take(indexer)
550+
grouper = level_values.map(mapper)
551+
return grouper, None, None
552+
553+
labels, uniques = algos.factorize(indexer, sort=True)
554+
555+
if len(uniques) > 0 and uniques[0] == -1:
556+
# Handle NAs
557+
mask = indexer != -1
558+
ok_labels, uniques = algos.factorize(indexer[mask],
559+
sort=True)
560+
561+
labels = np.empty(len(indexer), dtype=indexer.dtype)
562+
labels[mask] = ok_labels
563+
labels[~mask] = -1
564+
565+
if len(uniques) < len(level_index):
566+
# Remove unobserved levels from level_index
567+
level_index = level_index.take(uniques)
568+
569+
grouper = level_index.take(labels)
570+
571+
return grouper, labels, level_index
572+
542573
@property
543574
def _constructor(self):
544575
return MultiIndex.from_tuples

pandas/tests/test_groupby.py

+33
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,39 @@ def test_grouper_creation_bug(self):
458458
expected = s.groupby(level='one').sum()
459459
assert_series_equal(result, expected)
460460

461+
def test_grouper_column_and_index(self):
462+
# GH 14327
463+
464+
# Grouping a multi-index frame by a column and an index level should
465+
# be equivalent to resetting the index and grouping by two columns
466+
idx = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('a', 3),
467+
('b', 1), ('b', 2), ('b', 3)])
468+
idx.names = ['outer', 'inner']
469+
df_multi = pd.DataFrame({"A": np.arange(6),
470+
'B': ['one', 'one', 'two',
471+
'two', 'one', 'one']},
472+
index=idx)
473+
result = df_multi.groupby(['B', pd.Grouper(level='inner')]).mean()
474+
expected = df_multi.reset_index().groupby(['B', 'inner']).mean()
475+
assert_frame_equal(result, expected)
476+
477+
# Test the reverse grouping order
478+
result = df_multi.groupby([pd.Grouper(level='inner'), 'B']).mean()
479+
expected = df_multi.reset_index().groupby(['inner', 'B']).mean()
480+
assert_frame_equal(result, expected)
481+
482+
# Grouping a single-index frame by a column and the index should
483+
# be equivalent to resetting the index and grouping by two columns
484+
df_single = df_multi.reset_index('outer')
485+
result = df_single.groupby(['B', pd.Grouper(level='inner')]).mean()
486+
expected = df_single.reset_index().groupby(['B', 'inner']).mean()
487+
assert_frame_equal(result, expected)
488+
489+
# Test the reverse grouping order
490+
result = df_single.groupby([pd.Grouper(level='inner'), 'B']).mean()
491+
expected = df_single.reset_index().groupby(['inner', 'B']).mean()
492+
assert_frame_equal(result, expected)
493+
461494
def test_grouper_getting_correct_binner(self):
462495

463496
# GH 10063

0 commit comments

Comments
 (0)