Skip to content

Commit c4d438f

Browse files
toobazNo-Stream
authored andcommitted
Groupby tuples
xref pandas-dev#17996 Author: Pietro Battiston <[email protected]> Closes pandas-dev#18249 from toobaz/groupby_tuples and squashes the following commits: dafc838 [Pietro Battiston] DOC: Clarification of groupby(by=) argument e0bdfa7 [Pietro Battiston] TST: Test for tuples in columns, fixes to previous tests 74f91e0 [Pietro Battiston] TST: Fix tests which used tuples to pass multiple keys 201a4fe [Pietro Battiston] BUG: Never interpret a tuple as a list of keys
1 parent b678d80 commit c4d438f

File tree

7 files changed

+19
-15
lines changed

7 files changed

+19
-15
lines changed

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ Indexing
101101
^^^^^^^^
102102

103103
- Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`)
104-
- Bug in :func:`DataFrame.groupby` where key as tuple in a ``MultiIndex`` were interpreted as a list of keys (:issue:`17979`)
104+
- Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`)
105105
-
106106
-
107107

pandas/core/generic.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -5092,14 +5092,15 @@ def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True,
50925092
50935093
Parameters
50945094
----------
5095-
by : mapping, function, str, or iterable
5095+
by : mapping, function, label, or list of labels
50965096
Used to determine the groups for the groupby.
50975097
If ``by`` is a function, it's called on each value of the object's
50985098
index. If a dict or Series is passed, the Series or dict VALUES
50995099
will be used to determine the groups (the Series' values are first
51005100
aligned; see ``.align()`` method). If an ndarray is passed, the
5101-
values are used as-is determine the groups. A str or list of strs
5102-
may be passed to group by the columns in ``self``
5101+
values are used as-is determine the groups. A label or list of
5102+
labels may be passed to group by the columns in ``self``. Notice
5103+
that a tuple is interpreted a (single) key.
51035104
axis : int, default 0
51045105
level : int, level name, or sequence of such, default None
51055106
If the axis is a MultiIndex (hierarchical), group by a particular

pandas/core/groupby.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -2756,7 +2756,6 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
27562756
27572757
"""
27582758
group_axis = obj._get_axis(axis)
2759-
is_axis_multiindex = isinstance(obj._info_axis, MultiIndex)
27602759

27612760
# validate that the passed single level is compatible with the passed
27622761
# axis of the object
@@ -2817,9 +2816,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
28172816
elif isinstance(key, BaseGrouper):
28182817
return key, [], obj
28192818

2820-
# when MultiIndex, allow tuple to be a key
2821-
if not isinstance(key, (tuple, list)) or \
2822-
(isinstance(key, tuple) and is_axis_multiindex):
2819+
# Everything which is not a list is a key (including tuples):
2820+
if not isinstance(key, list):
28232821
keys = [key]
28242822
match_axis_length = False
28252823
else:

pandas/tests/groupby/test_groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -264,7 +264,7 @@ def test_len(self):
264264
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
265265
assert len(df.groupby(('a'))) == 0
266266
assert len(df.groupby(('b'))) == 3
267-
assert len(df.groupby(('a', 'b'))) == 3
267+
assert len(df.groupby(['a', 'b'])) == 3
268268

269269
def test_basic_regression(self):
270270
# regression

pandas/tests/groupby/test_grouping.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -366,13 +366,18 @@ def test_groupby_multiindex_tuple(self):
366366
result = df.groupby(('b', 1)).groups
367367
tm.assert_dict_equal(expected, result)
368368

369-
df2 = pd.DataFrame([[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
369+
df2 = pd.DataFrame(df.values,
370370
columns=pd.MultiIndex.from_arrays(
371371
[['a', 'b', 'b', 'c'],
372372
['d', 'd', 'e', 'e']]))
373-
df2.groupby([('b', 'd')]).groups
374-
expected = df.groupby([('b', 'd')]).groups
375-
result = df.groupby(('b', 'd')).groups
373+
expected = df2.groupby([('b', 'd')]).groups
374+
result = df.groupby(('b', 1)).groups
375+
tm.assert_dict_equal(expected, result)
376+
377+
df3 = pd.DataFrame(df.values,
378+
columns=[('a', 'd'), ('b', 'd'), ('b', 'e'), 'c'])
379+
expected = df3.groupby([('b', 'd')]).groups
380+
result = df.groupby(('b', 1)).groups
376381
tm.assert_dict_equal(expected, result)
377382

378383
@pytest.mark.parametrize('sort', [True, False])

pandas/tests/groupby/test_nth.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,7 @@ def test_nth(self):
202202
freq='B')
203203
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
204204
# get the first, fourth and last two business days for each month
205-
key = (df.index.year, df.index.month)
205+
key = [df.index.year, df.index.month]
206206
result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
207207
expected_dates = pd.to_datetime(
208208
['2014/4/1', '2014/4/4', '2014/4/29', '2014/4/30', '2014/5/1',

pandas/tests/groupby/test_value_counts.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ def seed_df(seed_nans, n, m):
4343

4444
df = seed_df(seed_nans, n, m)
4545
bins = None, np.arange(0, max(5, df['3rd'].max()) + 1, 2)
46-
keys = '1st', '2nd', ('1st', '2nd')
46+
keys = '1st', '2nd', ['1st', '2nd']
4747
for k, b in product(keys, bins):
4848
binned.append((df, k, b, n, m))
4949
ids.append("{}-{}-{}".format(k, n, m))

0 commit comments

Comments
 (0)