Skip to content

Commit b6a7cc9

Browse files
COMPAT: Emit warning when groupby by a tuple (pandas-dev#18731)
* COMPAT: Emit warning when groupby by a tuple Closes pandas-dev#18314 * DOC: avoid future warning * Cleanup, test unhashable * PEP8 * Correct KeyError * update * xfail * remove old comments * pep8 * Fixups
1 parent 7a0ee19 commit b6a7cc9

File tree

4 files changed

+59
-2
lines changed

4 files changed

+59
-2
lines changed

doc/source/groupby.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1091,7 +1091,7 @@ You can also select multiple rows from each group by specifying multiple nth val
10911091
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')
10921092
df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])
10931093
# get the first, 4th, and last date index for each month
1094-
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
1094+
df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])
10951095
10961096
Enumerate group items
10971097
~~~~~~~~~~~~~~~~~~~~~

doc/source/whatsnew/v0.22.0.txt

+4
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,10 @@ Deprecations
203203
- ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`).
204204
- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`).
205205
- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`)
206+
- Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated.
207+
In the future, a tuple passed to ``'by'`` will always refer to a single key
208+
that is the actual tuple, instead of treating the tuple as multiple keys. To
209+
retain the previous behavior, use a list instead of a tuple (:issue:`18314`)
206210
- ``Series.valid`` is deprecated. Use :meth:`Series.dropna` instead (:issue:`18800`).
207211

208212
.. _whatsnew_0220.prior_deprecations:

pandas/core/groupby.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
is_bool_dtype,
2929
is_scalar,
3030
is_list_like,
31+
is_hashable,
3132
needs_i8_conversion,
3233
_ensure_float64,
3334
_ensure_platform_int,
@@ -2850,7 +2851,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
28502851
elif isinstance(key, BaseGrouper):
28512852
return key, [], obj
28522853

2853-
# Everything which is not a list is a key (including tuples):
2854+
# In the future, a tuple key will always mean an actual key,
2855+
# not an iterable of keys. In the meantime, we attempt to provide
2856+
# a warning. We can assume that the user wanted a list of keys when
2857+
# the key is not in the index. We just have to be careful with
2858+
# unhashble elements of `key`. Any unhashable elements implies that
2859+
# they wanted a list of keys.
2860+
# https://github.com/pandas-dev/pandas/issues/18314
2861+
is_tuple = isinstance(key, tuple)
2862+
all_hashable = is_tuple and is_hashable(key)
2863+
2864+
if is_tuple:
2865+
if ((all_hashable and key not in obj and set(key).issubset(obj))
2866+
or not all_hashable):
2867+
# column names ('a', 'b') -> ['a', 'b']
2868+
# arrays like (a, b) -> [a, b]
2869+
msg = ("Interpreting tuple 'by' as a list of keys, rather than "
2870+
"a single key. Use 'by=[...]' instead of 'by=(...)'. In "
2871+
"the future, a tuple will always mean a single key.")
2872+
warnings.warn(msg, FutureWarning, stacklevel=5)
2873+
key = list(key)
2874+
28542875
if not isinstance(key, list):
28552876
keys = [key]
28562877
match_axis_length = False

pandas/tests/groupby/test_groupby.py

+32
Original file line numberDiff line numberDiff line change
@@ -2727,6 +2727,38 @@ def test_empty_dataframe_groupby(self):
27272727

27282728
assert_frame_equal(result, expected)
27292729

2730+
def test_tuple_warns(self):
2731+
# https://github.com/pandas-dev/pandas/issues/18314
2732+
df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
2733+
'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
2734+
with tm.assert_produces_warning(FutureWarning) as w:
2735+
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()
2736+
2737+
assert "Interpreting tuple 'by' as a list" in str(w[0].message)
2738+
2739+
with tm.assert_produces_warning(None):
2740+
df.groupby(('a', 'b')).c.mean()
2741+
2742+
def test_tuple_warns_unhashable(self):
2743+
# https://github.com/pandas-dev/pandas/issues/18314
2744+
business_dates = date_range(start='4/1/2014', end='6/30/2014',
2745+
freq='B')
2746+
df = DataFrame(1, index=business_dates, columns=['a', 'b'])
2747+
2748+
with tm.assert_produces_warning(FutureWarning) as w:
2749+
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
2750+
2751+
assert "Interpreting tuple 'by' as a list" in str(w[0].message)
2752+
2753+
@pytest.mark.xfail(reason="GH-18798")
2754+
def test_tuple_correct_keyerror(self):
2755+
# https://github.com/pandas-dev/pandas/issues/18798
2756+
df = pd.DataFrame(1, index=range(3),
2757+
columns=pd.MultiIndex.from_product([[1, 2],
2758+
[3, 4]]))
2759+
with tm.assert_raises_regex(KeyError, "(7, 8)"):
2760+
df.groupby((7, 8)).mean()
2761+
27302762

27312763
def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
27322764
tups = lmap(tuple, df[keys].values)

0 commit comments

Comments
 (0)