diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 316244b583aa2..552ddabb7359a 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1091,7 +1091,7 @@ You can also select multiple rows from each group by specifying multiple nth val business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) # get the first, 4th, and last date index for each month - df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index dfd222f10d235..ae6d0816abc41 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -203,6 +203,10 @@ Deprecations - ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). - ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). - ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`) +- Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated. + In the future, a tuple passed to ``'by'`` will always refer to a single key + that is the actual tuple, instead of treating the tuple as multiple keys. To + retain the previous behavior, use a list instead of a tuple (:issue:`18314`) - ``Series.valid`` is deprecated. Use :meth:`Series.dropna` instead (:issue:`18800`). .. _whatsnew_0220.prior_deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a5d8cc254cd93..b4223ac0a177a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -28,6 +28,7 @@ is_bool_dtype, is_scalar, is_list_like, + is_hashable, needs_i8_conversion, _ensure_float64, _ensure_platform_int, @@ -2850,7 +2851,27 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, elif isinstance(key, BaseGrouper): return key, [], obj - # Everything which is not a list is a key (including tuples): + # In the future, a tuple key will always mean an actual key, + # not an iterable of keys. In the meantime, we attempt to provide + # a warning. We can assume that the user wanted a list of keys when + # the key is not in the index. We just have to be careful with + # unhashble elements of `key`. Any unhashable elements implies that + # they wanted a list of keys. + # https://github.com/pandas-dev/pandas/issues/18314 + is_tuple = isinstance(key, tuple) + all_hashable = is_tuple and is_hashable(key) + + if is_tuple: + if ((all_hashable and key not in obj and set(key).issubset(obj)) + or not all_hashable): + # column names ('a', 'b') -> ['a', 'b'] + # arrays like (a, b) -> [a, b] + msg = ("Interpreting tuple 'by' as a list of keys, rather than " + "a single key. Use 'by=[...]' instead of 'by=(...)'. In " + "the future, a tuple will always mean a single key.") + warnings.warn(msg, FutureWarning, stacklevel=5) + key = list(key) + if not isinstance(key, list): keys = [key] match_axis_length = False diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3436dd9169081..3327612b016f4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2727,6 +2727,38 @@ def test_empty_dataframe_groupby(self): assert_frame_equal(result, expected) + def test_tuple_warns(self): + # https://github.com/pandas-dev/pandas/issues/18314 + df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], + 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) + with tm.assert_produces_warning(FutureWarning) as w: + df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + with tm.assert_produces_warning(None): + df.groupby(('a', 'b')).c.mean() + + def test_tuple_warns_unhashable(self): + # https://github.com/pandas-dev/pandas/issues/18314 + business_dates = date_range(start='4/1/2014', end='6/30/2014', + freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + + with tm.assert_produces_warning(FutureWarning) as w: + df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + @pytest.mark.xfail(reason="GH-18798") + def test_tuple_correct_keyerror(self): + # https://github.com/pandas-dev/pandas/issues/18798 + df = pd.DataFrame(1, index=range(3), + columns=pd.MultiIndex.from_product([[1, 2], + [3, 4]])) + with tm.assert_raises_regex(KeyError, "(7, 8)"): + df.groupby((7, 8)).mean() + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values)