From 209ffcefabf41886ea25afd7a9ff38a65b129b6c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 11 Dec 2017 13:04:50 -0600 Subject: [PATCH 01/10] COMPAT: Emit warning when groupby by a tuple Closes https://github.com/pandas-dev/pandas/issues/18314 --- doc/source/whatsnew/v0.22.0.txt | 3 +++ pandas/core/groupby.py | 10 +++++++++- pandas/tests/groupby/test_groupby.py | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2ea44722d343d..2eef25c0a8c5b 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -202,6 +202,9 @@ Deprecations - ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`). - ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`). - ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`) +- Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated. + In the future, a tuple passed to ``'by'`` will always refer to a single key + that is the actual tuple, instead of treating the tuple as multiple keys (:issue:`18314`) .. _whatsnew_0220.prior_deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a5d8cc254cd93..664db6b66a3ed 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2850,7 +2850,15 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, elif isinstance(key, BaseGrouper): return key, [], obj - # Everything which is not a list is a key (including tuples): + tuple_as_list = isinstance(key, tuple) and key not in obj + if tuple_as_list: + msg = ("Interpreting tuple 'by' as a list of keys, rather than " + "a single key. Use 'by={!r}' instead of 'by={!r}'. In the " + "future, a tuple will always mean a single key.".format( + list(key), key)) + warnings.warn(msg, FutureWarning, stacklevel=5) + key = list(key) + if not isinstance(key, list): keys = [key] match_axis_length = False diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 3436dd9169081..e74f73768e376 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2727,6 +2727,23 @@ def test_empty_dataframe_groupby(self): assert_frame_equal(result, expected) + def test_tuple_warns(self): + # https://github.com/pandas-dev/pandas/issues/18314 + df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2], + 'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]}) + with tm.assert_produces_warning(FutureWarning) as w: + df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + with tm.assert_produces_warning(FutureWarning) as w: + df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() + + assert "Interpreting tuple 'by' as a list" in str(w[0].message) + + with tm.assert_produces_warning(None): + df.groupby(('a', 'b')).c.mean() + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From ad09ade25426edf47dfb1e92e39b1b5a3fef694a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 11 Dec 2017 13:34:07 -0600 Subject: [PATCH 02/10] DOC: avoid future warning --- doc/source/groupby.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/groupby.rst b/doc/source/groupby.rst index 316244b583aa2..552ddabb7359a 100644 --- a/doc/source/groupby.rst +++ b/doc/source/groupby.rst @@ -1091,7 +1091,7 @@ You can also select multiple rows from each group by specifying multiple nth val business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B') df = pd.DataFrame(1, index=business_dates, columns=['a', 'b']) # get the first, 4th, and last date index for each month - df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + df.groupby([df.index.year, df.index.month]).nth([0, 3, -1]) Enumerate group items ~~~~~~~~~~~~~~~~~~~~~ From ade2b2b488dffad717bfbcff91031c7a7feab548 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 14:10:06 -0600 Subject: [PATCH 03/10] Cleanup, test unhashable --- doc/source/whatsnew/v0.22.0.txt | 3 ++- pandas/core/groupby.py | 27 +++++++++++++++++++-------- pandas/tests/groupby/test_groupby.py | 9 +++++++++ 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index a004e24899cbf..8c222352a4c58 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -205,7 +205,8 @@ Deprecations - ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`) - Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated. In the future, a tuple passed to ``'by'`` will always refer to a single key - that is the actual tuple, instead of treating the tuple as multiple keys (:issue:`18314`) + that is the actual tuple, instead of treating the tuple as multiple keys. To + retain the previous behavior, use a list instead of a tuple (:issue:`18314`) .. _whatsnew_0220.prior_deprecations: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 664db6b66a3ed..41b8e46d8940f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -28,6 +28,7 @@ is_bool_dtype, is_scalar, is_list_like, + is_hashable, needs_i8_conversion, _ensure_float64, _ensure_platform_int, @@ -2850,14 +2851,24 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, elif isinstance(key, BaseGrouper): return key, [], obj - tuple_as_list = isinstance(key, tuple) and key not in obj - if tuple_as_list: - msg = ("Interpreting tuple 'by' as a list of keys, rather than " - "a single key. Use 'by={!r}' instead of 'by={!r}'. In the " - "future, a tuple will always mean a single key.".format( - list(key), key)) - warnings.warn(msg, FutureWarning, stacklevel=5) - key = list(key) + # In the future, a tuple key will always mean an actual key, + # not an iterable of keys. In the meantime, we attempt to provide + # a warning. We can assume that the user wanted a list of keys when + # the key is not in the index. We just have to be careful with + # unhashble elements of `key`. Any unhashable elements implies that + # they wanted a list of keys. + # https://github.com/pandas-dev/pandas/issues/18314 + is_tuple = isinstance(key, tuple) + all_hashable = is_tuple and all(is_hashable(x) for x in key) + + if is_tuple: + if not all_hashable or key not in obj: + msg = ("Interpreting tuple 'by' as a list of keys, rather than " + "a single key. Use 'by={!r}' instead of 'by={!r}'. In the " + "future, a tuple will always mean a single key.".format( + list(key), key)) + warnings.warn(msg, FutureWarning, stacklevel=5) + key = list(key) if not isinstance(key, list): keys = [key] diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e74f73768e376..8152c269e0ba0 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2743,7 +2743,16 @@ def test_tuple_warns(self): with tm.assert_produces_warning(None): df.groupby(('a', 'b')).c.mean() + + def test_tuple_warns_unhashable(self): + # https://github.com/pandas-dev/pandas/issues/18314 + business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B') + df = DataFrame(1, index=business_dates, columns=['a', 'b']) + + with tm.assert_produces_warning(FutureWarning) as w: + df.groupby((df.index.year, df.index.month)).nth([0, 3, -1]) + assert "Interpreting tuple 'by' as a list" in str(w[0].message) def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From 60502263bf344463cba6d1cd02fa8e2d17534963 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 14:16:03 -0600 Subject: [PATCH 04/10] PEP8 --- pandas/core/groupby.py | 2 +- pandas/tests/groupby/test_groupby.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 41b8e46d8940f..92b92b0ee3947 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2866,7 +2866,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by={!r}' instead of 'by={!r}'. In the " "future, a tuple will always mean a single key.".format( - list(key), key)) + list(key), key)) warnings.warn(msg, FutureWarning, stacklevel=5) key = list(key) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 8152c269e0ba0..53612a0534ae8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2743,10 +2743,11 @@ def test_tuple_warns(self): with tm.assert_produces_warning(None): df.groupby(('a', 'b')).c.mean() - + def test_tuple_warns_unhashable(self): # https://github.com/pandas-dev/pandas/issues/18314 - business_dates = date_range(start='4/1/2014', end='6/30/2014', freq='B') + business_dates = date_range(start='4/1/2014', end='6/30/2014', + freq='B') df = DataFrame(1, index=business_dates, columns=['a', 'b']) with tm.assert_produces_warning(FutureWarning) as w: @@ -2754,6 +2755,7 @@ def test_tuple_warns_unhashable(self): assert "Interpreting tuple 'by' as a list" in str(w[0].message) + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) tups = com._asarray_tuplesafe(tups) From d8c20e85ea9363bbdbe58c16b51fe9b76eb4e24d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 15:01:17 -0600 Subject: [PATCH 05/10] Correct KeyError --- pandas/core/groupby.py | 12 +++++++++--- pandas/tests/groupby/test_groupby.py | 8 ++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 92b92b0ee3947..abce57d69d1c0 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2861,13 +2861,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, is_tuple = isinstance(key, tuple) all_hashable = is_tuple and all(is_hashable(x) for x in key) + original_grouper = None if is_tuple: if not all_hashable or key not in obj: msg = ("Interpreting tuple 'by' as a list of keys, rather than " - "a single key. Use 'by={!r}' instead of 'by={!r}'. In the " - "future, a tuple will always mean a single key.".format( - list(key), key)) + "a single key. Use 'by=[...]' instead of 'by=(...)'. In " + "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) + original_grouper = key key = list(key) if not isinstance(key, list): @@ -2939,6 +2940,11 @@ def is_in_obj(gpr): elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: + # Want to raise with the correct KeyError here + # The deprecation in #18731 means we may have + # the wrong error message here. + if original_grouper: + gpr = original_grouper raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 53612a0534ae8..b014fc31d8582 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2755,6 +2755,14 @@ def test_tuple_warns_unhashable(self): assert "Interpreting tuple 'by' as a list" in str(w[0].message) + def test_tuple_correct_keyerror(self): + df = pd.DataFrame(1, index=range(3), + columns=pd.MultiIndex.from_product([[1, 2], + [3, 4]])) + with tm.assert_produces_warning(FutureWarning): # just silence + with tm.assert_raises_regex(KeyError, "(7, 8)"): + df.groupby((7, 8)).mean() + def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): tups = lmap(tuple, df[keys].values) From d2a2372bf85a22a26ce422becfd108650fe67c6d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 15:08:58 -0600 Subject: [PATCH 06/10] update --- pandas/core/groupby.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index abce57d69d1c0..dcfe0764654d7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2861,14 +2861,14 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, is_tuple = isinstance(key, tuple) all_hashable = is_tuple and all(is_hashable(x) for x in key) - original_grouper = None if is_tuple: - if not all_hashable or key not in obj: + if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: + # column names ('a', 'b') -> ['a', 'b'] + # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " "a single key. Use 'by=[...]' instead of 'by=(...)'. In " "the future, a tuple will always mean a single key.") warnings.warn(msg, FutureWarning, stacklevel=5) - original_grouper = key key = list(key) if not isinstance(key, list): @@ -2943,8 +2943,6 @@ def is_in_obj(gpr): # Want to raise with the correct KeyError here # The deprecation in #18731 means we may have # the wrong error message here. - if original_grouper: - gpr = original_grouper raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions From 38ef818bc446049fb748cdef64dde222bfd70078 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 15:17:07 -0600 Subject: [PATCH 07/10] xfail --- pandas/tests/groupby/test_groupby.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b014fc31d8582..9ce61e2ffb10d 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2755,13 +2755,14 @@ def test_tuple_warns_unhashable(self): assert "Interpreting tuple 'by' as a list" in str(w[0].message) + @pytest.mark.xfail(reason="GH-18798") def test_tuple_correct_keyerror(self): + # https://github.com/pandas-dev/pandas/issues/18798 df = pd.DataFrame(1, index=range(3), columns=pd.MultiIndex.from_product([[1, 2], [3, 4]])) - with tm.assert_produces_warning(FutureWarning): # just silence - with tm.assert_raises_regex(KeyError, "(7, 8)"): - df.groupby((7, 8)).mean() + with tm.assert_raises_regex(KeyError, "(7, 8)"): + df.groupby((7, 8)).mean() def _check_groupby(df, result, keys, field, f=lambda x: x.sum()): From a27f4498b987c4b4e02f870ce76d498c37f2bb83 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 15:17:49 -0600 Subject: [PATCH 08/10] remove old comments --- pandas/core/groupby.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index dcfe0764654d7..5e9bb01c77e3f 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2940,9 +2940,6 @@ def is_in_obj(gpr): elif obj._is_level_reference(gpr): in_axis, name, level, gpr = False, None, gpr, None else: - # Want to raise with the correct KeyError here - # The deprecation in #18731 means we may have - # the wrong error message here. raise KeyError(gpr) elif isinstance(gpr, Grouper) and gpr.key is not None: # Add key to exclusions From 4e5ae9fff513828d69275a7b49826346a375273a Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 15:18:46 -0600 Subject: [PATCH 09/10] pep8 --- pandas/core/groupby.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 5e9bb01c77e3f..974f039eb01a7 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2862,7 +2862,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, all_hashable = is_tuple and all(is_hashable(x) for x in key) if is_tuple: - if (all_hashable and key not in obj and set(key).issubset(obj)) or not all_hashable: + if ((all_hashable and key not in obj and set(key).issubset(obj)) + or not all_hashable): # column names ('a', 'b') -> ['a', 'b'] # arrays like (a, b) -> [a, b] msg = ("Interpreting tuple 'by' as a list of keys, rather than " From a8b438323a4e0dc320bea1e1dfdbd72934de70a6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 15 Dec 2017 17:09:00 -0600 Subject: [PATCH 10/10] Fixups --- pandas/core/groupby.py | 2 +- pandas/tests/groupby/test_groupby.py | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 974f039eb01a7..b4223ac0a177a 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -2859,7 +2859,7 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, # they wanted a list of keys. # https://github.com/pandas-dev/pandas/issues/18314 is_tuple = isinstance(key, tuple) - all_hashable = is_tuple and all(is_hashable(x) for x in key) + all_hashable = is_tuple and is_hashable(key) if is_tuple: if ((all_hashable and key not in obj and set(key).issubset(obj)) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 9ce61e2ffb10d..3327612b016f4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2736,11 +2736,6 @@ def test_tuple_warns(self): assert "Interpreting tuple 'by' as a list" in str(w[0].message) - with tm.assert_produces_warning(FutureWarning) as w: - df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean() - - assert "Interpreting tuple 'by' as a list" in str(w[0].message) - with tm.assert_produces_warning(None): df.groupby(('a', 'b')).c.mean()