From 085abb20392f1ce175d24dba6606d4ef6ef5dc79 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 20:12:10 +0300 Subject: [PATCH 01/15] Tests added for PR #36605 Bug in issue #36158 appeared both for indices and get_group(). --- pandas/tests/groupby/test_grouping.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6c74e1521eeeb..dd5cecbd3dded 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -124,6 +124,15 @@ def test_getitem_single_column(self): tm.assert_series_equal(result, expected) + def test_indices_grouped_by_tuple_with_lambda(self): + df = pd.DataFrame({'Tuples': ((x, y) + for x in [0, 1] + for y in np.random.randint(3, 5, 5))}) + gb = df.groupby('Tuples') + gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) + expected = gb.indices + result = gb_lambda.indices + tm.assert_dict_equal(result, expected) # grouping # -------------------------------- @@ -775,6 +784,16 @@ def test_get_group_grouped_by_tuple(self): expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + def test_get_group_grouped_by_tuple_with_lambda(self): + df = pd.DataFrame({'Tuples': ((x, y) + for x in [0, 1] + for y in np.random.randint(3, 5, 5))}) + gb = df.groupby('Tuples') + gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) + expected = gb.get_group(list(gb.groups.keys())[0]) + result = gb_lambda.get_group(list(gb_lambda.groups.keys())[0]) + tm.assert_frame_equal(result, expected) + def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () From 84981d2d873429925ba8f82baff29904a58006c2 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 20:22:39 +0300 Subject: [PATCH 02/15] Aligned with [pre-commit] test --- pandas/tests/groupby/test_grouping.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index dd5cecbd3dded..373cb53e34d68 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -125,9 +125,9 @@ def test_getitem_single_column(self): tm.assert_series_equal(result, expected) def test_indices_grouped_by_tuple_with_lambda(self): - df = pd.DataFrame({'Tuples': ((x, y) - for x in [0, 1] - for y in np.random.randint(3, 5, 5))}) + df = pd.DataFrame( + {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} + ) gb = df.groupby('Tuples') gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) expected = gb.indices @@ -785,9 +785,9 @@ def test_get_group_grouped_by_tuple(self): tm.assert_frame_equal(result, expected) def test_get_group_grouped_by_tuple_with_lambda(self): - df = pd.DataFrame({'Tuples': ((x, y) - for x in [0, 1] - for y in np.random.randint(3, 5, 5))}) + df = pd.DataFrame( + {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} + ) gb = df.groupby('Tuples') gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) expected = gb.get_group(list(gb.groups.keys())[0]) From 3853e5f970af51062c0259d5016844b9cf5c92cb Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 20:27:45 +0300 Subject: [PATCH 03/15] Added issue reference to tests --- pandas/tests/groupby/test_grouping.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 373cb53e34d68..981a58939e51b 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -125,6 +125,7 @@ def test_getitem_single_column(self): tm.assert_series_equal(result, expected) def test_indices_grouped_by_tuple_with_lambda(self): + # GH 36158 df = pd.DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) @@ -785,6 +786,7 @@ def test_get_group_grouped_by_tuple(self): tm.assert_frame_equal(result, expected) def test_get_group_grouped_by_tuple_with_lambda(self): + # GH 36158 df = pd.DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) From fa82e079d7488e9f1251c2ba6435006728458f19 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 20:33:20 +0300 Subject: [PATCH 04/15] Aligned with [pre-commit] test --- pandas/tests/groupby/test_grouping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 981a58939e51b..74eabb2c1550a 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -129,7 +129,7 @@ def test_indices_grouped_by_tuple_with_lambda(self): df = pd.DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) - gb = df.groupby('Tuples') + gb = df.groupby("Tuples") gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) expected = gb.indices result = gb_lambda.indices @@ -790,7 +790,7 @@ def test_get_group_grouped_by_tuple_with_lambda(self): df = pd.DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) - gb = df.groupby('Tuples') + gb = df.groupby("Tuples") gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) expected = gb.get_group(list(gb.groups.keys())[0]) result = gb_lambda.get_group(list(gb_lambda.groups.keys())[0]) From 7dde2fd1ec464903e0c1036b31463af3a9493e8b Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 21:04:11 +0300 Subject: [PATCH 05/15] Comply with black --- pandas/tests/groupby/test_grouping.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 74eabb2c1550a..4cce75ae3aecf 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -129,12 +129,16 @@ def test_indices_grouped_by_tuple_with_lambda(self): df = pd.DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) + gb = df.groupby("Tuples") gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) + expected = gb.indices result = gb_lambda.indices + tm.assert_dict_equal(result, expected) + # grouping # -------------------------------- @@ -790,10 +794,13 @@ def test_get_group_grouped_by_tuple_with_lambda(self): df = pd.DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) + gb = df.groupby("Tuples") gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) + expected = gb.get_group(list(gb.groups.keys())[0]) result = gb_lambda.get_group(list(gb_lambda.groups.keys())[0]) + tm.assert_frame_equal(result, expected) def test_groupby_with_empty(self): From da360d262d086396facc01c1d81287b1939dee1c Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 21:38:17 +0300 Subject: [PATCH 06/15] Missed `self.` Missed `self.` during making PR, tests failed. Now corrected! --- pandas/core/groupby/grouper.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 353e447a31d43..6a49f2add17a2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -524,8 +524,8 @@ def __init__( raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) - if isinstance(grouper, MultiIndex): - self.grouper = grouper._values + if isinstance(self.grouper, MultiIndex): + self.grouper = self.grouper._values if not ( hasattr(self.grouper, "__len__") From 59d662457e6a0519afded7e1841ddf0b4bcec2ed Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Sat, 26 Sep 2020 19:56:00 +0300 Subject: [PATCH 07/15] Referencing changes in comment Changes [from comment](https://github.com/pandas-dev/pandas/pull/36605#pullrequestreview-497010611) by @rhshadrach --- pandas/core/groupby/grouper.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6a49f2add17a2..d0f6443dbc329 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -522,10 +522,7 @@ def __init__( if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") - self.grouper = self.index.map(self.grouper) - - if isinstance(self.grouper, MultiIndex): - self.grouper = self.grouper._values + self.grouper = self.index._transform_index(self.grouper) if not ( hasattr(self.grouper, "__len__") From 7332b24438d68f53d80008b26cb24402eff32f26 Mon Sep 17 00:00:00 2001 From: Dmitriy Perepelkin <32206956+dimithras@users.noreply.github.com> Date: Thu, 24 Sep 2020 18:57:55 +0300 Subject: [PATCH 08/15] Resolve issue #36158 As described in issue #36158 multi-dimensional groups prepared with lambda raise error on indices as isnan() is not defined for MultiIndex. Grouping already has the check for MultiIndex on line 440, but it's not triggered on __init__ when grouper is represented by lambda function and gets casted to MultiIndex later where MT stays as is. --- pandas/core/groupby/grouper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 9f0d953a2cc71..353e447a31d43 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -523,6 +523,10 @@ def __init__( t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) + + if isinstance(grouper, MultiIndex): + self.grouper = grouper._values + if not ( hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index) From f255870904718b91042e906390031341bb992236 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Wed, 28 Oct 2020 22:27:33 +0300 Subject: [PATCH 09/15] Failing test edited Previous assertion replaced with a new check. Added description to linking issue. Co-Authored-By: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/groupby/test_grouping.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4cce75ae3aecf..e93bfa690f2d2 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -408,8 +408,14 @@ def test_groupby_grouper_f_sanity_checked(self): # when the elements are Timestamp. # the result is Index[0:6], very confusing. - msg = r"Grouper result violates len\(labels\) == len\(data\)" - with pytest.raises(AssertionError, match=msg): + # GH36158 + # issue references errors for groupby + # created with functions (lambda or named) + # after changes applied under that issue this test fails with a + # more sensible error than the previous assertion. + + msg = r"'Timestamp' object is not subscriptable" + with pytest.raises(TypeError, match=msg): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): From 52bdf4a5be5b9a3557f5ab44a97fe1ed5fdf88df Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 24 Sep 2020 20:12:10 +0300 Subject: [PATCH 10/15] Tests added for PR #36605 Bug in issue #36158 for indices and get_group(). --- pandas/core/groupby/grouper.py | 4 ++-- pandas/tests/groupby/test_grouping.py | 28 +++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 353e447a31d43..6a49f2add17a2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -524,8 +524,8 @@ def __init__( raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) - if isinstance(grouper, MultiIndex): - self.grouper = grouper._values + if isinstance(self.grouper, MultiIndex): + self.grouper = self.grouper._values if not ( hasattr(self.grouper, "__len__") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 6c74e1521eeeb..4cce75ae3aecf 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -124,6 +124,20 @@ def test_getitem_single_column(self): tm.assert_series_equal(result, expected) + def test_indices_grouped_by_tuple_with_lambda(self): + # GH 36158 + df = pd.DataFrame( + {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} + ) + + gb = df.groupby("Tuples") + gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) + + expected = gb.indices + result = gb_lambda.indices + + tm.assert_dict_equal(result, expected) + # grouping # -------------------------------- @@ -775,6 +789,20 @@ def test_get_group_grouped_by_tuple(self): expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2]) tm.assert_frame_equal(result, expected) + def test_get_group_grouped_by_tuple_with_lambda(self): + # GH 36158 + df = pd.DataFrame( + {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} + ) + + gb = df.groupby("Tuples") + gb_lambda = df.groupby(lambda x: df.iloc[x, 0]) + + expected = gb.get_group(list(gb.groups.keys())[0]) + result = gb_lambda.get_group(list(gb_lambda.groups.keys())[0]) + + tm.assert_frame_equal(result, expected) + def test_groupby_with_empty(self): index = pd.DatetimeIndex(()) data = () From ebcc041c3bce8692be77acc77ad1b31b3ecbe1d6 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Sat, 26 Sep 2020 19:56:00 +0300 Subject: [PATCH 11/15] Referencing changes in comment Changes [from comment](https://github.com/pandas-dev/pandas/pull/36605#pullrequestreview-497010611) by @rhshadrach --- pandas/core/groupby/grouper.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 6a49f2add17a2..d0f6443dbc329 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -522,10 +522,7 @@ def __init__( if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") - self.grouper = self.index.map(self.grouper) - - if isinstance(self.grouper, MultiIndex): - self.grouper = self.grouper._values + self.grouper = self.index._transform_index(self.grouper) if not ( hasattr(self.grouper, "__len__") From 98c17c786586e44bd0d0a405fad3971f81b2c040 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Wed, 28 Oct 2020 22:27:33 +0300 Subject: [PATCH 12/15] Failing test edited Previous assertion replaced with a new check. Added description to linking issue. Co-Authored-By: Richard Shadrach <45562402+rhshadrach@users.noreply.github.com> --- pandas/tests/groupby/test_grouping.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4cce75ae3aecf..e93bfa690f2d2 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -408,8 +408,14 @@ def test_groupby_grouper_f_sanity_checked(self): # when the elements are Timestamp. # the result is Index[0:6], very confusing. - msg = r"Grouper result violates len\(labels\) == len\(data\)" - with pytest.raises(AssertionError, match=msg): + # GH36158 + # issue references errors for groupby + # created with functions (lambda or named) + # after changes applied under that issue this test fails with a + # more sensible error than the previous assertion. + + msg = r"'Timestamp' object is not subscriptable" + with pytest.raises(TypeError, match=msg): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): From 0921626626cf50e8404f4587118a8664ea92908d Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 29 Oct 2020 16:09:56 +0300 Subject: [PATCH 13/15] Issue gone with master After merging latest master issue is gone without applying the suggested fixes both by me and @rhshadrach . Removing all changes yet leaving newly added tests referring to original issue. --- pandas/core/groupby/grouper.py | 2 +- pandas/tests/groupby/test_grouping.py | 10 ++-------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index d0f6443dbc329..1626b6df66fbc 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -522,7 +522,7 @@ def __init__( if getattr(self.grouper, "ndim", 1) != 1: t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") - self.grouper = self.index._transform_index(self.grouper) + self.grouper = self.index.map(self.grouper) if not ( hasattr(self.grouper, "__len__") diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index e93bfa690f2d2..4cce75ae3aecf 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -408,14 +408,8 @@ def test_groupby_grouper_f_sanity_checked(self): # when the elements are Timestamp. # the result is Index[0:6], very confusing. - # GH36158 - # issue references errors for groupby - # created with functions (lambda or named) - # after changes applied under that issue this test fails with a - # more sensible error than the previous assertion. - - msg = r"'Timestamp' object is not subscriptable" - with pytest.raises(TypeError, match=msg): + msg = r"Grouper result violates len\(labels\) == len\(data\)" + with pytest.raises(AssertionError, match=msg): ts.groupby(lambda key: key[0:6]) def test_grouping_error_on_multidim_input(self, df): From ecc541d23de557cd4be5693f8854b74291f63956 Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 29 Oct 2020 16:36:47 +0300 Subject: [PATCH 14/15] Removed inconsistent use of pandas namespace --- pandas/tests/groupby/test_grouping.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py index 4cce75ae3aecf..39edc2b8d2527 100644 --- a/pandas/tests/groupby/test_grouping.py +++ b/pandas/tests/groupby/test_grouping.py @@ -126,7 +126,7 @@ def test_getitem_single_column(self): def test_indices_grouped_by_tuple_with_lambda(self): # GH 36158 - df = pd.DataFrame( + df = DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) @@ -791,7 +791,7 @@ def test_get_group_grouped_by_tuple(self): def test_get_group_grouped_by_tuple_with_lambda(self): # GH 36158 - df = pd.DataFrame( + df = DataFrame( {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))} ) From a8f4417e067535987f6cc3f0fe551d4639974abf Mon Sep 17 00:00:00 2001 From: Dmitry Perepelkin Date: Thu, 29 Oct 2020 16:43:11 +0300 Subject: [PATCH 15/15] Update grouper.py --- pandas/core/groupby/grouper.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 1626b6df66fbc..9f0d953a2cc71 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -523,7 +523,6 @@ def __init__( t = self.name or str(type(self.grouper)) raise ValueError(f"Grouper for '{t}' not 1-dimensional") self.grouper = self.index.map(self.grouper) - if not ( hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)