From ea88b2f1c631dc3bae6d1cdb30409b1eaec85e72 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 4 Jan 2020 22:44:49 +0800 Subject: [PATCH 1/7] BUG: groupby apply raises ValueError when groupby axis has duplicates (GH30667) --- doc/source/whatsnew/v1.0.0.rst | 1 + pandas/core/groupby/groupby.py | 24 +++++++++--------------- pandas/tests/groupby/test_apply.py | 23 +++++++++++++++++++++++ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b9cc1dad53674..b593bd6b003c7 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -973,6 +973,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates (:issue:`30667`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1ba4938d45fc9..d16c755ac86e0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -969,22 +969,16 @@ def reset_identity(values): result = concat(values, axis=self.axis) ax = self._selected_obj._get_axis(self.axis) - if isinstance(result, Series): - result = result.reindex(ax) + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + if ax.has_duplicates: + indexer = algorithms.unique1d(result.index.get_indexer_for(ax.values)) + result = result.take(indexer, axis=self.axis) else: - - # this is a very unfortunate situation - # we have a multi-index that is NOT lexsorted - # and we have a result which is duplicated - # we can't reindex, so we resort to this - # GH 14776 - if isinstance(ax, MultiIndex) and not ax.is_unique: - indexer = algorithms.unique1d( - result.index.get_indexer_for(ax.values) - ) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis) elif self.group_keys: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 17e5d3efe850f..e7eec7895e4cf 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -752,3 +752,26 @@ def most_common_values(df): ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("test_series", [True, False]) +def test_apply_with_duplicated_non_sorted_axis(test_series): + # GH 30667 + df = pd.DataFrame( + [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] + ) + if test_series: + ser = df.set_index("Y")["X"] + result = ser.groupby(level=0).apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_index() + expected = ser.sort_index() + tm.assert_series_equal(result, expected) + else: + result = df.groupby("Y").apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_values("Y") + expected = df.sort_values("Y") + tm.assert_frame_equal(result, expected) From a7ebc613512282b6ade6565248a64fd5706e3963 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sun, 5 Jan 2020 15:18:10 +0800 Subject: [PATCH 2/7] CLN: cleaned up code (GH30667) --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d16c755ac86e0..09e59b1f603e7 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -975,7 +975,7 @@ def reset_identity(values): # so we resort to this # GH 14776, 30667 if ax.has_duplicates: - indexer = algorithms.unique1d(result.index.get_indexer_for(ax.values)) + indexer = algorithms.unique1d(result.index.get_indexer_non_unique(ax.values)) result = result.take(indexer, axis=self.axis) else: result = result.reindex(ax, axis=self.axis) From 94be16be2157a28eb5874c323e2101c701b64304 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sun, 5 Jan 2020 15:19:25 +0800 Subject: [PATCH 3/7] CLN: reformatted code (GH30667) --- pandas/core/groupby/groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 09e59b1f603e7..dc7135b0339ce 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -975,7 +975,9 @@ def reset_identity(values): # so we resort to this # GH 14776, 30667 if ax.has_duplicates: - indexer = algorithms.unique1d(result.index.get_indexer_non_unique(ax.values)) + indexer = algorithms.unique1d( + result.index.get_indexer_non_unique(ax.values) + ) result = result.take(indexer, axis=self.axis) else: result = result.reindex(ax, axis=self.axis) From c1e8038a7f0c673b843e01880ba867b92942704f Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sun, 5 Jan 2020 15:46:49 +0800 Subject: [PATCH 4/7] TST: moved test case next to other similar tests (GH30667) --- pandas/tests/groupby/test_apply.py | 46 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index e7eec7895e4cf..220281d29c9e3 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -467,6 +467,29 @@ def filt2(x): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("test_series", [True, False]) +def test_apply_with_duplicated_non_sorted_axis(test_series): + # GH 30667 + df = pd.DataFrame( + [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] + ) + if test_series: + ser = df.set_index("Y")["X"] + result = ser.groupby(level=0).apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_index() + expected = ser.sort_index() + tm.assert_series_equal(result, expected) + else: + result = df.groupby("Y").apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_values("Y") + expected = df.sort_values("Y") + tm.assert_frame_equal(result, expected) + + def test_apply_corner_cases(): # #535, can't use sliding iterator @@ -752,26 +775,3 @@ def most_common_values(df): ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId" ) tm.assert_series_equal(result, expected) - - -@pytest.mark.parametrize("test_series", [True, False]) -def test_apply_with_duplicated_non_sorted_axis(test_series): - # GH 30667 - df = pd.DataFrame( - [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] - ) - if test_series: - ser = df.set_index("Y")["X"] - result = ser.groupby(level=0).apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_index() - expected = ser.sort_index() - tm.assert_series_equal(result, expected) - else: - result = df.groupby("Y").apply(lambda x: x) - - # not expecting the order to remain the same for duplicated axis - result = result.sort_values("Y") - expected = df.sort_values("Y") - tm.assert_frame_equal(result, expected) From cbbeb09caead820079d0f2695c7c7cc88f79fe82 Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sun, 5 Jan 2020 15:55:17 +0800 Subject: [PATCH 5/7] BUG: correction to previous logic(GH30667) --- pandas/core/groupby/groupby.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dc7135b0339ce..ef56e4d89dcb2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -975,9 +975,8 @@ def reset_identity(values): # so we resort to this # GH 14776, 30667 if ax.has_duplicates: - indexer = algorithms.unique1d( - result.index.get_indexer_non_unique(ax.values) - ) + indexer, _ = result.index.get_indexer_non_unique(ax.values) + indexer = algorithms.unique1d(indexer) result = result.take(indexer, axis=self.axis) else: result = result.reindex(ax, axis=self.axis) From 4502015cf2332742db10329dbc070d308c5d99ac Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Fri, 17 Jan 2020 09:17:13 +0800 Subject: [PATCH 6/7] DOC: Further qualified the bug fix entry in whatsnew (GH30667) --- doc/source/whatsnew/v1.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 2714b4347c2bb..737c4d6a8e07e 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1112,7 +1112,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) -- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates (:issue:`30667`) +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) Reshaping ^^^^^^^^^ From 358e673e4af45bc0fe71394296057333a0f7a48b Mon Sep 17 00:00:00 2001 From: fujiaxiang Date: Sat, 18 Jan 2020 23:46:51 +0800 Subject: [PATCH 7/7] DOC: moved whatsnew to V1.1.0 (GH30667) --- doc/source/whatsnew/v1.0.0.rst | 1 - doc/source/whatsnew/v1.1.0.rst | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 737c4d6a8e07e..fa562838c8f7c 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -1112,7 +1112,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` when using nunique on axis=1 (:issue:`30253`) - Bug in :meth:`GroupBy.quantile` with multiple list-like q value and integer column names (:issue:`30289`) - Bug in :meth:`GroupBy.pct_change` and :meth:`core.groupby.SeriesGroupBy.pct_change` causes ``TypeError`` when ``fill_method`` is ``None`` (:issue:`30463`) -- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) Reshaping ^^^^^^^^^ diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b5a7b19f160a4..daf78ee851341 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -133,9 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- - +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) Reshaping ^^^^^^^^^