diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 1cd325dad9f07..40c02eb495f67 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -133,9 +133,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- -- - +- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 233bdd11b372b..a8c96840ff17b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -969,22 +969,17 @@ def reset_identity(values): result = concat(values, axis=self.axis) ax = self._selected_obj._get_axis(self.axis) - if isinstance(result, Series): - result = result.reindex(ax) + # this is a very unfortunate situation + # we can't use reindex to restore the original order + # when the ax has duplicates + # so we resort to this + # GH 14776, 30667 + if ax.has_duplicates: + indexer, _ = result.index.get_indexer_non_unique(ax.values) + indexer = algorithms.unique1d(indexer) + result = result.take(indexer, axis=self.axis) else: - - # this is a very unfortunate situation - # we have a multi-index that is NOT lexsorted - # and we have a result which is duplicated - # we can't reindex, so we resort to this - # GH 14776 - if isinstance(ax, MultiIndex) and not ax.is_unique: - indexer = algorithms.unique1d( - result.index.get_indexer_for(ax.values) - ) - result = result.take(indexer, axis=self.axis) - else: - result = result.reindex(ax, axis=self.axis) + result = result.reindex(ax, axis=self.axis) elif self.group_keys: diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 2f2f97f2cd993..e81ff37510dc0 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -467,6 +467,29 @@ def filt2(x): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("test_series", [True, False]) +def test_apply_with_duplicated_non_sorted_axis(test_series): + # GH 30667 + df = pd.DataFrame( + [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2] + ) + if test_series: + ser = df.set_index("Y")["X"] + result = ser.groupby(level=0).apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_index() + expected = ser.sort_index() + tm.assert_series_equal(result, expected) + else: + result = df.groupby("Y").apply(lambda x: x) + + # not expecting the order to remain the same for duplicated axis + result = result.sort_values("Y") + expected = df.sort_values("Y") + tm.assert_frame_equal(result, expected) + + def test_apply_corner_cases(): # #535, can't use sliding iterator