Skip to content

Commit be6a3bc

Browse files
fujiaxiangjreback
authored andcommitted
BUG: groupby apply raises ValueError when groupby axis has duplicates and applied identity function (#30679)
1 parent 6a2f95b commit be6a3bc

File tree

3 files changed

+34
-18
lines changed

3 files changed

+34
-18
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-3
Original file line numberDiff line numberDiff line change
@@ -133,9 +133,7 @@ Plotting
133133
Groupby/resample/rolling
134134
^^^^^^^^^^^^^^^^^^^^^^^^
135135

136-
-
137-
-
138-
136+
- Bug in :meth:`GroupBy.apply` raises ``ValueError`` when the ``by`` axis is not sorted and has duplicates and the applied ``func`` does not mutate passed in objects (:issue:`30667`)
139137

140138
Reshaping
141139
^^^^^^^^^

pandas/core/groupby/groupby.py

+10-15
Original file line numberDiff line numberDiff line change
@@ -969,22 +969,17 @@ def reset_identity(values):
969969
result = concat(values, axis=self.axis)
970970
ax = self._selected_obj._get_axis(self.axis)
971971

972-
if isinstance(result, Series):
973-
result = result.reindex(ax)
972+
# this is a very unfortunate situation
973+
# we can't use reindex to restore the original order
974+
# when the ax has duplicates
975+
# so we resort to this
976+
# GH 14776, 30667
977+
if ax.has_duplicates:
978+
indexer, _ = result.index.get_indexer_non_unique(ax.values)
979+
indexer = algorithms.unique1d(indexer)
980+
result = result.take(indexer, axis=self.axis)
974981
else:
975-
976-
# this is a very unfortunate situation
977-
# we have a multi-index that is NOT lexsorted
978-
# and we have a result which is duplicated
979-
# we can't reindex, so we resort to this
980-
# GH 14776
981-
if isinstance(ax, MultiIndex) and not ax.is_unique:
982-
indexer = algorithms.unique1d(
983-
result.index.get_indexer_for(ax.values)
984-
)
985-
result = result.take(indexer, axis=self.axis)
986-
else:
987-
result = result.reindex(ax, axis=self.axis)
982+
result = result.reindex(ax, axis=self.axis)
988983

989984
elif self.group_keys:
990985

pandas/tests/groupby/test_apply.py

+23
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,29 @@ def filt2(x):
467467
tm.assert_frame_equal(result, expected)
468468

469469

470+
@pytest.mark.parametrize("test_series", [True, False])
471+
def test_apply_with_duplicated_non_sorted_axis(test_series):
472+
# GH 30667
473+
df = pd.DataFrame(
474+
[["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2]
475+
)
476+
if test_series:
477+
ser = df.set_index("Y")["X"]
478+
result = ser.groupby(level=0).apply(lambda x: x)
479+
480+
# not expecting the order to remain the same for duplicated axis
481+
result = result.sort_index()
482+
expected = ser.sort_index()
483+
tm.assert_series_equal(result, expected)
484+
else:
485+
result = df.groupby("Y").apply(lambda x: x)
486+
487+
# not expecting the order to remain the same for duplicated axis
488+
result = result.sort_values("Y")
489+
expected = df.sort_values("Y")
490+
tm.assert_frame_equal(result, expected)
491+
492+
470493
def test_apply_corner_cases():
471494
# #535, can't use sliding iterator
472495

0 commit comments

Comments
 (0)