Skip to content

Backport PR #31238: REGR: Prevent indexes that aren't directly backedby numpy from entering libreduction code paths #31378

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
is_list_like,
is_sequence,
)
from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries
from pandas.core.dtypes.generic import ABCSeries

from pandas.core.construction import create_series_with_explicit_dtype

Expand Down Expand Up @@ -277,9 +277,8 @@ def apply_standard(self):
if (
self.result_type in ["reduce", None]
and not self.dtypes.apply(is_extension_array_dtype).any()
# Disallow complex_internals since libreduction shortcut
# cannot handle MultiIndex
and not isinstance(self.agg_axis, ABCMultiIndex)
# Disallow complex_internals since libreduction shortcut raises a TypeError
and not self.agg_axis._has_complex_internals
):

values = self.values
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0):
com.get_callable_name(f) not in base.plotting_methods
and isinstance(splitter, FrameSplitter)
and axis == 0
# apply_frame_axis0 doesn't allow MultiIndex
and not isinstance(sdata.index, MultiIndex)
# fast_apply/libreduction doesn't allow non-numpy backed indexes
and not sdata.index._has_complex_internals
):
try:
result_values, mutated = splitter.fast_apply(f, group_keys)
Expand Down Expand Up @@ -616,8 +616,8 @@ def agg_series(self, obj: Series, func):
# TODO: can we get a performant workaround for EAs backed by ndarray?
return self._aggregate_series_pure_python(obj, func)

elif isinstance(obj.index, MultiIndex):
# MultiIndex; Pre-empt TypeError in _aggregate_series_fast
elif obj.index._has_complex_internals:
# Pre-empt TypeError in _aggregate_series_fast
return self._aggregate_series_pure_python(obj, func)

try:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3825,6 +3825,14 @@ def _assert_can_do_op(self, value):
if not is_scalar(value):
raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")

@property
def _has_complex_internals(self):
"""
Indicates if an index is not directly backed by a numpy array
"""
# used to avoid libreduction code paths, which raise or require conversion
return False

def _is_memory_usage_qualified(self) -> bool:
"""
Return a boolean if we need a qualified .info display.
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,11 @@ def values(self):
""" return the underlying data, which is a Categorical """
return self._data

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

def _wrap_setop_result(self, other, result):
name = get_op_result_name(self, other)
# We use _shallow_copy rather than the Index implementation
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,11 @@ def values(self):
def _values(self):
return self._data

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

def __array_wrap__(self, result, context=None):
# we don't want the superclass implementation
return result
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,11 @@ def values(self):
self._tuples = lib.fast_zip(values)
return self._tuples

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

@cache_readonly
def is_monotonic_increasing(self) -> bool:
"""
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,11 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs):
def values(self):
return np.asarray(self)

@property
def _has_complex_internals(self):
# used to avoid libreduction code paths, which raise or require conversion
return True

def _shallow_copy(self, values=None, **kwargs):
# TODO: simplify, figure out type of values
if values is None:
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,23 @@ def test_func_duplicates_raises():
df.groupby("A").agg(["min", "min"])


@pytest.mark.parametrize(
"index",
[
pd.CategoricalIndex(list("abc")),
pd.interval_range(0, 3),
pd.period_range("2020", periods=3, freq="D"),
pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
],
)
def test_agg_index_has_complex_internals(index):
# GH 31223
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
result = df.groupby("group").agg({"value": Series.nunique})
expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
tm.assert_frame_equal(result, expected)


class TestNamedAggregationSeries:
def test_series_named_agg(self):
df = pd.Series([1, 2, 3, 4])
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -769,3 +769,19 @@ def test_apply_multi_level_name(category):
)
tm.assert_frame_equal(result, expected)
assert df.index.names == ["A", "B"]


@pytest.mark.parametrize(
"index",
[
pd.CategoricalIndex(list("abc")),
pd.interval_range(0, 3),
pd.period_range("2020", periods=3, freq="D"),
pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
],
)
def test_apply_index_has_complex_internals(index):
# GH 31248
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
result = df.groupby("group").apply(lambda x: x)
tm.assert_frame_equal(result, df)