diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 14a3c3c008e92..a496afeee83af 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -13,7 +13,7 @@ is_list_like, is_sequence, ) -from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries +from pandas.core.dtypes.generic import ABCSeries from pandas.core.construction import create_series_with_explicit_dtype @@ -277,9 +277,8 @@ def apply_standard(self): if ( self.result_type in ["reduce", None] and not self.dtypes.apply(is_extension_array_dtype).any() - # Disallow complex_internals since libreduction shortcut - # cannot handle MultiIndex - and not isinstance(self.agg_axis, ABCMultiIndex) + # Disallow complex_internals since libreduction shortcut raises a TypeError + and not self.agg_axis._has_complex_internals ): values = self.values diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 37067a1897a52..679d3668523c2 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -164,8 +164,8 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0): com.get_callable_name(f) not in base.plotting_methods and isinstance(splitter, FrameSplitter) and axis == 0 - # apply_frame_axis0 doesn't allow MultiIndex - and not isinstance(sdata.index, MultiIndex) + # fast_apply/libreduction doesn't allow non-numpy backed indexes + and not sdata.index._has_complex_internals ): try: result_values, mutated = splitter.fast_apply(f, group_keys) @@ -616,8 +616,8 @@ def agg_series(self, obj: Series, func): # TODO: can we get a performant workaround for EAs backed by ndarray? return self._aggregate_series_pure_python(obj, func) - elif isinstance(obj.index, MultiIndex): - # MultiIndex; Pre-empt TypeError in _aggregate_series_fast + elif obj.index._has_complex_internals: + # Pre-empt TypeError in _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 875bbbd355ad6..6f9c865db673b 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -3825,6 +3825,14 @@ def _assert_can_do_op(self, value): if not is_scalar(value): raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}") + @property + def _has_complex_internals(self): + """ + Indicates if an index is not directly backed by a numpy array + """ + # used to avoid libreduction code paths, which raise or require conversion + return False + def _is_memory_usage_qualified(self) -> bool: """ Return a boolean if we need a qualified .info display. diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 94dee95ab4154..512013678593e 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -380,6 +380,11 @@ def values(self): """ return the underlying data, which is a Categorical """ return self._data + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + def _wrap_setop_result(self, other, result): name = get_op_result_name(self, other) # We use _shallow_copy rather than the Index implementation diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 6d3205bcf87a0..a5ab7cbacea93 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -402,6 +402,11 @@ def values(self): def _values(self): return self._data + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + def __array_wrap__(self, result, context=None): # we don't want the superclass implementation return result diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index b89ead2fe7b47..4e176df752867 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1346,6 +1346,11 @@ def values(self): self._tuples = lib.fast_zip(values) return self._tuples + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + @cache_readonly def is_monotonic_increasing(self) -> bool: """ diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 6ab2e66e05d6e..6877cf029ed0c 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -266,6 +266,11 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs): def values(self): return np.asarray(self) + @property + def _has_complex_internals(self): + # used to avoid libreduction code paths, which raise or require conversion + return True + def _shallow_copy(self, values=None, **kwargs): # TODO: simplify, figure out type of values if values is None: diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 0b72a61ed84de..056d457e76d81 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -361,6 +361,23 @@ def test_func_duplicates_raises(): df.groupby("A").agg(["min", "min"]) +@pytest.mark.parametrize( + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], +) +def test_agg_index_has_complex_internals(index): + # GH 31223 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").agg({"value": Series.nunique}) + expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group") + tm.assert_frame_equal(result, expected) + + class TestNamedAggregationSeries: def test_series_named_agg(self): df = pd.Series([1, 2, 3, 4]) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 10403bb148c94..9c3a832121c7f 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -769,3 +769,19 @@ def test_apply_multi_level_name(category): ) tm.assert_frame_equal(result, expected) assert df.index.names == ["A", "B"] + + +@pytest.mark.parametrize( + "index", + [ + pd.CategoricalIndex(list("abc")), + pd.interval_range(0, 3), + pd.period_range("2020", periods=3, freq="D"), + pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]), + ], +) +def test_apply_index_has_complex_internals(index): + # GH 31248 + df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index) + result = df.groupby("group").apply(lambda x: x) + tm.assert_frame_equal(result, df)