Backport PR pandas-dev#31238: REGR: Prevent indexes that aren't directly backed by numpy from entering libreduction code paths (pandas-dev#31378)

jschendel · jreback · commit b905f2b03385 · 2020-01-28T07:17:46.000-05:00
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -13,7 +13,7 @@
     is_list_like,
     is_sequence,
 )
-from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries
+from pandas.core.dtypes.generic import ABCSeries
 
 from pandas.core.construction import create_series_with_explicit_dtype
 
@@ -277,9 +277,8 @@ def apply_standard(self):
         if (
             self.result_type in ["reduce", None]
             and not self.dtypes.apply(is_extension_array_dtype).any()
-            # Disallow complex_internals since libreduction shortcut
-            #  cannot handle MultiIndex
-            and not isinstance(self.agg_axis, ABCMultiIndex)
+            # Disallow complex_internals since libreduction shortcut raises a TypeError
+            and not self.agg_axis._has_complex_internals
         ):
 
             values = self.values
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -164,8 +164,8 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0):
             com.get_callable_name(f) not in base.plotting_methods
             and isinstance(splitter, FrameSplitter)
             and axis == 0
-            # apply_frame_axis0 doesn't allow MultiIndex
-            and not isinstance(sdata.index, MultiIndex)
+            # fast_apply/libreduction doesn't allow non-numpy backed indexes
+            and not sdata.index._has_complex_internals
         ):
             try:
                 result_values, mutated = splitter.fast_apply(f, group_keys)
@@ -616,8 +616,8 @@ def agg_series(self, obj: Series, func):
             # TODO: can we get a performant workaround for EAs backed by ndarray?
             return self._aggregate_series_pure_python(obj, func)
 
-        elif isinstance(obj.index, MultiIndex):
-            # MultiIndex; Pre-empt TypeError in _aggregate_series_fast
+        elif obj.index._has_complex_internals:
+            # Pre-empt TypeError in _aggregate_series_fast
             return self._aggregate_series_pure_python(obj, func)
 
         try:
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
@@ -3825,6 +3825,14 @@ def _assert_can_do_op(self, value):
         if not is_scalar(value):
             raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")
 
+    @property
+    def _has_complex_internals(self):
+        """
+        Indicates if an index is not directly backed by a numpy array
+        """
+        # used to avoid libreduction code paths, which raise or require conversion
+        return False
+
     def _is_memory_usage_qualified(self) -> bool:
         """
         Return a boolean if we need a qualified .info display.
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -380,6 +380,11 @@ def values(self):
         """ return the underlying data, which is a Categorical """
         return self._data
 
+    @property
+    def _has_complex_internals(self):
+        # used to avoid libreduction code paths, which raise or require conversion
+        return True
+
     def _wrap_setop_result(self, other, result):
         name = get_op_result_name(self, other)
         # We use _shallow_copy rather than the Index implementation
diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py
@@ -402,6 +402,11 @@ def values(self):
     def _values(self):
         return self._data
 
+    @property
+    def _has_complex_internals(self):
+        # used to avoid libreduction code paths, which raise or require conversion
+        return True
+
     def __array_wrap__(self, result, context=None):
         # we don't want the superclass implementation
         return result
diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py
@@ -1346,6 +1346,11 @@ def values(self):
         self._tuples = lib.fast_zip(values)
         return self._tuples
 
+    @property
+    def _has_complex_internals(self):
+        # used to avoid libreduction code paths, which raise or require conversion
+        return True
+
     @cache_readonly
     def is_monotonic_increasing(self) -> bool:
         """
diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py
@@ -266,6 +266,11 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs):
     def values(self):
         return np.asarray(self)
 
+    @property
+    def _has_complex_internals(self):
+        # used to avoid libreduction code paths, which raise or require conversion
+        return True
+
     def _shallow_copy(self, values=None, **kwargs):
         # TODO: simplify, figure out type of values
         if values is None:
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -361,6 +361,23 @@ def test_func_duplicates_raises():
         df.groupby("A").agg(["min", "min"])
 
 
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.CategoricalIndex(list("abc")),
+        pd.interval_range(0, 3),
+        pd.period_range("2020", periods=3, freq="D"),
+        pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
+    ],
+)
+def test_agg_index_has_complex_internals(index):
+    # GH 31223
+    df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
+    result = df.groupby("group").agg({"value": Series.nunique})
+    expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
+    tm.assert_frame_equal(result, expected)
+
+
 class TestNamedAggregationSeries:
     def test_series_named_agg(self):
         df = pd.Series([1, 2, 3, 4])
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -769,3 +769,19 @@ def test_apply_multi_level_name(category):
     )
     tm.assert_frame_equal(result, expected)
     assert df.index.names == ["A", "B"]
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        pd.CategoricalIndex(list("abc")),
+        pd.interval_range(0, 3),
+        pd.period_range("2020", periods=3, freq="D"),
+        pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
+    ],
+)
+def test_apply_index_has_complex_internals(index):
+    # GH 31248
+    df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
+    result = df.groupby("group").apply(lambda x: x)
+    tm.assert_frame_equal(result, df)