Skip to content

REGR: Prevent indexes that aren't directly backed by numpy from entering libreduction code paths #31238

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 28, 2020
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
is_list_like,
is_sequence,
)
from pandas.core.dtypes.generic import ABCMultiIndex, ABCSeries
from pandas.core.dtypes.generic import ABCSeries

from pandas.core.construction import create_series_with_explicit_dtype

Expand Down Expand Up @@ -278,9 +278,8 @@ def apply_standard(self):
if (
self.result_type in ["reduce", None]
and not self.dtypes.apply(is_extension_array_dtype).any()
# Disallow complex_internals since libreduction shortcut
# cannot handle MultiIndex
and not isinstance(self.agg_axis, ABCMultiIndex)
# Disallow complex_internals since libreduction shortcut raises a TypeError
and not self.agg_axis._has_complex_internals
):

values = self.values
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ def apply(self, f, data: FrameOrSeries, axis: int = 0):
com.get_callable_name(f) not in base.plotting_methods
and isinstance(splitter, FrameSplitter)
and axis == 0
# apply_frame_axis0 doesn't allow MultiIndex
and not isinstance(sdata.index, MultiIndex)
# fast_apply/libreduction doesn't allow non-numpy backed indexes
and not sdata.index._has_complex_internals
):
try:
result_values, mutated = splitter.fast_apply(f, group_keys)
Expand Down Expand Up @@ -616,8 +616,8 @@ def agg_series(self, obj: Series, func):
# TODO: can we get a performant workaround for EAs backed by ndarray?
return self._aggregate_series_pure_python(obj, func)

elif isinstance(obj.index, MultiIndex):
# MultiIndex; Pre-empt TypeError in _aggregate_series_fast
elif obj.index._has_complex_internals:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now excludes PeriodIndex, which previously worked fine since .values converted to a numpy array. It looks more performant to exclude PeriodIndex though, since we avoid the conversion to numpy:

In [1]: import numpy as np 
   ...: import pandas as pd 
   ...: from string import ascii_letters 
   ...:  
   ...: np.random.seed(123) 
   ...: group = np.random.choice(list(ascii_letters), 10**5) 
   ...: value = np.random.randint(12345, size=10**5) 
   ...: index = pd.period_range("2000", freq="D", periods=10**5) 
   ...: df = pd.DataFrame({"group": group, "value": value}, index=index)

In [2]: %timeit df.groupby("group").agg({"value": pd.Series.nunique})
17.8 ms ± 48.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)  # on this branch
95.9 ms ± 183 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)  # on master

# Pre-empt TypeError in _aggregate_series_fast
return self._aggregate_series_pure_python(obj, func)

try:
Expand Down
8 changes: 8 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4109,6 +4109,14 @@ def _assert_can_do_op(self, value):
if not is_scalar(value):
raise TypeError(f"'value' must be a scalar, passed: {type(value).__name__}")

@property
def _has_complex_internals(self):
"""
Indicates if an index is not directly backed by a numpy array
"""
# used to disable groupby tricks
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"tricks" -> "going through libreduction fastpath which would ..."?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copied that from the original implementation but agree it's not very helpful. Updated to something that's more informative.

return False

def _is_memory_usage_qualified(self) -> bool:
"""
Return a boolean if we need a qualified .info display.
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,6 +378,11 @@ def values(self):
""" return the underlying data, which is a Categorical """
return self._data

@property
def _has_complex_internals(self):
# to disable groupby tricks
return True

def _wrap_setop_result(self, other, result):
name = get_op_result_name(self, other)
# We use _shallow_copy rather than the Index implementation
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/interval.py
Original file line number Diff line number Diff line change
Expand Up @@ -411,6 +411,11 @@ def values(self):
"""
return self._data

@property
def _has_complex_internals(self):
# to disable groupby tricks
return True

def __array_wrap__(self, result, context=None):
# we don't want the superclass implementation
return result
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1346,6 +1346,11 @@ def values(self):
self._tuples = lib.fast_zip(values)
return self._tuples

@property
def _has_complex_internals(self):
# to disable groupby tricks
return True

@cache_readonly
def is_monotonic_increasing(self) -> bool:
"""
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/indexes/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,11 @@ def _simple_new(cls, values, name=None, freq=None, **kwargs):
def values(self):
return np.asarray(self)

@property
def _has_complex_internals(self):
# to disable groupby tricks
return True

def _shallow_copy(self, values=None, **kwargs):
# TODO: simplify, figure out type of values
if values is None:
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,23 @@ def test_func_duplicates_raises():
df.groupby("A").agg(["min", "min"])


@pytest.mark.parametrize(
"index",
[
pd.CategoricalIndex(list("abc")),
pd.interval_range(0, 3),
pd.period_range("2020", periods=3, freq="D"),
pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
],
)
def test_agg_index_has_complex_internals(index):
# GH 31223
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
result = df.groupby("group").agg({"value": Series.nunique})
expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
tm.assert_frame_equal(result, expected)


class TestNamedAggregationSeries:
def test_series_named_agg(self):
df = pd.Series([1, 2, 3, 4])
Expand Down
16 changes: 16 additions & 0 deletions pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,3 +811,19 @@ def test_groupby_apply_datetime_result_dtypes():
index=["observation", "color", "mood", "intensity", "score"],
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"index",
[
pd.CategoricalIndex(list("abc")),
pd.interval_range(0, 3),
pd.period_range("2020", periods=3, freq="D"),
pd.MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
],
)
def test_apply_index_has_complex_internals(index):
# GH 31248
df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
result = df.groupby("group").apply(lambda x: x)
tm.assert_frame_equal(result, df)