Skip to content

PERF: groupby returns a RangeIndex from groups when possible #58117

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,7 @@ Removal of prior version deprecations/changes
Performance improvements
~~~~~~~~~~~~~~~~~~~~~~~~
- :attr:`Categorical.categories` returns a :class:`RangeIndex` columns instead of an :class:`Index` if the constructed ``values`` was a ``range``. (:issue:`57787`)
- :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` returns a :class:`RangeIndex` index when possible. (:issue:`58117`)
- :class:`DataFrame` returns a :class:`RangeIndex` columns when possible when ``data`` is a ``dict`` (:issue:`57943`)
- :func:`concat` returns a :class:`RangeIndex` level in the :class:`MultiIndex` result when ``keys`` is a ``range`` or :class:`RangeIndex` (:issue:`57542`)
- :meth:`RangeIndex.append` returns a :class:`RangeIndex` instead of a :class:`Index` when appending values that could continue the :class:`RangeIndex` (:issue:`57467`)
Expand Down
6 changes: 5 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
Index,
MultiIndex,
ensure_index,
maybe_sequence_to_range,
)
from pandas.core.series import Series
from pandas.core.sorting import (
Expand Down Expand Up @@ -754,7 +755,10 @@ def ids(self) -> npt.NDArray[np.intp]:

@cache_readonly
def result_index_and_ids(self) -> tuple[Index, npt.NDArray[np.intp]]:
levels = [Index._with_infer(ping.uniques) for ping in self.groupings]
levels = [
Index._with_infer(maybe_sequence_to_range(ping.uniques))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this introducing values-dependent behavior? When grouping is [1, 2, 3] I'd get a RangeIndex but when it's [1, 2, 4] it's now an Index.

If they behave the same and so we can call this an implementation detail, then I'm good not calling this values-dependent behavior.

for ping in self.groupings
]
obs = [
ping._observed or not ping._passed_categorical for ping in self.groupings
]
Expand Down
5 changes: 4 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7144,7 +7144,10 @@ def maybe_sequence_to_range(sequence) -> Any | range:
return sequence
if len(sequence) == 0:
return range(0)
np_sequence = np.asarray(sequence, dtype=np.int64)
try:
np_sequence = np.asarray(sequence, dtype=np.int64)
except OverflowError:
return sequence
diff = np_sequence[1] - np_sequence[0]
if diff == 0:
return sequence
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
PeriodIndex,
default_index,
ensure_index,
maybe_sequence_to_range,
)
import pandas.core.indexes.base as ibase
from pandas.core.indexes.multi import maybe_droplevels
Expand Down Expand Up @@ -538,16 +539,14 @@ def _init_dict(
_data : BlockManager for the new Series
index : index for the new Series
"""
keys: Index | tuple

# Looking for NaN in dict doesn't work ({np.nan : 1}[float('nan')]
# raises KeyError), so we iterate the entire dict, and align
if data:
# GH:34717, issue was using zip to extract key and values from data.
# using generators in effects the performance.
# Below is the new way of extracting the keys and values

keys = tuple(data.keys())
keys = maybe_sequence_to_range(tuple(data.keys()))
values = list(data.values()) # Generating list of values- faster way
elif index is not None:
# fastpath for Series(data=None). Just use broadcasting a scalar
Expand Down
14 changes: 12 additions & 2 deletions pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
DataFrame,
Index,
MultiIndex,
RangeIndex,
Series,
concat,
to_datetime,
Expand Down Expand Up @@ -517,7 +518,7 @@ def test_callable_result_dtype_frame(
df["c"] = df["c"].astype(input_dtype)
op = getattr(df.groupby(keys)[["c"]], method)
result = op(lambda x: x.astype(result_dtype).iloc[0])
expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
expected_index = RangeIndex(0, 1) if method == "transform" else agg_index
expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
result_dtype
)
Expand All @@ -541,7 +542,7 @@ def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
df = DataFrame({"a": [1], "b": [2], "c": [input]})
op = getattr(df.groupby(keys)["c"], method)
result = op(lambda x: x.astype(dtype).iloc[0])
expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
expected_index = RangeIndex(0, 1) if method == "transform" else agg_index
expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
tm.assert_series_equal(result, expected)

Expand Down Expand Up @@ -1663,3 +1664,12 @@ def func(x):
msg = "length must not be 0"
with pytest.raises(ValueError, match=msg):
df.groupby("A", observed=False).agg(func)


def test_agg_groups_returns_rangeindex():
df = DataFrame({"group": [1, 1, 2], "value": [1, 2, 3]})
result = df.groupby("group").agg(max)
expected = DataFrame(
[2, 3], index=RangeIndex(1, 3, name="group"), columns=["value"]
)
tm.assert_frame_equal(result, expected, check_index_type=True)
13 changes: 10 additions & 3 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ def test_groupby_nonobject_dtype(multiindex_dataframe_random_data):
result = grouped.sum()

expected = multiindex_dataframe_random_data.groupby(key.astype("O")).sum()
assert result.index.dtype == np.int8
assert expected.index.dtype == np.int64
tm.assert_frame_equal(result, expected, check_index_type=False)
tm.assert_frame_equal(result, expected, check_index_type=True)


def test_groupby_nonobject_dtype_mixed():
Expand Down Expand Up @@ -2955,3 +2953,12 @@ def test_groupby_dropna_with_nunique_unique():
)

tm.assert_frame_equal(result, expected)


def test_groupby_groups_returns_rangeindex():
df = DataFrame({"group": [1, 1, 2], "value": [1, 2, 3]})
result = df.groupby("group").max()
expected = DataFrame(
[2, 3], index=RangeIndex(1, 3, name="group"), columns=["value"]
)
tm.assert_frame_equal(result, expected, check_index_type=True)
7 changes: 5 additions & 2 deletions pandas/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,9 +258,12 @@ def test_idxmin_idxmax_extremes(how, any_real_numpy_dtype):
)
gb = df.groupby("a")
result = getattr(gb, how)()
expected = DataFrame(
{"b": [1, 0]}, index=pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
exp_idx = (
pd.Index([1, 2], name="a", dtype=any_real_numpy_dtype)
if "float" in any_real_numpy_dtype
else pd.RangeIndex(range(1, 3), name="a")
)
expected = DataFrame({"b": [1, 0]}, index=exp_idx)
tm.assert_frame_equal(result, expected)


Expand Down
10 changes: 9 additions & 1 deletion pandas/tests/groupby/transform/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DataFrame,
Index,
MultiIndex,
RangeIndex,
Series,
Timestamp,
concat,
Expand Down Expand Up @@ -290,7 +291,7 @@ def test_transform_casting():
),
"DATETIME": pd.to_datetime([f"2014-10-08 {time}" for time in times]),
},
index=pd.RangeIndex(11, name="idx"),
index=RangeIndex(11, name="idx"),
)

result = df.groupby("ID3")["DATETIME"].transform(lambda x: x.diff())
Expand Down Expand Up @@ -1535,3 +1536,10 @@ def test_transform_sum_one_column_with_matching_labels_and_missing_labels():
result = df.groupby(series, as_index=False).transform("sum")
expected = DataFrame({"X": [-93203.0, -93203.0, np.nan]})
tm.assert_frame_equal(result, expected)


def test_transform_groups_returns_rangeindex():
df = DataFrame({"group": [1, 1, 2], "value": [1, 2, 3]})
result = df.groupby("group").transform(lambda x: x + 1)
expected = DataFrame([2, 3, 4], index=RangeIndex(0, 3), columns=["value"])
tm.assert_frame_equal(result, expected, check_index_type=True)
8 changes: 2 additions & 6 deletions pandas/tests/resample/test_resampler_grouper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas.compat import is_platform_windows

import pandas as pd
from pandas import (
DataFrame,
Expand Down Expand Up @@ -587,14 +585,12 @@ def test_resample_no_columns():
)
expected = DataFrame(
index=pd.MultiIndex(
levels=[np.array([0, 1], dtype=np.intp), index],
levels=[range(2), index],
codes=[[0, 0, 0, 1], [0, 1, 2, 3]],
names=[None, "date"],
)
)

# GH#52710 - Index comes out as 32-bit on 64-bit Windows
tm.assert_frame_equal(result, expected, check_index_type=not is_platform_windows())
tm.assert_frame_equal(result, expected)


def test_groupby_resample_size_all_index_same():
Expand Down
34 changes: 15 additions & 19 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
Grouper,
Index,
MultiIndex,
RangeIndex,
Series,
concat,
date_range,
Expand Down Expand Up @@ -424,12 +425,10 @@ def test_pivot_no_values(self):
res = df.pivot_table(index=df.index.month, columns=df.index.day)

exp_columns = MultiIndex.from_tuples([("A", 1), ("A", 2)])
exp_columns = exp_columns.set_levels(
exp_columns.levels[1].astype(np.int32), level=1
)
exp_columns = exp_columns.set_levels(exp_columns.levels[1], level=1)
exp = DataFrame(
[[2.5, 4.0], [2.0, np.nan]],
index=Index([1, 2], dtype=np.int32),
index=range(1, 3),
columns=exp_columns,
)
tm.assert_frame_equal(res, exp)
Expand All @@ -446,9 +445,7 @@ def test_pivot_no_values(self):
[["A"], pd.DatetimeIndex(["2011-01-31"], dtype="M8[ns]")],
names=[None, "dt"],
)
exp = DataFrame(
[3.25, 2.0], index=Index([1, 2], dtype=np.int32), columns=exp_columns
)
exp = DataFrame([3.25, 2.0], index=range(1, 3), columns=exp_columns)
tm.assert_frame_equal(res, exp)

res = df.pivot_table(
Expand Down Expand Up @@ -1671,7 +1668,7 @@ def test_pivot_dtaccessor(self):
expected = DataFrame(
{7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]},
index=exp_idx,
columns=Index([7, 8, 9], dtype=np.int32, name="dt1"),
columns=RangeIndex(range(7, 10), name="dt1"),
)
tm.assert_frame_equal(result, expected)

Expand All @@ -1681,8 +1678,8 @@ def test_pivot_dtaccessor(self):

expected = DataFrame(
{7: [0.0, 3.0], 8: [1.0, 4.0], 9: [2.0, 5.0]},
index=Index([1, 2], dtype=np.int32, name="dt2"),
columns=Index([7, 8, 9], dtype=np.int32, name="dt1"),
index=RangeIndex(range(1, 3), name="dt2"),
columns=RangeIndex(range(7, 10), name="dt1"),
)
tm.assert_frame_equal(result, expected)

Expand All @@ -1693,11 +1690,12 @@ def test_pivot_dtaccessor(self):
values="value1",
)

exp_col = MultiIndex.from_arrays(
[
np.array([7, 7, 8, 8, 9, 9], dtype=np.int32),
np.array([1, 2] * 3, dtype=np.int32),
],
exp_col = MultiIndex(
levels=(
RangeIndex(start=7, stop=10, step=1, name="dt1"),
RangeIndex(start=1, stop=3, step=1, name="dt2"),
),
codes=([0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]),
names=["dt1", "dt2"],
)
expected = DataFrame(
Expand Down Expand Up @@ -1737,8 +1735,7 @@ def test_daily(self):
for y in ts.index.year.unique().values:
mask = ts.index.year == y
expected[y] = Series(ts.values[mask], index=doy[mask])
expected = DataFrame(expected, dtype=float).T
expected.index = expected.index.astype(np.int32)
expected = DataFrame(expected, dtype=float, index=range(1, 367)).T
tm.assert_frame_equal(result, expected)

def test_monthly(self):
Expand All @@ -1753,8 +1750,7 @@ def test_monthly(self):
for y in ts.index.year.unique().values:
mask = ts.index.year == y
expected[y] = Series(ts.values[mask], index=month[mask])
expected = DataFrame(expected, dtype=float).T
expected.index = expected.index.astype(np.int32)
expected = DataFrame(expected, dtype=float, index=range(1, 13)).T
tm.assert_frame_equal(result, expected)

def test_pivot_table_with_iterator_values(self, data):
Expand Down
Loading