Skip to content

Backport PR #27827 on branch 0.25.x (BUG: Fixed groupby quantile for listlike q) #28085

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ Plotting
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^

- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`)
- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
- Bug in :meth:`pandas.core.groupby.GroupBy.nth` where ``observed=False`` was being ignored for Categorical groupers (:issue:`26385`)
- Bug in windowing over read-only arrays (:issue:`27766`)
Expand Down
64 changes: 52 additions & 12 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1872,6 +1872,7 @@ def quantile(self, q=0.5, interpolation="linear"):
a 2.0
b 3.0
"""
from pandas import concat

def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
if is_object_dtype(vals):
Expand Down Expand Up @@ -1899,18 +1900,57 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:

return vals

return self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=q,
interpolation=interpolation,
)
if is_scalar(q):
return self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=q,
interpolation=interpolation,
)
else:
results = [
self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=qi,
interpolation=interpolation,
)
for qi in q
]
result = concat(results, axis=0, keys=q)
# fix levels to place quantiles on the inside
# TODO(GH-10710): Ideally, we could write this as
# >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :]
# but this hits https://github.com/pandas-dev/pandas/issues/10710
# which doesn't reorder the list-like `q` on the inner level.
order = np.roll(list(range(result.index.nlevels)), -1)
result = result.reorder_levels(order)
result = result.reindex(q, level=-1)

# fix order.
hi = len(q) * self.ngroups
arr = np.arange(0, hi, self.ngroups)
arrays = []

for i in range(self.ngroups):
arr = arr + i
arrays.append(arr)

indices = np.concatenate(arrays)
assert len(indices) == len(result)
return result.take(indices)

@Substitution(name="groupby")
def ngroup(self, ascending=True):
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,57 @@ def test_quantile(interpolation, a_vals, b_vals, q):
tm.assert_frame_equal(result, expected)


def test_quantile_array():
# https://github.com/pandas-dev/pandas/issues/27526
df = pd.DataFrame({"A": [0, 1, 2, 3, 4]})
result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25])

index = pd.MultiIndex.from_product([[0, 1], [0.25]])
expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index)
tm.assert_frame_equal(result, expected)

df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])

result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75])
expected = pd.DataFrame(
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
)
tm.assert_frame_equal(result, expected)


def test_quantile_array_no_sort():
df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75])
expected = pd.DataFrame(
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
)
tm.assert_frame_equal(result, expected)

result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25])
expected = pd.DataFrame(
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
)
tm.assert_frame_equal(result, expected)


def test_quantile_array_multiple_levels():
df = pd.DataFrame(
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
)
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
index = pd.MultiIndex.from_tuples(
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
names=["c", "d", None],
)
expected = pd.DataFrame(
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
)
tm.assert_frame_equal(result, expected)


def test_quantile_raises():
df = pd.DataFrame(
[["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]
Expand Down