Skip to content

BUG: Fixed groupby quantile for listlike q #27827

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.25.1.rst
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ Plotting
Groupby/resample/rolling
^^^^^^^^^^^^^^^^^^^^^^^^

- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo: pandas

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multiple -> list-like

- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`)
- Bug in windowing over read-only arrays (:issue:`27766`)
- Fixed segfault in `pandas.core.groupby.DataFrameGroupBy.quantile` when an invalid quantile was passed (:issue:`27470`)
Expand Down
64 changes: 52 additions & 12 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1870,6 +1870,7 @@ def quantile(self, q=0.5, interpolation="linear"):
a 2.0
b 3.0
"""
from pandas import concat

def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]:
if is_object_dtype(vals):
Expand Down Expand Up @@ -1897,18 +1898,57 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray:

return vals

return self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=q,
interpolation=interpolation,
)
if is_scalar(q):
return self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=q,
interpolation=interpolation,
)
else:
results = [
self._get_cythonized_result(
"group_quantile",
self.grouper,
aggregate=True,
needs_values=True,
needs_mask=True,
cython_dtype=np.float64,
pre_processing=pre_processor,
post_processing=post_processor,
q=qi,
interpolation=interpolation,
)
for qi in q
]
result = concat(results, axis=0, keys=q)
# fix levels to place quantiles on the inside
# TODO(GH-10710): Ideally, we could write this as
# >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :]
# but this hits https://github.com/pandas-dev/pandas/issues/10710
# which doesn't reorder the list-like `q` on the inner level.
order = np.roll(list(range(result.index.nlevels)), -1)
result = result.reorder_levels(order)
result = result.reindex(q, level=-1)

# fix order.
hi = len(q) * self.ngroups
arr = np.arange(0, hi, self.ngroups)
arrays = []

for i in range(self.ngroups):
arr = arr + i
arrays.append(arr)

indices = np.concatenate(arrays)
assert len(indices) == len(result)
return result.take(indices)

@Substitution(name="groupby")
def ngroup(self, ascending=True):
Expand Down
51 changes: 51 additions & 0 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,57 @@ def test_quantile(interpolation, a_vals, b_vals, q):
tm.assert_frame_equal(result, expected)


def test_quantile_array():
# https://github.com/pandas-dev/pandas/issues/27526
df = pd.DataFrame({"A": [0, 1, 2, 3, 4]})
result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25])

index = pd.MultiIndex.from_product([[0, 1], [0.25]])
expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index)
tm.assert_frame_equal(result, expected)

df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])

result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75])
expected = pd.DataFrame(
{"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
)
tm.assert_frame_equal(result, expected)


def test_quantile_array_no_sort():
df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75])
expected = pd.DataFrame(
{"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
)
tm.assert_frame_equal(result, expected)

result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25])
expected = pd.DataFrame(
{"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
)
tm.assert_frame_equal(result, expected)


def test_quantile_array_multiple_levels():
df = pd.DataFrame(
{"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
)
result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
index = pd.MultiIndex.from_tuples(
[("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
names=["c", "d", None],
)
expected = pd.DataFrame(
{"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
)
tm.assert_frame_equal(result, expected)


def test_quantile_raises():
df = pd.DataFrame(
[["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"]
Expand Down