From 02360469e85153672faf0bb7e778e4b77fadc102 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 8 Aug 2019 15:35:12 -0500 Subject: [PATCH 1/5] BUG: Fixed groupby quantile for listlike q Closes #27526 --- doc/source/whatsnew/v0.25.1.rst | 1 + pandas/core/groupby/groupby.py | 46 ++++++++++++++++++++------- pandas/tests/groupby/test_function.py | 19 +++++++++++ 3 files changed, 54 insertions(+), 12 deletions(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index f1d3f152e503d..185d7bdc52b1e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -117,6 +117,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ +- Fixed regression in :meth:`pands.core.groupby.DataFrameGroupBy.quantile` raising when multiple quantiles are given (:issue:`27526`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` where applying a timezone conversion lambda function would drop timezone information (:issue:`27496`) - Bug in windowing over read-only arrays (:issue:`27766`) - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c5e81e21e9fd5..1b7f605ca7a57 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1870,6 +1870,7 @@ def quantile(self, q=0.5, interpolation="linear"): a 2.0 b 3.0 """ + from pandas import concat def pre_processor(vals: np.ndarray) -> Tuple[np.ndarray, Optional[Type]]: if is_object_dtype(vals): @@ -1897,18 +1898,39 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return vals - return self._get_cythonized_result( - "group_quantile", - self.grouper, - aggregate=True, - needs_values=True, - needs_mask=True, - cython_dtype=np.float64, - pre_processing=pre_processor, - post_processing=post_processor, - q=q, - interpolation=interpolation, - ) + if is_scalar(q): + return self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=q, + interpolation=interpolation, + ) + else: + results = [ + self._get_cythonized_result( + "group_quantile", + self.grouper, + aggregate=True, + needs_values=True, + needs_mask=True, + cython_dtype=np.float64, + pre_processing=pre_processor, + post_processing=post_processor, + q=qi, + interpolation=interpolation, + ) + for qi in q + ] + result = concat(results, axis=0, keys=q).swaplevel() + if self.sort: + result = result.sort_index(level=list(range(result.index.nlevels - 1))) + return result @Substitution(name="groupby") def ngroup(self, ascending=True): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index efc3142b25b82..b324f1d14c4fe 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1238,6 +1238,25 @@ def test_quantile(interpolation, a_vals, b_vals, q): tm.assert_frame_equal(result, expected) +def test_quantile_multi(): + # https://github.com/pandas-dev/pandas/issues/27526 + df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) + result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) + + index = pd.MultiIndex.from_product([[0, 1], [0.25]]) + expected = pd.DataFrame({"A": [0.25, 2.50]}, index=index) + tm.assert_frame_equal(result, expected) + + df = pd.DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]}) + index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]]) + + result = df.groupby([0, 0, 1, 1]).quantile([0.25, 0.75]) + expected = pd.DataFrame( + {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] From ca2411a32c9886f2028fdf57680d9ce5765a87b2 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 9 Aug 2019 13:51:52 -0500 Subject: [PATCH 2/5] bad --- pandas/core/groupby/groupby.py | 22 ++++++++++++++--- pandas/tests/groupby/test_function.py | 34 ++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1b7f605ca7a57..d1519950de211 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1927,10 +1927,26 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: ) for qi in q ] - result = concat(results, axis=0, keys=q).swaplevel() - if self.sort: - result = result.sort_index(level=list(range(result.index.nlevels - 1))) + result = concat(results, axis=0, keys=q) + order = np.roll(list(range(result.index.nlevels)), -1) + result = result.reorder_levels(order) + result = result.reindex(q, level=-1) return result + # + # ngroups = self.ngroups + # nquantiles = len(q) + # arrays = [] + # + # for i in range(ngroups): + # lo = i + # hi = ngroups * nquantiles + i + # print(lo, hi, ngroups) + # arrays.append(np.arange(lo, hi, ngroups)) + # + # idx = np.concatenate(arrays) + # import pdb; pdb.set_trace() + # result = result.iloc[idx] + # return result @Substitution(name="groupby") def ngroup(self, ascending=True): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index b324f1d14c4fe..e654098cefeea 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1238,7 +1238,7 @@ def test_quantile(interpolation, a_vals, b_vals, q): tm.assert_frame_equal(result, expected) -def test_quantile_multi(): +def test_quantile_array(): # https://github.com/pandas-dev/pandas/issues/27526 df = pd.DataFrame({"A": [0, 1, 2, 3, 4]}) result = df.groupby([0, 0, 1, 1, 1]).quantile([0.25]) @@ -1257,6 +1257,38 @@ def test_quantile_multi(): tm.assert_frame_equal(result, expected) +def test_quantile_array_no_sort(): + df = pd.DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]}) + result = df.groupby([1, 0, 1], sort=False).quantile([0.25, 0.5, 0.75]) + expected = pd.DataFrame( + {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]), + ) + tm.assert_frame_equal(result, expected) + + result = df.groupby([1, 0, 1], sort=False).quantile([0.75, 0.25]) + expected = pd.DataFrame( + {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]}, + index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]), + ) + tm.assert_frame_equal(result, expected) + + +def test_quantile_array_multiple_levels(): + df = pd.DataFrame( + {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]} + ) + result = df.groupby(["c", "d"]).quantile([0.25, 0.75]) + index = pd.MultiIndex.from_tuples( + [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)], + names=["c", "d", None], + ) + expected = pd.DataFrame( + {"A": [0.5, 1.5, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + ) + tm.assert_frame_equal(result, expected) + + def test_quantile_raises(): df = pd.DataFrame( [["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"] From f66a67b1e3b9f6d7102938940b9e826aa880d6d1 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 12 Aug 2019 21:20:44 -0500 Subject: [PATCH 3/5] wip --- pandas/core/groupby/groupby.py | 31 +++++++++++++-------------- pandas/tests/groupby/test_function.py | 2 +- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d1519950de211..855d0b940f121 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1898,6 +1898,7 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return vals + self.exclusions |= set(self.obj.select_dtypes(include="object").columns) if is_scalar(q): return self._get_cythonized_result( "group_quantile", @@ -1927,26 +1928,24 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: ) for qi in q ] + # fix levels to place quantiles on the inside result = concat(results, axis=0, keys=q) order = np.roll(list(range(result.index.nlevels)), -1) result = result.reorder_levels(order) result = result.reindex(q, level=-1) - return result - # - # ngroups = self.ngroups - # nquantiles = len(q) - # arrays = [] - # - # for i in range(ngroups): - # lo = i - # hi = ngroups * nquantiles + i - # print(lo, hi, ngroups) - # arrays.append(np.arange(lo, hi, ngroups)) - # - # idx = np.concatenate(arrays) - # import pdb; pdb.set_trace() - # result = result.iloc[idx] - # return result + + # fix order. + hi = len(q) * self.ngroups + arr = np.arange(0, hi, self.ngroups) + arrays = [] + + for i in range(self.ngroups): + arr = arr + i + arrays.append(arr) + + indices = np.concatenate(arrays) + assert len(indices) == len(result) + return result.take(indices) @Substitution(name="groupby") def ngroup(self, ascending=True): diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 376de032d3f6f..509d7c33b643b 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1284,7 +1284,7 @@ def test_quantile_array_multiple_levels(): names=["c", "d", None], ) expected = pd.DataFrame( - {"A": [0.5, 1.5, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index + {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index ) tm.assert_frame_equal(result, expected) From 624d33b691009a00547ba9dea949de35e954a071 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 13 Aug 2019 10:04:48 -0500 Subject: [PATCH 4/5] Remove dtype selection --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 855d0b940f121..c00bd83982c8f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1898,7 +1898,6 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: return vals - self.exclusions |= set(self.obj.select_dtypes(include="object").columns) if is_scalar(q): return self._get_cythonized_result( "group_quantile", From dc5147a394ba3276b3b47f04d21b9c12be2e76bf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 20 Aug 2019 09:24:39 -0500 Subject: [PATCH 5/5] add future implementation --- pandas/core/groupby/groupby.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c00bd83982c8f..35c2dac3901a2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1927,8 +1927,12 @@ def post_processor(vals: np.ndarray, inference: Optional[Type]) -> np.ndarray: ) for qi in q ] - # fix levels to place quantiles on the inside result = concat(results, axis=0, keys=q) + # fix levels to place quantiles on the inside + # TODO(GH-10710): Ideally, we could write this as + # >>> result.stack(0).loc[pd.IndexSlice[:, ..., q], :] + # but this hits https://github.com/pandas-dev/pandas/issues/10710 + # which doesn't reorder the list-like `q` on the inner level. order = np.roll(list(range(result.index.nlevels)), -1) result = result.reorder_levels(order) result = result.reindex(q, level=-1)