From b94ee25dccf032ebb65cb41b0e4135bf3c97c72e Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 21 Aug 2021 13:17:31 +0530 Subject: [PATCH 1/5] BUG: GroupBy.quantile fails with pd.NA --- doc/source/whatsnew/v1.3.3.rst | 2 +- pandas/core/groupby/groupby.py | 4 ++++ pandas/tests/groupby/test_quantile.py | 10 ++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 1340188c3d609..93ed59c7bdb0a 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,7 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) -- +- Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ab29dea3190c8..6a87463e25fd3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -63,6 +63,7 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -2448,6 +2449,9 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: elif is_timedelta64_dtype(vals.dtype): inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): + out = vals.to_numpy(dtype=float, na_value=np.nan) + inference = np.dtype(np.float64) else: out = np.asarray(vals) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 90437b9139594..f31a1bc5f42e4 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -287,3 +287,13 @@ def test_columns_groupby_quantile(): ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dtype", [float, "Float64", "Float32"]) +def test_groupby_quantile_NA(dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") + expected.index.name = "x" + tm.assert_series_equal(expected, result) From 5712bf89e5f70f52cf89771fded838dc94a9ab5d Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 21 Aug 2021 19:46:11 +0530 Subject: [PATCH 2/5] suggested change --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 6a87463e25fd3..32375a6bc2cb4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2450,8 +2450,8 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): - out = vals.to_numpy(dtype=float, na_value=np.nan) inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) From 105abf488501a2d6d223ef1b09861dbf37cdf717 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 21 Aug 2021 20:14:49 +0530 Subject: [PATCH 3/5] added fixture; colocated near dtype tests --- pandas/tests/groupby/test_quantile.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index f31a1bc5f42e4..4b303d1426930 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -248,6 +248,15 @@ def test_groupby_quantile_skips_invalid_dtype(q): tm.assert_frame_equal(result, expected) +def test_groupby_quantile_NA(any_float_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + def test_groupby_timedelta_quantile(): # GH: 29485 df = DataFrame( @@ -287,13 +296,3 @@ def test_columns_groupby_quantile(): ) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("dtype", [float, "Float64", "Float32"]) -def test_groupby_quantile_NA(dtype): - # GH#42849 - df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=dtype) - result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([0.2], dtype=float, index=[1.0], name="y") - expected.index.name = "x" - tm.assert_series_equal(expected, result) From 485402c7a9bd4370baceee153bed0e6f3bf3024e Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Sat, 21 Aug 2021 22:48:16 +0530 Subject: [PATCH 4/5] added tests for int_EA & allNA --- pandas/tests/groupby/test_quantile.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 4b303d1426930..ab91df6cb7594 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -248,7 +248,7 @@ def test_groupby_quantile_skips_invalid_dtype(q): tm.assert_frame_equal(result, expected) -def test_groupby_quantile_NA(any_float_dtype): +def test_groupby_quantile_NA_float(any_float_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype) result = df.groupby("x")["y"].quantile(0.5) @@ -257,6 +257,24 @@ def test_groupby_quantile_NA(any_float_dtype): tm.assert_series_equal(expected, result) +def test_groupby_quantile_NA_int(any_int_ea_dtype): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([3.5], dtype=float, index=[1], name="y") + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + +def test_groupby_quantile_allNA_column(): + # GH#42849 + df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype="Float64") + result = df.groupby("x")["y"].quantile(0.5) + expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y") + expected.index.name = "x" + tm.assert_series_equal(expected, result) + + def test_groupby_timedelta_quantile(): # GH: 29485 df = DataFrame( From 8a57188b5eacfff48f12f77be4affc8e22d83508 Mon Sep 17 00:00:00 2001 From: debnathshoham Date: Wed, 1 Sep 2021 12:36:14 +0530 Subject: [PATCH 5/5] added suggested tests --- pandas/tests/groupby/test_quantile.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index ab91df6cb7594..83d6c20bcac24 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -256,19 +256,31 @@ def test_groupby_quantile_NA_float(any_float_dtype): expected.index.name = "x" tm.assert_series_equal(expected, result) + result = df.groupby("x")["y"].quantile([0.5, 0.75]) + expected = pd.Series( + [0.2] * 2, + index=pd.MultiIndex.from_product(([1.0], [0.5, 0.75]), names=["x", None]), + name="y", + ) + tm.assert_series_equal(result, expected) + def test_groupby_quantile_NA_int(any_int_ea_dtype): # GH#42849 df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype) result = df.groupby("x")["y"].quantile(0.5) - expected = pd.Series([3.5], dtype=float, index=[1], name="y") - expected.index.name = "x" + expected = pd.Series([3.5], dtype=float, index=Index([1], name="x"), name="y") tm.assert_series_equal(expected, result) + result = df.groupby("x").quantile(0.5) + expected = DataFrame({"y": 3.5}, index=Index([1], name="x")) + tm.assert_frame_equal(result, expected) + -def test_groupby_quantile_allNA_column(): +@pytest.mark.parametrize("dtype", ["Float64", "Float32"]) +def test_groupby_quantile_allNA_column(dtype): # GH#42849 - df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype="Float64") + df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype) result = df.groupby("x")["y"].quantile(0.5) expected = pd.Series([np.nan], dtype=float, index=[1.0], name="y") expected.index.name = "x"