From 99a5137bdb1f760a3bdd16266e09c6d657dd7dae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Apr 2023 11:56:55 -0700 Subject: [PATCH] BUG: ArrowExtensionArray returning approximate median --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/arrays/arrow/array.py | 8 +++++++- pandas/tests/extension/test_arrow.py | 6 ++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 9c867544a324b..89d15115197f7 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -26,6 +26,7 @@ Bug fixes ~~~~~~~~~ - Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`) - Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`) +- Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) - Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b60d29aff6991..ef6f4601ed074 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -1270,7 +1270,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs): else: pyarrow_name = { - "median": "approximate_median", + "median": "quantile", "prod": "product", "std": "stddev", "var": "variance", @@ -1286,6 +1286,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs): # GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0 if name in ["any", "all"] and "min_count" not in kwargs: kwargs["min_count"] = 0 + elif name == "median": + # GH 52679: Use quantile instead of approximate_median + kwargs["q"] = 0.5 try: result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs) @@ -1297,6 +1300,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs): f"upgrading pyarrow." ) raise TypeError(msg) from err + if name == "median": + # GH 52679: Use quantile instead of approximate_median; returns array + result = result[0] if pc.is_null(result).as_py(): return self.dtype.na_value diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7e4532b1ee326..45b973a44e56d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -504,6 +504,12 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request): request.node.add_marker(xfail_mark) super().test_reduce_series(data, all_numeric_reductions, skipna) + @pytest.mark.parametrize("typ", ["int64", "uint64", "float64"]) + def test_median_not_approximate(self, typ): + # GH 52679 + result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median() + assert result == 1.5 + class TestBaseBooleanReduce(base.BaseBooleanReduceTests): @pytest.mark.parametrize("skipna", [True, False])