Skip to content

Commit dd8533e

Browse files
Backport PR #52765 on branch 2.0.x (BUG: ArrowExtensionArray returning approximate median) (#52785)
Backport PR #52765: BUG: ArrowExtensionArray returning approximate median Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 5a95983 commit dd8533e

File tree

3 files changed

+14
-1
lines changed

3 files changed

+14
-1
lines changed

doc/source/whatsnew/v2.0.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Bug fixes
2626
~~~~~~~~~
2727
- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
2828
- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
29+
- Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
2930
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
3031
- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
3132
- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`)

pandas/core/arrays/arrow/array.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1259,7 +1259,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
12591259

12601260
else:
12611261
pyarrow_name = {
1262-
"median": "approximate_median",
1262+
"median": "quantile",
12631263
"prod": "product",
12641264
"std": "stddev",
12651265
"var": "variance",
@@ -1275,6 +1275,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
12751275
# GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
12761276
if name in ["any", "all"] and "min_count" not in kwargs:
12771277
kwargs["min_count"] = 0
1278+
elif name == "median":
1279+
# GH 52679: Use quantile instead of approximate_median
1280+
kwargs["q"] = 0.5
12781281

12791282
try:
12801283
result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
@@ -1286,6 +1289,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
12861289
f"upgrading pyarrow."
12871290
)
12881291
raise TypeError(msg) from err
1292+
if name == "median":
1293+
# GH 52679: Use quantile instead of approximate_median; returns array
1294+
result = result[0]
12891295
if pc.is_null(result).as_py():
12901296
return self.dtype.na_value
12911297

pandas/tests/extension/test_arrow.py

+6
Original file line numberDiff line numberDiff line change
@@ -506,6 +506,12 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
506506
request.node.add_marker(xfail_mark)
507507
super().test_reduce_series(data, all_numeric_reductions, skipna)
508508

509+
@pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
510+
def test_median_not_approximate(self, typ):
511+
# GH 52679
512+
result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
513+
assert result == 1.5
514+
509515

510516
class TestBaseBooleanReduce(base.BaseBooleanReduceTests):
511517
@pytest.mark.parametrize("skipna", [True, False])

0 commit comments

Comments
 (0)