Skip to content

Commit 68b2fa6

Browse files
authored
BUG: ArrowExtensionArray returning approximate median (#52765)
1 parent 52e2289 commit 68b2fa6

File tree

3 files changed

+14
-1
lines changed

3 files changed

+14
-1
lines changed

doc/source/whatsnew/v2.0.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Bug fixes
2626
~~~~~~~~~
2727
- Bug in :attr:`Series.dt.days` that would overflow ``int32`` number of days (:issue:`52391`)
2828
- Bug in :class:`arrays.DatetimeArray` constructor returning an incorrect unit when passed a non-nanosecond numpy datetime array (:issue:`52555`)
29+
- Bug in :func:`Series.median` with :class:`ArrowDtype` returning an approximate median (:issue:`52679`)
2930
- Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`)
3031
- Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`)
3132
- Bug in :func:`to_datetime` and :func:`to_timedelta` when trying to convert numeric data with a :class:`ArrowDtype` (:issue:`52425`)

pandas/core/arrays/arrow/array.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1277,7 +1277,7 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
12771277

12781278
else:
12791279
pyarrow_name = {
1280-
"median": "approximate_median",
1280+
"median": "quantile",
12811281
"prod": "product",
12821282
"std": "stddev",
12831283
"var": "variance",
@@ -1293,6 +1293,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
12931293
# GH51624: pyarrow defaults to min_count=1, pandas behavior is min_count=0
12941294
if name in ["any", "all"] and "min_count" not in kwargs:
12951295
kwargs["min_count"] = 0
1296+
elif name == "median":
1297+
# GH 52679: Use quantile instead of approximate_median
1298+
kwargs["q"] = 0.5
12961299

12971300
try:
12981301
result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
@@ -1304,6 +1307,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
13041307
f"upgrading pyarrow."
13051308
)
13061309
raise TypeError(msg) from err
1310+
if name == "median":
1311+
# GH 52679: Use quantile instead of approximate_median; returns array
1312+
result = result[0]
13071313
if pc.is_null(result).as_py():
13081314
return self.dtype.na_value
13091315

pandas/tests/extension/test_arrow.py

+6
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,12 @@ def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
505505
request.node.add_marker(xfail_mark)
506506
super().test_reduce_series(data, all_numeric_reductions, skipna)
507507

508+
@pytest.mark.parametrize("typ", ["int64", "uint64", "float64"])
509+
def test_median_not_approximate(self, typ):
510+
# GH 52679
511+
result = pd.Series([1, 2], dtype=f"{typ}[pyarrow]").median()
512+
assert result == 1.5
513+
508514

509515
class TestBaseBooleanReduce(base.BaseBooleanReduceTests):
510516
@pytest.mark.parametrize("skipna", [True, False])

0 commit comments

Comments
 (0)