Skip to content

Commit 3702492

Browse files
rohanpjreback
authored andcommitted
PERF: optimized median func when bottleneck not present (#16509)
1 parent b6acf5e commit 3702492

File tree

5 files changed

+14
-26
lines changed

5 files changed

+14
-26
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -379,6 +379,8 @@ Performance Improvements
379379
- Improved performance of :func:`IntervalIndex.symmetric_difference()` (:issue:`18475`)
380380
- Improved performance of ``DatetimeIndex`` and ``Series`` arithmetic operations with Business-Month and Business-Quarter frequencies (:issue:`18489`)
381381
- :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`)
382+
- Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`)
383+
382384

383385
.. _whatsnew_0230.docs:
384386

pandas/_libs/algos.pyx

-18
Original file line numberDiff line numberDiff line change
@@ -196,24 +196,6 @@ cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k) nogil:
196196
return a[k]
197197

198198

199-
cpdef numeric median(numeric[:] arr):
200-
"""
201-
A faster median
202-
"""
203-
cdef Py_ssize_t n = arr.size
204-
205-
if n == 0:
206-
return np.NaN
207-
208-
arr = arr.copy()
209-
210-
if n % 2:
211-
return kth_smallest(arr, n // 2)
212-
else:
213-
return (kth_smallest(arr, n // 2) +
214-
kth_smallest(arr, n // 2 - 1)) / 2
215-
216-
217199
# ----------------------------------------------------------------------
218200
# Pairwise correlation/covariance
219201

pandas/_libs/groupby.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def group_last_object(ndarray[object, ndim=2] out,
118118
out[i, j] = resx[i, j]
119119

120120

121-
cdef inline float64_t _median_linear(float64_t* a, int n) nogil:
121+
cdef inline float64_t median_linear(float64_t* a, int n) nogil:
122122
cdef int i, j, na_count = 0
123123
cdef float64_t result
124124
cdef float64_t* tmp

pandas/_libs/groupby_helper.pxi.in

+1-1
Original file line numberDiff line numberDiff line change
@@ -740,7 +740,7 @@ def group_median_float64(ndarray[float64_t, ndim=2] out,
740740
ptr += _counts[0]
741741
for j in range(ngroups):
742742
size = _counts[j + 1]
743-
out[j, i] = _median_linear(ptr, size)
743+
out[j, i] = median_linear(ptr, size)
744744
ptr += size
745745

746746

pandas/core/nanops.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import numpy as np
88
from pandas import compat
9-
from pandas._libs import tslib, algos, lib
9+
from pandas._libs import tslib, lib
1010
from pandas.core.dtypes.common import (
1111
_get_dtype,
1212
is_float, is_scalar,
@@ -370,14 +370,13 @@ def nanmean(values, axis=None, skipna=True):
370370
@bottleneck_switch()
371371
def nanmedian(values, axis=None, skipna=True):
372372

373-
values, mask, dtype, dtype_max = _get_values(values, skipna)
374-
375373
def get_median(x):
376374
mask = notna(x)
377375
if not skipna and not mask.all():
378376
return np.nan
379-
return algos.median(com._values_from_object(x[mask]))
377+
return np.nanmedian(x[mask])
380378

379+
values, mask, dtype, dtype_max = _get_values(values, skipna)
381380
if not is_float_dtype(values):
382381
values = values.astype('f8')
383382
values[mask] = np.nan
@@ -389,10 +388,15 @@ def get_median(x):
389388

390389
# an array from a frame
391390
if values.ndim > 1:
391+
392392
# there's a non-empty array to apply over otherwise numpy raises
393393
if notempty:
394-
return _wrap_results(
395-
np.apply_along_axis(get_median, axis, values), dtype)
394+
if not skipna:
395+
return _wrap_results(
396+
np.apply_along_axis(get_median, axis, values), dtype)
397+
398+
# fastpath for the skipna case
399+
return _wrap_results(np.nanmedian(values, axis), dtype)
396400

397401
# must return the correct shape, but median is not defined for the
398402
# empty set so return nans of shape "everything but the passed axis"

0 commit comments

Comments
 (0)