Skip to content

Commit f30278e

Browse files
committed
Merge pull request #6810 from gdraps/replace-scoreatpercentile
CLN: replace pandas.compat.scipy.scoreatpercentile with numpy.percentile
2 parents d2e1abf + 9d89f51 commit f30278e

File tree

8 files changed

+38
-104
lines changed

8 files changed

+38
-104
lines changed

doc/source/release.rst

+4
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,10 @@ API Changes
172172
(and numpy defaults)
173173
- add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`)
174174

175+
- Replace ``pandas.compat.scipy.scoreatpercentile`` with ``numpy.percentile`` (:issue:`6810`)
176+
- ``.quantile`` on a ``datetime[ns]`` series now returns ``Timestamp`` instead
177+
of ``np.datetime64`` objects (:issue:`6810`)
178+
175179
Deprecations
176180
~~~~~~~~~~~~
177181

pandas/compat/scipy.py

-82
Original file line numberDiff line numberDiff line change
@@ -6,88 +6,6 @@
66
import numpy as np
77

88

9-
def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
10-
"""Calculate the score at the given `per` percentile of the sequence `a`.
11-
12-
For example, the score at `per=50` is the median. If the desired quantile
13-
lies between two data points, we interpolate between them, according to
14-
the value of `interpolation`. If the parameter `limit` is provided, it
15-
should be a tuple (lower, upper) of two values. Values of `a` outside
16-
this (closed) interval will be ignored.
17-
18-
The `interpolation_method` parameter supports three values, namely
19-
`fraction` (default), `lower` and `higher`. Interpolation is done only,
20-
if the desired quantile lies between two data points `i` and `j`. For
21-
`fraction`, the result is an interpolated value between `i` and `j`;
22-
for `lower`, the result is `i`, for `higher` the result is `j`.
23-
24-
Parameters
25-
----------
26-
a : ndarray
27-
Values from which to extract score.
28-
per : scalar
29-
Percentile at which to extract score.
30-
limit : tuple, optional
31-
Tuple of two scalars, the lower and upper limits within which to
32-
compute the percentile.
33-
interpolation_method : {'fraction', 'lower', 'higher'}, optional
34-
This optional parameter specifies the interpolation method to use,
35-
when the desired quantile lies between two data points `i` and `j`:
36-
37-
- fraction: `i + (j - i)*fraction`, where `fraction` is the
38-
fractional part of the index surrounded by `i` and `j`.
39-
- lower: `i`.
40-
- higher: `j`.
41-
42-
Returns
43-
-------
44-
score : float
45-
Score at percentile.
46-
47-
See Also
48-
--------
49-
percentileofscore
50-
51-
Examples
52-
--------
53-
>>> from scipy import stats
54-
>>> a = np.arange(100)
55-
>>> stats.scoreatpercentile(a, 50)
56-
49.5
57-
58-
"""
59-
# TODO: this should be a simple wrapper around a well-written quantile
60-
# function. GNU R provides 9 quantile algorithms (!), with differing
61-
# behaviour at, for example, discontinuities.
62-
values = np.sort(a, axis=0)
63-
if limit:
64-
values = values[(limit[0] <= values) & (values <= limit[1])]
65-
66-
idx = per / 100. * (values.shape[0] - 1)
67-
if idx % 1 == 0:
68-
score = values[idx]
69-
else:
70-
if interpolation_method == 'fraction':
71-
score = _interpolate(values[int(idx)], values[int(idx) + 1],
72-
idx % 1)
73-
elif interpolation_method == 'lower':
74-
score = values[np.floor(idx)]
75-
elif interpolation_method == 'higher':
76-
score = values[np.ceil(idx)]
77-
else:
78-
raise ValueError("interpolation_method can only be 'fraction', "
79-
"'lower' or 'higher'")
80-
81-
return score
82-
83-
84-
def _interpolate(a, b, fraction):
85-
"""Returns the point at the given fraction between a and b, where
86-
'fraction' must be between 0 and 1.
87-
"""
88-
return a + (b - a) * fraction
89-
90-
919
def rankdata(a):
9210
"""
9311
Ranks the data, dealing with ties appropriately.

pandas/core/frame.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
import pandas.computation.expressions as expressions
3939
from pandas.computation.eval import eval as _eval
4040
from pandas.computation.scope import _ensure_scope
41-
from pandas.compat.scipy import scoreatpercentile as _quantile
41+
from numpy import percentile as _quantile
4242
from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
4343
OrderedDict, raise_with_traceback)
4444
from pandas import compat

pandas/core/series.py

+6-5
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
import pandas.tslib as tslib
5353
import pandas.index as _index
5454

55-
from pandas.compat.scipy import scoreatpercentile as _quantile
55+
from numpy import percentile as _quantile
5656
from pandas.core.config import get_option
5757

5858
__all__ = ['Series']
@@ -1235,10 +1235,11 @@ def quantile(self, q=0.5):
12351235
valid_values = self.dropna().values
12361236
if len(valid_values) == 0:
12371237
return pa.NA
1238-
result = _quantile(valid_values, q * 100)
1239-
if not np.isscalar and com.is_timedelta64_dtype(result):
1240-
from pandas.tseries.timedeltas import to_timedelta
1241-
return to_timedelta(result)
1238+
if com.is_datetime64_dtype(self):
1239+
values = _values_from_object(self).view('i8')
1240+
result = lib.Timestamp(_quantile(values, q * 100))
1241+
else:
1242+
result = _quantile(valid_values, q * 100)
12421243

12431244
return result
12441245

pandas/tests/test_frame.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -10915,13 +10915,13 @@ def wrapper(x):
1091510915
check_dtype=False, check_dates=True)
1091610916

1091710917
def test_quantile(self):
10918-
from pandas.compat.scipy import scoreatpercentile
10918+
from numpy import percentile
1091910919

1092010920
q = self.tsframe.quantile(0.1, axis=0)
10921-
self.assertEqual(q['A'], scoreatpercentile(self.tsframe['A'], 10))
10921+
self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
1092210922
q = self.tsframe.quantile(0.9, axis=1)
1092310923
q = self.intframe.quantile(0.1)
10924-
self.assertEqual(q['A'], scoreatpercentile(self.intframe['A'], 10))
10924+
self.assertEqual(q['A'], percentile(self.intframe['A'], 10))
1092510925

1092610926
# test degenerate case
1092710927
q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)

pandas/tests/test_groupby.py

+8-8
Original file line numberDiff line numberDiff line change
@@ -1907,17 +1907,17 @@ def test_groupby_with_hier_columns(self):
19071907
self.assert_(result.columns.equals(df.columns[:-1]))
19081908

19091909
def test_pass_args_kwargs(self):
1910-
from pandas.compat.scipy import scoreatpercentile
1910+
from numpy import percentile
19111911

1912-
def f(x, q=None):
1913-
return scoreatpercentile(x, q)
1914-
g = lambda x: scoreatpercentile(x, 80)
1912+
def f(x, q=None, axis=0):
1913+
return percentile(x, q, axis=axis)
1914+
g = lambda x: percentile(x, 80, axis=0)
19151915

19161916
# Series
19171917
ts_grouped = self.ts.groupby(lambda x: x.month)
1918-
agg_result = ts_grouped.agg(scoreatpercentile, 80)
1919-
apply_result = ts_grouped.apply(scoreatpercentile, 80)
1920-
trans_result = ts_grouped.transform(scoreatpercentile, 80)
1918+
agg_result = ts_grouped.agg(percentile, 80, axis=0)
1919+
apply_result = ts_grouped.apply(percentile, 80, axis=0)
1920+
trans_result = ts_grouped.transform(percentile, 80, axis=0)
19211921

19221922
agg_expected = ts_grouped.quantile(.8)
19231923
trans_expected = ts_grouped.transform(g)
@@ -1935,7 +1935,7 @@ def f(x, q=None):
19351935

19361936
# DataFrame
19371937
df_grouped = self.tsframe.groupby(lambda x: x.month)
1938-
agg_result = df_grouped.agg(scoreatpercentile, 80)
1938+
agg_result = df_grouped.agg(percentile, 80, axis=0)
19391939
apply_result = df_grouped.apply(DataFrame.quantile, .8)
19401940
expected = df_grouped.quantile(.8)
19411941
assert_frame_equal(apply_result, expected)

pandas/tests/test_series.py

+15-4
Original file line numberDiff line numberDiff line change
@@ -2137,17 +2137,28 @@ def test_prod_numpy16_bug(self):
21372137
self.assertNotIsInstance(result, Series)
21382138

21392139
def test_quantile(self):
2140-
from pandas.compat.scipy import scoreatpercentile
2140+
from numpy import percentile
21412141

21422142
q = self.ts.quantile(0.1)
2143-
self.assertEqual(q, scoreatpercentile(self.ts.valid(), 10))
2143+
self.assertEqual(q, percentile(self.ts.valid(), 10))
21442144

21452145
q = self.ts.quantile(0.9)
2146-
self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90))
2146+
self.assertEqual(q, percentile(self.ts.valid(), 90))
21472147

21482148
# object dtype
21492149
q = Series(self.ts,dtype=object).quantile(0.9)
2150-
self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90))
2150+
self.assertEqual(q, percentile(self.ts.valid(), 90))
2151+
2152+
# datetime64[ns] dtype
2153+
dts = self.ts.index.to_series()
2154+
q = dts.quantile(.2)
2155+
self.assertEqual(q, Timestamp('2000-01-10 19:12:00'))
2156+
2157+
if not _np_version_under1p7:
2158+
# timedelta64[ns] dtype
2159+
tds = dts.diff()
2160+
q = tds.quantile(.25)
2161+
self.assertEqual(q, pd.to_timedelta('24:00:00'))
21512162

21522163
def test_describe(self):
21532164
_ = self.series.describe()

pandas/tseries/tests/test_timedeltas.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ def test_timedelta_ops(self):
240240

241241
result = td.quantile(.1)
242242
# This properly returned a scalar.
243-
expected = to_timedelta('00:00:02.6')
243+
expected = np.timedelta64(2599999999,'ns')
244244
tm.assert_almost_equal(result, expected)
245245

246246
result = td.median()[0]

0 commit comments

Comments
 (0)