Skip to content

Commit 77cbbc6

Browse files
committed
ENH: ship parts of scipy.stats, close #1092
1 parent 4cb44fb commit 77cbbc6

File tree

9 files changed

+270
-33
lines changed

9 files changed

+270
-33
lines changed

RELEASE.rst

+5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,11 @@ pandas 0.8.0
3030
- Add GroupBy.prod optimized aggregation function and 'prod' fast time series
3131
conversion method (#1018)
3232

33+
**Improvements to existing features**
34+
35+
- Shipping some functions from scipy.stats to reduce dependency,
36+
e.g. Series.describe and DataFrame.describe (GH #1092)
37+
3338
**API Changes**
3439

3540
- Change BDay (business day) to not normalize dates by default

pandas/compat/scipy.py

+242
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,242 @@
1+
"""
2+
Shipping functions from SciPy to reduce dependency on having SciPy installed
3+
"""
4+
5+
import numpy as np
6+
7+
8+
def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
9+
"""
10+
Calculate the score at the given `per` percentile of the sequence `a`.
11+
12+
For example, the score at `per=50` is the median. If the desired quantile
13+
lies between two data points, we interpolate between them, according to
14+
the value of `interpolation`. If the parameter `limit` is provided, it
15+
should be a tuple (lower, upper) of two values. Values of `a` outside
16+
this (closed) interval will be ignored.
17+
18+
The `interpolation_method` parameter supports three values, namely
19+
`fraction` (default), `lower` and `higher`. Interpolation is done only,
20+
if the desired quantile lies between two data points `i` and `j`. For
21+
`fraction`, the result is an interpolated value between `i` and `j`;
22+
for `lower`, the result is `i`, for `higher` the result is `j`.
23+
24+
Parameters
25+
----------
26+
a : ndarray
27+
Values from which to extract score.
28+
per : scalar
29+
Percentile at which to extract score.
30+
limit : tuple, optional
31+
Tuple of two scalars, the lower and upper limits within which to
32+
compute the percentile.
33+
interpolation : {'fraction', 'lower', 'higher'}, optional
34+
This optional parameter specifies the interpolation method to use,
35+
when the desired quantile lies between two data points `i` and `j`:
36+
37+
- fraction: `i + (j - i)*fraction`, where `fraction` is the
38+
fractional part of the index surrounded by `i` and `j`.
39+
-lower: `i`.
40+
- higher: `j`.
41+
42+
Returns
43+
-------
44+
score : float
45+
Score at percentile.
46+
47+
See Also
48+
--------
49+
percentileofscore
50+
51+
Examples
52+
--------
53+
>>> from scipy import stats
54+
>>> a = np.arange(100)
55+
>>> stats.scoreatpercentile(a, 50)
56+
49.5
57+
58+
"""
59+
# TODO: this should be a simple wrapper around a well-written quantile
60+
# function. GNU R provides 9 quantile algorithms (!), with differing
61+
# behaviour at, for example, discontinuities.
62+
values = np.sort(a, axis=0)
63+
if limit:
64+
values = values[(limit[0] <= values) & (values <= limit[1])]
65+
66+
idx = per /100. * (values.shape[0] - 1)
67+
if (idx % 1 == 0):
68+
score = values[idx]
69+
else:
70+
if interpolation_method == 'fraction':
71+
score = _interpolate(values[int(idx)], values[int(idx) + 1],
72+
idx % 1)
73+
elif interpolation_method == 'lower':
74+
score = values[np.floor(idx)]
75+
elif interpolation_method == 'higher':
76+
score = values[np.ceil(idx)]
77+
else:
78+
raise ValueError("interpolation_method can only be 'fraction', " \
79+
"'lower' or 'higher'")
80+
81+
return score
82+
83+
84+
def _interpolate(a, b, fraction):
85+
"""Returns the point at the given fraction between a and b, where
86+
'fraction' must be between 0 and 1.
87+
"""
88+
return a + (b - a)*fraction
89+
90+
91+
def rankdata(a):
92+
"""
93+
Ranks the data, dealing with ties appropriately.
94+
95+
Equal values are assigned a rank that is the average of the ranks that
96+
would have been otherwise assigned to all of the values within that set.
97+
Ranks begin at 1, not 0.
98+
99+
Parameters
100+
----------
101+
a : array_like
102+
This array is first flattened.
103+
104+
Returns
105+
-------
106+
rankdata : ndarray
107+
An array of length equal to the size of `a`, containing rank scores.
108+
109+
Examples
110+
--------
111+
>>> stats.rankdata([0, 2, 2, 3])
112+
array([ 1. , 2.5, 2.5, 4. ])
113+
114+
"""
115+
a = np.ravel(a)
116+
n = len(a)
117+
svec, ivec = fastsort(a)
118+
sumranks = 0
119+
dupcount = 0
120+
newarray = np.zeros(n, float)
121+
for i in xrange(n):
122+
sumranks += i
123+
dupcount += 1
124+
if i==n-1 or svec[i] != svec[i+1]:
125+
averank = sumranks / float(dupcount) + 1
126+
for j in xrange(i-dupcount+1,i+1):
127+
newarray[ivec[j]] = averank
128+
sumranks = 0
129+
dupcount = 0
130+
return newarray
131+
132+
133+
def fastsort(a):
134+
"""
135+
Sort an array and provide the argsort.
136+
137+
Parameters
138+
----------
139+
a : array_like
140+
Input array.
141+
142+
Returns
143+
-------
144+
fastsort : ndarray of type int
145+
sorted indices into the original array
146+
147+
"""
148+
# TODO: the wording in the docstring is nonsense.
149+
it = np.argsort(a)
150+
as_ = a[it]
151+
return as_, it
152+
153+
154+
def percentileofscore(a, score, kind='rank'):
155+
'''
156+
The percentile rank of a score relative to a list of scores.
157+
158+
A `percentileofscore` of, for example, 80% means that 80% of the
159+
scores in `a` are below the given score. In the case of gaps or
160+
ties, the exact definition depends on the optional keyword, `kind`.
161+
162+
Parameters
163+
----------
164+
a: array like
165+
Array of scores to which `score` is compared.
166+
score: int or float
167+
Score that is compared to the elements in `a`.
168+
kind: {'rank', 'weak', 'strict', 'mean'}, optional
169+
This optional parameter specifies the interpretation of the
170+
resulting score:
171+
172+
- "rank": Average percentage ranking of score. In case of
173+
multiple matches, average the percentage rankings of
174+
all matching scores.
175+
- "weak": This kind corresponds to the definition of a cumulative
176+
distribution function. A percentileofscore of 80%
177+
means that 80% of values are less than or equal
178+
to the provided score.
179+
- "strict": Similar to "weak", except that only values that are
180+
strictly less than the given score are counted.
181+
- "mean": The average of the "weak" and "strict" scores, often used in
182+
testing. See
183+
184+
http://en.wikipedia.org/wiki/Percentile_rank
185+
186+
Returns
187+
-------
188+
pcos : float
189+
Percentile-position of score (0-100) relative to `a`.
190+
191+
Examples
192+
--------
193+
Three-quarters of the given values lie below a given score:
194+
195+
>>> percentileofscore([1, 2, 3, 4], 3)
196+
75.0
197+
198+
With multiple matches, note how the scores of the two matches, 0.6
199+
and 0.8 respectively, are averaged:
200+
201+
>>> percentileofscore([1, 2, 3, 3, 4], 3)
202+
70.0
203+
204+
Only 2/5 values are strictly less than 3:
205+
206+
>>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict')
207+
40.0
208+
209+
But 4/5 values are less than or equal to 3:
210+
211+
>>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak')
212+
80.0
213+
214+
The average between the weak and the strict scores is
215+
216+
>>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean')
217+
60.0
218+
219+
'''
220+
a = np.array(a)
221+
n = len(a)
222+
223+
if kind == 'rank':
224+
if not(np.any(a == score)):
225+
a = np.append(a, score)
226+
a_len = np.array(range(len(a)))
227+
else:
228+
a_len = np.array(range(len(a))) + 1.0
229+
230+
a = np.sort(a)
231+
idx = [a == score]
232+
pct = (np.mean(a_len[idx]) / n) * 100.0
233+
return pct
234+
235+
elif kind == 'strict':
236+
return sum(a < score) / float(n) * 100
237+
elif kind == 'weak':
238+
return sum(a <= score) / float(n) * 100
239+
elif kind == 'mean':
240+
return (sum(a < score) + sum(a <= score)) * 50 / float(n)
241+
else:
242+
raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'")

pandas/core/frame.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
from pandas.core.indexing import _NDFrameIndexer, _maybe_droplevels
3232
from pandas.core.internals import BlockManager, make_block, form_blocks
3333
from pandas.core.series import Series, _radd_compat
34+
from pandas.compat.scipy import scoreatpercentile as _quantile
3435
from pandas.util import py3compat
3536
from pandas.util.terminal import get_terminal_size
3637
from pandas.util.decorators import deprecate, Appender, Substitution
@@ -3810,7 +3811,6 @@ def quantile(self, q=0.5, axis=0):
38103811
-------
38113812
quantiles : Series
38123813
"""
3813-
from scipy.stats import scoreatpercentile
38143814
per = q * 100
38153815

38163816
def f(arr):
@@ -3821,7 +3821,7 @@ def f(arr):
38213821
if len(arr) == 0:
38223822
return nan
38233823
else:
3824-
return scoreatpercentile(arr, per)
3824+
return _quantile(arr, per)
38253825

38263826
return self.apply(f, axis=axis)
38273827

pandas/core/series.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
import pandas._tseries as lib
3131
from pandas.util.decorators import Appender, Substitution
3232

33+
from pandas.compat.scipy import scoreatpercentile as _quantile
34+
3335
__all__ = ['Series', 'TimeSeries']
3436

3537
_np_version = np.version.short_version
@@ -1249,11 +1251,10 @@ def quantile(self, q=0.5):
12491251
-------
12501252
quantile : float
12511253
"""
1252-
from scipy.stats import scoreatpercentile
12531254
valid_values = self.dropna().values
12541255
if len(valid_values) == 0:
12551256
return np.nan
1256-
return scoreatpercentile(valid_values, q * 100)
1257+
return _quantile(valid_values, q * 100)
12571258

12581259
def describe(self, percentile_width=50):
12591260
"""

pandas/stats/misc.py

+3-11
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33

44
from pandas.core.api import Series, DataFrame, isnull, notnull
55
from pandas.core.series import remove_na
6+
from pandas.compat.scipy import scoreatpercentile
7+
68

79
__all__ = ['bucket', 'bucketpanel']
810

@@ -293,16 +295,10 @@ def quantileTS(frame, percentile):
293295
percentile: int
294296
nth percentile
295297
296-
See also
297-
--------
298-
scipy.stats.scoreatpercentile
299-
300298
Returns
301299
-------
302300
Series (or TimeSeries)
303301
"""
304-
from scipy.stats import scoreatpercentile
305-
306302
def func(x):
307303
x = np.asarray(x.valid())
308304
if x.any():
@@ -340,15 +336,11 @@ def percentileRank(frame, column=None, kind='mean'):
340336
341337
http://en.wikipedia.org/wiki/Percentile_rank
342338
343-
See also
344-
--------
345-
scipy.stats.percentileofscore
346-
347339
Returns
348340
-------
349341
TimeSeries or DataFrame, depending on input
350342
"""
351-
from scipy.stats import percentileofscore
343+
from pandas.compat.scipy import percentileofscore
352344
fun = lambda xs, score: percentileofscore(remove_na(xs),
353345
score, kind=kind)
354346

pandas/tests/test_frame.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -4552,10 +4552,7 @@ def wrapper(x):
45524552
self._check_stat_op('median', wrapper, frame=self.intframe)
45534553

45544554
def test_quantile(self):
4555-
try:
4556-
from scipy.stats import scoreatpercentile
4557-
except ImportError:
4558-
return
4555+
from pandas.compat.scipy import scoreatpercentile
45594556

45604557
q = self.tsframe.quantile(0.1, axis=0)
45614558
self.assertEqual(q['A'], scoreatpercentile(self.tsframe['A'], 10))
@@ -4615,14 +4612,13 @@ def test_cumprod(self):
46154612
df.cumprod(1)
46164613

46174614
def test_rank(self):
4618-
from scipy.stats import rankdata
4615+
from pandas.compat.scipy import rankdata
46194616

46204617
self.frame['A'][::2] = np.nan
46214618
self.frame['B'][::3] = np.nan
46224619
self.frame['C'][::4] = np.nan
46234620
self.frame['D'][::5] = np.nan
46244621

4625-
46264622
ranks0 = self.frame.rank()
46274623
ranks1 = self.frame.rank(1)
46284624
mask = np.isnan(self.frame.values)

pandas/tests/test_groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1214,7 +1214,7 @@ def test_groupby_with_hier_columns(self):
12141214
self.assert_(result.columns.equals(df.columns[:-1]))
12151215

12161216
def test_pass_args_kwargs(self):
1217-
from scipy.stats import scoreatpercentile
1217+
from pandas.compat.scipy import scoreatpercentile
12181218

12191219
def f(x, q=None):
12201220
return scoreatpercentile(x, q)

pandas/tests/test_series.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1047,7 +1047,7 @@ def test_prod_numpy16_bug(self):
10471047
self.assert_(not isinstance(result, Series))
10481048

10491049
def test_quantile(self):
1050-
from scipy.stats import scoreatpercentile
1050+
from pandas.compat.scipy import scoreatpercentile
10511051

10521052
q = self.ts.quantile(0.1)
10531053
self.assertEqual(q, scoreatpercentile(self.ts.valid(), 10))
@@ -1698,7 +1698,7 @@ def test_order(self):
16981698
assert_almost_equal(expected, ordered.valid().values)
16991699

17001700
def test_rank(self):
1701-
from scipy.stats import rankdata
1701+
from pandas.compat.scipy import rankdata
17021702

17031703
self.ts[::2] = np.nan
17041704
self.ts[:10][::3] = 4.

0 commit comments

Comments
 (0)