Skip to content

Commit 3e904fd

Browse files
committed
ENH: implement qcut for quantile cuts, fix 32-bit build close #1378
1 parent 6e46099 commit 3e904fd

File tree

7 files changed

+151
-38
lines changed

7 files changed

+151
-38
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ pandas 0.8.0
8080
- Add Panel.transpose method for rearranging axes (#695)
8181
- Add new ``cut`` function (patterned after R) for discretizing data into
8282
equal range-length bins or arbitrary breaks of your choosing (#415)
83+
- Add new ``qcut`` for cutting with quantiles (#1378)
8384
- Added Andrews curves plot tupe (#1325)
8485
- Add support for tox and Travis CI (#1382)
8586

pandas/core/algorithms.py

+72
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
import pandas.core.common as com
99
import pandas.lib as lib
10+
import pandas._algos as _algos
1011

1112
def match(to_match, values, na_sentinel=-1):
1213
"""
@@ -179,6 +180,77 @@ def rank(values, axis=0, method='average', na_option='keep',
179180
ascending=ascending)
180181
return ranks
181182

183+
def quantile(x, q, interpolation_method='fraction'):
184+
"""
185+
Compute sample quantile or quantiles of the input array. For example, q=0.5
186+
computes the median.
187+
188+
The `interpolation_method` parameter supports three values, namely
189+
`fraction` (default), `lower` and `higher`. Interpolation is done only,
190+
if the desired quantile lies between two data points `i` and `j`. For
191+
`fraction`, the result is an interpolated value between `i` and `j`;
192+
for `lower`, the result is `i`, for `higher` the result is `j`.
193+
194+
Parameters
195+
----------
196+
a : ndarray
197+
Values from which to extract score.
198+
q : scalar or array
199+
Percentile at which to extract score.
200+
interpolation : {'fraction', 'lower', 'higher'}, optional
201+
This optional parameter specifies the interpolation method to use,
202+
when the desired quantile lies between two data points `i` and `j`:
203+
204+
- fraction: `i + (j - i)*fraction`, where `fraction` is the
205+
fractional part of the index surrounded by `i` and `j`.
206+
-lower: `i`.
207+
- higher: `j`.
208+
209+
Returns
210+
-------
211+
score : float
212+
Score at percentile.
213+
214+
Examples
215+
--------
216+
>>> from scipy import stats
217+
>>> a = np.arange(100)
218+
>>> stats.scoreatpercentile(a, 50)
219+
49.5
220+
221+
"""
222+
values = np.sort(x)
223+
224+
def _get_score(at):
225+
idx = at * (len(values) - 1)
226+
if (idx % 1 == 0):
227+
score = values[idx]
228+
else:
229+
if interpolation_method == 'fraction':
230+
score = _interpolate(values[int(idx)], values[int(idx) + 1],
231+
idx % 1)
232+
elif interpolation_method == 'lower':
233+
score = values[np.floor(idx)]
234+
elif interpolation_method == 'higher':
235+
score = values[np.ceil(idx)]
236+
else:
237+
raise ValueError("interpolation_method can only be 'fraction', " \
238+
"'lower' or 'higher'")
239+
240+
return score
241+
242+
if np.isscalar(q):
243+
return _get_score(q)
244+
else:
245+
q = np.asarray(q, np.float64)
246+
return _algos.arrmap_float64(q, _get_score)
247+
248+
def _interpolate(a, b, fraction):
249+
"""Returns the point at the given fraction between a and b, where
250+
'fraction' must be between 0 and 1.
251+
"""
252+
return a + (b - a)*fraction
253+
182254

183255
def _get_data_algo(values, func_map):
184256
if com.is_float_dtype(values):

pandas/core/series.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1083,8 +1083,8 @@ def value_counts(self):
10831083
-------
10841084
counts : Series
10851085
"""
1086-
import pandas.core.algorithms as algos
1087-
return algos.value_counts(self.values, sort=True, ascending=False)
1086+
from pandas.core.algorithms import value_counts
1087+
return value_counts(self.values, sort=True, ascending=False)
10881088

10891089
def unique(self):
10901090
"""

pandas/src/datetime.pyx

+14-4
Original file line numberDiff line numberDiff line change
@@ -887,8 +887,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
887887
result_b.fill(NPY_NAT)
888888

889889
# left side
890-
idx_shifted = np.maximum(0, trans.searchsorted(vals - DAY_NS,
891-
side='right') - 1)
890+
idx_shifted = _ensure_int64(
891+
np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1))
892892

893893
for i in range(n):
894894
v = vals[i] - deltas[idx_shifted[i]]
@@ -899,8 +899,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
899899
result_a[i] = v
900900

901901
# right side
902-
idx_shifted = np.maximum(0, trans.searchsorted(vals + DAY_NS,
903-
side='right') - 1)
902+
idx_shifted = _ensure_int64(
903+
np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1))
904904

905905
for i in range(n):
906906
v = vals[i] - deltas[idx_shifted[i]]
@@ -929,6 +929,16 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
929929

930930
return result
931931

932+
cdef _ensure_int64(object arr):
933+
if util.is_array(arr):
934+
if (<ndarray> arr).descr.type_num == NPY_INT64:
935+
return arr
936+
else:
937+
return arr.astype(np.int64)
938+
else:
939+
return np.array(arr, dtype=np.int64)
940+
941+
932942
cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
933943
cdef Py_ssize_t pivot, left = 0, right = n
934944

pandas/tools/tests/test_tile.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
import pandas.util.testing as tm
88
import pandas.core.common as com
99

10-
from pandas.tools.tile import cut
10+
from pandas.core.algorithms import quantile
11+
from pandas.tools.tile import cut, qcut
1112

1213
from numpy.testing import assert_equal, assert_almost_equal
1314

@@ -84,6 +85,20 @@ def test_na_handling(self):
8485
ex_labels = np.where(com.isnull(arr), np.nan, labels)
8586
tm.assert_almost_equal(labels, ex_labels)
8687

88+
def test_qcut(self):
89+
arr = np.random.randn(1000)
90+
91+
labels, bins = qcut(arr, 4, retbins=True)
92+
93+
ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
94+
95+
assert_almost_equal(bins, ex_bins)
96+
97+
ex_labels = cut(arr, ex_bins)
98+
99+
self.assert_(np.array_equal(labels, ex_labels))
100+
101+
87102
if __name__ == '__main__':
88103
nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
89104
exit=False)

pandas/tools/tile.py

+45-30
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from pandas.core.api import DataFrame, Series
6+
import pandas.core.algorithms as algos
67
import pandas.core.common as com
78
import pandas.core.nanops as nanops
89

@@ -92,13 +93,56 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
9293
if (np.diff(bins) < 0).any():
9394
raise ValueError('bins must increase monotonically.')
9495

96+
return _bins_to_cuts(x, bins, right=right, labels=labels,
97+
retbins=retbins, precision=precision)
98+
99+
100+
101+
def qcut(x, q=4, labels=None, retbins=False, precision=3):
102+
"""
103+
Quantile-based discretization function. Discretize variable into
104+
equal-sized buckets based on rank or based on sample quantiles. For example
105+
1000 values for 10 quantiles would produce 1000 integers from 0 to 9
106+
indicating the
107+
108+
Parameters
109+
----------
110+
x : ndarray or Series
111+
q : integer or array of quantiles
112+
Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
113+
array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
114+
labels : array or boolean, default None
115+
Labels to use for bin edges, or False to return integer bin labels
116+
retbins : bool, optional
117+
Whether to return the bins or not. Can be useful if bins is given
118+
as a scalar.
119+
120+
Returns
121+
-------
122+
123+
Notes
124+
-----
125+
126+
Examples
127+
--------
128+
"""
129+
if com.is_integer(q):
130+
quantiles = np.linspace(0, 1, q + 1)
131+
bins = algos.quantile(x, quantiles)
132+
return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
133+
precision=precision)
134+
else:
135+
raise NotImplementedError
136+
137+
138+
def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
139+
precision=3):
95140
side = 'left' if right else 'right'
96141
ids = bins.searchsorted(x, side=side)
97142

98143
mask = com.isnull(x)
99144
has_nas = mask.any()
100145

101-
102146
if labels is not False:
103147
if labels is None:
104148
labels = bins
@@ -132,35 +176,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
132176
return labels, bins
133177

134178

135-
def qcut(x, n, ties_method='average'):
136-
"""
137-
Quantile-based discretization function. Discretize variable into
138-
equal-sized buckets based on rank. For example 1000 values for 10 quantiles
139-
would produce 1000 integers from 0 to 9 indicating the
140-
141-
Parameters
142-
----------
143-
x : ndarray or Series
144-
n : integer
145-
Number of quantiles. 10 for deciles, 4 for quartiles, etc.
146-
ties_method : {'average', 'min', 'max', 'first'}, default 'average'
147-
average: average rank of group
148-
min: lowest rank in group
149-
max: highest rank in group
150-
first: ranks assigned in order they appear in the array
151-
152-
Returns
153-
-------
154-
155-
Notes
156-
-----
157-
158-
Examples
159-
--------
160-
"""
161-
pass
162-
163-
164179
def _format_label(x, precision=3):
165180
fmt_str = '%%.%dg' % precision
166181
if com.is_float(x):

scripts/count_code.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c"
1+
cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c|plib.c"

0 commit comments

Comments
 (0)