ENH: implement qcut for quantile cuts, fix 32-bit build close #1378

wesm · wesm · commit 3e904fddda17 · 2012-06-04T22:27:40.000-04:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -80,6 +80,7 @@ pandas 0.8.0
   - Add Panel.transpose method for rearranging axes (#695)
   - Add new ``cut`` function (patterned after R) for discretizing data into
     equal range-length bins or arbitrary breaks of your choosing (#415)
+  - Add new ``qcut`` for cutting with quantiles (#1378)
   - Added Andrews curves plot tupe (#1325)
   - Add support for tox and Travis CI (#1382)
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -7,6 +7,7 @@
 
 import pandas.core.common as com
 import pandas.lib as lib
+import pandas._algos as _algos
 
 def match(to_match, values, na_sentinel=-1):
     """
@@ -179,6 +180,77 @@ def rank(values, axis=0, method='average', na_option='keep',
                   ascending=ascending)
     return ranks
 
+def quantile(x, q, interpolation_method='fraction'):
+    """
+    Compute sample quantile or quantiles of the input array. For example, q=0.5
+    computes the median.
+
+    The `interpolation_method` parameter supports three values, namely
+    `fraction` (default), `lower` and `higher`. Interpolation is done only,
+    if the desired quantile lies between two data points `i` and `j`. For
+    `fraction`, the result is an interpolated value between `i` and `j`;
+    for `lower`, the result is `i`, for `higher` the result is `j`.
+
+    Parameters
+    ----------
+    a : ndarray
+        Values from which to extract score.
+    q : scalar or array
+        Percentile at which to extract score.
+    interpolation : {'fraction', 'lower', 'higher'}, optional
+        This optional parameter specifies the interpolation method to use,
+        when the desired quantile lies between two data points `i` and `j`:
+
+        - fraction: `i + (j - i)*fraction`, where `fraction` is the
+                    fractional part of the index surrounded by `i` and `j`.
+        -lower: `i`.
+        - higher: `j`.
+
+    Returns
+    -------
+    score : float
+        Score at percentile.
+
+    Examples
+    --------
+    >>> from scipy import stats
+    >>> a = np.arange(100)
+    >>> stats.scoreatpercentile(a, 50)
+    49.5
+
+    """
+    values = np.sort(x)
+
+    def _get_score(at):
+        idx = at * (len(values) - 1)
+        if (idx % 1 == 0):
+            score = values[idx]
+        else:
+            if interpolation_method == 'fraction':
+                score = _interpolate(values[int(idx)], values[int(idx) + 1],
+                                     idx % 1)
+            elif interpolation_method == 'lower':
+                score = values[np.floor(idx)]
+            elif interpolation_method == 'higher':
+                score = values[np.ceil(idx)]
+            else:
+                raise ValueError("interpolation_method can only be 'fraction', " \
+                                 "'lower' or 'higher'")
+
+        return score
+
+    if np.isscalar(q):
+        return _get_score(q)
+    else:
+        q = np.asarray(q, np.float64)
+        return _algos.arrmap_float64(q, _get_score)
+
+def _interpolate(a, b, fraction):
+    """Returns the point at the given fraction between a and b, where
+    'fraction' must be between 0 and 1.
+    """
+    return a + (b - a)*fraction
+
 
 def _get_data_algo(values, func_map):
     if com.is_float_dtype(values):
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1083,8 +1083,8 @@ def value_counts(self):
         -------
         counts : Series
         """
-        import pandas.core.algorithms as algos
-        return algos.value_counts(self.values, sort=True, ascending=False)
+        from pandas.core.algorithms import value_counts
+        return value_counts(self.values, sort=True, ascending=False)
 
     def unique(self):
         """
diff --git a/pandas/src/datetime.pyx b/pandas/src/datetime.pyx
@@ -887,8 +887,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
     result_b.fill(NPY_NAT)
 
     # left side
-    idx_shifted = np.maximum(0, trans.searchsorted(vals - DAY_NS,
-                                                   side='right') - 1)
+    idx_shifted = _ensure_int64(
+        np.maximum(0, trans.searchsorted(vals - DAY_NS, side='right') - 1))
 
     for i in range(n):
         v = vals[i] - deltas[idx_shifted[i]]
@@ -899,8 +899,8 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
             result_a[i] = v
 
     # right side
-    idx_shifted = np.maximum(0, trans.searchsorted(vals + DAY_NS,
-                                                   side='right') - 1)
+    idx_shifted = _ensure_int64(
+        np.maximum(0, trans.searchsorted(vals + DAY_NS, side='right') - 1))
 
     for i in range(n):
         v = vals[i] - deltas[idx_shifted[i]]
@@ -929,6 +929,16 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz):
 
     return result
 
+cdef _ensure_int64(object arr):
+    if util.is_array(arr):
+        if (<ndarray> arr).descr.type_num == NPY_INT64:
+            return arr
+        else:
+            return arr.astype(np.int64)
+    else:
+        return np.array(arr, dtype=np.int64)
+
+
 cdef inline bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n):
     cdef Py_ssize_t pivot, left = 0, right = n
 
diff --git a/pandas/tools/tests/test_tile.py b/pandas/tools/tests/test_tile.py
@@ -7,7 +7,8 @@
 import pandas.util.testing as tm
 import pandas.core.common as com
 
-from pandas.tools.tile import cut
+from pandas.core.algorithms import quantile
+from pandas.tools.tile import cut, qcut
 
 from numpy.testing import assert_equal, assert_almost_equal
 
@@ -84,6 +85,20 @@ def test_na_handling(self):
         ex_labels = np.where(com.isnull(arr), np.nan, labels)
         tm.assert_almost_equal(labels, ex_labels)
 
+    def test_qcut(self):
+        arr = np.random.randn(1000)
+
+        labels, bins = qcut(arr, 4, retbins=True)
+
+        ex_bins = quantile(arr, [0, .25, .5, .75, 1.])
+
+        assert_almost_equal(bins, ex_bins)
+
+        ex_labels = cut(arr, ex_bins)
+
+        self.assert_(np.array_equal(labels, ex_labels))
+
+
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'],
                    exit=False)
diff --git a/pandas/tools/tile.py b/pandas/tools/tile.py
@@ -3,6 +3,7 @@
 """
 
 from pandas.core.api import DataFrame, Series
+import pandas.core.algorithms as algos
 import pandas.core.common as com
 import pandas.core.nanops as nanops
 
@@ -92,13 +93,56 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
         if (np.diff(bins) < 0).any():
             raise ValueError('bins must increase monotonically.')
 
+    return _bins_to_cuts(x, bins, right=right, labels=labels,
+                         retbins=retbins, precision=precision)
+
+
+
+def qcut(x, q=4, labels=None, retbins=False, precision=3):
+    """
+    Quantile-based discretization function. Discretize variable into
+    equal-sized buckets based on rank or based on sample quantiles. For example
+    1000 values for 10 quantiles would produce 1000 integers from 0 to 9
+    indicating the
+
+    Parameters
+    ----------
+    x : ndarray or Series
+    q : integer or array of quantiles
+        Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
+        array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles
+    labels : array or boolean, default None
+        Labels to use for bin edges, or False to return integer bin labels
+    retbins : bool, optional
+        Whether to return the bins or not. Can be useful if bins is given
+        as a scalar.
+
+    Returns
+    -------
+
+    Notes
+    -----
+
+    Examples
+    --------
+    """
+    if com.is_integer(q):
+        quantiles = np.linspace(0, 1, q + 1)
+        bins = algos.quantile(x, quantiles)
+        return _bins_to_cuts(x, bins, labels=labels, retbins=retbins,
+                             precision=precision)
+    else:
+        raise NotImplementedError
+
+
+def _bins_to_cuts(x, bins, right=True, labels=None, retbins=False,
+                  precision=3):
     side = 'left' if right else 'right'
     ids = bins.searchsorted(x, side=side)
 
     mask = com.isnull(x)
     has_nas = mask.any()
 
-
     if labels is not False:
         if labels is None:
             labels = bins
@@ -132,35 +176,6 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3):
     return labels, bins
 
 
-def qcut(x, n, ties_method='average'):
-    """
-    Quantile-based discretization function. Discretize variable into
-    equal-sized buckets based on rank. For example 1000 values for 10 quantiles
-    would produce 1000 integers from 0 to 9 indicating the
-
-    Parameters
-    ----------
-    x : ndarray or Series
-    n : integer
-        Number of quantiles. 10 for deciles, 4 for quartiles, etc.
-    ties_method : {'average', 'min', 'max', 'first'}, default 'average'
-        average: average rank of group
-        min: lowest rank in group
-        max: highest rank in group
-        first: ranks assigned in order they appear in the array
-
-    Returns
-    -------
-
-    Notes
-    -----
-
-    Examples
-    --------
-    """
-    pass
-
-
 def _format_label(x, precision=3):
     fmt_str = '%%.%dg' % precision
     if com.is_float(x):
diff --git a/scripts/count_code.sh b/scripts/count_code.sh
@@ -1 +1 @@
-cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c"
+cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c|generated.c|plib.c"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c\|sandbox.c\|engines.c\|sparse.c\|generated.c"`
	`1`	`+cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c\|sandbox.c\|engines.c\|sparse.c\|generated.c\|plib.c"`