ENH: stats.scoreatpercentile

jjhelmus · jjhelmus · commit 1cdd08b62f52 · 2012-12-05T12:17:21.000-05:00
stats.scoreatpercentile now accepts sequences of percentiles and axis
parameter. Function adapted from NumPy's percentile function.
diff --git a/scipy/stats/stats.py b/scipy/stats/stats.py
@@ -1404,13 +1404,8 @@ def itemfreq(a):
     return array(_support.abut(scores, freq))
 
 
-def _interpolate(a, b, fraction):
-    """Returns the point at the given fraction between a and b, where
-    'fraction' must be between 0 and 1.
-    """
-    return a + (b - a)*fraction;
-
-def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
+def scoreatpercentile(a, per, limit=(), interpolation_method='fraction',
+        axis=None):
     """
     Calculate the score at the given `per` percentile of the sequence `a`.
 
@@ -1428,9 +1423,9 @@ def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
 
     Parameters
     ----------
-    a : ndarray
+    a : array_like
         Values from which to extract score.
-    per : scalar
+    per : scalar in range [0,100] (or sequence of floats)
         Percentile at which to extract score.
     limit : tuple, optional
         Tuple of two scalars, the lower and upper limits within which to
@@ -1443,10 +1438,13 @@ def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
                     fractional part of the index surrounded by `i` and `j`.
         -lower: `i`.
         - higher: `j`.
+    axis : int, optional
+        Axis along which the percentiles are computed. The default (None)
+        is to compute the median along a flattened version of the array.
 
     Returns
     -------
-    score : float
+    score : float (or sequence of floats)
         Score at percentile.
 
     See Also
@@ -1461,29 +1459,64 @@ def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
     49.5
 
     """
-    # TODO: this should be a simple wrapper around a well-written quantile
-    # function.  GNU R provides 9 quantile algorithms (!), with differing
-    # behaviour at, for example, discontinuities.
-    values = np.sort(a, axis=0)
+    # adapted from NumPy's percentile function
+    a = np.asarray(a)
+
     if limit:
-        values = values[(limit[0] <= values) & (values <= limit[1])]
+        a = a[(limit[0] <= a) & (a <= limit[1])]
 
-    idx = per /100. * (values.shape[0] - 1)
-    if (idx % 1 == 0):
-        score = values[idx]
-    else:
-        if interpolation_method == 'fraction':
-            score = _interpolate(values[int(idx)], values[int(idx) + 1],
-                                 idx % 1)
-        elif interpolation_method == 'lower':
-            score = values[np.floor(idx)]
+    if per == 0:
+        return a.min(axis=axis)
+    elif per == 100:
+        return a.max(axis=axis)
+
+    sorted = np.sort(a, axis=axis)
+    if axis is None:
+        axis = 0
+
+    return _compute_qth_percentile(sorted, per, interpolation_method, axis)
+
+
+# handle sequence of per's without calling sort multiple times
+def _compute_qth_percentile(sorted, per, interpolation_method, axis):
+    if not np.isscalar(per):
+        return [_compute_qth_percentile(sorted, i, interpolation_method, axis)
+             for i in per]
+
+    if (per < 0) or (per > 100):
+        raise ValueError("percentile must be in the range [0, 100]")
+
+    indexer = [slice(None)] * sorted.ndim
+    idx = per / 100. * (sorted.shape[axis] - 1)
+
+    if int(idx) != idx:
+        # round fractional indices according to interpolation method
+        if interpolation_method == 'lower':
+            idx = np.floor(idx)
         elif interpolation_method == 'higher':
-            score = values[np.ceil(idx)]
+            idx = np.ceil(idx)
+        elif interpolation_method == 'fraction':
+            pass  # keep idx as fraction and interpolate
         else:
             raise ValueError("interpolation_method can only be 'fraction', " \
                              "'lower' or 'higher'")
 
-    return score
+    i = int(idx)
+    if i == idx:
+        indexer[axis] = slice(i, i + 1)
+        weights = array(1)
+        sumval = 1.0
+    else:
+        indexer[axis] = slice(i, i + 2)
+        j = i + 1
+        weights = array([(j - idx), (idx - i)], float)
+        wshape = [1] * sorted.ndim
+        wshape[axis] = 2
+        weights.shape = wshape
+        sumval = weights.sum()
+
+    # Use np.add.reduce to coerce data type
+    return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
 
 
 def percentileofscore(a, score, kind='rank'):
diff --git a/scipy/stats/tests/test_stats.py b/scipy/stats/tests/test_stats.py
@@ -993,9 +993,9 @@ def test_2D_array_dim1(self):
 
 class TestScoreatpercentile(TestCase):
     def setUp(self):
-        self.a1 = [3,4,5,10,-3,-5,6]
-        self.a2 = [3,-6,-2,8,7,4,2,1]
-        self.a3 = [3.,4,5,10,-3,-5,-6,7.0]
+        self.a1 = [3, 4, 5, 10, -3, -5, 6]
+        self.a2 = [3, -6, -2, 8, 7, 4, 2, 1]
+        self.a3 = [3., 4, 5, 10, -3, -5, -6, 7.0]
 
     def test_basic(self):
         x = arange(8) * 0.5
@@ -1009,33 +1009,29 @@ def test_2D(self):
                    [4, 4, 3],
                    [1, 1, 1],
                    [1, 1, 1]])
-        assert_array_equal(stats.scoreatpercentile(x, 50), [1,1,1])
+        assert_array_equal(stats.scoreatpercentile(x, 50), [1, 1, 1])
 
     def test_fraction(self):
         scoreatperc = stats.scoreatpercentile
 
         # Test defaults
         assert_equal(scoreatperc(range(10), 50), 4.5)
-        assert_equal(scoreatperc(range(10), 50, (2,7)), 4.5)
+        assert_equal(scoreatperc(range(10), 50, (2, 7)), 4.5)
         assert_equal(scoreatperc(range(100), 50, limit=(1, 8)), 4.5)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (10,100)), 55)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (1,10)), 5.5)
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (10, 100)), 55)
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (1, 10)), 5.5)
 
         # explicitly specify interpolation_method 'fraction' (the default)
-        assert_equal(scoreatperc(range(10), 50, interpolation_method='fraction'),
-                     4.5)
+        assert_equal(scoreatperc(range(10), 50,
+                                interpolation_method='fraction'), 4.5)
         assert_equal(scoreatperc(range(10), 50, limit=(2, 7),
-                                 interpolation_method='fraction'),
-                     4.5)
+                                 interpolation_method='fraction'), 4.5)
         assert_equal(scoreatperc(range(100), 50, limit=(1, 8),
-                                 interpolation_method='fraction'),
-                     4.5)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (10, 100),
-                                 interpolation_method='fraction'),
-                     55)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (1,10),
-                                 interpolation_method='fraction'),
-                     5.5)
+                                 interpolation_method='fraction'), 4.5)
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (10, 100),
+                                 interpolation_method='fraction'), 55)
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (1, 10),
+                                 interpolation_method='fraction'), 5.5)
 
     def test_lower_higher(self):
         scoreatperc = stats.scoreatpercentile
@@ -1045,23 +1041,45 @@ def test_lower_higher(self):
                                  interpolation_method='lower'), 4)
         assert_equal(scoreatperc(range(10), 50,
                                  interpolation_method='higher'), 5)
-        assert_equal(scoreatperc(range(10), 50, (2,7),
+        assert_equal(scoreatperc(range(10), 50, (2, 7),
                                  interpolation_method='lower'), 4)
-        assert_equal(scoreatperc(range(10), 50, limit=(2,7),
+        assert_equal(scoreatperc(range(10), 50, limit=(2, 7),
                                  interpolation_method='higher'), 5)
-        assert_equal(scoreatperc(range(100), 50, (1,8),
+        assert_equal(scoreatperc(range(100), 50, (1, 8),
                                  interpolation_method='lower'), 4)
-        assert_equal(scoreatperc(range(100), 50, (1,8),
+        assert_equal(scoreatperc(range(100), 50, (1, 8),
                                  interpolation_method='higher'), 5)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (10,100),
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (10, 100),
                                  interpolation_method='lower'), 10)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, limit=(10, 100),
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, limit=(10, 100),
                                  interpolation_method='higher'), 100)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, (1,10),
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, (1, 10),
                                  interpolation_method='lower'), 1)
-        assert_equal(scoreatperc(np.array([1, 10 ,100]), 50, limit=(1,10),
+        assert_equal(scoreatperc(np.array([1, 10, 100]), 50, limit=(1, 10),
                                  interpolation_method='higher'), 10)
 
+    def test_sequence(self):
+        x = arange(8) * 0.5
+        assert_equal(stats.scoreatpercentile(x, [0, 100, 50]), [0, 3.5, 1.75])
+
+    def test_axis(self):
+        scoreatperc = stats.scoreatpercentile
+        x = arange(12).reshape(3, 4)
+
+        assert_equal(scoreatperc(x, (25, 50, 100)), [2.75, 5.5, 11.0])
+
+        r0 = [[2, 3, 4, 5], [4, 5, 6, 7], [8, 9, 10, 11]]
+        assert_equal(scoreatperc(x, (25, 50, 100), axis=0), r0)
+
+        r1 = [[0.75, 4.75, 8.75], [1.5, 5.5, 9.5], [3, 7, 11]]
+        assert_equal(scoreatperc(x, (25, 50, 100), axis=1), r1)
+
+    def test_exception(self):
+        assert_raises(ValueError, stats.scoreatpercentile, [1, 2], 56,
+            interpolation_method='foobar')
+        assert_raises(ValueError, stats.scoreatpercentile, [1], 101)
+        assert_raises(ValueError, stats.scoreatpercentile, [1], -1)
+
 
 class TestCMedian(TestCase):
     def test_basic(self):