|
| 1 | +""" |
| 2 | +Shipping functions from SciPy to reduce dependency on having SciPy installed |
| 3 | +""" |
| 4 | + |
| 5 | +import numpy as np |
| 6 | + |
| 7 | + |
| 8 | +def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'): |
| 9 | + """ |
| 10 | + Calculate the score at the given `per` percentile of the sequence `a`. |
| 11 | +
|
| 12 | + For example, the score at `per=50` is the median. If the desired quantile |
| 13 | + lies between two data points, we interpolate between them, according to |
| 14 | + the value of `interpolation`. If the parameter `limit` is provided, it |
| 15 | + should be a tuple (lower, upper) of two values. Values of `a` outside |
| 16 | + this (closed) interval will be ignored. |
| 17 | +
|
| 18 | + The `interpolation_method` parameter supports three values, namely |
| 19 | + `fraction` (default), `lower` and `higher`. Interpolation is done only, |
| 20 | + if the desired quantile lies between two data points `i` and `j`. For |
| 21 | + `fraction`, the result is an interpolated value between `i` and `j`; |
| 22 | + for `lower`, the result is `i`, for `higher` the result is `j`. |
| 23 | +
|
| 24 | + Parameters |
| 25 | + ---------- |
| 26 | + a : ndarray |
| 27 | + Values from which to extract score. |
| 28 | + per : scalar |
| 29 | + Percentile at which to extract score. |
| 30 | + limit : tuple, optional |
| 31 | + Tuple of two scalars, the lower and upper limits within which to |
| 32 | + compute the percentile. |
| 33 | + interpolation : {'fraction', 'lower', 'higher'}, optional |
| 34 | + This optional parameter specifies the interpolation method to use, |
| 35 | + when the desired quantile lies between two data points `i` and `j`: |
| 36 | +
|
| 37 | + - fraction: `i + (j - i)*fraction`, where `fraction` is the |
| 38 | + fractional part of the index surrounded by `i` and `j`. |
| 39 | + -lower: `i`. |
| 40 | + - higher: `j`. |
| 41 | +
|
| 42 | + Returns |
| 43 | + ------- |
| 44 | + score : float |
| 45 | + Score at percentile. |
| 46 | +
|
| 47 | + See Also |
| 48 | + -------- |
| 49 | + percentileofscore |
| 50 | +
|
| 51 | + Examples |
| 52 | + -------- |
| 53 | + >>> from scipy import stats |
| 54 | + >>> a = np.arange(100) |
| 55 | + >>> stats.scoreatpercentile(a, 50) |
| 56 | + 49.5 |
| 57 | +
|
| 58 | + """ |
| 59 | + # TODO: this should be a simple wrapper around a well-written quantile |
| 60 | + # function. GNU R provides 9 quantile algorithms (!), with differing |
| 61 | + # behaviour at, for example, discontinuities. |
| 62 | + values = np.sort(a, axis=0) |
| 63 | + if limit: |
| 64 | + values = values[(limit[0] <= values) & (values <= limit[1])] |
| 65 | + |
| 66 | + idx = per /100. * (values.shape[0] - 1) |
| 67 | + if (idx % 1 == 0): |
| 68 | + score = values[idx] |
| 69 | + else: |
| 70 | + if interpolation_method == 'fraction': |
| 71 | + score = _interpolate(values[int(idx)], values[int(idx) + 1], |
| 72 | + idx % 1) |
| 73 | + elif interpolation_method == 'lower': |
| 74 | + score = values[np.floor(idx)] |
| 75 | + elif interpolation_method == 'higher': |
| 76 | + score = values[np.ceil(idx)] |
| 77 | + else: |
| 78 | + raise ValueError("interpolation_method can only be 'fraction', " \ |
| 79 | + "'lower' or 'higher'") |
| 80 | + |
| 81 | + return score |
| 82 | + |
| 83 | + |
| 84 | +def _interpolate(a, b, fraction): |
| 85 | + """Returns the point at the given fraction between a and b, where |
| 86 | + 'fraction' must be between 0 and 1. |
| 87 | + """ |
| 88 | + return a + (b - a)*fraction |
| 89 | + |
| 90 | + |
| 91 | +def rankdata(a): |
| 92 | + """ |
| 93 | + Ranks the data, dealing with ties appropriately. |
| 94 | +
|
| 95 | + Equal values are assigned a rank that is the average of the ranks that |
| 96 | + would have been otherwise assigned to all of the values within that set. |
| 97 | + Ranks begin at 1, not 0. |
| 98 | +
|
| 99 | + Parameters |
| 100 | + ---------- |
| 101 | + a : array_like |
| 102 | + This array is first flattened. |
| 103 | +
|
| 104 | + Returns |
| 105 | + ------- |
| 106 | + rankdata : ndarray |
| 107 | + An array of length equal to the size of `a`, containing rank scores. |
| 108 | +
|
| 109 | + Examples |
| 110 | + -------- |
| 111 | + >>> stats.rankdata([0, 2, 2, 3]) |
| 112 | + array([ 1. , 2.5, 2.5, 4. ]) |
| 113 | +
|
| 114 | + """ |
| 115 | + a = np.ravel(a) |
| 116 | + n = len(a) |
| 117 | + svec, ivec = fastsort(a) |
| 118 | + sumranks = 0 |
| 119 | + dupcount = 0 |
| 120 | + newarray = np.zeros(n, float) |
| 121 | + for i in xrange(n): |
| 122 | + sumranks += i |
| 123 | + dupcount += 1 |
| 124 | + if i==n-1 or svec[i] != svec[i+1]: |
| 125 | + averank = sumranks / float(dupcount) + 1 |
| 126 | + for j in xrange(i-dupcount+1,i+1): |
| 127 | + newarray[ivec[j]] = averank |
| 128 | + sumranks = 0 |
| 129 | + dupcount = 0 |
| 130 | + return newarray |
| 131 | + |
| 132 | + |
| 133 | +def fastsort(a): |
| 134 | + """ |
| 135 | + Sort an array and provide the argsort. |
| 136 | +
|
| 137 | + Parameters |
| 138 | + ---------- |
| 139 | + a : array_like |
| 140 | + Input array. |
| 141 | +
|
| 142 | + Returns |
| 143 | + ------- |
| 144 | + fastsort : ndarray of type int |
| 145 | + sorted indices into the original array |
| 146 | +
|
| 147 | + """ |
| 148 | + # TODO: the wording in the docstring is nonsense. |
| 149 | + it = np.argsort(a) |
| 150 | + as_ = a[it] |
| 151 | + return as_, it |
| 152 | + |
| 153 | + |
| 154 | +def percentileofscore(a, score, kind='rank'): |
| 155 | + ''' |
| 156 | + The percentile rank of a score relative to a list of scores. |
| 157 | +
|
| 158 | + A `percentileofscore` of, for example, 80% means that 80% of the |
| 159 | + scores in `a` are below the given score. In the case of gaps or |
| 160 | + ties, the exact definition depends on the optional keyword, `kind`. |
| 161 | +
|
| 162 | + Parameters |
| 163 | + ---------- |
| 164 | + a: array like |
| 165 | + Array of scores to which `score` is compared. |
| 166 | + score: int or float |
| 167 | + Score that is compared to the elements in `a`. |
| 168 | + kind: {'rank', 'weak', 'strict', 'mean'}, optional |
| 169 | + This optional parameter specifies the interpretation of the |
| 170 | + resulting score: |
| 171 | +
|
| 172 | + - "rank": Average percentage ranking of score. In case of |
| 173 | + multiple matches, average the percentage rankings of |
| 174 | + all matching scores. |
| 175 | + - "weak": This kind corresponds to the definition of a cumulative |
| 176 | + distribution function. A percentileofscore of 80% |
| 177 | + means that 80% of values are less than or equal |
| 178 | + to the provided score. |
| 179 | + - "strict": Similar to "weak", except that only values that are |
| 180 | + strictly less than the given score are counted. |
| 181 | + - "mean": The average of the "weak" and "strict" scores, often used in |
| 182 | + testing. See |
| 183 | +
|
| 184 | + http://en.wikipedia.org/wiki/Percentile_rank |
| 185 | +
|
| 186 | + Returns |
| 187 | + ------- |
| 188 | + pcos : float |
| 189 | + Percentile-position of score (0-100) relative to `a`. |
| 190 | +
|
| 191 | + Examples |
| 192 | + -------- |
| 193 | + Three-quarters of the given values lie below a given score: |
| 194 | +
|
| 195 | + >>> percentileofscore([1, 2, 3, 4], 3) |
| 196 | + 75.0 |
| 197 | +
|
| 198 | + With multiple matches, note how the scores of the two matches, 0.6 |
| 199 | + and 0.8 respectively, are averaged: |
| 200 | +
|
| 201 | + >>> percentileofscore([1, 2, 3, 3, 4], 3) |
| 202 | + 70.0 |
| 203 | +
|
| 204 | + Only 2/5 values are strictly less than 3: |
| 205 | +
|
| 206 | + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='strict') |
| 207 | + 40.0 |
| 208 | +
|
| 209 | + But 4/5 values are less than or equal to 3: |
| 210 | +
|
| 211 | + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='weak') |
| 212 | + 80.0 |
| 213 | +
|
| 214 | + The average between the weak and the strict scores is |
| 215 | +
|
| 216 | + >>> percentileofscore([1, 2, 3, 3, 4], 3, kind='mean') |
| 217 | + 60.0 |
| 218 | +
|
| 219 | + ''' |
| 220 | + a = np.array(a) |
| 221 | + n = len(a) |
| 222 | + |
| 223 | + if kind == 'rank': |
| 224 | + if not(np.any(a == score)): |
| 225 | + a = np.append(a, score) |
| 226 | + a_len = np.array(range(len(a))) |
| 227 | + else: |
| 228 | + a_len = np.array(range(len(a))) + 1.0 |
| 229 | + |
| 230 | + a = np.sort(a) |
| 231 | + idx = [a == score] |
| 232 | + pct = (np.mean(a_len[idx]) / n) * 100.0 |
| 233 | + return pct |
| 234 | + |
| 235 | + elif kind == 'strict': |
| 236 | + return sum(a < score) / float(n) * 100 |
| 237 | + elif kind == 'weak': |
| 238 | + return sum(a <= score) / float(n) * 100 |
| 239 | + elif kind == 'mean': |
| 240 | + return (sum(a < score) + sum(a <= score)) * 50 / float(n) |
| 241 | + else: |
| 242 | + raise ValueError("kind can only be 'rank', 'strict', 'weak' or 'mean'") |
0 commit comments