From 6be8784b0e2e27757c6e99d8211b98c38c7dc17f Mon Sep 17 00:00:00 2001 From: Garrett Drapala Date: Sat, 5 Apr 2014 09:48:43 -0400 Subject: [PATCH 1/2] CLN: replace pandas.compat.scipy.scoreatpercentile with numpy.percentile --- pandas/compat/scipy.py | 82 ------------------------- pandas/core/frame.py | 2 +- pandas/core/series.py | 2 +- pandas/tests/test_frame.py | 6 +- pandas/tests/test_groupby.py | 16 ++--- pandas/tests/test_series.py | 8 +-- pandas/tseries/tests/test_timedeltas.py | 2 +- 7 files changed, 18 insertions(+), 100 deletions(-) diff --git a/pandas/compat/scipy.py b/pandas/compat/scipy.py index 81601ffe25609..06da8799d0c96 100644 --- a/pandas/compat/scipy.py +++ b/pandas/compat/scipy.py @@ -6,88 +6,6 @@ import numpy as np -def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'): - """Calculate the score at the given `per` percentile of the sequence `a`. - - For example, the score at `per=50` is the median. If the desired quantile - lies between two data points, we interpolate between them, according to - the value of `interpolation`. If the parameter `limit` is provided, it - should be a tuple (lower, upper) of two values. Values of `a` outside - this (closed) interval will be ignored. - - The `interpolation_method` parameter supports three values, namely - `fraction` (default), `lower` and `higher`. Interpolation is done only, - if the desired quantile lies between two data points `i` and `j`. For - `fraction`, the result is an interpolated value between `i` and `j`; - for `lower`, the result is `i`, for `higher` the result is `j`. - - Parameters - ---------- - a : ndarray - Values from which to extract score. - per : scalar - Percentile at which to extract score. - limit : tuple, optional - Tuple of two scalars, the lower and upper limits within which to - compute the percentile. - interpolation_method : {'fraction', 'lower', 'higher'}, optional - This optional parameter specifies the interpolation method to use, - when the desired quantile lies between two data points `i` and `j`: - - - fraction: `i + (j - i)*fraction`, where `fraction` is the - fractional part of the index surrounded by `i` and `j`. - - lower: `i`. - - higher: `j`. - - Returns - ------- - score : float - Score at percentile. - - See Also - -------- - percentileofscore - - Examples - -------- - >>> from scipy import stats - >>> a = np.arange(100) - >>> stats.scoreatpercentile(a, 50) - 49.5 - - """ - # TODO: this should be a simple wrapper around a well-written quantile - # function. GNU R provides 9 quantile algorithms (!), with differing - # behaviour at, for example, discontinuities. - values = np.sort(a, axis=0) - if limit: - values = values[(limit[0] <= values) & (values <= limit[1])] - - idx = per / 100. * (values.shape[0] - 1) - if idx % 1 == 0: - score = values[idx] - else: - if interpolation_method == 'fraction': - score = _interpolate(values[int(idx)], values[int(idx) + 1], - idx % 1) - elif interpolation_method == 'lower': - score = values[np.floor(idx)] - elif interpolation_method == 'higher': - score = values[np.ceil(idx)] - else: - raise ValueError("interpolation_method can only be 'fraction', " - "'lower' or 'higher'") - - return score - - -def _interpolate(a, b, fraction): - """Returns the point at the given fraction between a and b, where - 'fraction' must be between 0 and 1. - """ - return a + (b - a) * fraction - - def rankdata(a): """ Ranks the data, dealing with ties appropriately. diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a00b729f1735a..2f8c70024a1e7 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -38,7 +38,7 @@ import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval from pandas.computation.scope import _ensure_scope -from pandas.compat.scipy import scoreatpercentile as _quantile +from numpy import percentile as _quantile from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u, OrderedDict, raise_with_traceback) from pandas import compat diff --git a/pandas/core/series.py b/pandas/core/series.py index 4d32481e30e55..ccbd15978548e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -52,7 +52,7 @@ import pandas.tslib as tslib import pandas.index as _index -from pandas.compat.scipy import scoreatpercentile as _quantile +from numpy import percentile as _quantile from pandas.core.config import get_option __all__ = ['Series'] diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index f273c794a7f05..01b42457e72f5 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -10915,13 +10915,13 @@ def wrapper(x): check_dtype=False, check_dates=True) def test_quantile(self): - from pandas.compat.scipy import scoreatpercentile + from numpy import percentile q = self.tsframe.quantile(0.1, axis=0) - self.assertEqual(q['A'], scoreatpercentile(self.tsframe['A'], 10)) + self.assertEqual(q['A'], percentile(self.tsframe['A'], 10)) q = self.tsframe.quantile(0.9, axis=1) q = self.intframe.quantile(0.1) - self.assertEqual(q['A'], scoreatpercentile(self.intframe['A'], 10)) + self.assertEqual(q['A'], percentile(self.intframe['A'], 10)) # test degenerate case q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index a7f7223172848..c0b7425485cba 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -1907,17 +1907,17 @@ def test_groupby_with_hier_columns(self): self.assert_(result.columns.equals(df.columns[:-1])) def test_pass_args_kwargs(self): - from pandas.compat.scipy import scoreatpercentile + from numpy import percentile - def f(x, q=None): - return scoreatpercentile(x, q) - g = lambda x: scoreatpercentile(x, 80) + def f(x, q=None, axis=0): + return percentile(x, q, axis=axis) + g = lambda x: percentile(x, 80, axis=0) # Series ts_grouped = self.ts.groupby(lambda x: x.month) - agg_result = ts_grouped.agg(scoreatpercentile, 80) - apply_result = ts_grouped.apply(scoreatpercentile, 80) - trans_result = ts_grouped.transform(scoreatpercentile, 80) + agg_result = ts_grouped.agg(percentile, 80, axis=0) + apply_result = ts_grouped.apply(percentile, 80, axis=0) + trans_result = ts_grouped.transform(percentile, 80, axis=0) agg_expected = ts_grouped.quantile(.8) trans_expected = ts_grouped.transform(g) @@ -1935,7 +1935,7 @@ def f(x, q=None): # DataFrame df_grouped = self.tsframe.groupby(lambda x: x.month) - agg_result = df_grouped.agg(scoreatpercentile, 80) + agg_result = df_grouped.agg(percentile, 80, axis=0) apply_result = df_grouped.apply(DataFrame.quantile, .8) expected = df_grouped.quantile(.8) assert_frame_equal(apply_result, expected) diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 744a020347af9..44a3301931439 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2137,17 +2137,17 @@ def test_prod_numpy16_bug(self): self.assertNotIsInstance(result, Series) def test_quantile(self): - from pandas.compat.scipy import scoreatpercentile + from numpy import percentile q = self.ts.quantile(0.1) - self.assertEqual(q, scoreatpercentile(self.ts.valid(), 10)) + self.assertEqual(q, percentile(self.ts.valid(), 10)) q = self.ts.quantile(0.9) - self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + self.assertEqual(q, percentile(self.ts.valid(), 90)) # object dtype q = Series(self.ts,dtype=object).quantile(0.9) - self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90)) + self.assertEqual(q, percentile(self.ts.valid(), 90)) def test_describe(self): _ = self.series.describe() diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py index 57d8bf5623a78..215e6e62c685e 100644 --- a/pandas/tseries/tests/test_timedeltas.py +++ b/pandas/tseries/tests/test_timedeltas.py @@ -240,7 +240,7 @@ def test_timedelta_ops(self): result = td.quantile(.1) # This properly returned a scalar. - expected = to_timedelta('00:00:02.6') + expected = np.timedelta64(2599999999,'ns') tm.assert_almost_equal(result, expected) result = td.median()[0] From 9d89f513b5d6eb61941a162040f4694b8f03ffca Mon Sep 17 00:00:00 2001 From: Garrett Drapala Date: Sat, 5 Apr 2014 10:44:35 -0400 Subject: [PATCH 2/2] CLN/TST: return Timestamp for .quantile on datetime[ns] series --- doc/source/release.rst | 4 ++++ pandas/core/series.py | 9 +++++---- pandas/tests/test_series.py | 11 +++++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index fb4f06ac03ff9..cc8e271d62183 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -172,6 +172,10 @@ API Changes (and numpy defaults) - add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`) +- Replace ``pandas.compat.scipy.scoreatpercentile`` with ``numpy.percentile`` (:issue:`6810`) +- ``.quantile`` on a ``datetime[ns]`` series now returns ``Timestamp`` instead + of ``np.datetime64`` objects (:issue:`6810`) + Deprecations ~~~~~~~~~~~~ diff --git a/pandas/core/series.py b/pandas/core/series.py index ccbd15978548e..6172f87ead246 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1235,10 +1235,11 @@ def quantile(self, q=0.5): valid_values = self.dropna().values if len(valid_values) == 0: return pa.NA - result = _quantile(valid_values, q * 100) - if not np.isscalar and com.is_timedelta64_dtype(result): - from pandas.tseries.timedeltas import to_timedelta - return to_timedelta(result) + if com.is_datetime64_dtype(self): + values = _values_from_object(self).view('i8') + result = lib.Timestamp(_quantile(values, q * 100)) + else: + result = _quantile(valid_values, q * 100) return result diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 44a3301931439..d1775177d3c1d 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -2149,6 +2149,17 @@ def test_quantile(self): q = Series(self.ts,dtype=object).quantile(0.9) self.assertEqual(q, percentile(self.ts.valid(), 90)) + # datetime64[ns] dtype + dts = self.ts.index.to_series() + q = dts.quantile(.2) + self.assertEqual(q, Timestamp('2000-01-10 19:12:00')) + + if not _np_version_under1p7: + # timedelta64[ns] dtype + tds = dts.diff() + q = tds.quantile(.25) + self.assertEqual(q, pd.to_timedelta('24:00:00')) + def test_describe(self): _ = self.series.describe() _ = self.ts.describe()