Merge pull request #6810 from gdraps/replace-scoreatpercentile

jreback · jreback · commit f30278e76e7f · 2014-04-16T09:04:41.000-04:00
CLN: replace pandas.compat.scipy.scoreatpercentile with numpy.percentile
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -172,6 +172,10 @@ API Changes
   (and numpy defaults)
 - add ``inplace`` keyword to ``Series.order/sort`` to make them inverses (:issue:`6859`)
 
+- Replace ``pandas.compat.scipy.scoreatpercentile`` with ``numpy.percentile`` (:issue:`6810`)
+- ``.quantile`` on a ``datetime[ns]`` series now returns ``Timestamp`` instead
+  of ``np.datetime64`` objects (:issue:`6810`)
+
 Deprecations
 ~~~~~~~~~~~~
 
diff --git a/pandas/compat/scipy.py b/pandas/compat/scipy.py
@@ -6,88 +6,6 @@
 import numpy as np
 
 
-def scoreatpercentile(a, per, limit=(), interpolation_method='fraction'):
-    """Calculate the score at the given `per` percentile of the sequence `a`.
-
-    For example, the score at `per=50` is the median. If the desired quantile
-    lies between two data points, we interpolate between them, according to
-    the value of `interpolation`. If the parameter `limit` is provided, it
-    should be a tuple (lower, upper) of two values. Values of `a` outside
-    this (closed) interval will be ignored.
-
-    The `interpolation_method` parameter supports three values, namely
-    `fraction` (default), `lower` and `higher`. Interpolation is done only,
-    if the desired quantile lies between two data points `i` and `j`. For
-    `fraction`, the result is an interpolated value between `i` and `j`;
-    for `lower`, the result is `i`, for `higher` the result is `j`.
-
-    Parameters
-    ----------
-    a : ndarray
-        Values from which to extract score.
-    per : scalar
-        Percentile at which to extract score.
-    limit : tuple, optional
-        Tuple of two scalars, the lower and upper limits within which to
-        compute the percentile.
-    interpolation_method : {'fraction', 'lower', 'higher'}, optional
-        This optional parameter specifies the interpolation method to use,
-        when the desired quantile lies between two data points `i` and `j`:
-
-        - fraction: `i + (j - i)*fraction`, where `fraction` is the
-                    fractional part of the index surrounded by `i` and `j`.
-        - lower: `i`.
-        - higher: `j`.
-
-    Returns
-    -------
-    score : float
-        Score at percentile.
-
-    See Also
-    --------
-    percentileofscore
-
-    Examples
-    --------
-    >>> from scipy import stats
-    >>> a = np.arange(100)
-    >>> stats.scoreatpercentile(a, 50)
-    49.5
-
-    """
-    # TODO: this should be a simple wrapper around a well-written quantile
-    # function.  GNU R provides 9 quantile algorithms (!), with differing
-    # behaviour at, for example, discontinuities.
-    values = np.sort(a, axis=0)
-    if limit:
-        values = values[(limit[0] <= values) & (values <= limit[1])]
-
-    idx = per / 100. * (values.shape[0] - 1)
-    if idx % 1 == 0:
-        score = values[idx]
-    else:
-        if interpolation_method == 'fraction':
-            score = _interpolate(values[int(idx)], values[int(idx) + 1],
-                                 idx % 1)
-        elif interpolation_method == 'lower':
-            score = values[np.floor(idx)]
-        elif interpolation_method == 'higher':
-            score = values[np.ceil(idx)]
-        else:
-            raise ValueError("interpolation_method can only be 'fraction', "
-                             "'lower' or 'higher'")
-
-    return score
-
-
-def _interpolate(a, b, fraction):
-    """Returns the point at the given fraction between a and b, where
-    'fraction' must be between 0 and 1.
-    """
-    return a + (b - a) * fraction
-
-
 def rankdata(a):
     """
     Ranks the data, dealing with ties appropriately.
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -38,7 +38,7 @@
 import pandas.computation.expressions as expressions
 from pandas.computation.eval import eval as _eval
 from pandas.computation.scope import _ensure_scope
-from pandas.compat.scipy import scoreatpercentile as _quantile
+from numpy import percentile as _quantile
 from pandas.compat import(range, zip, lrange, lmap, lzip, StringIO, u,
                           OrderedDict, raise_with_traceback)
 from pandas import compat
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -52,7 +52,7 @@
 import pandas.tslib as tslib
 import pandas.index as _index
 
-from pandas.compat.scipy import scoreatpercentile as _quantile
+from numpy import percentile as _quantile
 from pandas.core.config import get_option
 
 __all__ = ['Series']
@@ -1235,10 +1235,11 @@ def quantile(self, q=0.5):
         valid_values = self.dropna().values
         if len(valid_values) == 0:
             return pa.NA
-        result = _quantile(valid_values, q * 100)
-        if not np.isscalar and com.is_timedelta64_dtype(result):
-            from pandas.tseries.timedeltas import to_timedelta
-            return to_timedelta(result)
+        if com.is_datetime64_dtype(self):
+            values = _values_from_object(self).view('i8')
+            result = lib.Timestamp(_quantile(values, q * 100))
+        else:
+            result = _quantile(valid_values, q * 100)
 
         return result
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -10915,13 +10915,13 @@ def wrapper(x):
                             check_dtype=False, check_dates=True)
 
     def test_quantile(self):
-        from pandas.compat.scipy import scoreatpercentile
+        from numpy import percentile
 
         q = self.tsframe.quantile(0.1, axis=0)
-        self.assertEqual(q['A'], scoreatpercentile(self.tsframe['A'], 10))
+        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
         q = self.tsframe.quantile(0.9, axis=1)
         q = self.intframe.quantile(0.1)
-        self.assertEqual(q['A'], scoreatpercentile(self.intframe['A'], 10))
+        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))
 
         # test degenerate case
         q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -1907,17 +1907,17 @@ def test_groupby_with_hier_columns(self):
         self.assert_(result.columns.equals(df.columns[:-1]))
 
     def test_pass_args_kwargs(self):
-        from pandas.compat.scipy import scoreatpercentile
+        from numpy import percentile
 
-        def f(x, q=None):
-            return scoreatpercentile(x, q)
-        g = lambda x: scoreatpercentile(x, 80)
+        def f(x, q=None, axis=0):
+            return percentile(x, q, axis=axis)
+        g = lambda x: percentile(x, 80, axis=0)
 
         # Series
         ts_grouped = self.ts.groupby(lambda x: x.month)
-        agg_result = ts_grouped.agg(scoreatpercentile, 80)
-        apply_result = ts_grouped.apply(scoreatpercentile, 80)
-        trans_result = ts_grouped.transform(scoreatpercentile, 80)
+        agg_result = ts_grouped.agg(percentile, 80, axis=0)
+        apply_result = ts_grouped.apply(percentile, 80, axis=0)
+        trans_result = ts_grouped.transform(percentile, 80, axis=0)
 
         agg_expected = ts_grouped.quantile(.8)
         trans_expected = ts_grouped.transform(g)
@@ -1935,7 +1935,7 @@ def f(x, q=None):
 
         # DataFrame
         df_grouped = self.tsframe.groupby(lambda x: x.month)
-        agg_result = df_grouped.agg(scoreatpercentile, 80)
+        agg_result = df_grouped.agg(percentile, 80, axis=0)
         apply_result = df_grouped.apply(DataFrame.quantile, .8)
         expected = df_grouped.quantile(.8)
         assert_frame_equal(apply_result, expected)
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -2137,17 +2137,28 @@ def test_prod_numpy16_bug(self):
         self.assertNotIsInstance(result, Series)
 
     def test_quantile(self):
-        from pandas.compat.scipy import scoreatpercentile
+        from numpy import percentile
 
         q = self.ts.quantile(0.1)
-        self.assertEqual(q, scoreatpercentile(self.ts.valid(), 10))
+        self.assertEqual(q, percentile(self.ts.valid(), 10))
 
         q = self.ts.quantile(0.9)
-        self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90))
+        self.assertEqual(q, percentile(self.ts.valid(), 90))
 
         # object dtype
         q = Series(self.ts,dtype=object).quantile(0.9)
-        self.assertEqual(q, scoreatpercentile(self.ts.valid(), 90))
+        self.assertEqual(q, percentile(self.ts.valid(), 90))
+
+        # datetime64[ns] dtype
+        dts = self.ts.index.to_series()
+        q = dts.quantile(.2)
+        self.assertEqual(q, Timestamp('2000-01-10 19:12:00'))
+
+        if not _np_version_under1p7:
+            # timedelta64[ns] dtype
+            tds = dts.diff()
+            q = tds.quantile(.25)
+            self.assertEqual(q, pd.to_timedelta('24:00:00'))
 
     def test_describe(self):
         _ = self.series.describe()
diff --git a/pandas/tseries/tests/test_timedeltas.py b/pandas/tseries/tests/test_timedeltas.py
@@ -240,7 +240,7 @@ def test_timedelta_ops(self):
 
         result = td.quantile(.1)
         # This properly returned a scalar.
-        expected = to_timedelta('00:00:02.6')
+        expected = np.timedelta64(2599999999,'ns')
         tm.assert_almost_equal(result, expected)
 
         result = td.median()[0]