Closes issue #10174. Added 'interpolation' keyword in Dataframe.quantile and Series.quantile

mayankasthana · jreback · commit e05f66a1bd79 · 2016-01-06T19:18:41.000-05:00
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -111,6 +111,7 @@ Other enhancements
 - ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the
   values it contains (:issue:`11597`)
 - ``Series`` gained an ``is_unique`` attribute (:issue:`11946`)
+- ``DataFrame.quantile`` and ``Series.quantile`` now accept ``interpolation`` keyword (:issue:`10174`).
 
 .. _whatsnew_0180.enhancements.rounding:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -64,8 +64,9 @@
 import pandas.algos as _algos
 
 from pandas.core.config import get_option
+from pandas import _np_version_under1p9
 
-#----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 # Docstring templates
 
 _shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame',
@@ -1578,7 +1579,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None,
             longtable = get_option("display.latex.longtable")
         if escape is None:
             escape = get_option("display.latex.escape")
-            
+
         formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns,
                                            col_space=col_space, na_rep=na_rep,
                                            header=header, index=index,
@@ -4430,7 +4431,7 @@ def round(self, decimals=0, out=None):
         Returns
         -------
         DataFrame object
-        
+
         See Also
         --------
         numpy.around
@@ -4874,7 +4875,8 @@ def mode(self, axis=0, numeric_only=False):
         f = lambda s: s.mode()
         return data.apply(f, axis=axis)
 
-    def quantile(self, q=0.5, axis=0, numeric_only=True):
+    def quantile(self, q=0.5, axis=0, numeric_only=True,
+                 interpolation='linear'):
         """
         Return values at the given quantile over requested axis, a la
         numpy.percentile.
@@ -4885,7 +4887,16 @@ def quantile(self, q=0.5, axis=0, numeric_only=True):
             0 <= q <= 1, the quantile(s) to compute
         axis : {0, 1, 'index', 'columns'} (default 0)
             0 or 'index' for row-wise, 1 or 'columns' for column-wise
-
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            .. versionadded:: 0.18.0
+            This optional parameter specifies the interpolation method to use,
+            when the desired quantile lies between two data points `i` and `j`:
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
 
         Returns
         -------
@@ -4920,7 +4931,12 @@ def quantile(self, q=0.5, axis=0, numeric_only=True):
         else:
             squeeze = False
 
-        def f(arr, per):
+        if _np_version_under1p9:
+            if interpolation != 'linear':
+                raise ValueError("Interpolation methods other than linear "
+                                 "are not supported in numpy < 1.9")
+
+        def f(arr, per, interpolation):
             if arr._is_datelike_mixed_type:
                 values = _values_from_object(arr).view('i8')
             else:
@@ -4929,7 +4945,10 @@ def f(arr, per):
             if len(values) == 0:
                 return NA
             else:
-                return _quantile(values, per)
+                if _np_version_under1p9:
+                    return _quantile(values, per)
+                else:
+                    return _quantile(values, per, interpolation=interpolation)
 
         data = self._get_numeric_data() if numeric_only else self
 
@@ -4943,7 +4962,7 @@ def f(arr, per):
         is_dt_col = data.dtypes.map(com.is_datetime64_dtype)
         is_dt_col = is_dt_col[is_dt_col].index
 
-        quantiles = [[f(vals, x) for x in per]
+        quantiles = [[f(vals, x, interpolation) for x in per]
                      for (_, vals) in data.iteritems()]
 
         result = self._constructor(quantiles, index=data._info_axis,
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -58,6 +58,8 @@
 from numpy import percentile as _quantile
 from pandas.core.config import get_option
 
+from pandas import _np_version_under1p9
+
 __all__ = ['Series']
 
 
@@ -1238,18 +1240,18 @@ def idxmax(self, axis=None, out=None, skipna=True):
     def round(self, decimals=0):
         """
         Round each value in a Series to the given number of decimals.
-        
+
         Parameters
         ----------
         decimals : int
-            Number of decimal places to round to (default: 0). 
-            If decimals is negative, it specifies the number of 
+            Number of decimal places to round to (default: 0).
+            If decimals is negative, it specifies the number of
             positions to the left of the decimal point.
-        
+
         Returns
         -------
         Series object
-        
+
         See Also
         --------
         numpy.around
@@ -1261,14 +1263,24 @@ def round(self, decimals=0):
 
         return result
 
-    def quantile(self, q=0.5):
+    def quantile(self, q=0.5, interpolation='linear'):
         """
         Return value at the given quantile, a la numpy.percentile.
 
         Parameters
         ----------
         q : float or array-like, default 0.5 (50% quantile)
             0 <= q <= 1, the quantile(s) to compute
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            .. versionadded:: 0.18.0
+            This optional parameter specifies the interpolation method to use,
+            when the desired quantile lies between two data points `i` and `j`:
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
 
         Returns
         -------
@@ -1288,19 +1300,29 @@ def quantile(self, q=0.5):
         0.75    3.25
         dtype: float64
         """
-        valid = self.dropna()
+
         self._check_percentile(q)
 
-        def multi(values, qs):
+        if _np_version_under1p9:
+            if interpolation != 'linear':
+                raise ValueError("Interpolation methods other than linear "
+                                 "are not supported in numpy < 1.9.")
+
+        def multi(values, qs, **kwargs):
             if com.is_list_like(qs):
-                values = [_quantile(values, x*100) for x in qs]
+                values = [_quantile(values, x * 100, **kwargs) for x in qs]
                 # let empty result to be Float64Index
                 qs = Float64Index(qs)
                 return self._constructor(values, index=qs, name=self.name)
             else:
-                return _quantile(values, qs*100)
+                return _quantile(values, qs * 100, **kwargs)
+
+        kwargs = dict()
+        if not _np_version_under1p9:
+            kwargs.update({'interpolation': interpolation})
 
-        return self._maybe_box(lambda values: multi(values, q), dropna=True)
+        return self._maybe_box(lambda values: multi(values, q, **kwargs),
+                               dropna=True)
 
     def corr(self, other, method='pearson',
              min_periods=None):
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -55,8 +55,9 @@
 import pandas.lib as lib
 
 from numpy.testing.decorators import slow
+from pandas import _np_version_under1p9
 
-#---------------------------------------------------------------------
+# ---------------------------------------------------------------------
 # DataFrame test cases
 
 JOIN_TYPES = ['inner', 'outer', 'left', 'right']
@@ -5457,10 +5458,10 @@ def test_repr_column_name_unicode_truncation_bug(self):
     def test_head_tail(self):
         assert_frame_equal(self.frame.head(), self.frame[:5])
         assert_frame_equal(self.frame.tail(), self.frame[-5:])
-         
+
         assert_frame_equal(self.frame.head(0), self.frame[0:0])
         assert_frame_equal(self.frame.tail(0), self.frame[0:0])
-        
+
         assert_frame_equal(self.frame.head(-1), self.frame[:-1])
         assert_frame_equal(self.frame.tail(-1), self.frame[1:])
         assert_frame_equal(self.frame.head(1), self.frame[:1])
@@ -13564,10 +13565,11 @@ def test_round_issue(self):
 
         decimals = pd.Series([1, 0, 2], index=['A', 'B', 'A'])
         self.assertRaises(ValueError, df.round, decimals)
-        
+
     def test_built_in_round(self):
         if not compat.PY3:
-            raise nose.SkipTest('build in round cannot be overriden prior to Python 3')
+            raise nose.SkipTest("build in round cannot be overriden "
+                                "prior to Python 3")
 
         # GH11763
         # Here's the test frame we'll be working with
@@ -13578,7 +13580,7 @@ def test_built_in_round(self):
         expected_rounded = DataFrame(
             {'col1': [1., 2., 3.], 'col2': [1., 2., 3.]})
         tm.assert_frame_equal(round(df), expected_rounded)
-        
+
     def test_quantile(self):
         from numpy import percentile
 
@@ -13642,6 +13644,88 @@ def test_quantile_axis_parameter(self):
         self.assertRaises(ValueError, df.quantile, 0.1, axis=-1)
         self.assertRaises(ValueError, df.quantile, 0.1, axis="column")
 
+    def test_quantile_interpolation(self):
+        # GH #10174
+        if _np_version_under1p9:
+            raise nose.SkipTest("Numpy version under 1.9")
+
+        from numpy import percentile
+
+        # interpolation = linear (default case)
+        q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
+        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
+        q = self.intframe.quantile(0.1)
+        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))
+
+        # test with and without interpolation keyword
+        q1 = self.intframe.quantile(0.1)
+        self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10))
+        assert_series_equal(q, q1)
+
+        # interpolation method other than default linear
+        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+        result = df.quantile(.5, axis=1, interpolation='nearest')
+        expected = Series([1., 2., 3.], index=[1, 2, 3])
+        assert_series_equal(result, expected)
+
+        # axis
+        result = df.quantile([.5, .75], axis=1, interpolation='lower')
+        expected = DataFrame({1: [1., 1.], 2: [2., 2.],
+                              3: [3., 3.]}, index=[0.5, 0.75])
+        assert_frame_equal(result, expected)
+
+        # test degenerate case
+        df = DataFrame({'x': [], 'y': []})
+        q = df.quantile(0.1, axis=0, interpolation='higher')
+        assert(np.isnan(q['x']) and np.isnan(q['y']))
+
+        # multi
+        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+                       columns=['a', 'b', 'c'])
+        result = df.quantile([.25, .5], interpolation='midpoint')
+        expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]],
+                             index=[.25, .5], columns=['a', 'b', 'c'])
+        assert_frame_equal(result, expected)
+
+    def test_quantile_interpolation_np_lt_1p9(self):
+        # GH #10174
+        if not _np_version_under1p9:
+            raise nose.SkipTest("Numpy version is greater than 1.9")
+
+        from numpy import percentile
+
+        # interpolation = linear (default case)
+        q = self.tsframe.quantile(0.1, axis=0, interpolation='linear')
+        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
+        q = self.intframe.quantile(0.1)
+        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))
+
+        # test with and without interpolation keyword
+        q1 = self.intframe.quantile(0.1)
+        self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10))
+        assert_series_equal(q, q1)
+
+        # interpolation method other than default linear
+        expErrMsg = ("Interpolation methods other than linear"
+                     " not supported in numpy < 1.9")
+        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+        with assertRaisesRegexp(ValueError, expErrMsg):
+            df.quantile(.5, axis=1, interpolation='nearest')
+
+        with assertRaisesRegexp(ValueError, expErrMsg):
+            df.quantile([.5, .75], axis=1, interpolation='lower')
+
+        # test degenerate case
+        df = DataFrame({'x': [], 'y': []})
+        with assertRaisesRegexp(ValueError, expErrMsg):
+            q = df.quantile(0.1, axis=0, interpolation='higher')
+
+        # multi
+        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+                       columns=['a', 'b', 'c'])
+        with assertRaisesRegexp(ValueError, expErrMsg):
+            df.quantile([.25, .5], interpolation='midpoint')
+
     def test_quantile_multi(self):
         df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                        columns=['a', 'b', 'c'])
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -19,8 +19,9 @@
 import numpy.ma as ma
 import pandas as pd
 
-from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range, NaT,
-                    date_range, period_range, timedelta_range, _np_version_under1p8)
+from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range,
+                    NaT, date_range, period_range, timedelta_range,
+                    _np_version_under1p8, _np_version_under1p9)
 from pandas.core.index import MultiIndex
 from pandas.core.indexing import IndexingError
 from pandas.tseries.period import PeriodIndex
@@ -3080,9 +3081,50 @@ def test_quantile_multi(self):
         assert_series_equal(result, expected)
 
         result = self.ts.quantile([])
-        expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float))
+        expected = pd.Series([], name=self.ts.name,
+                             index=Index([], dtype=float))
         assert_series_equal(result, expected)
 
+    def test_quantile_interpolation(self):
+        # GH #10174
+        if _np_version_under1p9:
+            raise nose.SkipTest("Numpy version is under 1.9")
+
+        from numpy import percentile
+
+        # interpolation = linear (default case)
+        q = self.ts.quantile(0.1, interpolation='linear')
+        self.assertEqual(q, percentile(self.ts.valid(), 10))
+        q1 = self.ts.quantile(0.1)
+        self.assertEqual(q1, percentile(self.ts.valid(), 10))
+
+        # test with and without interpolation keyword
+        self.assertEqual(q, q1)
+
+    def test_quantile_interpolation_np_lt_1p9(self):
+        # GH #10174
+        if not _np_version_under1p9:
+            raise nose.SkipTest("Numpy version is greater than 1.9")
+
+        from numpy import percentile
+
+        # interpolation = linear (default case)
+        q = self.ts.quantile(0.1, interpolation='linear')
+        self.assertEqual(q, percentile(self.ts.valid(), 10))
+        q1 = self.ts.quantile(0.1)
+        self.assertEqual(q1, percentile(self.ts.valid(), 10))
+
+        # interpolation other than linear
+        expErrMsg = "Interpolation methods other than " \
+                    "linear not supported in numpy < 1.9"
+        with tm.assertRaisesRegexp(ValueError, expErrMsg):
+            self.ts.quantile(0.9, interpolation='nearest')
+
+        # object dtype
+        with tm.assertRaisesRegexp(ValueError, expErrMsg):
+            q = Series(self.ts, dtype=object).quantile(0.7,
+                                                       interpolation='higher')
+
     def test_append(self):
         appendedSeries = self.series.append(self.objSeries)
         for idx, value in compat.iteritems(appendedSeries):