Closes issue pandas-dev#10174. Added 'interpolation' keyword in Dataframe.quantile and Series.quantile

mayankasthana · mkasthana-cs · commit 55836e6e7fa7 · 2016-01-07T02:21:13.000+05:30
diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt
@@ -111,6 +111,7 @@ Other enhancements
 - ``sys.getsizeof(obj)`` returns the memory usage of a pandas object, including the
   values it contains (:issue:`11597`)
 - ``Series`` gained an ``is_unique`` attribute (:issue:`11946`)
+- ``DataFrame.quantile`` and ``Series.quantile`` now accept ``interpolation`` keyword (:issue:`10174`).
 
 .. _whatsnew_0180.enhancements.rounding:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -64,6 +64,7 @@
 import pandas.algos as _algos
 
 from pandas.core.config import get_option
+from pandas import _np_version_under1p9
 
 #----------------------------------------------------------------------
 # Docstring templates
@@ -4874,7 +4875,7 @@ def mode(self, axis=0, numeric_only=False):
         f = lambda s: s.mode()
         return data.apply(f, axis=axis)
 
-    def quantile(self, q=0.5, axis=0, numeric_only=True):
+    def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation='linear'):
         """
         Return values at the given quantile over requested axis, a la
         numpy.percentile.
@@ -4885,7 +4886,16 @@ def quantile(self, q=0.5, axis=0, numeric_only=True):
             0 <= q <= 1, the quantile(s) to compute
         axis : {0, 1, 'index', 'columns'} (default 0)
             0 or 'index' for row-wise, 1 or 'columns' for column-wise
-
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            .. versionadded:: 0.18.0
+            This optional parameter specifies the interpolation method to use,
+            when the desired quantile lies between two data points `i` and `j`:
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
 
         Returns
         -------
@@ -4920,7 +4930,12 @@ def quantile(self, q=0.5, axis=0, numeric_only=True):
         else:
             squeeze = False
 
-        def f(arr, per):
+        if _np_version_under1p9:
+            if interpolation != 'linear':
+                raise ValueError("Interpolation methods"
+                    " other than linear not supported in numpy < 1.9")
+
+        def f(arr, per,interpolation):
             if arr._is_datelike_mixed_type:
                 values = _values_from_object(arr).view('i8')
             else:
@@ -4929,7 +4944,10 @@ def f(arr, per):
             if len(values) == 0:
                 return NA
             else:
-                return _quantile(values, per)
+                if _np_version_under1p9:
+                    return _quantile(values, per)
+                else:
+                    return _quantile(values, per, interpolation=interpolation)
 
         data = self._get_numeric_data() if numeric_only else self
 
@@ -4943,7 +4961,7 @@ def f(arr, per):
         is_dt_col = data.dtypes.map(com.is_datetime64_dtype)
         is_dt_col = is_dt_col[is_dt_col].index
 
-        quantiles = [[f(vals, x) for x in per]
+        quantiles = [[f(vals, x, interpolation) for x in per]
                      for (_, vals) in data.iteritems()]
 
         result = self._constructor(quantiles, index=data._info_axis,
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -58,6 +58,8 @@
 from numpy import percentile as _quantile
 from pandas.core.config import get_option
 
+from pandas import _np_version_under1p9
+
 __all__ = ['Series']
 
 
@@ -1261,14 +1263,24 @@ def round(self, decimals=0):
 
         return result
 
-    def quantile(self, q=0.5):
+    def quantile(self, q=0.5, interpolation='linear'):
         """
         Return value at the given quantile, a la numpy.percentile.
 
         Parameters
         ----------
         q : float or array-like, default 0.5 (50% quantile)
             0 <= q <= 1, the quantile(s) to compute
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+            .. versionadded:: 0.18.0
+            This optional parameter specifies the interpolation method to use,
+            when the desired quantile lies between two data points `i` and `j`:
+                * linear: `i + (j - i) * fraction`, where `fraction` is the
+                  fractional part of the index surrounded by `i` and `j`.
+                * lower: `i`.
+                * higher: `j`.
+                * nearest: `i` or `j` whichever is nearest.
+                * midpoint: (`i` + `j`) / 2.
 
         Returns
         -------
@@ -1291,17 +1303,26 @@ def quantile(self, q=0.5):
         valid = self.dropna()
         self._check_percentile(q)
 
-        def multi(values, qs):
+        if _np_version_under1p9:
+            if interpolation != 'linear':
+                raise ValueError("Interpolation methods"
+                    " other than linear not supported in numpy < 1.9.")
+
+        def multi(values,qs,**kwargs):
             if com.is_list_like(qs):
-                values = [_quantile(values, x*100) for x in qs]
+                values = [_quantile(values, x*100, **kwargs) for x in qs]
                 # let empty result to be Float64Index
                 qs = Float64Index(qs)
                 return self._constructor(values, index=qs, name=self.name)
             else:
-                return _quantile(values, qs*100)
-
-        return self._maybe_box(lambda values: multi(values, q), dropna=True)
+                return _quantile(values, qs*100, **kwargs)
+        
+        kwargs = dict()
+        if not _np_version_under1p9:
+            kwargs.update({'interpolation':interpolation})
 
+        return self._maybe_box(lambda values: multi(values,q,**kwargs), dropna=True)
+        
     def corr(self, other, method='pearson',
              min_periods=None):
         """
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -55,6 +55,7 @@
 import pandas.lib as lib
 
 from numpy.testing.decorators import slow
+from pandas import _np_version_under1p9
 
 #---------------------------------------------------------------------
 # DataFrame test cases
@@ -13642,6 +13643,93 @@ def test_quantile_axis_parameter(self):
         self.assertRaises(ValueError, df.quantile, 0.1, axis=-1)
         self.assertRaises(ValueError, df.quantile, 0.1, axis="column")
 
+    def test_quantile_interpolation(self):
+        # GH #10174
+        if _np_version_under1p9:
+            raise nose.SkipTest("Numpy version under 1.9")            
+
+        from numpy import percentile
+
+        #interpolation = linear (default case)
+        q = self.tsframe.quantile(0.1, axis=0,interpolation='linear')
+        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
+        q = self.intframe.quantile(0.1)
+        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))
+                
+        q1 = self.intframe.quantile(0.1)
+        self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10))        
+        #test with and without interpolation keyword
+        assert_series_equal(q,q1)   
+        
+        #interpolation method other than default linear
+        
+        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+        result = df.quantile(.5, axis=1,interpolation='nearest')
+        expected = Series([1., 2., 3.], index=[1, 2, 3])
+        assert_series_equal(result, expected)
+
+        #axis
+        result = df.quantile([.5, .75], axis=1,interpolation='lower')
+        expected = DataFrame({1: [1., 1.], 2: [2., 2.],
+                              3: [3., 3.]}, index=[0.5, 0.75])
+        assert_frame_equal(result, expected)
+
+        #test degenerate case
+        df = DataFrame({'x': [], 'y': []})
+        q = df.quantile(0.1, axis=0,interpolation='higher')            
+        assert(np.isnan(q['x']) and np.isnan(q['y']))
+
+        #multi
+        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+                   columns=['a', 'b', 'c'])
+        result = df.quantile([.25, .5],interpolation='midpoint')
+        expected = DataFrame([[1.5, 1.5, 1.5], [2.5, 2.5, 2.5]],
+                             index=[.25, .5], columns=['a', 'b', 'c'])
+        assert_frame_equal(result, expected)
+    
+
+    def test_quantile_interpolation_np_lt_1p9(self):
+        # GH #10174
+        if not _np_version_under1p9:
+            raise nose.SkipTest("Numpy version is greater than 1.9")
+
+        from numpy import percentile
+
+        #interpolation = linear (default case)
+        q = self.tsframe.quantile(0.1, axis=0,interpolation='linear')
+        self.assertEqual(q['A'], percentile(self.tsframe['A'], 10))
+        q = self.intframe.quantile(0.1)
+        self.assertEqual(q['A'], percentile(self.intframe['A'], 10))
+                
+        q1 = self.intframe.quantile(0.1)
+        self.assertEqual(q1['A'], np.percentile(self.intframe['A'], 10))        
+        #test with and without interpolation keyword
+        assert_series_equal(q,q1)   
+        
+        #interpolation method other than default linear
+
+        expErrMsg = ("Interpolation methods other than linear"
+                    " not supported in numpy < 1.9")
+        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
+        with assertRaisesRegexp(ValueError,expErrMsg):
+            df.quantile(.5, axis=1,interpolation='nearest')
+
+        with assertRaisesRegexp(ValueError,expErrMsg):
+            df.quantile([.5, .75], axis=1,interpolation='lower')
+
+        # test degenerate case
+        df = DataFrame({'x': [], 'y': []})
+        with assertRaisesRegexp(ValueError,expErrMsg):
+            q = df.quantile(0.1, axis=0,interpolation='higher')
+
+        #multi
+        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
+                   columns=['a', 'b', 'c'])            
+        with assertRaisesRegexp(ValueError,expErrMsg):
+            result = df.quantile([.25, .5],interpolation='midpoint')
+        
+
+
     def test_quantile_multi(self):
         df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                        columns=['a', 'b', 'c'])
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -20,7 +20,8 @@
 import pandas as pd
 
 from pandas import (Index, Series, DataFrame, isnull, notnull, bdate_range, NaT,
-                    date_range, period_range, timedelta_range, _np_version_under1p8)
+                    date_range, period_range, timedelta_range, _np_version_under1p8,
+                    _np_version_under1p9)
 from pandas.core.index import MultiIndex
 from pandas.core.indexing import IndexingError
 from pandas.tseries.period import PeriodIndex
@@ -3083,6 +3084,44 @@ def test_quantile_multi(self):
         expected = pd.Series([], name=self.ts.name, index=Index([], dtype=float))
         assert_series_equal(result, expected)
 
+    def test_quantile_interpolation(self):
+        # GH #10174
+        if _np_version_under1p9:
+            raise nose.SkipTest("Numpy version is under 1.9")
+
+        from numpy import percentile
+
+        #interpolation = linear (default case)
+        q = self.ts.quantile(0.1,interpolation='linear')
+        self.assertEqual(q, percentile(self.ts.valid(), 10))
+        q1 = self.ts.quantile(0.1)
+        self.assertEqual(q1, percentile(self.ts.valid(), 10))
+
+        #test with and without interpolation keyword
+        self.assertEqual(q,q1)
+
+    def test_quantile_interpolation_np_lt_1p9(self):
+        # GH #10174
+        if not _np_version_under1p9:
+            raise nose.SkipTest("Numpy version is greater than 1.9")
+        
+        from numpy import percentile
+        
+        #interpolation = linear (default case)
+        q = self.ts.quantile(0.1,interpolation='linear')
+        self.assertEqual(q, percentile(self.ts.valid(), 10))
+        q1 = self.ts.quantile(0.1)
+        self.assertEqual(q1, percentile(self.ts.valid(), 10))
+        
+        #interpolation other than linear
+        expErrMsg = "Interpolation methods other than linear not supported in numpy < 1.9"
+        with tm.assertRaisesRegexp(ValueError,expErrMsg):
+            self.ts.quantile(0.9,interpolation='nearest')
+        
+        # object dtype
+        with tm.assertRaisesRegexp(ValueError,expErrMsg):
+            q = Series(self.ts,dtype=object).quantile(0.7,interpolation='higher')
+
     def test_append(self):
         appendedSeries = self.series.append(self.objSeries)
         for idx, value in compat.iteritems(appendedSeries):