Merge pull request #10691 from lmjohns3/master

jreback · jreback · commit b7374cac4d3f · 2015-09-01T08:04:25.000-04:00
Allow interpolate() to fill backwards as well as forwards
diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst
@@ -329,6 +329,10 @@ Interpolation
   :meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have
   revamped interpolation methods and functionality.
 
+.. versionadded:: 0.17.0
+
+  The ``limit_direction`` keyword argument was added.
+
 Both Series and Dataframe objects have an ``interpolate`` method that, by default,
 performs linear interpolation at missing datapoints.
 
@@ -448,17 +452,33 @@ at the new values.
 .. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation
 .. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
 
+Interpolation Limits
+^^^^^^^^^^^^^^^^^^^^
 
 Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword
-argument.  Use this to limit the number of consecutive interpolations, keeping
-``NaN`` values for interpolations that are too far from the last valid
+argument. Use this argument to limit the number of consecutive interpolations,
+keeping ``NaN`` values for interpolations that are too far from the last valid
 observation:
 
 .. ipython:: python
 
-   ser = pd.Series([1, 3, np.nan, np.nan, np.nan, 11])
+   ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13])
    ser.interpolate(limit=2)
 
+By default, ``limit`` applies in a forward direction, so that only ``NaN``
+values after a non-``NaN`` value can be filled. If you provide ``'backward'`` or
+``'both'`` for the ``limit_direction`` keyword argument, you can fill ``NaN``
+values before non-``NaN`` values, or both before and after non-``NaN`` values,
+respectively:
+
+.. ipython:: python
+
+   ser.interpolate(limit=1)  # limit_direction == 'forward'
+
+   ser.interpolate(limit=1, limit_direction='backward')
+
+   ser.interpolate(limit=1, limit_direction='both')
+
 .. _missing_data.replace:
 
 Replacing Generic Values
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -55,6 +55,12 @@ New features
 - SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
 - Enable writing complex values to HDF stores when using table format (:issue:`10447`)
 - Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
+- Add a ``limit_direction`` keyword argument that works with ``limit`` to enable ``interpolate`` to fill ``NaN`` values forward, backward, or both (:issue:`9218` and :issue:`10420`)
+
+  .. ipython:: python
+
+     ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13])
+     ser.interpolate(limit=1, limit_direction='both')
 
 .. _whatsnew_0170.gil:
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -1589,6 +1589,7 @@ def _clean_interp_method(method, **kwargs):
 
 
 def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
+                   limit_direction='forward',
                    fill_value=None, bounds_error=False, order=None, **kwargs):
     """
     Logic for the 1-d interpolation.  The result should be 1-d, inputs
@@ -1602,9 +1603,15 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
     invalid = isnull(yvalues)
     valid = ~invalid
 
-    valid_y = yvalues[valid]
-    valid_x = xvalues[valid]
-    new_x = xvalues[invalid]
+    if not valid.any():
+        # have to call np.asarray(xvalues) since xvalues could be an Index
+        # which cant be mutated
+        result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
+        result.fill(np.nan)
+        return result
+
+    if valid.all():
+        return yvalues
 
     if method == 'time':
         if not getattr(xvalues, 'is_all_dates', None):
@@ -1614,66 +1621,82 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
                              'DatetimeIndex')
         method = 'values'
 
-    def _interp_limit(invalid, limit):
-        """mask off values that won't be filled since they exceed the limit"""
+    def _interp_limit(invalid, fw_limit, bw_limit):
+        "Get idx of values that won't be forward-filled b/c they exceed the limit."
         all_nans = np.where(invalid)[0]
         if all_nans.size == 0: # no nans anyway
             return []
-        violate = [invalid[x:x + limit + 1] for x in all_nans]
-        violate = np.array([x.all() & (x.size > limit) for x in violate])
-        return all_nans[violate] + limit
+        violate = [invalid[max(0, x - bw_limit):x + fw_limit + 1] for x in all_nans]
+        violate = np.array([x.all() & (x.size > bw_limit + fw_limit) for x in violate])
+        return all_nans[violate] + fw_limit - bw_limit
+
+    valid_limit_directions = ['forward', 'backward', 'both']
+    limit_direction = limit_direction.lower()
+    if limit_direction not in valid_limit_directions:
+        msg = 'Invalid limit_direction: expecting one of %r, got %r.' % (
+            valid_limit_directions, limit_direction)
+        raise ValueError(msg)
 
-    xvalues = getattr(xvalues, 'values', xvalues)
-    yvalues = getattr(yvalues, 'values', yvalues)
+    from pandas import Series
+    ys = Series(yvalues)
+    start_nans = set(range(ys.first_valid_index()))
+    end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
+
+    # This is a list of the indexes in the series whose yvalue is currently NaN,
+    # but whose interpolated yvalue will be overwritten with NaN after computing
+    # the interpolation. For each index in this list, one of these conditions is
+    # true of the corresponding NaN in the yvalues:
+    #
+    # a) It is one of a chain of NaNs at the beginning of the series, and either
+    #    limit is not specified or limit_direction is 'forward'.
+    # b) It is one of a chain of NaNs at the end of the series, and limit is
+    #    specified and limit_direction is 'backward' or 'both'.
+    # c) Limit is nonzero and it is further than limit from the nearest non-NaN
+    #    value (with respect to the limit_direction setting).
+    #
+    # The default behavior is to fill forward with no limit, ignoring NaNs at
+    # the beginning (see issues #9218 and #10420)
+    violate_limit = sorted(start_nans)
 
     if limit:
-        violate_limit = _interp_limit(invalid, limit)
-    if valid.any():
-        firstIndex = valid.argmax()
-        valid = valid[firstIndex:]
-        invalid = invalid[firstIndex:]
-        result = yvalues.copy()
-        if valid.all():
-            return yvalues
-    else:
-        # have to call np.array(xvalues) since xvalues could be an Index
-        # which cant be mutated
-        result = np.empty_like(np.array(xvalues), dtype=np.float64)
-        result.fill(np.nan)
-        return result
+        if limit_direction == 'forward':
+            violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0)))
+        if limit_direction == 'backward':
+            violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit)))
+        if limit_direction == 'both':
+            violate_limit = _interp_limit(invalid, limit, limit)
+
+    xvalues = getattr(xvalues, 'values', xvalues)
+    yvalues = getattr(yvalues, 'values', yvalues)
+    result = yvalues.copy()
 
     if method in ['linear', 'time', 'index', 'values']:
         if method in ('values', 'index'):
             inds = np.asarray(xvalues)
             # hack for DatetimeIndex, #1646
             if issubclass(inds.dtype.type, np.datetime64):
                 inds = inds.view(np.int64)
-
             if inds.dtype == np.object_:
                 inds = lib.maybe_convert_objects(inds)
         else:
             inds = xvalues
-
-        inds = inds[firstIndex:]
-
-        result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid],
-                                                 yvalues[firstIndex:][valid])
-
-        if limit:
-            result[violate_limit] = np.nan
+        result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
+        result[violate_limit] = np.nan
         return result
 
     sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
                   'barycentric', 'krogh', 'spline', 'polynomial',
                   'piecewise_polynomial', 'pchip']
     if method in sp_methods:
-        new_x = new_x[firstIndex:]
-
-        result[firstIndex:][invalid] = _interpolate_scipy_wrapper(
-            valid_x, valid_y, new_x, method=method, fill_value=fill_value,
+        inds = np.asarray(xvalues)
+        # hack for DatetimeIndex, #1646
+        if issubclass(inds.dtype.type, np.datetime64):
+            inds = inds.view(np.int64)
+        result[invalid] = _interpolate_scipy_wrapper(
+            inds[valid], yvalues[valid], inds[invalid], method=method,
+            fill_value=fill_value,
             bounds_error=bounds_error, order=order, **kwargs)
-        if limit:
-            result[violate_limit] = np.nan
+        result[violate_limit] = np.nan
         return result
 
 
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -2964,7 +2964,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
             return self._constructor(new_data).__finalize__(self)
 
     def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
-                    downcast=None, **kwargs):
+                    limit_direction='forward', downcast=None, **kwargs):
         """
         Interpolate values according to different methods.
 
@@ -3001,6 +3001,12 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
             * 1: fill row-by-row
         limit : int, default None.
             Maximum number of consecutive NaNs to fill.
+        limit_direction : {'forward', 'backward', 'both'}, defaults to 'forward'
+            If limit is specified, consecutive NaNs will be filled in this
+            direction.
+
+            .. versionadded:: 0.17.0
+
         inplace : bool, default False
             Update the NDFrame in place if possible.
         downcast : optional, 'infer' or None, defaults to None
@@ -3071,6 +3077,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
             index=index,
             values=_maybe_transposed_self,
             limit=limit,
+            limit_direction=limit_direction,
             inplace=inplace,
             downcast=downcast,
             **kwargs
diff --git a/pandas/core/internals.py b/pandas/core/internals.py
@@ -747,6 +747,7 @@ def putmask(self, mask, new, align=True, inplace=False,
 
     def interpolate(self, method='pad', axis=0, index=None,
                     values=None, inplace=False, limit=None,
+                    limit_direction='forward',
                     fill_value=None, coerce=False, downcast=None, **kwargs):
 
         def check_int_bool(self, inplace):
@@ -790,6 +791,7 @@ def check_int_bool(self, inplace):
                                      values=values,
                                      axis=axis,
                                      limit=limit,
+                                     limit_direction=limit_direction,
                                      fill_value=fill_value,
                                      inplace=inplace,
                                      downcast=downcast,
@@ -829,6 +831,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
 
     def _interpolate(self, method=None, index=None, values=None,
                      fill_value=None, axis=0, limit=None,
+                     limit_direction='forward',
                      inplace=False, downcast=None, **kwargs):
         """ interpolate using scipy wrappers """
 
@@ -855,6 +858,7 @@ def func(x):
             # should the axis argument be handled below in apply_along_axis?
             # i.e. not an arg to com.interpolate_1d
             return com.interpolate_1d(index, x, method=method, limit=limit,
+                                      limit_direction=limit_direction,
                                       fill_value=fill_value,
                                       bounds_error=False, **kwargs)
 
diff --git a/pandas/tests/test_generic.py b/pandas/tests/test_generic.py
@@ -857,10 +857,79 @@ def test_interp_scipy_basic(self):
 
     def test_interp_limit(self):
         s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
         expected = Series([1., 3., 5., 7., np.nan, 11.])
         result = s.interpolate(method='linear', limit=2)
         assert_series_equal(result, expected)
 
+    def test_interp_limit_forward(self):
+        s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
+        # Provide 'forward' (the default) explicitly here.
+        expected = Series([1., 3., 5., 7., np.nan, 11.])
+
+        result = s.interpolate(
+            method='linear', limit=2, limit_direction='forward')
+        assert_series_equal(result, expected)
+
+        result = s.interpolate(
+            method='linear', limit=2, limit_direction='FORWARD')
+        assert_series_equal(result, expected)
+
+    def test_interp_limit_bad_direction(self):
+        s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+        expected = Series([1., 3., 5., 7., 9., 11.])
+
+        self.assertRaises(ValueError, s.interpolate,
+                          method='linear', limit=2,
+                          limit_direction='abc')
+
+        # raises an error even if no limit is specified.
+        self.assertRaises(ValueError, s.interpolate,
+                          method='linear',
+                          limit_direction='abc')
+
+    def test_interp_limit_direction(self):
+        # These tests are for issue #9218 -- fill NaNs in both directions.
+        s = Series([1, 3, np.nan, np.nan, np.nan, 11])
+
+        expected = Series([1., 3., np.nan, 7., 9., 11.])
+        result = s.interpolate(
+            method='linear', limit=2, limit_direction='backward')
+        assert_series_equal(result, expected)
+
+        expected = Series([1., 3., 5., np.nan, 9., 11.])
+        result = s.interpolate(
+            method='linear', limit=1, limit_direction='both')
+        assert_series_equal(result, expected)
+
+        # Check that this works on a longer series of nans.
+        s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan])
+
+        expected = Series([1., 3., 4., 5., 6., 7., 9., 10., 11., 12., 12.])
+        result = s.interpolate(
+            method='linear', limit=2, limit_direction='both')
+        assert_series_equal(result, expected)
+
+        expected = Series([1., 3., 4., np.nan, 6., 7., 9., 10., 11., 12., 12.])
+        result = s.interpolate(
+            method='linear', limit=1, limit_direction='both')
+        assert_series_equal(result, expected)
+
+    def test_interp_limit_to_ends(self):
+        # These test are for issue #10420 -- flow back to beginning.
+        s = Series([np.nan, np.nan, 5, 7, 9, np.nan])
+
+        expected = Series([5., 5., 5., 7., 9., np.nan])
+        result = s.interpolate(
+            method='linear', limit=2, limit_direction='backward')
+        assert_series_equal(result, expected)
+
+        expected = Series([5., 5., 5., 7., 9., 9.])
+        result = s.interpolate(
+            method='linear', limit=2, limit_direction='both')
+        assert_series_equal(result, expected)
+
     def test_interp_all_good(self):
         # scipy
         tm._skip_if_no_scipy()