Skip to content

Commit b7374ca

Browse files
committed
Merge pull request #10691 from lmjohns3/master
Allow interpolate() to fill backwards as well as forwards
2 parents 5803e4c + df4d52c commit b7374ca

File tree

6 files changed

+172
-43
lines changed

6 files changed

+172
-43
lines changed

doc/source/missing_data.rst

+23-3
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,10 @@ Interpolation
329329
:meth:`~pandas.DataFrame.interpolate`, and :meth:`~pandas.Series.interpolate` have
330330
revamped interpolation methods and functionality.
331331

332+
.. versionadded:: 0.17.0
333+
334+
The ``limit_direction`` keyword argument was added.
335+
332336
Both Series and Dataframe objects have an ``interpolate`` method that, by default,
333337
performs linear interpolation at missing datapoints.
334338

@@ -448,17 +452,33 @@ at the new values.
448452
.. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation
449453
.. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
450454

455+
Interpolation Limits
456+
^^^^^^^^^^^^^^^^^^^^
451457

452458
Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword
453-
argument. Use this to limit the number of consecutive interpolations, keeping
454-
``NaN`` values for interpolations that are too far from the last valid
459+
argument. Use this argument to limit the number of consecutive interpolations,
460+
keeping ``NaN`` values for interpolations that are too far from the last valid
455461
observation:
456462

457463
.. ipython:: python
458464
459-
ser = pd.Series([1, 3, np.nan, np.nan, np.nan, 11])
465+
ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13])
460466
ser.interpolate(limit=2)
461467
468+
By default, ``limit`` applies in a forward direction, so that only ``NaN``
469+
values after a non-``NaN`` value can be filled. If you provide ``'backward'`` or
470+
``'both'`` for the ``limit_direction`` keyword argument, you can fill ``NaN``
471+
values before non-``NaN`` values, or both before and after non-``NaN`` values,
472+
respectively:
473+
474+
.. ipython:: python
475+
476+
ser.interpolate(limit=1) # limit_direction == 'forward'
477+
478+
ser.interpolate(limit=1, limit_direction='backward')
479+
480+
ser.interpolate(limit=1, limit_direction='both')
481+
462482
.. _missing_data.replace:
463483

464484
Replacing Generic Values

doc/source/whatsnew/v0.17.0.txt

+6
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ New features
5555
- SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
5656
- Enable writing complex values to HDF stores when using table format (:issue:`10447`)
5757
- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
58+
- Add a ``limit_direction`` keyword argument that works with ``limit`` to enable ``interpolate`` to fill ``NaN`` values forward, backward, or both (:issue:`9218` and :issue:`10420`)
59+
60+
.. ipython:: python
61+
62+
ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13])
63+
ser.interpolate(limit=1, limit_direction='both')
5864

5965
.. _whatsnew_0170.gil:
6066

pandas/core/common.py

+62-39
Original file line numberDiff line numberDiff line change
@@ -1589,6 +1589,7 @@ def _clean_interp_method(method, **kwargs):
15891589

15901590

15911591
def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
1592+
limit_direction='forward',
15921593
fill_value=None, bounds_error=False, order=None, **kwargs):
15931594
"""
15941595
Logic for the 1-d interpolation. The result should be 1-d, inputs
@@ -1602,9 +1603,15 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
16021603
invalid = isnull(yvalues)
16031604
valid = ~invalid
16041605

1605-
valid_y = yvalues[valid]
1606-
valid_x = xvalues[valid]
1607-
new_x = xvalues[invalid]
1606+
if not valid.any():
1607+
# have to call np.asarray(xvalues) since xvalues could be an Index
1608+
# which cant be mutated
1609+
result = np.empty_like(np.asarray(xvalues), dtype=np.float64)
1610+
result.fill(np.nan)
1611+
return result
1612+
1613+
if valid.all():
1614+
return yvalues
16081615

16091616
if method == 'time':
16101617
if not getattr(xvalues, 'is_all_dates', None):
@@ -1614,66 +1621,82 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
16141621
'DatetimeIndex')
16151622
method = 'values'
16161623

1617-
def _interp_limit(invalid, limit):
1618-
"""mask off values that won't be filled since they exceed the limit"""
1624+
def _interp_limit(invalid, fw_limit, bw_limit):
1625+
"Get idx of values that won't be forward-filled b/c they exceed the limit."
16191626
all_nans = np.where(invalid)[0]
16201627
if all_nans.size == 0: # no nans anyway
16211628
return []
1622-
violate = [invalid[x:x + limit + 1] for x in all_nans]
1623-
violate = np.array([x.all() & (x.size > limit) for x in violate])
1624-
return all_nans[violate] + limit
1629+
violate = [invalid[max(0, x - bw_limit):x + fw_limit + 1] for x in all_nans]
1630+
violate = np.array([x.all() & (x.size > bw_limit + fw_limit) for x in violate])
1631+
return all_nans[violate] + fw_limit - bw_limit
1632+
1633+
valid_limit_directions = ['forward', 'backward', 'both']
1634+
limit_direction = limit_direction.lower()
1635+
if limit_direction not in valid_limit_directions:
1636+
msg = 'Invalid limit_direction: expecting one of %r, got %r.' % (
1637+
valid_limit_directions, limit_direction)
1638+
raise ValueError(msg)
16251639

1626-
xvalues = getattr(xvalues, 'values', xvalues)
1627-
yvalues = getattr(yvalues, 'values', yvalues)
1640+
from pandas import Series
1641+
ys = Series(yvalues)
1642+
start_nans = set(range(ys.first_valid_index()))
1643+
end_nans = set(range(1 + ys.last_valid_index(), len(valid)))
1644+
1645+
# This is a list of the indexes in the series whose yvalue is currently NaN,
1646+
# but whose interpolated yvalue will be overwritten with NaN after computing
1647+
# the interpolation. For each index in this list, one of these conditions is
1648+
# true of the corresponding NaN in the yvalues:
1649+
#
1650+
# a) It is one of a chain of NaNs at the beginning of the series, and either
1651+
# limit is not specified or limit_direction is 'forward'.
1652+
# b) It is one of a chain of NaNs at the end of the series, and limit is
1653+
# specified and limit_direction is 'backward' or 'both'.
1654+
# c) Limit is nonzero and it is further than limit from the nearest non-NaN
1655+
# value (with respect to the limit_direction setting).
1656+
#
1657+
# The default behavior is to fill forward with no limit, ignoring NaNs at
1658+
# the beginning (see issues #9218 and #10420)
1659+
violate_limit = sorted(start_nans)
16281660

16291661
if limit:
1630-
violate_limit = _interp_limit(invalid, limit)
1631-
if valid.any():
1632-
firstIndex = valid.argmax()
1633-
valid = valid[firstIndex:]
1634-
invalid = invalid[firstIndex:]
1635-
result = yvalues.copy()
1636-
if valid.all():
1637-
return yvalues
1638-
else:
1639-
# have to call np.array(xvalues) since xvalues could be an Index
1640-
# which cant be mutated
1641-
result = np.empty_like(np.array(xvalues), dtype=np.float64)
1642-
result.fill(np.nan)
1643-
return result
1662+
if limit_direction == 'forward':
1663+
violate_limit = sorted(start_nans | set(_interp_limit(invalid, limit, 0)))
1664+
if limit_direction == 'backward':
1665+
violate_limit = sorted(end_nans | set(_interp_limit(invalid, 0, limit)))
1666+
if limit_direction == 'both':
1667+
violate_limit = _interp_limit(invalid, limit, limit)
1668+
1669+
xvalues = getattr(xvalues, 'values', xvalues)
1670+
yvalues = getattr(yvalues, 'values', yvalues)
1671+
result = yvalues.copy()
16441672

16451673
if method in ['linear', 'time', 'index', 'values']:
16461674
if method in ('values', 'index'):
16471675
inds = np.asarray(xvalues)
16481676
# hack for DatetimeIndex, #1646
16491677
if issubclass(inds.dtype.type, np.datetime64):
16501678
inds = inds.view(np.int64)
1651-
16521679
if inds.dtype == np.object_:
16531680
inds = lib.maybe_convert_objects(inds)
16541681
else:
16551682
inds = xvalues
1656-
1657-
inds = inds[firstIndex:]
1658-
1659-
result[firstIndex:][invalid] = np.interp(inds[invalid], inds[valid],
1660-
yvalues[firstIndex:][valid])
1661-
1662-
if limit:
1663-
result[violate_limit] = np.nan
1683+
result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid])
1684+
result[violate_limit] = np.nan
16641685
return result
16651686

16661687
sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic',
16671688
'barycentric', 'krogh', 'spline', 'polynomial',
16681689
'piecewise_polynomial', 'pchip']
16691690
if method in sp_methods:
1670-
new_x = new_x[firstIndex:]
1671-
1672-
result[firstIndex:][invalid] = _interpolate_scipy_wrapper(
1673-
valid_x, valid_y, new_x, method=method, fill_value=fill_value,
1691+
inds = np.asarray(xvalues)
1692+
# hack for DatetimeIndex, #1646
1693+
if issubclass(inds.dtype.type, np.datetime64):
1694+
inds = inds.view(np.int64)
1695+
result[invalid] = _interpolate_scipy_wrapper(
1696+
inds[valid], yvalues[valid], inds[invalid], method=method,
1697+
fill_value=fill_value,
16741698
bounds_error=bounds_error, order=order, **kwargs)
1675-
if limit:
1676-
result[violate_limit] = np.nan
1699+
result[violate_limit] = np.nan
16771700
return result
16781701

16791702

pandas/core/generic.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -2964,7 +2964,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
29642964
return self._constructor(new_data).__finalize__(self)
29652965

29662966
def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
2967-
downcast=None, **kwargs):
2967+
limit_direction='forward', downcast=None, **kwargs):
29682968
"""
29692969
Interpolate values according to different methods.
29702970
@@ -3001,6 +3001,12 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
30013001
* 1: fill row-by-row
30023002
limit : int, default None.
30033003
Maximum number of consecutive NaNs to fill.
3004+
limit_direction : {'forward', 'backward', 'both'}, defaults to 'forward'
3005+
If limit is specified, consecutive NaNs will be filled in this
3006+
direction.
3007+
3008+
.. versionadded:: 0.17.0
3009+
30043010
inplace : bool, default False
30053011
Update the NDFrame in place if possible.
30063012
downcast : optional, 'infer' or None, defaults to None
@@ -3071,6 +3077,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
30713077
index=index,
30723078
values=_maybe_transposed_self,
30733079
limit=limit,
3080+
limit_direction=limit_direction,
30743081
inplace=inplace,
30753082
downcast=downcast,
30763083
**kwargs

pandas/core/internals.py

+4
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,7 @@ def putmask(self, mask, new, align=True, inplace=False,
747747

748748
def interpolate(self, method='pad', axis=0, index=None,
749749
values=None, inplace=False, limit=None,
750+
limit_direction='forward',
750751
fill_value=None, coerce=False, downcast=None, **kwargs):
751752

752753
def check_int_bool(self, inplace):
@@ -790,6 +791,7 @@ def check_int_bool(self, inplace):
790791
values=values,
791792
axis=axis,
792793
limit=limit,
794+
limit_direction=limit_direction,
793795
fill_value=fill_value,
794796
inplace=inplace,
795797
downcast=downcast,
@@ -829,6 +831,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False,
829831

830832
def _interpolate(self, method=None, index=None, values=None,
831833
fill_value=None, axis=0, limit=None,
834+
limit_direction='forward',
832835
inplace=False, downcast=None, **kwargs):
833836
""" interpolate using scipy wrappers """
834837

@@ -855,6 +858,7 @@ def func(x):
855858
# should the axis argument be handled below in apply_along_axis?
856859
# i.e. not an arg to com.interpolate_1d
857860
return com.interpolate_1d(index, x, method=method, limit=limit,
861+
limit_direction=limit_direction,
858862
fill_value=fill_value,
859863
bounds_error=False, **kwargs)
860864

pandas/tests/test_generic.py

+69
Original file line numberDiff line numberDiff line change
@@ -857,10 +857,79 @@ def test_interp_scipy_basic(self):
857857

858858
def test_interp_limit(self):
859859
s = Series([1, 3, np.nan, np.nan, np.nan, 11])
860+
860861
expected = Series([1., 3., 5., 7., np.nan, 11.])
861862
result = s.interpolate(method='linear', limit=2)
862863
assert_series_equal(result, expected)
863864

865+
def test_interp_limit_forward(self):
866+
s = Series([1, 3, np.nan, np.nan, np.nan, 11])
867+
868+
# Provide 'forward' (the default) explicitly here.
869+
expected = Series([1., 3., 5., 7., np.nan, 11.])
870+
871+
result = s.interpolate(
872+
method='linear', limit=2, limit_direction='forward')
873+
assert_series_equal(result, expected)
874+
875+
result = s.interpolate(
876+
method='linear', limit=2, limit_direction='FORWARD')
877+
assert_series_equal(result, expected)
878+
879+
def test_interp_limit_bad_direction(self):
880+
s = Series([1, 3, np.nan, np.nan, np.nan, 11])
881+
expected = Series([1., 3., 5., 7., 9., 11.])
882+
883+
self.assertRaises(ValueError, s.interpolate,
884+
method='linear', limit=2,
885+
limit_direction='abc')
886+
887+
# raises an error even if no limit is specified.
888+
self.assertRaises(ValueError, s.interpolate,
889+
method='linear',
890+
limit_direction='abc')
891+
892+
def test_interp_limit_direction(self):
893+
# These tests are for issue #9218 -- fill NaNs in both directions.
894+
s = Series([1, 3, np.nan, np.nan, np.nan, 11])
895+
896+
expected = Series([1., 3., np.nan, 7., 9., 11.])
897+
result = s.interpolate(
898+
method='linear', limit=2, limit_direction='backward')
899+
assert_series_equal(result, expected)
900+
901+
expected = Series([1., 3., 5., np.nan, 9., 11.])
902+
result = s.interpolate(
903+
method='linear', limit=1, limit_direction='both')
904+
assert_series_equal(result, expected)
905+
906+
# Check that this works on a longer series of nans.
907+
s = Series([1, 3, np.nan, np.nan, np.nan, 7, 9, np.nan, np.nan, 12, np.nan])
908+
909+
expected = Series([1., 3., 4., 5., 6., 7., 9., 10., 11., 12., 12.])
910+
result = s.interpolate(
911+
method='linear', limit=2, limit_direction='both')
912+
assert_series_equal(result, expected)
913+
914+
expected = Series([1., 3., 4., np.nan, 6., 7., 9., 10., 11., 12., 12.])
915+
result = s.interpolate(
916+
method='linear', limit=1, limit_direction='both')
917+
assert_series_equal(result, expected)
918+
919+
def test_interp_limit_to_ends(self):
920+
# These test are for issue #10420 -- flow back to beginning.
921+
s = Series([np.nan, np.nan, 5, 7, 9, np.nan])
922+
923+
expected = Series([5., 5., 5., 7., 9., np.nan])
924+
result = s.interpolate(
925+
method='linear', limit=2, limit_direction='backward')
926+
assert_series_equal(result, expected)
927+
928+
expected = Series([5., 5., 5., 7., 9., 9.])
929+
result = s.interpolate(
930+
method='linear', limit=2, limit_direction='both')
931+
assert_series_equal(result, expected)
932+
864933
def test_interp_all_good(self):
865934
# scipy
866935
tm._skip_if_no_scipy()

0 commit comments

Comments
 (0)