Skip to content

ENH: Added max_gap keyword for series.interpolate #25141

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 36 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
5e4b2ee
Added maxgap keyword for series.interpolate
cchwala Feb 4, 2019
b752602
minor pep8 fixes
cchwala Feb 4, 2019
839b11a
fixed parameter order
cchwala Feb 4, 2019
fcdc4e4
Merge remote-tracking branch 'upstream/master' into interpolate_maxgap
cchwala Mar 26, 2019
20b70b7
Merge remote-tracking branch 'upstream/master' into interpolate_maxgap
cchwala Jun 11, 2019
3cb371e
Changed parameter name from `maxgap` to `max_gap`
cchwala Jun 11, 2019
8c6ff7a
Moved code to derive indices of "NaNs to preserve" in separate function
cchwala Jun 11, 2019
4aaf8dc
Tests for errors extended and moved to own function
cchwala Jun 11, 2019
1f0406f
added blank lines in docstring as requested
cchwala Jun 11, 2019
eaacefd
Added test which fails for method='pad'
cchwala Jun 11, 2019
f274d16
Merge remote-tracking branch 'upstream/master' into interpolate_maxgap
cchwala Aug 30, 2019
c72acdb
manually add black code formating
cchwala Aug 30, 2019
e0aee3a
First WIP but working version to fix issue with `pad` and `limit_area`
cchwala Sep 5, 2019
af15eaf
fix: do not decide based on dimension but on crucial kwargs which int…
cchwala Sep 5, 2019
12d2e5b
some clean up
cchwala Sep 5, 2019
c25d1f8
Make it work with NaT and test for that
cchwala Sep 17, 2019
4d40722
Added comment on why two interpolate fill functions are needed
cchwala Sep 17, 2019
255518e
fix typo
cchwala Sep 17, 2019
2015e84
Added tests for DataFrames
cchwala Sep 17, 2019
4d7b0f1
Added failing test for https://github.com/pandas-dev/pandas/issues/12918
cchwala Sep 17, 2019
cbf7388
Now using 1D pad and backfill functions in `interpolate_1d_fill()`
cchwala Sep 17, 2019
5128b9d
Merge remote-tracking branch 'upstream/master' into interpolate_maxgap
cchwala Nov 11, 2019
3c55e1e
Additional required adjustments after merge with upstream/master
cchwala Nov 19, 2019
f9e4044
Merge remote-tracking branch 'upstream/master' into interpolate_maxgap
cchwala Nov 19, 2019
d1bbcd6
Removed test for bug with pad which should be solved in a separate PR
cchwala Nov 19, 2019
21b3091
removed trailing whitespaces
cchwala Nov 19, 2019
c96c604
fixed formating for black and flake8
cchwala Nov 19, 2019
bd84fc9
updated docstring for interpolat with max_gap
cchwala Nov 19, 2019
908ffe5
added max_gap info and example to documentation
cchwala Nov 20, 2019
380ef7c
added info to whatsnew file
cchwala Nov 20, 2019
5a1718a
flake8
cchwala Nov 20, 2019
16755bd
update docs with info on limit_direction and method pad
cchwala Nov 20, 2019
b58d721
better test for https://github.com/pandas-dev/pandas/issues/26796
cchwala Nov 20, 2019
aa58ffa
typo, black, flake8
cchwala Nov 20, 2019
ae16124
update to doc
cchwala Nov 20, 2019
28b442c
fix wrong behavior when combining max_gap and limit_direction
cchwala Nov 20, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -6648,7 +6648,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
* 'outside': Only fill NaNs outside valid values (extrapolate).

.. versionadded:: 0.21.0
maxgap : int, optional
Maximum number of consecutive NaN values up to which a NaN-gap
will be interpolated. For all NaN-gaps wider than that no
interpolation is carried out. Must be greater than 0.

.. versionadded:: 0.25.0
downcast : optional, 'infer' or None, defaults to None
Downcast dtypes if possible.
**kwargs
Expand Down Expand Up @@ -6783,7 +6788,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,

@Appender(_shared_docs['interpolate'] % _shared_doc_kwargs)
def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
limit_direction='forward', limit_area=None,
limit_direction='forward', limit_area=None, maxgap=None,
downcast=None, **kwargs):
"""
Interpolate values according to different methods.
Expand Down Expand Up @@ -6836,6 +6841,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
values=_maybe_transposed_self, limit=limit,
limit_direction=limit_direction,
limit_area=limit_area,
maxgap=maxgap,
inplace=inplace, downcast=downcast,
**kwargs)

Expand Down
50 changes: 43 additions & 7 deletions pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def clean_interp_method(method, **kwargs):
return method


def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
def interpolate_1d(xvalues, yvalues, method='linear', limit=None, maxgap=None,
limit_direction='forward', limit_area=None, fill_value=None,
bounds_error=False, order=None, **kwargs):
"""
Expand Down Expand Up @@ -165,6 +165,16 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
elif limit < 1:
raise ValueError('Limit must be greater than 0')

if (maxgap is not None) and (limit is not None):
raise ValueError('maxgap cannot be used together with limit')

if maxgap is None:
pass
elif not is_integer(maxgap):
raise ValueError('maxgap must be an integer')
elif maxgap < 1:
raise ValueError('maxgap must be greater than 0')

from pandas import Series
ys = Series(yvalues)

Expand All @@ -182,14 +192,40 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None,
# contain indices of NaNs at the beginning of the series, and NaNs that
# are more than'limit' away from the prior non-NaN.

# In case that maxgap is provided, preserve_nans is derived so that
# gaps with continuous NaN values of width > maxgap will be preserved.

# set preserve_nans based on direction using _interp_limit
if limit_direction == 'forward':
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == 'backward':
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
if maxgap is None:
if limit_direction == 'forward':
preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
elif limit_direction == 'backward':
preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))
else:
# both directions... just use _interp_limit
preserve_nans = set(_interp_limit(invalid, limit, limit))
def bfill_nan(arr):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the benefit to making this a separate closure?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no real reason. Maybe at some point I thought the function definition would make things clearer.

Should I just put the content of the function in-line starting at L350?

""" Backward-fill NaNs """
mask = np.isnan(arr)
idx = np.where(~mask, np.arange(mask.shape[0]), mask.shape[0] - 1)
idx = np.minimum.accumulate(idx[::-1], axis=0)[::-1]
out = arr[idx]
return out

# Generate array where the NaN-gap-width is filled in as value
# at each NaN location.
cumsum = np.cumsum(invalid).astype('float')
diff = np.zeros_like(yvalues)
diff[~invalid] = np.pad(np.diff(cumsum[~invalid]),
(1, 0), mode='constant')
diff[invalid] = np.nan
diff = bfill_nan(diff)
# hack to avoid having trailing NaNs in `diff`. Fill these
# with `maxgap`. Everthing smaller than `maxgap` won't matter
# in the following.
diff[np.isnan(diff)] = maxgap
preserve_nans = set(np.flatnonzero((diff > maxgap) & invalid))

# if limit_area is set, add either mid or outside indices
# to preserve_nans GH #16284
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/series/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,41 @@ def test_interp_limit_to_ends(self):
limit_direction='both')
assert_series_equal(result, expected)

def test_interp_maxgap(self):
s = Series([
np.nan,
1., np.nan,
2., np.nan, np.nan,
5., np.nan, np.nan, np.nan,
-1., np.nan, np.nan
])

excpected = Series([
1.,
1., 1.5,
2., 3., 4.,
5., np.nan, np.nan, np.nan,
-1., -1, -1
])

result = s.interpolate(method='linear', maxgap=2)
assert_series_equal(result, excpected)

excpected = Series([
np.nan,
1., 1.5,
2., 3., 4.,
5., np.nan, np.nan, np.nan,
-1., np.nan, np.nan
])

result = s.interpolate(method='linear', maxgap=2, limit_area='inside')
assert_series_equal(result, excpected)

with pytest.raises(ValueError,
match='maxgap cannot be used together with limit'):
s.interpolate(method='linear', maxgap=2, limit=3)

def test_interp_limit_before_ends(self):
# These test are for issue #11115 -- limit ends properly.
s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan])
Expand Down