diff --git a/doc/source/user_guide/missing_data.rst b/doc/source/user_guide/missing_data.rst index 6c36a6470f841..37da46c1ddc17 100644 --- a/doc/source/user_guide/missing_data.rst +++ b/doc/source/user_guide/missing_data.rst @@ -339,6 +339,10 @@ Interpolation The ``limit_area`` keyword argument was added. +.. versionadded:: 1.0.0 + + The ``max_gap`` keyword argument was added. + Both Series and DataFrame objects have :meth:`~DataFrame.interpolate` that, by default, performs linear interpolation at missing data points. @@ -481,8 +485,9 @@ filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, - np.nan, 13, np.nan, np.nan]) + ser = pd.Series([np.nan, np.nan, 2, np.nan, np.nan, + 3, np.nan, np.nan, np.nan, + 13, np.nan, np.nan]) ser # fill all consecutive values in a forward direction @@ -491,8 +496,24 @@ filled since the last valid observation: # fill one consecutive value in a forward direction ser.interpolate(limit=1) +If an interpolation should only be carried out for consecutive ``NaN`` values +of a certain maximum length, the ``max_gap`` keyword, introduced in v1.0.0, +can be used. Any ``NaN`` gap longer than ``max_gap`` will not be modified. +This can be useful, e.g. if an interpolation using the ``scipy`` methods +should be restricted to short NaN-gaps because the expected variation over +longer NaN-gaps forbids using interpolated values. + +.. ipython:: python + + ser + # interpolate in forward direction but only NaN-gaps with a maximum 2 consecutive NaN values + ser.interpolate(max_gap=2) + By default, ``NaN`` values are filled in a ``forward`` direction. Use ``limit_direction`` parameter to fill ``backward`` or from ``both`` directions. +Note that for methods `pad`, `ffill`, `backfill` and `bfill` ``limit_direction`` +must not be set as these fill methods implicitly are meant to work only in one +direction. .. ipython:: python diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 98d861d999ea9..ad89327de2a4d 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -125,6 +125,7 @@ Other enhancements - Roundtripping DataFrames with nullable integer or string data types to parquet (:meth:`~DataFrame.to_parquet` / :func:`read_parquet`) using the `'pyarrow'` engine now preserve those data types with pyarrow >= 1.0.0 (:issue:`20612`). +- :meth:`Series.interpolate` added the ``max_gap`` keyword to limit interpolation to NaN-gaps of a certain length (:issue:`25141`) Build Changes ^^^^^^^^^^^^^ @@ -300,6 +301,7 @@ Performance improvements Bug fixes ~~~~~~~~~ +- ``limit_area`` and ``limit_direction`` now work in :meth:`Series.interpolate` if ``method`` is ``pad`` (:issue:`25141`) Categorical ^^^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 982a57a6f725e..50e3659b0858b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6894,7 +6894,9 @@ def replace( Update the data in place if possible. limit_direction : {'forward', 'backward', 'both'}, default 'forward' If limit is specified, consecutive NaNs will be filled in this - direction. + direction. If the methods 'pad' or 'ffill' are used it must be + None or 'forward'. If 'backfill' or 'bfill' are use it must be + None or 'backwards'. limit_area : {`None`, 'inside', 'outside'}, default None If limit is specified, consecutive NaNs will be filled with this restriction. @@ -6906,6 +6908,13 @@ def replace( .. versionadded:: 0.23.0 + max_gap : int, optional + Maximum number of consecutive NaN values up to which a NaN-gap + will be interpolated. All longer NaN-gaps will be left unchanged. + Must be greater than 0. + + .. versionadded:: 1.0.0 + downcast : optional, 'infer' or None, defaults to None Downcast dtypes if possible. **kwargs @@ -6990,6 +6999,36 @@ def replace( 8 4.71 dtype: object + Similar to the examples above. Filling in ``NaN`` in a Series + by padding, but here filling only NaN-gaps smaller than a specific + gap width using the kwarg `max_gap`. + + >>> s = pd.Series([np.nan, "single_one", np.nan, + ... "fill_two_more", np.nan, np.nan, np.nan, + ... 4.71, np.nan]) + >>> s + 0 NaN + 1 single_one + 2 NaN + 3 fill_two_more + 4 NaN + 5 NaN + 6 NaN + 7 4.71 + 8 NaN + dtype: object + >>> s.interpolate(method='pad', max_gap=2) + 0 NaN + 1 single_one + 2 single_one + 3 fill_two_more + 4 NaN + 5 NaN + 6 NaN + 7 4.71 + 8 4.71 + dtype: object + Filling in ``NaN`` in a Series via polynomial interpolation or splines: Both 'polynomial' and 'spline' methods require that you also specify an ``order`` (int). @@ -7045,8 +7084,9 @@ def interpolate( axis=0, limit=None, inplace=False, - limit_direction="forward", + limit_direction=None, limit_area=None, + max_gap=None, downcast=None, **kwargs, ): @@ -7085,6 +7125,28 @@ def interpolate( "column to a numeric dtype." ) + # Set `limit_direction` depending on `method` + if (method == "pad") or (method == "ffill"): + if (limit_direction == "backward") or (limit_direction == "both"): + raise ValueError( + "`limit_direction` must not be `%s` for method `%s`" + % (limit_direction, method) + ) + else: + limit_direction = "forward" + elif (method == "backfill") or (method == "bfill"): + if (limit_direction == "forward") or (limit_direction == "both"): + raise ValueError( + "`limit_direction` must not be `%s` for method `%s`" + % (limit_direction, method) + ) + else: + limit_direction = "backward" + else: + # Set default + if limit_direction is None: + limit_direction = "forward" + # create/use the index if method == "linear": # prior default @@ -7120,6 +7182,7 @@ def interpolate( limit=limit, limit_direction=limit_direction, limit_area=limit_area, + max_gap=max_gap, inplace=inplace, downcast=downcast, **kwargs, diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 7ace80415c846..a12040f590653 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1083,6 +1083,7 @@ def interpolate( values=None, inplace=False, limit=None, + max_gap=None, limit_direction="forward", limit_area=None, fill_value=None, @@ -1117,6 +1118,8 @@ def check_int_bool(self, inplace): axis=axis, inplace=inplace, limit=limit, + max_gap=max_gap, + limit_area=limit_area, fill_value=fill_value, coerce=coerce, downcast=downcast, @@ -1133,6 +1136,7 @@ def check_int_bool(self, inplace): values=values, axis=axis, limit=limit, + max_gap=max_gap, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, @@ -1147,6 +1151,8 @@ def _interpolate_with_fill( axis=0, inplace=False, limit=None, + max_gap=None, + limit_area=None, fill_value=None, coerce=False, downcast=None, @@ -1169,16 +1175,38 @@ def _interpolate_with_fill( # We only get here for non-ExtensionBlock fill_value = convert_scalar(self.values, fill_value) - values = missing.interpolate_2d( - values, - method=method, - axis=axis, - limit=limit, - fill_value=fill_value, - dtype=self.dtype, - ) + # We have to distinguish two cases: + # 1. When kwargs `max_gap` or `limit_area` are used: They are not + # supported by `missing.interpolate_2d()`. Using these kwargs only + # works by applying the fill along a certain axis. + # 2. All other cases: Then, `missing.interpolate_2d()` can be used. + if (max_gap is not None) or (limit_area is not None): + + def func(x): + return missing.interpolate_1d_fill( + x, + method=method, + axis=axis, + limit=limit, + max_gap=max_gap, + limit_area=limit_area, + fill_value=fill_value, + dtype=self.dtype, + ) + + interp_values = np.apply_along_axis(func, axis, values) + + else: + interp_values = missing.interpolate_2d( + values, + method=method, + axis=axis, + limit=limit, + fill_value=fill_value, + dtype=self.dtype, + ) - blocks = [self.make_block_same_class(values, ndim=self.ndim)] + blocks = [self.make_block_same_class(interp_values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) def _interpolate( @@ -1189,6 +1217,7 @@ def _interpolate( fill_value=None, axis=0, limit=None, + max_gap=None, limit_direction="forward", limit_area=None, inplace=False, @@ -1227,6 +1256,7 @@ def func(x): x, method=method, limit=limit, + max_gap=max_gap, limit_direction=limit_direction, limit_area=limit_area, fill_value=fill_value, diff --git a/pandas/core/missing.py b/pandas/core/missing.py index fc54c03c042b7..7f886b2891905 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -11,6 +11,7 @@ ensure_float64, is_datetime64_dtype, is_datetime64tz_dtype, + is_integer, is_integer_dtype, is_scalar, is_timedelta64_dtype, @@ -159,6 +160,7 @@ def interpolate_1d( yvalues, method="linear", limit=None, + max_gap=None, limit_direction="forward", limit_area=None, fill_value=None, @@ -218,40 +220,25 @@ def interpolate_1d( # default limit is unlimited GH #16282 limit = algos._validate_limit(nobs=None, limit=limit) - # These are sets of index pointers to invalid values... i.e. {0, 1, etc... - all_nans = set(np.flatnonzero(invalid)) - start_nans = set(range(find_valid_index(yvalues, "first"))) - end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) - mid_nans = all_nans - start_nans - end_nans - - # Like the sets above, preserve_nans contains indices of invalid values, - # but in this case, it is the final set of indices that need to be - # preserved as NaN after the interpolation. - - # For example if limit_direction='forward' then preserve_nans will - # contain indices of NaNs at the beginning of the series, and NaNs that - # are more than'limit' away from the prior non-NaN. - - # set preserve_nans based on direction using _interp_limit - if limit_direction == "forward": - preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) - elif limit_direction == "backward": - preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) - else: - # both directions... just use _interp_limit - preserve_nans = set(_interp_limit(invalid, limit, limit)) + if (max_gap is not None) and (limit is not None): + raise ValueError("max_gap cannot be used together with limit") - # if limit_area is set, add either mid or outside indices - # to preserve_nans GH #16284 - if limit_area == "inside": - # preserve NaNs on the outside - preserve_nans |= start_nans | end_nans - elif limit_area == "outside": - # preserve NaNs on the inside - preserve_nans |= mid_nans - - # sort preserve_nans and covert to list - preserve_nans = sorted(preserve_nans) + if max_gap is None: + pass + elif not is_integer(max_gap): + raise ValueError("max_gap must be an integer") + elif max_gap < 1: + raise ValueError("max_gap must be greater than 0") + + preserve_nans = _derive_indices_of_nans_to_preserve( + yvalues=yvalues, + valid=valid, + invalid=invalid, + limit=limit, + limit_area=limit_area, + limit_direction=limit_direction, + max_gap=max_gap, + ) xvalues = getattr(xvalues, "values", xvalues) yvalues = getattr(yvalues, "values", yvalues) @@ -306,6 +293,82 @@ def interpolate_1d( return result +def _derive_indices_of_nans_to_preserve( + yvalues, invalid, valid, limit, limit_area, limit_direction, max_gap +): + """ Derive the indices of NaNs that shall be preserved after interpolation + + This function is called by `interpolate_1d` and takes the arguments with + the same name from there. In `interpolate_1d`, after performing the + interpolation the list of indices of NaNs to preserve is used to put + NaNs in the desired locations. + + """ + + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + start_nans = set(range(find_valid_index(yvalues, "first"))) + end_nans = set(range(1 + find_valid_index(yvalues, "last"), len(valid))) + mid_nans = all_nans - start_nans - end_nans + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than'limit' away from the prior non-NaN. + + # In case that max_gap is provided, preserve_nans is derived so that + # gaps with continuous NaN values of width > max_gap will be preserved. + + # set preserve_nans based on direction using _interp_limit + if limit_direction == "forward": + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) + elif limit_direction == "backward": + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) + else: + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + if max_gap is not None: + + def bfill_nan(arr): + """ Backward-fill NaNs """ + mask = np.isnan(arr) + idx = np.where(~mask, np.arange(mask.shape[0]), mask.shape[0] - 1) + idx = np.minimum.accumulate(idx[::-1], axis=0)[::-1] + out = arr[idx] + return out + + # Generate array where the NaN-gap-width is filled in as value + # at each NaN location. + cumsum = np.cumsum(invalid).astype("float") + diff = np.zeros_like(yvalues, dtype="float") + diff[~invalid] = np.pad(np.diff(cumsum[~invalid]), (1, 0), mode="constant") + diff[invalid] = np.nan + diff = bfill_nan(diff) + # hack to avoid having trailing NaNs in `diff`. Fill these + # with `max_gap`. Everything smaller than `max_gap` won't matter + # in the following. + diff[np.isnan(diff)] = max_gap + preserve_nans |= set(np.flatnonzero((diff > max_gap) & invalid)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == "inside": + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == "outside": + # preserve NaNs on the inside + preserve_nans |= mid_nans + + # sort preserve_nans and covert to list + preserve_nans = sorted(preserve_nans) + + return preserve_nans + + def _interpolate_scipy_wrapper( x, y, new_x, method, fill_value=None, bounds_error=False, order=None, **kwargs ): @@ -471,6 +534,74 @@ def _akima_interpolate(xi, yi, x, der=0, axis=0): return [P(x, nu) for nu in der] +def interpolate_1d_fill( + values, + method="pad", + axis=0, + limit=None, + max_gap=None, + limit_area=None, + fill_value=None, + dtype=None, +): + """ + This is a 1D-versoin of `interpolate_2d`, which is used for methods `pad` + and `backfill` when interpolating. This 1D-version is necessary to be + able to handle kwargs `max_gap` and `limit_area` via the function + ` _derive_indices_of_nans_to_preserve`. It is used the same way as the + 1D-interpolation functions which are based on scipy-interpolation, i.e. + via np.apply_along_axis. + """ + if method == "pad": + limit_direction = "forward" + elif method == "backfill": + limit_direction = "backward" + else: + raise ValueError("`method` must be either 'pad' or 'backfill'.") + + orig_values = values + + yvalues = values + invalid = isna(yvalues) + valid = ~invalid + + if values.ndim > 1: + raise AssertionError("This only works with 1D data.") + + if fill_value is None: + mask = None + else: # todo create faster fill func without masking + mask = mask_missing(values, fill_value) + + preserve_nans = _derive_indices_of_nans_to_preserve( + yvalues=yvalues, + valid=valid, + invalid=invalid, + limit=limit, + limit_area=limit_area, + limit_direction=limit_direction, + max_gap=max_gap, + ) + + method = clean_fill_method(method) + if method == "pad": + values = pad_1d(values, limit=limit, mask=mask, dtype=dtype) + else: + values = backfill_1d(values, limit=limit, mask=mask, dtype=dtype) + + if orig_values.dtype.kind == "M": + # convert float back to datetime64 + values = values.astype(orig_values.dtype) + + # if np.issubdtype(values.dtype, np.datetime64): + # values[preserve_nans] = np.datetime64('NaT') + # else: + # values[preserve_nans] = np.nan + values[preserve_nans] = fill_value + + return values + + def interpolate_2d( values, method="pad", axis=0, limit=None, fill_value=None, dtype=None ): diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 24510ff9338ca..e8fdd51e09833 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -550,6 +550,41 @@ def test_frame_fillna_limit(self): expected.values[:3] = np.nan tm.assert_frame_equal(result, expected) + def test_frame_interp_max_gap(self): + nan = np.nan + s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + df = pd.concat([s, s], axis=1) + + expected_s = Series( + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + + result = df.interpolate(method="linear", max_gap=2) + tm.assert_frame_equal(result, expected_df) + + expected_s = Series( + [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + + result = df.interpolate(method="linear", max_gap=2, limit_direction="backward") + tm.assert_frame_equal(result, expected_df) + + expected_s = Series( + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + result = df.interpolate(method="linear", max_gap=2, limit_area="inside") + tm.assert_frame_equal(result, expected_df) + + expected_s = Series( + [nan, 1.0, 1.0, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + expected_df = pd.concat([expected_s, expected_s], axis=1) + result = df.interpolate(method="pad", max_gap=2, limit_area="inside") + tm.assert_frame_equal(result, expected_df) + def test_fillna_skip_certain_blocks(self): # don't try to fill boolean, int blocks diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0751e1fb8b906..3b5c3b9dbc555 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1397,6 +1397,17 @@ def test_interp_limit_area(self): with pytest.raises(ValueError, match=msg): s.interpolate(method="linear", limit_area="abc") + def test_interp_limit_area_with_pad(self): + # Test for issue #26796 -- using `limit_area` with `method=pad` + s = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, np.nan, np.nan]) + expected = Series([np.nan, np.nan, 3, 3, 3, 3, 7, np.nan, np.nan]) + result = s.interpolate(method="pad", limit_area="inside") + tm.assert_series_equal(result, expected) + + expected = Series([np.nan, np.nan, 3, np.nan, np.nan, np.nan, 7, 7, 7]) + result = s.interpolate(method="pad", limit_area="outside") + tm.assert_series_equal(result, expected) + def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11]) @@ -1422,6 +1433,40 @@ def test_interp_limit_direction(self): result = s.interpolate(method="linear", limit=1, limit_direction="both") tm.assert_series_equal(result, expected) + def test_interp_limit_direction_with_pad_error(self): + # Since `pad` forces a forward fill and `bfill` forces a backward fill + # they should not be used together with `limit_direction` + s = Series([1, 3, np.nan, np.nan, np.nan, 11]) + + with pytest.raises( + ValueError, + match="`limit_direction` must not be `backward` for method `pad`", + ): + s.interpolate(method="pad", limit=1, limit_direction="backward") + + with pytest.raises( + ValueError, + match="`limit_direction` must not be `backward` for method `ffill`", + ): + s.interpolate(method="ffill", limit=1, limit_direction="backward") + + with pytest.raises( + ValueError, match="`limit_direction` must not be `both` for method `ffill`" + ): + s.interpolate(method="ffill", limit=1, limit_direction="both") + + with pytest.raises( + ValueError, + match="`limit_direction` must not be `forward` for method `backfill`", + ): + s.interpolate(method="backfill", limit=1, limit_direction="forward") + + with pytest.raises( + ValueError, + match="`limit_direction` must not be `forward` for method `bfill`", + ): + s.interpolate(method="bfill", limit=1, limit_direction="forward") + def test_interp_limit_to_ends(self): # These test are for issue #10420 -- flow back to beginning. s = Series([np.nan, np.nan, 5, 7, 9, np.nan]) @@ -1434,6 +1479,57 @@ def test_interp_limit_to_ends(self): result = s.interpolate(method="linear", limit=2, limit_direction="both") tm.assert_series_equal(result, expected) + def test_interp_max_gap(self): + nan = np.nan + s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + + expected = Series( + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, -1, -1] + ) + result = s.interpolate(method="linear", max_gap=2) + tm.assert_series_equal(result, expected) + + expected = Series( + [1.0, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + result = s.interpolate(method="linear", max_gap=2, limit_direction="backward") + tm.assert_series_equal(result, expected) + + expected = Series( + [nan, 1.0, 1.5, 2.0, 3.0, 4.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + result = s.interpolate(method="linear", max_gap=2, limit_area="inside") + tm.assert_series_equal(result, expected) + + expected = Series( + [nan, 1.0, 1, 2.0, 2.0, 2.0, 5.0, nan, nan, nan, -1.0, nan, nan] + ) + result = s.interpolate(method="pad", max_gap=2, limit_area="inside") + tm.assert_series_equal(result, expected) + + def test_interp_max_gap_nat(self): + series = Series([0, 1, 2, iNaT], dtype="M8[ns]") + + result = series.interpolate(method="pad", max_gap=2) + expected = Series([0, 1, 2, 2], dtype="M8[ns]") + + tm.assert_series_equal(result, expected) + + def test_interp_max_gap_errors(self): + nan = np.nan + s = Series([nan, 1.0, nan, 2.0, nan, nan, 5.0, nan, nan, nan, -1.0, nan, nan]) + + with pytest.raises( + ValueError, match="max_gap cannot be used together with limit" + ): + s.interpolate(method="linear", max_gap=2, limit=3) + + with pytest.raises(ValueError, match="max_gap must be an integer"): + s.interpolate(method="linear", max_gap="foo") + + with pytest.raises(ValueError, match="max_gap must be greater than 0"): + s.interpolate(method="linear", max_gap=0) + def test_interp_limit_before_ends(self): # These test are for issue #11115 -- limit ends properly. s = Series([np.nan, np.nan, 5, 7, np.nan, np.nan])