diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 59cc709359a8d..4f55bd0d5e7ad 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -437,6 +437,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`) - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`) - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) +- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 9fef78d9f8c3d..039d868bccd16 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -314,7 +314,16 @@ def get_interp_index(method, index: Index) -> Index: # prior default from pandas import Index - index = Index(np.arange(len(index))) + if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype( + index.dtype, "mM" + ): + # Convert datetime-like indexes to int64 + index = Index(index.view("i8")) + + elif not is_numeric_dtype(index.dtype): + # We keep behavior consistent with prior versions of pandas for + # non-numeric, non-datetime indexes + index = Index(range(len(index))) else: methods = {"index", "values", "nearest", "time"} is_numeric_or_datetime = ( @@ -616,6 +625,9 @@ def _interpolate_scipy_wrapper( terp = alt_methods.get(method, None) if terp is None: raise ValueError(f"Can not interpolate with method={method}.") + + # Make sure downcast is not in kwargs for alt methods + kwargs.pop("downcast", None) new_y = terp(x, y, new_x, **kwargs) return new_y diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 86d1f55f38c05..ccbe25fdae841 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -80,6 +80,7 @@ TimedeltaIndex, timedelta_range, ) +from pandas.core.reshape.concat import concat from pandas.tseries.frequencies import ( is_subperiod, @@ -885,30 +886,59 @@ def interpolate( Freq: 500ms, dtype: float64 Internal reindexing with ``asfreq()`` prior to interpolation leads to - an interpolated timeseries on the basis the reindexed timestamps (anchors). - Since not all datapoints from original series become anchors, - it can lead to misleading interpolation results as in the following example: + an interpolated timeseries on the basis of the reindexed timestamps + (anchors). It is assured that all available datapoints from original + series become anchors, so it also works for resampling-cases that lead + to non-aligned timestamps, as in the following example: >>> series.resample("400ms").interpolate("linear") 2023-03-01 07:00:00.000 1.0 - 2023-03-01 07:00:00.400 1.2 - 2023-03-01 07:00:00.800 1.4 - 2023-03-01 07:00:01.200 1.6 - 2023-03-01 07:00:01.600 1.8 + 2023-03-01 07:00:00.400 0.2 + 2023-03-01 07:00:00.800 -0.6 + 2023-03-01 07:00:01.200 -0.4 + 2023-03-01 07:00:01.600 0.8 2023-03-01 07:00:02.000 2.0 - 2023-03-01 07:00:02.400 2.2 - 2023-03-01 07:00:02.800 2.4 - 2023-03-01 07:00:03.200 2.6 - 2023-03-01 07:00:03.600 2.8 + 2023-03-01 07:00:02.400 1.6 + 2023-03-01 07:00:02.800 1.2 + 2023-03-01 07:00:03.200 1.4 + 2023-03-01 07:00:03.600 2.2 2023-03-01 07:00:04.000 3.0 Freq: 400ms, dtype: float64 - Note that the series erroneously increases between two anchors + Note that the series correctly decreases between two anchors ``07:00:00`` and ``07:00:02``. """ assert downcast is lib.no_default # just checking coverage result = self._upsample("asfreq") - return result.interpolate( + + # If the original data has timestamps which are not aligned with the + # target timestamps, we need to add those points back to the data frame + # that is supposed to be interpolated. This does not work with + # PeriodIndex, so we skip this case. GH#21351 + obj = self._selected_obj + is_period_index = isinstance(obj.index, PeriodIndex) + + # Skip this step for PeriodIndex + if not is_period_index: + final_index = result.index + if isinstance(final_index, MultiIndex): + raise NotImplementedError( + "Direct interpolation of MultiIndex data frames is not " + "supported. If you tried to resample and interpolate on a " + "grouped data frame, please use:\n" + "`df.groupby(...).apply(lambda x: x.resample(...)." + "interpolate(...), include_groups=False)`" + "\ninstead, as resampling and interpolation has to be " + "performed for each group independently." + ) + + missing_data_points_index = obj.index.difference(final_index) + if len(missing_data_points_index) > 0: + result = concat( + [result, obj.loc[missing_data_points_index]] + ).sort_index() + + result_interpolated = result.interpolate( method=method, axis=axis, limit=limit, @@ -919,6 +949,18 @@ def interpolate( **kwargs, ) + # No further steps if the original data has a PeriodIndex + if is_period_index: + return result_interpolated + + # Make sure that original data points which do not align with the + # resampled index are removed + result_interpolated = result_interpolated.loc[final_index] + + # Make sure frequency indexes are preserved + result_interpolated.index = final_index + return result_interpolated + @final def asfreq(self, fill_value=None): """ diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 0a9d059736e6f..cdb9ff8a67b6b 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -109,7 +109,7 @@ def test_interp_basic_with_non_range_index(self, using_infer_string): else: result = df.set_index("C").interpolate() expected = df.set_index("C") - expected.loc[3, "A"] = 3 + expected.loc[3, "A"] = 2.66667 expected.loc[5, "B"] = 9 tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py index 9cd51b95d6efd..3428abacd509e 100644 --- a/pandas/tests/resample/test_base.py +++ b/pandas/tests/resample/test_base.py @@ -25,6 +25,29 @@ from pandas.core.resample import _asfreq_compat +@pytest.fixture( + params=[ + "linear", + "time", + "index", + "values", + "nearest", + "zero", + "slinear", + "quadratic", + "cubic", + "barycentric", + "krogh", + "from_derivatives", + "piecewise_polynomial", + "pchip", + "akima", + ], +) +def all_1d_no_arg_interpolation_methods(request): + return request.param + + @pytest.mark.parametrize("freq", ["2D", "1h"]) @pytest.mark.parametrize( "index", @@ -91,6 +114,56 @@ def test_resample_interpolate(index): tm.assert_frame_equal(result, expected) +def test_resample_interpolate_regular_sampling_off_grid( + all_1d_no_arg_interpolation_methods, +): + pytest.importorskip("scipy") + # GH#21351 + index = date_range("2000-01-01 00:01:00", periods=5, freq="2h") + ser = Series(np.arange(5.0), index) + + method = all_1d_no_arg_interpolation_methods + # Resample to 1 hour sampling and interpolate with the given method + ser_resampled = ser.resample("1h").interpolate(method) + + # Check that none of the resampled values are NaN, except the first one + # which lies 1 minute before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + if method not in ["nearest", "zero"]: + # Check that the resampled values are close to the expected values + # except for methods with known inaccuracies + assert np.all( + np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1) + ) + + +def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods): + pytest.importorskip("scipy") + # GH#21351 + ser = Series( + np.linspace(0.0, 1.0, 5), + index=DatetimeIndex( + [ + "2000-01-01 00:00:03", + "2000-01-01 00:00:22", + "2000-01-01 00:00:24", + "2000-01-01 00:00:31", + "2000-01-01 00:00:39", + ] + ), + ) + + # Resample to 5 second sampling and interpolate with the given method + ser_resampled = ser.resample("5s").interpolate(all_1d_no_arg_interpolation_methods) + + # Check that none of the resampled values are NaN, except the first one + # which lies 3 seconds before the first actual data point + assert np.isnan(ser_resampled.iloc[0]) + assert not ser_resampled.iloc[1:].isna().any() + + def test_raises_on_non_datetimelike_index(): # this is a non datetimelike index xp = DataFrame() diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py index 11ad9240527d5..5f5a54c4d92a3 100644 --- a/pandas/tests/resample/test_time_grouper.py +++ b/pandas/tests/resample/test_time_grouper.py @@ -333,26 +333,98 @@ def test_upsample_sum(method, method_args, expected_values): tm.assert_series_equal(result, expected) -def test_groupby_resample_interpolate(): +@pytest.fixture +def groupy_test_df(): + return DataFrame( + {"price": [10, 11, 9], "volume": [50, 60, 50]}, + index=date_range("01/01/2018", periods=3, freq="W"), + ) + + +def test_groupby_resample_interpolate_raises(groupy_test_df): + # GH 35325 + + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None + + dfs = [groupy_test_df, groupy_test_df_without_index_name] + + for df in dfs: + msg = "DataFrameGroupBy.resample operated on the grouping columns" + with tm.assert_produces_warning(DeprecationWarning, match=msg): + with pytest.raises( + NotImplementedError, + match="Direct interpolation of MultiIndex data frames is " + "not supported", + ): + df.groupby("volume").resample("1D").interpolate(method="linear") + + +def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df): # GH 35325 - d = {"price": [10, 11, 9], "volume": [50, 60, 50]} - df = DataFrame(d) + # Make a copy of the test data frame that has index.name=None + groupy_test_df_without_index_name = groupy_test_df.copy() + groupy_test_df_without_index_name.index.name = None - df["week_starting"] = date_range("01/01/2018", periods=3, freq="W") + dfs = [groupy_test_df, groupy_test_df_without_index_name] - msg = "DataFrameGroupBy.resample operated on the grouping columns" - with tm.assert_produces_warning(DeprecationWarning, match=msg): - result = ( - df.set_index("week_starting") - .groupby("volume") - .resample("1D") - .interpolate(method="linear") + for df in dfs: + result = df.groupby("volume").apply( + lambda x: x.resample("1d").interpolate(method="linear"), + include_groups=False, ) - volume = [50] * 15 + [60] - week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ - Timestamp("2018-01-14") + volume = [50] * 15 + [60] + week_starting = list(date_range("2018-01-07", "2018-01-21")) + [ + Timestamp("2018-01-14") + ] + expected_ind = pd.MultiIndex.from_arrays( + [volume, week_starting], + names=["volume", df.index.name], + ) + + expected = DataFrame( + data={ + "price": [ + 10.0, + 9.928571428571429, + 9.857142857142858, + 9.785714285714286, + 9.714285714285714, + 9.642857142857142, + 9.571428571428571, + 9.5, + 9.428571428571429, + 9.357142857142858, + 9.285714285714286, + 9.214285714285714, + 9.142857142857142, + 9.071428571428571, + 9.0, + 11.0, + ] + }, + index=expected_ind, + ) + tm.assert_frame_equal(result, expected) + + +def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df): + """Similar test as test_groupby_resample_interpolate_with_apply_syntax but + with resampling that results in missing anchor points when interpolating. + See GH#21351.""" + # GH#21351 + result = groupy_test_df.groupby("volume").apply( + lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False + ) + + volume = [50, 50, 60] + week_starting = [ + Timestamp("2018-01-07"), + Timestamp("2018-01-18 01:00:00"), + Timestamp("2018-01-14"), ] expected_ind = pd.MultiIndex.from_arrays( [volume, week_starting], @@ -363,24 +435,10 @@ def test_groupby_resample_interpolate(): data={ "price": [ 10.0, - 9.928571428571429, - 9.857142857142858, - 9.785714285714286, - 9.714285714285714, - 9.642857142857142, - 9.571428571428571, - 9.5, - 9.428571428571429, - 9.357142857142858, - 9.285714285714286, - 9.214285714285714, - 9.142857142857142, - 9.071428571428571, - 9.0, + 9.21131, 11.0, - ], - "volume": [50.0] * 15 + [60], + ] }, index=expected_ind, ) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected, check_names=False) diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py index 1008c2c87dc9e..ff7f8d0b7fa72 100644 --- a/pandas/tests/series/methods/test_interpolate.py +++ b/pandas/tests/series/methods/test_interpolate.py @@ -94,7 +94,12 @@ def test_interpolate(self, datetime_series): ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index) ts_copy = ts.copy() - ts_copy[5:10] = np.nan + + # Set data between Tuesday and Thursday to NaN for 2 consecutive weeks. + # Linear interpolation should fill in the missing values correctly, + # as the index is equally-spaced within each week. + ts_copy[1:4] = np.nan + ts_copy[6:9] = np.nan linear_interp = ts_copy.interpolate(method="linear") tm.assert_series_equal(linear_interp, ts) @@ -265,7 +270,7 @@ def test_nan_interpolate(self, kwargs): def test_nan_irregular_index(self): s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9]) result = s.interpolate() - expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9]) + expected = Series([1.0, 2.0, 2.6666666666666665, 4.0], index=[1, 3, 5, 9]) tm.assert_series_equal(result, expected) def test_nan_str_index(self):