Fix/time series interpolation is wrong 21351 (#56515)

cbpygit · MarcoGorelli · mroeschke · web-flow · commit 4f7cb743533d · 2024-04-24T12:41:58.000-07:00
* fix: Fixes wrong doctest output in `pandas.core.resample.Resampler.interpolate` and the related explanation about consideration of anchor points when interpolating downsampled series with non-aligned result index.

* Resolved merge conflicts

* fix: Fixes wrong test case assumption for interpolation

Fixes assumption in `test_interp_basic_with_non_range_index`. If the index is [1, 2, 3, 5] and values are [1, 2, np.nan, 4], it is wrong to expect that interpolation will result in 3 for the missing value in case of linear interpolation. It will rather be 2.666...

* fix: Make sure frequency indexes are preserved with new interpolation approach

* fix: Fixes new-style up-sampling interpolation for MultiIndexes resulting from groupby-operations

* fix: Fixes wrong test case assumption when using linear interpolation on series with datetime index using business days only (test case `pandas.tests.series.methods.test_interpolate.TestSeriesInterpolateData.test_interpolate`).

* fix: Fixes wrong test case assumption when using linear interpolation on irregular index (test case `pandas.tests.series.methods.test_interpolate.TestSeriesInterpolateData.test_nan_irregular_index`).

* fix: Adds test skips for interpolation methods that require scipy if scipy is not installed

* fix: Makes sure keyword arguments "downcast" is not passed to scipy interpolation methods that are not using `interp1d` or spline.

* fix: Adjusted expected warning type in `test_groupby_resample_interpolate_off_grid`.

* fix: Fixes failing interpolation on groupby if the index has `name`=None. Adds this check to an existing test case.

* Trigger Actions

* feat: Raise error on attempt to interpolate a MultiIndex data frame, providing a useful error message that describes a working alternative syntax. Fixed related test cases and added test that makes sure the error is raised.

* Apply suggestions from code review

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* refactor: Adjusted error type assertion in test case

* refactor: Removed unused parametrization definitions and switched to direct parametrization for interpolation methods in tests.

* fix: Adds forgotten "@" before pytest.mark.parametrize

* refactor: Apply suggestions from code review

* refactor: Switched to ficture params syntax for test case parametrization

* Update pandas/tests/resample/test_time_grouper.py

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* Update pandas/tests/resample/test_base.py

Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;

* refactor: Fixes too long line

* tests: Fixes test that fails due to unimportant index name comparison

* docs: Added entry in whatsnew

* Empty-Commit

* Empty-Commit

* Empty-Commit

* docs: Sorted whatsnew

* docs: Adjusted bug fix note and moved it to the right section

---------

Co-authored-by: Marco Edward Gorelli &lt;marcogorelli@protonmail.com&gt;
Co-authored-by: Matthew Roeschke &lt;10647082+mroeschke@users.noreply.github.com&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -438,6 +438,7 @@ Groupby/resample/rolling
 - Bug in :meth:`.DataFrameGroupBy.groups` and :meth:`.SeriesGroupby.groups` that would not respect groupby argument ``dropna`` (:issue:`55919`)
 - Bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
 - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`)
+- Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`)
 - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`)
 - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
 
diff --git a/pandas/core/missing.py b/pandas/core/missing.py
@@ -314,7 +314,16 @@ def get_interp_index(method, index: Index) -> Index:
         # prior default
         from pandas import Index
 
-        index = Index(np.arange(len(index)))
+        if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype(
+            index.dtype, "mM"
+        ):
+            # Convert datetime-like indexes to int64
+            index = Index(index.view("i8"))
+
+        elif not is_numeric_dtype(index.dtype):
+            # We keep behavior consistent with prior versions of pandas for
+            # non-numeric, non-datetime indexes
+            index = Index(range(len(index)))
     else:
         methods = {"index", "values", "nearest", "time"}
         is_numeric_or_datetime = (
@@ -616,6 +625,9 @@ def _interpolate_scipy_wrapper(
         terp = alt_methods.get(method, None)
         if terp is None:
             raise ValueError(f"Can not interpolate with method={method}.")
+
+        # Make sure downcast is not in kwargs for alt methods
+        kwargs.pop("downcast", None)
         new_y = terp(x, y, new_x, **kwargs)
     return new_y
 
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -80,6 +80,7 @@
     TimedeltaIndex,
     timedelta_range,
 )
+from pandas.core.reshape.concat import concat
 
 from pandas.tseries.frequencies import (
     is_subperiod,
@@ -885,30 +886,59 @@ def interpolate(
         Freq: 500ms, dtype: float64
 
         Internal reindexing with ``asfreq()`` prior to interpolation leads to
-        an interpolated timeseries on the basis the reindexed timestamps (anchors).
-        Since not all datapoints from original series become anchors,
-        it can lead to misleading interpolation results as in the following example:
+        an interpolated timeseries on the basis of the reindexed timestamps
+        (anchors). It is assured that all available datapoints from original
+        series become anchors, so it also works for resampling-cases that lead
+        to non-aligned timestamps, as in the following example:
 
         >>> series.resample("400ms").interpolate("linear")
         2023-03-01 07:00:00.000    1.0
-        2023-03-01 07:00:00.400    1.2
-        2023-03-01 07:00:00.800    1.4
-        2023-03-01 07:00:01.200    1.6
-        2023-03-01 07:00:01.600    1.8
+        2023-03-01 07:00:00.400    0.2
+        2023-03-01 07:00:00.800   -0.6
+        2023-03-01 07:00:01.200   -0.4
+        2023-03-01 07:00:01.600    0.8
         2023-03-01 07:00:02.000    2.0
-        2023-03-01 07:00:02.400    2.2
-        2023-03-01 07:00:02.800    2.4
-        2023-03-01 07:00:03.200    2.6
-        2023-03-01 07:00:03.600    2.8
+        2023-03-01 07:00:02.400    1.6
+        2023-03-01 07:00:02.800    1.2
+        2023-03-01 07:00:03.200    1.4
+        2023-03-01 07:00:03.600    2.2
         2023-03-01 07:00:04.000    3.0
         Freq: 400ms, dtype: float64
 
-        Note that the series erroneously increases between two anchors
+        Note that the series correctly decreases between two anchors
         ``07:00:00`` and ``07:00:02``.
         """
         assert downcast is lib.no_default  # just checking coverage
         result = self._upsample("asfreq")
-        return result.interpolate(
+
+        # If the original data has timestamps which are not aligned with the
+        # target timestamps, we need to add those points back to the data frame
+        # that is supposed to be interpolated. This does not work with
+        # PeriodIndex, so we skip this case. GH#21351
+        obj = self._selected_obj
+        is_period_index = isinstance(obj.index, PeriodIndex)
+
+        # Skip this step for PeriodIndex
+        if not is_period_index:
+            final_index = result.index
+            if isinstance(final_index, MultiIndex):
+                raise NotImplementedError(
+                    "Direct interpolation of MultiIndex data frames is not "
+                    "supported. If you tried to resample and interpolate on a "
+                    "grouped data frame, please use:\n"
+                    "`df.groupby(...).apply(lambda x: x.resample(...)."
+                    "interpolate(...), include_groups=False)`"
+                    "\ninstead, as resampling and interpolation has to be "
+                    "performed for each group independently."
+                )
+
+            missing_data_points_index = obj.index.difference(final_index)
+            if len(missing_data_points_index) > 0:
+                result = concat(
+                    [result, obj.loc[missing_data_points_index]]
+                ).sort_index()
+
+        result_interpolated = result.interpolate(
             method=method,
             axis=axis,
             limit=limit,
@@ -919,6 +949,18 @@ def interpolate(
             **kwargs,
         )
 
+        # No further steps if the original data has a PeriodIndex
+        if is_period_index:
+            return result_interpolated
+
+        # Make sure that original data points which do not align with the
+        # resampled index are removed
+        result_interpolated = result_interpolated.loc[final_index]
+
+        # Make sure frequency indexes are preserved
+        result_interpolated.index = final_index
+        return result_interpolated
+
     @final
     def asfreq(self, fill_value=None):
         """
diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py
@@ -109,7 +109,7 @@ def test_interp_basic_with_non_range_index(self, using_infer_string):
         else:
             result = df.set_index("C").interpolate()
             expected = df.set_index("C")
-            expected.loc[3, "A"] = 3
+            expected.loc[3, "A"] = 2.66667
             expected.loc[5, "B"] = 9
             tm.assert_frame_equal(result, expected)
 
diff --git a/pandas/tests/resample/test_base.py b/pandas/tests/resample/test_base.py
@@ -25,6 +25,29 @@
 from pandas.core.resample import _asfreq_compat
 
 
+@pytest.fixture(
+    params=[
+        "linear",
+        "time",
+        "index",
+        "values",
+        "nearest",
+        "zero",
+        "slinear",
+        "quadratic",
+        "cubic",
+        "barycentric",
+        "krogh",
+        "from_derivatives",
+        "piecewise_polynomial",
+        "pchip",
+        "akima",
+    ],
+)
+def all_1d_no_arg_interpolation_methods(request):
+    return request.param
+
+
 @pytest.mark.parametrize("freq", ["2D", "1h"])
 @pytest.mark.parametrize(
     "index",
@@ -91,6 +114,56 @@ def test_resample_interpolate(index):
     tm.assert_frame_equal(result, expected)
 
 
+def test_resample_interpolate_regular_sampling_off_grid(
+    all_1d_no_arg_interpolation_methods,
+):
+    pytest.importorskip("scipy")
+    # GH#21351
+    index = date_range("2000-01-01 00:01:00", periods=5, freq="2h")
+    ser = Series(np.arange(5.0), index)
+
+    method = all_1d_no_arg_interpolation_methods
+    # Resample to 1 hour sampling and interpolate with the given method
+    ser_resampled = ser.resample("1h").interpolate(method)
+
+    # Check that none of the resampled values are NaN, except the first one
+    # which lies 1 minute before the first actual data point
+    assert np.isnan(ser_resampled.iloc[0])
+    assert not ser_resampled.iloc[1:].isna().any()
+
+    if method not in ["nearest", "zero"]:
+        # Check that the resampled values are close to the expected values
+        # except for methods with known inaccuracies
+        assert np.all(
+            np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1)
+        )
+
+
+def test_resample_interpolate_irregular_sampling(all_1d_no_arg_interpolation_methods):
+    pytest.importorskip("scipy")
+    # GH#21351
+    ser = Series(
+        np.linspace(0.0, 1.0, 5),
+        index=DatetimeIndex(
+            [
+                "2000-01-01 00:00:03",
+                "2000-01-01 00:00:22",
+                "2000-01-01 00:00:24",
+                "2000-01-01 00:00:31",
+                "2000-01-01 00:00:39",
+            ]
+        ),
+    )
+
+    # Resample to 5 second sampling and interpolate with the given method
+    ser_resampled = ser.resample("5s").interpolate(all_1d_no_arg_interpolation_methods)
+
+    # Check that none of the resampled values are NaN, except the first one
+    # which lies 3 seconds before the first actual data point
+    assert np.isnan(ser_resampled.iloc[0])
+    assert not ser_resampled.iloc[1:].isna().any()
+
+
 def test_raises_on_non_datetimelike_index():
     # this is a non datetimelike index
     xp = DataFrame()
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
@@ -333,26 +333,98 @@ def test_upsample_sum(method, method_args, expected_values):
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_resample_interpolate():
+@pytest.fixture
+def groupy_test_df():
+    return DataFrame(
+        {"price": [10, 11, 9], "volume": [50, 60, 50]},
+        index=date_range("01/01/2018", periods=3, freq="W"),
+    )
+
+
+def test_groupby_resample_interpolate_raises(groupy_test_df):
+    # GH 35325
+
+    # Make a copy of the test data frame that has index.name=None
+    groupy_test_df_without_index_name = groupy_test_df.copy()
+    groupy_test_df_without_index_name.index.name = None
+
+    dfs = [groupy_test_df, groupy_test_df_without_index_name]
+
+    for df in dfs:
+        msg = "DataFrameGroupBy.resample operated on the grouping columns"
+        with tm.assert_produces_warning(DeprecationWarning, match=msg):
+            with pytest.raises(
+                NotImplementedError,
+                match="Direct interpolation of MultiIndex data frames is "
+                "not supported",
+            ):
+                df.groupby("volume").resample("1D").interpolate(method="linear")
+
+
+def test_groupby_resample_interpolate_with_apply_syntax(groupy_test_df):
     # GH 35325
-    d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
 
-    df = DataFrame(d)
+    # Make a copy of the test data frame that has index.name=None
+    groupy_test_df_without_index_name = groupy_test_df.copy()
+    groupy_test_df_without_index_name.index.name = None
 
-    df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
+    dfs = [groupy_test_df, groupy_test_df_without_index_name]
 
-    msg = "DataFrameGroupBy.resample operated on the grouping columns"
-    with tm.assert_produces_warning(DeprecationWarning, match=msg):
-        result = (
-            df.set_index("week_starting")
-            .groupby("volume")
-            .resample("1D")
-            .interpolate(method="linear")
+    for df in dfs:
+        result = df.groupby("volume").apply(
+            lambda x: x.resample("1d").interpolate(method="linear"),
+            include_groups=False,
         )
 
-    volume = [50] * 15 + [60]
-    week_starting = list(date_range("2018-01-07", "2018-01-21")) + [
-        Timestamp("2018-01-14")
+        volume = [50] * 15 + [60]
+        week_starting = list(date_range("2018-01-07", "2018-01-21")) + [
+            Timestamp("2018-01-14")
+        ]
+        expected_ind = pd.MultiIndex.from_arrays(
+            [volume, week_starting],
+            names=["volume", df.index.name],
+        )
+
+        expected = DataFrame(
+            data={
+                "price": [
+                    10.0,
+                    9.928571428571429,
+                    9.857142857142858,
+                    9.785714285714286,
+                    9.714285714285714,
+                    9.642857142857142,
+                    9.571428571428571,
+                    9.5,
+                    9.428571428571429,
+                    9.357142857142858,
+                    9.285714285714286,
+                    9.214285714285714,
+                    9.142857142857142,
+                    9.071428571428571,
+                    9.0,
+                    11.0,
+                ]
+            },
+            index=expected_ind,
+        )
+        tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_interpolate_with_apply_syntax_off_grid(groupy_test_df):
+    """Similar test as test_groupby_resample_interpolate_with_apply_syntax but
+    with resampling that results in missing anchor points when interpolating.
+    See GH#21351."""
+    # GH#21351
+    result = groupy_test_df.groupby("volume").apply(
+        lambda x: x.resample("265h").interpolate(method="linear"), include_groups=False
+    )
+
+    volume = [50, 50, 60]
+    week_starting = [
+        Timestamp("2018-01-07"),
+        Timestamp("2018-01-18 01:00:00"),
+        Timestamp("2018-01-14"),
     ]
     expected_ind = pd.MultiIndex.from_arrays(
         [volume, week_starting],
@@ -363,24 +435,10 @@ def test_groupby_resample_interpolate():
         data={
             "price": [
                 10.0,
-                9.928571428571429,
-                9.857142857142858,
-                9.785714285714286,
-                9.714285714285714,
-                9.642857142857142,
-                9.571428571428571,
-                9.5,
-                9.428571428571429,
-                9.357142857142858,
-                9.285714285714286,
-                9.214285714285714,
-                9.142857142857142,
-                9.071428571428571,
-                9.0,
+                9.21131,
                 11.0,
-            ],
-            "volume": [50.0] * 15 + [60],
+            ]
         },
         index=expected_ind,
     )
-    tm.assert_frame_equal(result, expected)
+    tm.assert_frame_equal(result, expected, check_names=False)
diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py
@@ -94,7 +94,12 @@ def test_interpolate(self, datetime_series):
         ts = Series(np.arange(len(datetime_series), dtype=float), datetime_series.index)
 
         ts_copy = ts.copy()
-        ts_copy[5:10] = np.nan
+
+        # Set data between Tuesday and Thursday to NaN for 2 consecutive weeks.
+        # Linear interpolation should fill in the missing values correctly,
+        # as the index is equally-spaced within each week.
+        ts_copy[1:4] = np.nan
+        ts_copy[6:9] = np.nan
 
         linear_interp = ts_copy.interpolate(method="linear")
         tm.assert_series_equal(linear_interp, ts)
@@ -265,7 +270,7 @@ def test_nan_interpolate(self, kwargs):
     def test_nan_irregular_index(self):
         s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9])
         result = s.interpolate()
-        expected = Series([1.0, 2.0, 3.0, 4.0], index=[1, 3, 5, 9])
+        expected = Series([1.0, 2.0, 2.6666666666666665, 4.0], index=[1, 3, 5, 9])
         tm.assert_series_equal(result, expected)
 
     def test_nan_str_index(self):