fix: Fixes new-style up-sampling interpolation for MultiIndexes resulting from groupby-operations

cbpygit · cbpygit · commit a04a3a251ba6 · 2024-01-02T13:17:42.000+01:00
diff --git a/pandas/core/resample.py b/pandas/core/resample.py
@@ -1091,12 +1091,35 @@ def interpolate(
         # If the original data has timestamps which are not aligned with the
         # target timestamps, we need to add those points back to the data frame
         # that is supposed to be interpolated. This does not work with
-        # PeriodIndex, so we skip this case.
+        # PeriodIndex, so we skip this case. GH#21351
         obj = self._selected_obj
         is_period_index = isinstance(obj.index, PeriodIndex)
 
+        # Skip this step for PeriodIndex
         if not is_period_index:
             final_index = result.index
+            if isinstance(final_index, MultiIndex):
+                # MultiIndex case: the `self._selected_obj` is the object before
+                # the groupby that led to this MultiIndex, so that the index
+                # is not directly available. We reconstruct it by obtaining the
+                # groupby columns from the final index, but assuming that the
+                # name of the datetime index is not included...
+                group_columns = list(
+                    set(final_index.names).difference({obj.index.name})
+                )
+
+                # ... To obtain a DataFrame with the groupby columns and the
+                # datetime index, we need to reset the index and groupby again,
+                # then apply the (cheap) first-aggregator.
+                obj = (
+                    obj.reset_index().groupby(group_columns + [obj.index.name]).first()
+                )
+
+                # The value columns that became index levels have to be added
+                # back manually. This is not ideal performance-wise.
+                for column in group_columns:
+                    obj[column] = obj.index.get_level_values(column)
+
             missing_data_points_index = obj.index.difference(final_index)
             if len(missing_data_points_index) > 0:
                 result = concat(
@@ -1120,7 +1143,7 @@ def interpolate(
             return result_interpolated
 
         result_interpolated = result_interpolated.loc[final_index]
-        # This is to make sure that frequency indexes are preserved
+        # We make sure frequency indexes are preserved
         result_interpolated.index = final_index
         return result_interpolated
 
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
@@ -337,21 +337,19 @@ def test_upsample_sum(method, method_args, expected_values):
     tm.assert_series_equal(result, expected)
 
 
-def test_groupby_resample_interpolate():
-    # GH 35325
-    d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
-
-    df = DataFrame(d)
-
+@pytest.fixture
+def groupy_test_df():
+    df = DataFrame({"price": [10, 11, 9], "volume": [50, 60, 50]})
     df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
+    return df.set_index("week_starting")
 
+
+def test_groupby_resample_interpolate(groupy_test_df):
+    # GH 35325
     msg = "DataFrameGroupBy.resample operated on the grouping columns"
     with tm.assert_produces_warning(FutureWarning, match=msg):
         result = (
-            df.set_index("week_starting")
-            .groupby("volume")
-            .resample("1D")
-            .interpolate(method="linear")
+            groupy_test_df.groupby("volume").resample("1D").interpolate(method="linear")
         )
 
     volume = [50] * 15 + [60]
@@ -388,3 +386,40 @@ def test_groupby_resample_interpolate():
         index=expected_ind,
     )
     tm.assert_frame_equal(result, expected)
+
+
+def test_groupby_resample_interpolate_off_grid(groupy_test_df):
+    """Similar test as test_groupby_resample_interpolate but with resampling
+    that results in missing anchor points when interpolating. See GH#21351."""
+    # GH#21351
+    msg = "DataFrameGroupBy.resample operated on the grouping columns"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        result = (
+            groupy_test_df.groupby("volume")
+            .resample("265H")
+            .interpolate(method="linear")
+        )
+
+    volume = [50, 50, 60]
+    week_starting = [
+        Timestamp("2018-01-07"),
+        Timestamp("2018-01-18 01:00:00"),
+        Timestamp("2018-01-14"),
+    ]
+    expected_ind = pd.MultiIndex.from_arrays(
+        [volume, week_starting],
+        names=["volume", "week_starting"],
+    )
+
+    expected = DataFrame(
+        data={
+            "price": [
+                10.0,
+                9.5,
+                11.0,
+            ],
+            "volume": np.array(volume).astype(float),
+        },
+        index=expected_ind,
+    )
+    tm.assert_frame_equal(result, expected)