Skip to content

Commit a04a3a2

Browse files
committed
fix: Fixes new-style up-sampling interpolation for MultiIndexes resulting from groupby-operations
1 parent dd8b8d3 commit a04a3a2

File tree

2 files changed

+70
-12
lines changed

2 files changed

+70
-12
lines changed

pandas/core/resample.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -1091,12 +1091,35 @@ def interpolate(
10911091
# If the original data has timestamps which are not aligned with the
10921092
# target timestamps, we need to add those points back to the data frame
10931093
# that is supposed to be interpolated. This does not work with
1094-
# PeriodIndex, so we skip this case.
1094+
# PeriodIndex, so we skip this case. GH#21351
10951095
obj = self._selected_obj
10961096
is_period_index = isinstance(obj.index, PeriodIndex)
10971097

1098+
# Skip this step for PeriodIndex
10981099
if not is_period_index:
10991100
final_index = result.index
1101+
if isinstance(final_index, MultiIndex):
1102+
# MultiIndex case: the `self._selected_obj` is the object before
1103+
# the groupby that led to this MultiIndex, so that the index
1104+
# is not directly available. We reconstruct it by obtaining the
1105+
# groupby columns from the final index, but assuming that the
1106+
# name of the datetime index is not included...
1107+
group_columns = list(
1108+
set(final_index.names).difference({obj.index.name})
1109+
)
1110+
1111+
# ... To obtain a DataFrame with the groupby columns and the
1112+
# datetime index, we need to reset the index and groupby again,
1113+
# then apply the (cheap) first-aggregator.
1114+
obj = (
1115+
obj.reset_index().groupby(group_columns + [obj.index.name]).first()
1116+
)
1117+
1118+
# The value columns that became index levels have to be added
1119+
# back manually. This is not ideal performance-wise.
1120+
for column in group_columns:
1121+
obj[column] = obj.index.get_level_values(column)
1122+
11001123
missing_data_points_index = obj.index.difference(final_index)
11011124
if len(missing_data_points_index) > 0:
11021125
result = concat(
@@ -1120,7 +1143,7 @@ def interpolate(
11201143
return result_interpolated
11211144

11221145
result_interpolated = result_interpolated.loc[final_index]
1123-
# This is to make sure that frequency indexes are preserved
1146+
# We make sure frequency indexes are preserved
11241147
result_interpolated.index = final_index
11251148
return result_interpolated
11261149

pandas/tests/resample/test_time_grouper.py

+45-10
Original file line numberDiff line numberDiff line change
@@ -337,21 +337,19 @@ def test_upsample_sum(method, method_args, expected_values):
337337
tm.assert_series_equal(result, expected)
338338

339339

340-
def test_groupby_resample_interpolate():
341-
# GH 35325
342-
d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
343-
344-
df = DataFrame(d)
345-
340+
@pytest.fixture
341+
def groupy_test_df():
342+
df = DataFrame({"price": [10, 11, 9], "volume": [50, 60, 50]})
346343
df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
344+
return df.set_index("week_starting")
347345

346+
347+
def test_groupby_resample_interpolate(groupy_test_df):
348+
# GH 35325
348349
msg = "DataFrameGroupBy.resample operated on the grouping columns"
349350
with tm.assert_produces_warning(FutureWarning, match=msg):
350351
result = (
351-
df.set_index("week_starting")
352-
.groupby("volume")
353-
.resample("1D")
354-
.interpolate(method="linear")
352+
groupy_test_df.groupby("volume").resample("1D").interpolate(method="linear")
355353
)
356354

357355
volume = [50] * 15 + [60]
@@ -388,3 +386,40 @@ def test_groupby_resample_interpolate():
388386
index=expected_ind,
389387
)
390388
tm.assert_frame_equal(result, expected)
389+
390+
391+
def test_groupby_resample_interpolate_off_grid(groupy_test_df):
392+
"""Similar test as test_groupby_resample_interpolate but with resampling
393+
that results in missing anchor points when interpolating. See GH#21351."""
394+
# GH#21351
395+
msg = "DataFrameGroupBy.resample operated on the grouping columns"
396+
with tm.assert_produces_warning(FutureWarning, match=msg):
397+
result = (
398+
groupy_test_df.groupby("volume")
399+
.resample("265H")
400+
.interpolate(method="linear")
401+
)
402+
403+
volume = [50, 50, 60]
404+
week_starting = [
405+
Timestamp("2018-01-07"),
406+
Timestamp("2018-01-18 01:00:00"),
407+
Timestamp("2018-01-14"),
408+
]
409+
expected_ind = pd.MultiIndex.from_arrays(
410+
[volume, week_starting],
411+
names=["volume", "week_starting"],
412+
)
413+
414+
expected = DataFrame(
415+
data={
416+
"price": [
417+
10.0,
418+
9.5,
419+
11.0,
420+
],
421+
"volume": np.array(volume).astype(float),
422+
},
423+
index=expected_ind,
424+
)
425+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)