Skip to content

Commit e1c5ba8

Browse files
committed
fix: Fixes new-style up-sampling interpolation for MultiIndexes resulting from groupby-operations
1 parent df86708 commit e1c5ba8

File tree

2 files changed

+70
-12
lines changed

2 files changed

+70
-12
lines changed

pandas/core/resample.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -1090,12 +1090,35 @@ def interpolate(
10901090
# If the original data has timestamps which are not aligned with the
10911091
# target timestamps, we need to add those points back to the data frame
10921092
# that is supposed to be interpolated. This does not work with
1093-
# PeriodIndex, so we skip this case.
1093+
# PeriodIndex, so we skip this case. GH#21351
10941094
obj = self._selected_obj
10951095
is_period_index = isinstance(obj.index, PeriodIndex)
10961096

1097+
# Skip this step for PeriodIndex
10971098
if not is_period_index:
10981099
final_index = result.index
1100+
if isinstance(final_index, MultiIndex):
1101+
# MultiIndex case: the `self._selected_obj` is the object before
1102+
# the groupby that led to this MultiIndex, so that the index
1103+
# is not directly available. We reconstruct it by obtaining the
1104+
# groupby columns from the final index, but assuming that the
1105+
# name of the datetime index is not included...
1106+
group_columns = list(
1107+
set(final_index.names).difference({obj.index.name})
1108+
)
1109+
1110+
# ... To obtain a DataFrame with the groupby columns and the
1111+
# datetime index, we need to reset the index and groupby again,
1112+
# then apply the (cheap) first-aggregator.
1113+
obj = (
1114+
obj.reset_index().groupby(group_columns + [obj.index.name]).first()
1115+
)
1116+
1117+
# The value columns that became index levels have to be added
1118+
# back manually. This is not ideal performance-wise.
1119+
for column in group_columns:
1120+
obj[column] = obj.index.get_level_values(column)
1121+
10991122
missing_data_points_index = obj.index.difference(final_index)
11001123
if len(missing_data_points_index) > 0:
11011124
result = concat(
@@ -1119,7 +1142,7 @@ def interpolate(
11191142
return result_interpolated
11201143

11211144
result_interpolated = result_interpolated.loc[final_index]
1122-
# This is to make sure that frequency indexes are preserved
1145+
# We make sure frequency indexes are preserved
11231146
result_interpolated.index = final_index
11241147
return result_interpolated
11251148

pandas/tests/resample/test_time_grouper.py

+45-10
Original file line numberDiff line numberDiff line change
@@ -337,21 +337,19 @@ def test_upsample_sum(method, method_args, expected_values):
337337
tm.assert_series_equal(result, expected)
338338

339339

340-
def test_groupby_resample_interpolate():
341-
# GH 35325
342-
d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
343-
344-
df = DataFrame(d)
345-
340+
@pytest.fixture
341+
def groupy_test_df():
342+
df = DataFrame({"price": [10, 11, 9], "volume": [50, 60, 50]})
346343
df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
344+
return df.set_index("week_starting")
347345

346+
347+
def test_groupby_resample_interpolate(groupy_test_df):
348+
# GH 35325
348349
msg = "DataFrameGroupBy.resample operated on the grouping columns"
349350
with tm.assert_produces_warning(FutureWarning, match=msg):
350351
result = (
351-
df.set_index("week_starting")
352-
.groupby("volume")
353-
.resample("1D")
354-
.interpolate(method="linear")
352+
groupy_test_df.groupby("volume").resample("1D").interpolate(method="linear")
355353
)
356354

357355
volume = [50] * 15 + [60]
@@ -388,3 +386,40 @@ def test_groupby_resample_interpolate():
388386
index=expected_ind,
389387
)
390388
tm.assert_frame_equal(result, expected)
389+
390+
391+
def test_groupby_resample_interpolate_off_grid(groupy_test_df):
392+
"""Similar test as test_groupby_resample_interpolate but with resampling
393+
that results in missing anchor points when interpolating. See GH#21351."""
394+
# GH#21351
395+
msg = "DataFrameGroupBy.resample operated on the grouping columns"
396+
with tm.assert_produces_warning(FutureWarning, match=msg):
397+
result = (
398+
groupy_test_df.groupby("volume")
399+
.resample("265H")
400+
.interpolate(method="linear")
401+
)
402+
403+
volume = [50, 50, 60]
404+
week_starting = [
405+
Timestamp("2018-01-07"),
406+
Timestamp("2018-01-18 01:00:00"),
407+
Timestamp("2018-01-14"),
408+
]
409+
expected_ind = pd.MultiIndex.from_arrays(
410+
[volume, week_starting],
411+
names=["volume", "week_starting"],
412+
)
413+
414+
expected = DataFrame(
415+
data={
416+
"price": [
417+
10.0,
418+
9.5,
419+
11.0,
420+
],
421+
"volume": np.array(volume).astype(float),
422+
},
423+
index=expected_ind,
424+
)
425+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)