Skip to content

Commit 3c0b59d

Browse files
committed
fix: Fix time series interpolation on resampled data for off-grid data points
Fixes pandas-dev#21351 (Time Series Interpolation is wrong). Interpolation on resampled data was only working as the original data points were discarded in the resampling process if they were not aligned with the resampled index. In this implementation the missing datapoints are added to the dataset before interpolation, and discarded afterwards. Does not handle PeriodIndex.
1 parent ee4ceec commit 3c0b59d

File tree

3 files changed

+102
-2
lines changed

3 files changed

+102
-2
lines changed

pandas/core/missing.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -304,7 +304,16 @@ def get_interp_index(method, index: Index) -> Index:
304304
# prior default
305305
from pandas import Index
306306

307-
index = Index(np.arange(len(index)))
307+
if isinstance(index.dtype, DatetimeTZDtype) or lib.is_np_dtype(
308+
index.dtype, "mM"
309+
):
310+
# Convert datetime-like indexes to int64
311+
index = Index(index.view("i8"))
312+
313+
elif not is_numeric_dtype(index.dtype):
314+
# We keep behavior consistent with prior versions of pandas for
315+
# non-numeric, non-datetime indexes
316+
index = Index(np.arange(len(index)))
308317
else:
309318
methods = {"index", "values", "nearest", "time"}
310319
is_numeric_or_datetime = (

pandas/core/resample.py

+24-1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@
8383
TimedeltaIndex,
8484
timedelta_range,
8585
)
86+
from pandas.core.reshape.concat import concat
8687

8788
from pandas.tseries.frequencies import (
8889
is_subperiod,
@@ -1085,7 +1086,23 @@ def interpolate(
10851086
"""
10861087
assert downcast is lib.no_default # just checking coverage
10871088
result = self._upsample("asfreq")
1088-
return result.interpolate(
1089+
1090+
# If the original data has timestamps which are not aligned with the
1091+
# target timestamps, we need to add those points back to the data frame
1092+
# that is supposed to be interpolated. This does not work with
1093+
# PeriodIndex, so we skip this case.
1094+
obj = self._selected_obj
1095+
is_period_index = isinstance(obj.index, PeriodIndex)
1096+
1097+
if not is_period_index:
1098+
final_index = result.index
1099+
missing_data_points_index = obj.index.difference(final_index)
1100+
if len(missing_data_points_index) > 0:
1101+
result = concat(
1102+
[result, obj.loc[missing_data_points_index]]
1103+
).sort_index()
1104+
1105+
result_interpolated = result.interpolate(
10891106
method=method,
10901107
axis=axis,
10911108
limit=limit,
@@ -1096,6 +1113,12 @@ def interpolate(
10961113
**kwargs,
10971114
)
10981115

1116+
# We make sure that original data points which do not align with the
1117+
# resampled index are removed
1118+
if is_period_index:
1119+
return result_interpolated
1120+
return result_interpolated.loc[final_index]
1121+
10991122
@final
11001123
def asfreq(self, fill_value=None):
11011124
"""

pandas/tests/resample/test_base.py

+68
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,27 @@
3636
[DATE_RANGE, PERIOD_RANGE, TIMEDELTA_RANGE],
3737
)
3838

39+
all_1d_no_arg_interpolation_methods = pytest.mark.parametrize(
40+
"method",
41+
[
42+
"linear",
43+
"time",
44+
"index",
45+
"values",
46+
"nearest",
47+
"zero",
48+
"slinear",
49+
"quadratic",
50+
"cubic",
51+
"barycentric",
52+
"krogh",
53+
"from_derivatives",
54+
"piecewise_polynomial",
55+
"pchip",
56+
"akima",
57+
],
58+
)
59+
3960

4061
@pytest.fixture
4162
def create_index(_index_factory):
@@ -90,6 +111,53 @@ def test_resample_interpolate(frame):
90111
tm.assert_frame_equal(result, expected)
91112

92113

114+
@all_1d_no_arg_interpolation_methods
115+
def test_resample_interpolate_regular_sampling_off_grid(method):
116+
# GH#21351
117+
index = date_range("2000-01-01 00:01:00", periods=5, freq="2h")
118+
ser = Series(np.arange(5.0), index)
119+
120+
# Resample to 1 hour sampling and interpolate with the given method
121+
ser_resampled = ser.resample("1h").interpolate(method)
122+
123+
# Check that none of the resampled values are NaN, except the first one
124+
# which lies 1 minute before the first actual data point
125+
assert np.isnan(ser_resampled.iloc[0])
126+
assert not ser_resampled.iloc[1:].isna().any()
127+
128+
if method not in ["nearest", "zero"]:
129+
# Check that the resampled values are close to the expected values
130+
# except for methods with known inaccuracies
131+
assert np.all(
132+
np.isclose(ser_resampled.values[1:], np.arange(0.5, 4.5, 0.5), rtol=1.0e-1)
133+
)
134+
135+
136+
@all_1d_no_arg_interpolation_methods
137+
def test_resample_interpolate_irregular_sampling(method):
138+
# GH#21351
139+
ser = Series(
140+
np.linspace(0.0, 1.0, 5),
141+
index=DatetimeIndex(
142+
[
143+
"2000-01-01 00:00:03",
144+
"2000-01-01 00:00:22",
145+
"2000-01-01 00:00:24",
146+
"2000-01-01 00:00:31",
147+
"2000-01-01 00:00:39",
148+
]
149+
),
150+
)
151+
152+
# Resample to 5 second sampling and interpolate with the given method
153+
ser_resampled = ser.resample("5s").interpolate(method)
154+
155+
# Check that none of the resampled values are NaN, except the first one
156+
# which lies 3 seconds before the first actual data point
157+
assert np.isnan(ser_resampled.iloc[0])
158+
assert not ser_resampled.iloc[1:].isna().any()
159+
160+
93161
def test_raises_on_non_datetimelike_index():
94162
# this is a non datetimelike index
95163
xp = DataFrame()

0 commit comments

Comments
 (0)