diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index a17bf7c8bd6e9..9c24e31de4fd5 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -371,6 +371,7 @@ Datetimelike - Fixed bug where two :class:`DateOffset` objects with different ``normalize`` attributes could evaluate as equal (:issue:`21404`) - Fixed bug where :meth:`Timestamp.resolution` incorrectly returned 1-microsecond ``timedelta`` instead of 1-nanosecond :class:`Timedelta` (:issue:`21336`,:issue:`21365`) +- Fixed bug in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` where null values were not filled for dtypes of ``datetime64[ns]``, ``datetime64[ns, tz]``, ``timedelta64[ns]`` (:issue:`21915`) Timedelta ^^^^^^^^^ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 8da678e0adec0..8488584d21f65 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6097,8 +6097,11 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, raise ValueError("Only `method=linear` interpolation is supported " "on MultiIndexes.") - if _maybe_transposed_self._data.get_dtype_counts().get( - 'object') == len(_maybe_transposed_self.T): + dtype_counts = _maybe_transposed_self._data.get_dtype_counts() + if ('object' in dtype_counts and + dtype_counts.get('object') == len(_maybe_transposed_self.T)): + # Try to short-circuit tranposing to avoid superfluous dimension + # errors GH#13287, GH#17539, GH#19197 raise TypeError("Cannot interpolate with all NaNs.") # create/use the index diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 208d7b8bcf8a7..159e31ab55425 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -15,6 +15,7 @@ from pandas.core.base import PandasObject +import pandas.core.dtypes.common as ct from pandas.core.dtypes.dtypes import ( ExtensionDtype, DatetimeTZDtype, PandasExtensionDtype, @@ -1158,20 +1159,19 @@ def check_int_bool(self, inplace): try: m = missing.clean_interp_method(method, **kwargs) except: - m = None + raise ValueError("invalid method '{0}' to interpolate." + .format(method)) - if m is not None: - r = check_int_bool(self, inplace) - if r is not None: - return r - return self._interpolate(method=m, index=index, values=values, - axis=axis, limit=limit, - limit_direction=limit_direction, - limit_area=limit_area, - fill_value=fill_value, inplace=inplace, - downcast=downcast, mgr=mgr, **kwargs) + r = check_int_bool(self, inplace) + if r is not None: + return r + return self._interpolate(method=m, index=index, values=values, + axis=axis, limit=limit, + limit_direction=limit_direction, + limit_area=limit_area, + fill_value=fill_value, inplace=inplace, + downcast=downcast, mgr=mgr, **kwargs) - raise ValueError("invalid method '{0}' to interpolate.".format(method)) def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, limit=None, fill_value=None, coerce=False, @@ -1199,6 +1199,7 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, blocks = [self.make_block_same_class(values, ndim=self.ndim)] return self._maybe_downcast(blocks, downcast) + # TODO: ignoring `values`? def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, limit_direction='forward', limit_area=None, @@ -1206,13 +1207,27 @@ def _interpolate(self, method=None, index=None, values=None, """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, 'inplace') - data = self.values if inplace else self.values.copy() # only deal with floats - if not self.is_float: + if ct.needs_i8_conversion(self.dtype): + if ct.is_period_dtype(self.dtype): + raise NotImplementedError("PeriodDtype columns/Series don't " + "exist yet, but will soon. " + "When they do, test them!") + mask = isna(self.values) + values = self.values + + # DatetimeTZBlock.values is DatetimeIndex, need to cast/shape + values = getattr(values, 'values', values).reshape(self.shape) + data = values.astype(np.float64) + data[mask.reshape(self.shape)] = np.nan + elif not self.is_float: if not self.is_integer: return self - data = data.astype(np.float64) + data = self.values.astype(np.float64) + else: + # avoid making a copy if possible + data = self.values if inplace else self.values.copy() if fill_value is None: fill_value = self.fill_value @@ -1224,7 +1239,6 @@ def _interpolate(self, method=None, index=None, values=None, # process 1-d slices in the axis direction def func(x): - # process a 1-d slice, returning it # should the axis argument be handled below in apply_along_axis? # i.e. not an arg to missing.interpolate_1d @@ -1236,6 +1250,20 @@ def func(x): # interp each column independently interp_values = np.apply_along_axis(func, axis, data) + if ct.needs_i8_conversion(self.dtype): + # convert remaining NaNs back to NaT and cast back to own dtype + mask = isna(interp_values) + interp_values[mask] = fill_value # TODO: or self.fill_value? + + # Note: we need to get to a numpy dtype (M8[ns] or m8[ns]) and + # not a pandas tz-aware dtype (for now) + dtype = self.dtype.base + assert isinstance(dtype, np.dtype) + interp_values = interp_values.astype(dtype) + if is_datetimetz(self): + # squeeze() since we expanded dimension above + held = self._holder(interp_values.squeeze(), tz='UTC') + interp_values = held.tz_convert(self.dtype.tz) blocks = [self.make_block_same_class(interp_values)] return self._maybe_downcast(blocks, downcast) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 9567c08781856..b4416432764cf 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -360,7 +360,10 @@ def test_fillna_categorical_nan(self): cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) - res = df.fillna(df.median()) + with tm.assert_produces_warning(RuntimeWarning): + # RuntimeWarning: All-NaN slice encountered + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype='category') @@ -855,3 +858,71 @@ def test_interp_ignore_all_good(self): # all good result = df[['B', 'D']].interpolate(downcast=None) assert_frame_equal(result, df[['B', 'D']]) + + @pytest.mark.parametrize('use_idx', [True, False]) + @pytest.mark.parametrize('tz', [None, 'US/Central']) + def test_interpolate_dt64_values(self, tz, use_idx): + # GH#21915 + dti = pd.date_range('2016-01-01', periods=10, tz=tz) + index = dti if use_idx else None + + # Copy to avoid corrupting dti, see GH#21907 + ser = pd.Series(dti, index=index).copy() + ser[::3] = pd.NaT + + expected = pd.Series(dti, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + df = ser.to_frame() + expected = expected.to_frame() + + result = df.interpolate(method='linear') + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('use_idx', [True, False]) + def test_interpolate_td64_values(self, use_idx): + # GH#21915 + tdi = pd.timedelta_range('1D', periods=10) + index = tdi if use_idx else None + + ser = pd.Series(tdi, index=index) + ser[::3] = pd.NaT + + expected = pd.Series(tdi, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + df = ser.to_frame() + expected = expected.to_frame() + + result = df.interpolate(method='linear') + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize('use_idx', [True, False]) + def test_interpolate_datetimelike_and_object(self, use_idx): + # GH#21915 + # Check that dt64/td64 with more than one column doesn't get + # screwed up by .transpose() with an object column present. + dti_tz = pd.date_range('2016-01-01', periods=10, tz='US/Central') + dti_naive = pd.date_range('2016-01-01', periods=10, tz=None) + tdi = pd.timedelta_range('1D', periods=10) + objcol = list('ABCDEFGHIJ') + + index = tdi if use_idx else None + + df = pd.DataFrame({'aware': dti_tz, + 'naive': dti_naive, + 'tdi': tdi, + 'obj': objcol}, + columns=['naive', 'aware', 'tdi', 'obj'], + index=index) + + expected = df.copy() + expected.iloc[0, :-1] = pd.NaT + expected.iloc[-1, :-1] = df.iloc[-2, :-1] + + df.iloc[::3, :-1] = pd.NaT + + result = df.interpolate(method='linear') + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 2bc44cb1c683f..96f4a60a8c53f 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1317,3 +1317,38 @@ def test_series_interpolate_intraday(self): result = ts.reindex(new_index).interpolate(method='time') tm.assert_numpy_array_equal(result.values, exp.values) + + # TODO: De-duplicate with similar tests in test.frame.test_missing? + @pytest.mark.parametrize('use_idx', [True, False]) + @pytest.mark.parametrize('tz', [None, 'US/Central']) + def test_interpolate_dt64_values(self, tz, use_idx): + # GH#21915 + dti = pd.date_range('2016-01-01', periods=10, tz=tz) + index = dti if use_idx else None + + # Copy to avoid corrupting dti, see GH#21907 + ser = pd.Series(dti, index=index).copy() + ser[::3] = pd.NaT + + expected = pd.Series(dti, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + result = ser.interpolate(method='linear') + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize('use_idx', [True, False]) + def test_interpolate_td64_values(self, use_idx): + # GH#21915 + tdi = pd.timedelta_range('1D', periods=10) + index = tdi if use_idx else None + + ser = pd.Series(tdi, index=index) + ser[::3] = pd.NaT + + expected = pd.Series(tdi, index=index) + expected.iloc[0] = pd.NaT + expected.iloc[-1] = expected.iloc[-2] + + result = ser.interpolate(method='linear') + tm.assert_series_equal(result, expected)