diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 0ab95dd260a9c..32e65fbe8d54c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -360,6 +360,7 @@ Numeric - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) - Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) - Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) +- Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index be105f0035447..f4b9714dd8ba1 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2079,7 +2079,7 @@ def _can_hold_element(self, element: Any) -> bool: return is_integer(element) or (is_float(element) and element.is_integer()) -class DatetimeLikeBlockMixin: +class DatetimeLikeBlockMixin(Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" @property @@ -2111,6 +2111,32 @@ def iget(self, key): # TODO(EA2D): this can be removed if we ever have 2D EA return self.array_values().reshape(self.shape)[key] + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. + + Parameters + ---------- + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. + + Returns + ------- + A list with a new TimeDeltaBlock. + + Notes + ----- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + # TODO(EA2D): reshape not necessary with 2D EAs + values = self.array_values().reshape(self.shape) + + new_values = values - values.shift(n, axis=axis) + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs values = self.array_values() @@ -2118,7 +2144,7 @@ def shift(self, periods, axis=0, fill_value=None): return self.make_block_same_class(new_values) -class DatetimeBlock(DatetimeLikeBlockMixin, Block): +class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True @@ -2220,6 +2246,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types + diff = DatetimeBlock.diff fill_value = np.datetime64("NaT", "ns") array_values = ExtensionBlock.array_values @@ -2291,43 +2318,6 @@ def external_values(self): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - def diff(self, n: int, axis: int = 0) -> List["Block"]: - """ - 1st discrete difference. - - Parameters - ---------- - n : int - Number of periods to diff. - axis : int, default 0 - Axis to diff upon. - - Returns - ------- - A list with a new TimeDeltaBlock. - - Notes - ----- - The arguments here are mimicking shift so they are called correctly - by apply. - """ - if axis == 0: - # TODO(EA2D): special case not needed with 2D EAs - # Cannot currently calculate diff across multiple blocks since this - # function is invoked via apply - raise NotImplementedError - - if n == 0: - # Fastpath avoids making a copy in `shift` - new_values = np.zeros(self.values.shape, dtype=np.int64) - else: - new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 - - # Reshape the new_values like how algos.diff does for timedelta data - new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype("timedelta64[ns]") - return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - def fillna(self, value, limit=None, inplace=False, downcast=None): # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 42586c14092f2..e160d5d24d40a 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -39,6 +39,60 @@ def test_diff(self, datetime_frame): expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}) tm.assert_frame_equal(result, expected) + def test_diff_timedelta64_with_nat(self): + # GH#32441 + arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") + arr[:, 0] = np.timedelta64("NaT", "ns") + + df = pd.DataFrame(arr) + result = df.diff(1, axis=0) + + expected = pd.DataFrame( + {0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]} + ) + tm.assert_equal(result, expected) + + result = df.diff(0) + expected = df - df + assert expected[0].isna().all() + tm.assert_equal(result, expected) + + result = df.diff(-1, axis=1) + expected = df * np.nan + tm.assert_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0_with_nat(self, tz): + # GH#32441 + dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz) + ser = pd.Series(dti) + + df = ser.to_frame() + + result = df.diff() + ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]) + expected = pd.Series(ex_index).to_frame() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_with_nat_zero_periods(self, tz): + # diff on NaT values should give NaT, not timedelta64(0) + dti = pd.date_range("2016-01-01", periods=4, tz=tz) + ser = pd.Series(dti) + df = ser.to_frame() + + df[1] = ser.copy() + df.iloc[:, 0] = pd.NaT + + expected = df - df + assert expected[0].isna().all() + + result = df.diff(0, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.diff(0, axis=1) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis0(self, tz): # GH#18578