From beabe38b272043114f0af8fee43dcda34a17ae99 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 8 Oct 2020 18:53:39 -0700 Subject: [PATCH 1/3] BUG: DataFrame.diff with dt64 and NaTs --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/internals/blocks.py | 68 +++++++++++-------------- pandas/tests/frame/methods/test_diff.py | 32 ++++++++++++ 3 files changed, 62 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ae4d5ea692066..e31a164ae908c 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -360,6 +360,7 @@ Numeric - Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`) - Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`) - Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`) +- Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`) Conversion ^^^^^^^^^^ diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index fcc923c97cf83..e48aae231246d 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -2066,7 +2066,7 @@ def _can_hold_element(self, element: Any) -> bool: return is_integer(element) or (is_float(element) and element.is_integer()) -class DatetimeLikeBlockMixin: +class DatetimeLikeBlockMixin(Block): """Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock.""" @property @@ -2098,6 +2098,32 @@ def iget(self, key): # TODO(EA2D): this can be removed if we ever have 2D EA return self.array_values().reshape(self.shape)[key] + def diff(self, n: int, axis: int = 0) -> List["Block"]: + """ + 1st discrete difference. + + Parameters + ---------- + n : int + Number of periods to diff. + axis : int, default 0 + Axis to diff upon. + + Returns + ------- + A list with a new TimeDeltaBlock. + + Notes + ----- + The arguments here are mimicking shift so they are called correctly + by apply. + """ + # TODO(EA2D): reshape not necessary with 2D EAs + values = self.array_values().reshape(self.shape) + + new_values = values - values.shift(n, axis=axis) + return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] + def shift(self, periods, axis=0, fill_value=None): # TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs values = self.array_values() @@ -2105,7 +2131,7 @@ def shift(self, periods, axis=0, fill_value=None): return self.make_block_same_class(new_values) -class DatetimeBlock(DatetimeLikeBlockMixin, Block): +class DatetimeBlock(DatetimeLikeBlockMixin): __slots__ = () is_datetime = True @@ -2211,6 +2237,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock): internal_values = Block.internal_values _can_hold_element = DatetimeBlock._can_hold_element to_native_types = DatetimeBlock.to_native_types + diff = DatetimeBlock.diff fill_value = np.datetime64("NaT", "ns") array_values = ExtensionBlock.array_values @@ -2282,43 +2309,6 @@ def external_values(self): # return an object-dtype ndarray of Timestamps. return np.asarray(self.values.astype("datetime64[ns]", copy=False)) - def diff(self, n: int, axis: int = 0) -> List["Block"]: - """ - 1st discrete difference. - - Parameters - ---------- - n : int - Number of periods to diff. - axis : int, default 0 - Axis to diff upon. - - Returns - ------- - A list with a new TimeDeltaBlock. - - Notes - ----- - The arguments here are mimicking shift so they are called correctly - by apply. - """ - if axis == 0: - # TODO(EA2D): special case not needed with 2D EAs - # Cannot currently calculate diff across multiple blocks since this - # function is invoked via apply - raise NotImplementedError - - if n == 0: - # Fastpath avoids making a copy in `shift` - new_values = np.zeros(self.values.shape, dtype=np.int64) - else: - new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8 - - # Reshape the new_values like how algos.diff does for timedelta data - new_values = new_values.reshape(1, len(new_values)) - new_values = new_values.astype("timedelta64[ns]") - return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)] - def fillna(self, value, limit=None, inplace=False, downcast=None): # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 42586c14092f2..9810499346929 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -39,6 +39,38 @@ def test_diff(self, datetime_frame): expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_axis0_with_nat(self, tz): + # GH#32441 + dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz) + ser = pd.Series(dti) + + df = ser.to_frame() + + result = df.diff() + ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)]) + expected = pd.Series(ex_index).to_frame() + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("tz", [None, "UTC"]) + def test_diff_datetime_with_nat_zero_periods(self, tz): + # diff on NaT values should give NaT, not timedelta64(0) + dti = pd.date_range("2016-01-01", periods=4, tz=tz) + ser = pd.Series(dti) + df = ser.to_frame() + + df[1] = ser.copy() + df.iloc[:, 0] = pd.NaT + + expected = df - df + assert expected[0].isna().all() + + result = df.diff(0, axis=0) + tm.assert_frame_equal(result, expected) + + result = df.diff(0, axis=1) + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis0(self, tz): # GH#18578 From 2dccd37c6290f74f9144ab57ae9aae7bb173727d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 9 Oct 2020 16:01:19 -0700 Subject: [PATCH 2/3] tests for td64 case --- pandas/tests/frame/methods/test_diff.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 9810499346929..4ab189f0511c8 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -39,6 +39,28 @@ def test_diff(self, datetime_frame): expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)}) tm.assert_frame_equal(result, expected) + def test_diff_timedelta64_with_nat(self): + # GH#32441 + arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") + arr[:, 0] = "NaT" + + df = pd.DataFrame(arr) + result = df.diff(1, axis=0) + + expected = pd.DataFrame( + {0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]} + ) + tm.assert_equal(result, expected) + + result = df.diff(0) + expected = df - df + assert expected[0].isna().all() + tm.assert_equal(result, expected) + + result = df.diff(-1, axis=1) + expected = df * np.nan + tm.assert_equal(result, expected) + @pytest.mark.parametrize("tz", [None, "UTC"]) def test_diff_datetime_axis0_with_nat(self, tz): # GH#32441 From 3a56c91033b584a568bca06ae4978f9a603bcead Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 9 Oct 2020 16:26:08 -0700 Subject: [PATCH 3/3] compat --- pandas/tests/frame/methods/test_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py index 4ab189f0511c8..e160d5d24d40a 100644 --- a/pandas/tests/frame/methods/test_diff.py +++ b/pandas/tests/frame/methods/test_diff.py @@ -42,7 +42,7 @@ def test_diff(self, datetime_frame): def test_diff_timedelta64_with_nat(self): # GH#32441 arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]") - arr[:, 0] = "NaT" + arr[:, 0] = np.timedelta64("NaT", "ns") df = pd.DataFrame(arr) result = df.diff(1, axis=0)