Skip to content

Commit 6803e7d

Browse files
authored
BUG: DataFrame.diff with dt64 and NaTs (#36998)
1 parent cf5d6a2 commit 6803e7d

File tree

3 files changed

+82
-37
lines changed

3 files changed

+82
-37
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,7 @@ Numeric
361361
- Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`)
362362
- Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`)
363363
- Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`)
364+
- Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`)
364365

365366
Conversion
366367
^^^^^^^^^^

pandas/core/internals/blocks.py

+27-37
Original file line numberDiff line numberDiff line change
@@ -2117,6 +2117,32 @@ def iget(self, key):
21172117
# TODO(EA2D): this can be removed if we ever have 2D EA
21182118
return self.array_values().reshape(self.shape)[key]
21192119

2120+
def diff(self, n: int, axis: int = 0) -> List["Block"]:
2121+
"""
2122+
1st discrete difference.
2123+
2124+
Parameters
2125+
----------
2126+
n : int
2127+
Number of periods to diff.
2128+
axis : int, default 0
2129+
Axis to diff upon.
2130+
2131+
Returns
2132+
-------
2133+
A list with a new TimeDeltaBlock.
2134+
2135+
Notes
2136+
-----
2137+
The arguments here are mimicking shift so they are called correctly
2138+
by apply.
2139+
"""
2140+
# TODO(EA2D): reshape not necessary with 2D EAs
2141+
values = self.array_values().reshape(self.shape)
2142+
2143+
new_values = values - values.shift(n, axis=axis)
2144+
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
2145+
21202146
def shift(self, periods, axis=0, fill_value=None):
21212147
# TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs
21222148
values = self.array_values()
@@ -2230,6 +2256,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
22302256
internal_values = Block.internal_values
22312257
_can_hold_element = DatetimeBlock._can_hold_element
22322258
to_native_types = DatetimeBlock.to_native_types
2259+
diff = DatetimeBlock.diff
22332260
fill_value = np.datetime64("NaT", "ns")
22342261
array_values = ExtensionBlock.array_values
22352262

@@ -2301,43 +2328,6 @@ def external_values(self):
23012328
# return an object-dtype ndarray of Timestamps.
23022329
return np.asarray(self.values.astype("datetime64[ns]", copy=False))
23032330

2304-
def diff(self, n: int, axis: int = 0) -> List["Block"]:
2305-
"""
2306-
1st discrete difference.
2307-
2308-
Parameters
2309-
----------
2310-
n : int
2311-
Number of periods to diff.
2312-
axis : int, default 0
2313-
Axis to diff upon.
2314-
2315-
Returns
2316-
-------
2317-
A list with a new TimeDeltaBlock.
2318-
2319-
Notes
2320-
-----
2321-
The arguments here are mimicking shift so they are called correctly
2322-
by apply.
2323-
"""
2324-
if axis == 0:
2325-
# TODO(EA2D): special case not needed with 2D EAs
2326-
# Cannot currently calculate diff across multiple blocks since this
2327-
# function is invoked via apply
2328-
raise NotImplementedError
2329-
2330-
if n == 0:
2331-
# Fastpath avoids making a copy in `shift`
2332-
new_values = np.zeros(self.values.shape, dtype=np.int64)
2333-
else:
2334-
new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8
2335-
2336-
# Reshape the new_values like how algos.diff does for timedelta data
2337-
new_values = new_values.reshape(1, len(new_values))
2338-
new_values = new_values.astype("timedelta64[ns]")
2339-
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]
2340-
23412331
def fillna(self, value, limit=None, inplace=False, downcast=None):
23422332
# We support filling a DatetimeTZ with a `value` whose timezone
23432333
# is different by coercing to object.

pandas/tests/frame/methods/test_diff.py

+54
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,60 @@ def test_diff(self, datetime_frame):
3939
expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)})
4040
tm.assert_frame_equal(result, expected)
4141

42+
def test_diff_timedelta64_with_nat(self):
43+
# GH#32441
44+
arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
45+
arr[:, 0] = np.timedelta64("NaT", "ns")
46+
47+
df = pd.DataFrame(arr)
48+
result = df.diff(1, axis=0)
49+
50+
expected = pd.DataFrame(
51+
{0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]}
52+
)
53+
tm.assert_equal(result, expected)
54+
55+
result = df.diff(0)
56+
expected = df - df
57+
assert expected[0].isna().all()
58+
tm.assert_equal(result, expected)
59+
60+
result = df.diff(-1, axis=1)
61+
expected = df * np.nan
62+
tm.assert_equal(result, expected)
63+
64+
@pytest.mark.parametrize("tz", [None, "UTC"])
65+
def test_diff_datetime_axis0_with_nat(self, tz):
66+
# GH#32441
67+
dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz)
68+
ser = pd.Series(dti)
69+
70+
df = ser.to_frame()
71+
72+
result = df.diff()
73+
ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)])
74+
expected = pd.Series(ex_index).to_frame()
75+
tm.assert_frame_equal(result, expected)
76+
77+
@pytest.mark.parametrize("tz", [None, "UTC"])
78+
def test_diff_datetime_with_nat_zero_periods(self, tz):
79+
# diff on NaT values should give NaT, not timedelta64(0)
80+
dti = pd.date_range("2016-01-01", periods=4, tz=tz)
81+
ser = pd.Series(dti)
82+
df = ser.to_frame()
83+
84+
df[1] = ser.copy()
85+
df.iloc[:, 0] = pd.NaT
86+
87+
expected = df - df
88+
assert expected[0].isna().all()
89+
90+
result = df.diff(0, axis=0)
91+
tm.assert_frame_equal(result, expected)
92+
93+
result = df.diff(0, axis=1)
94+
tm.assert_frame_equal(result, expected)
95+
4296
@pytest.mark.parametrize("tz", [None, "UTC"])
4397
def test_diff_datetime_axis0(self, tz):
4498
# GH#18578

0 commit comments

Comments
 (0)