Skip to content

BUG: DataFrame.diff with dt64 and NaTs #36998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Oct 10, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ Numeric
- Bug in :meth:`DataFrame.__rmatmul__` error handling reporting transposed shapes (:issue:`21581`)
- Bug in :class:`Series` flex arithmetic methods where the result when operating with a ``list``, ``tuple`` or ``np.ndarray`` would have an incorrect name (:issue:`36760`)
- Bug in :class:`IntegerArray` multiplication with ``timedelta`` and ``np.timedelta64`` objects (:issue:`36870`)
- Bug in :meth:`DataFrame.diff` with ``datetime64`` dtypes including ``NaT`` values failing to fill ``NaT`` results correctly (:issue:`32441`)

Conversion
^^^^^^^^^^
Expand Down
68 changes: 29 additions & 39 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -2079,7 +2079,7 @@ def _can_hold_element(self, element: Any) -> bool:
return is_integer(element) or (is_float(element) and element.is_integer())


class DatetimeLikeBlockMixin:
class DatetimeLikeBlockMixin(Block):
"""Mixin class for DatetimeBlock, DatetimeTZBlock, and TimedeltaBlock."""

@property
Expand Down Expand Up @@ -2111,14 +2111,40 @@ def iget(self, key):
# TODO(EA2D): this can be removed if we ever have 2D EA
return self.array_values().reshape(self.shape)[key]

def diff(self, n: int, axis: int = 0) -> List["Block"]:
"""
1st discrete difference.

Parameters
----------
n : int
Number of periods to diff.
axis : int, default 0
Axis to diff upon.

Returns
-------
A list with a new TimeDeltaBlock.

Notes
-----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
# TODO(EA2D): reshape not necessary with 2D EAs
values = self.array_values().reshape(self.shape)

new_values = values - values.shift(n, axis=axis)
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]

def shift(self, periods, axis=0, fill_value=None):
# TODO(EA2D) this is unnecessary if these blocks are backed by 2D EAs
values = self.array_values()
new_values = values.shift(periods, fill_value=fill_value, axis=axis)
return self.make_block_same_class(new_values)


class DatetimeBlock(DatetimeLikeBlockMixin, Block):
class DatetimeBlock(DatetimeLikeBlockMixin):
__slots__ = ()
is_datetime = True

Expand Down Expand Up @@ -2220,6 +2246,7 @@ class DatetimeTZBlock(ExtensionBlock, DatetimeBlock):
internal_values = Block.internal_values
_can_hold_element = DatetimeBlock._can_hold_element
to_native_types = DatetimeBlock.to_native_types
diff = DatetimeBlock.diff
fill_value = np.datetime64("NaT", "ns")
array_values = ExtensionBlock.array_values

Expand Down Expand Up @@ -2291,43 +2318,6 @@ def external_values(self):
# return an object-dtype ndarray of Timestamps.
return np.asarray(self.values.astype("datetime64[ns]", copy=False))

def diff(self, n: int, axis: int = 0) -> List["Block"]:
"""
1st discrete difference.

Parameters
----------
n : int
Number of periods to diff.
axis : int, default 0
Axis to diff upon.

Returns
-------
A list with a new TimeDeltaBlock.

Notes
-----
The arguments here are mimicking shift so they are called correctly
by apply.
"""
if axis == 0:
# TODO(EA2D): special case not needed with 2D EAs
# Cannot currently calculate diff across multiple blocks since this
# function is invoked via apply
raise NotImplementedError

if n == 0:
# Fastpath avoids making a copy in `shift`
new_values = np.zeros(self.values.shape, dtype=np.int64)
else:
new_values = (self.values - self.shift(n, axis=axis)[0].values).asi8

# Reshape the new_values like how algos.diff does for timedelta data
new_values = new_values.reshape(1, len(new_values))
new_values = new_values.astype("timedelta64[ns]")
return [TimeDeltaBlock(new_values, placement=self.mgr_locs.indexer)]

def fillna(self, value, limit=None, inplace=False, downcast=None):
# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
Expand Down
54 changes: 54 additions & 0 deletions pandas/tests/frame/methods/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,60 @@ def test_diff(self, datetime_frame):
expected = pd.DataFrame({"x": np.nan, "y": pd.Series(1), "z": pd.Series(1)})
tm.assert_frame_equal(result, expected)

def test_diff_timedelta64_with_nat(self):
# GH#32441
arr = np.arange(6).reshape(3, 2).astype("timedelta64[ns]")
arr[:, 0] = np.timedelta64("NaT", "ns")

df = pd.DataFrame(arr)
result = df.diff(1, axis=0)

expected = pd.DataFrame(
{0: df[0], 1: [pd.NaT, pd.Timedelta(2), pd.Timedelta(2)]}
)
tm.assert_equal(result, expected)

result = df.diff(0)
expected = df - df
assert expected[0].isna().all()
tm.assert_equal(result, expected)

result = df.diff(-1, axis=1)
expected = df * np.nan
tm.assert_equal(result, expected)

@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis0_with_nat(self, tz):
# GH#32441
dti = pd.DatetimeIndex(["NaT", "2019-01-01", "2019-01-02"], tz=tz)
ser = pd.Series(dti)

df = ser.to_frame()

result = df.diff()
ex_index = pd.TimedeltaIndex([pd.NaT, pd.NaT, pd.Timedelta(days=1)])
expected = pd.Series(ex_index).to_frame()
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_with_nat_zero_periods(self, tz):
# diff on NaT values should give NaT, not timedelta64(0)
dti = pd.date_range("2016-01-01", periods=4, tz=tz)
ser = pd.Series(dti)
df = ser.to_frame()

df[1] = ser.copy()
df.iloc[:, 0] = pd.NaT

expected = df - df
assert expected[0].isna().all()

result = df.diff(0, axis=0)
tm.assert_frame_equal(result, expected)

result = df.diff(0, axis=1)
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("tz", [None, "UTC"])
def test_diff_datetime_axis0(self, tz):
# GH#18578
Expand Down