From 5ecf33195b93e0b039f1542be17b75c547dbfe9a Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Oct 2020 08:49:15 -0700 Subject: [PATCH 1/3] BUG: algos.diff with datetimelike and NaT --- pandas/_libs/algos.pyx | 46 ++++++++++++++++++++++++++++++++++---- pandas/core/algorithms.py | 18 ++++++++++++--- pandas/tests/test_algos.py | 17 ++++++++++++++ 3 files changed, 74 insertions(+), 7 deletions(-) diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index c4723a5f064c7..da5ae97bb067b 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -1195,6 +1195,7 @@ ctypedef fused diff_t: ctypedef fused out_t: float32_t float64_t + int64_t @cython.boundscheck(False) @@ -1204,11 +1205,13 @@ def diff_2d( ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, + bint datetimelike=False, ): cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.flags.f_contiguous # bint f_contig = arr.is_f_contig() # TODO(cython 3) + diff_t left, right # Disable for unsupported dtype combinations, # see https://github.com/cython/cython/issues/2646 @@ -1218,6 +1221,9 @@ def diff_2d( elif (out_t is float64_t and (diff_t is float32_t or diff_t is int8_t or diff_t is int16_t)): raise NotImplementedError + elif out_t is int64_t and diff_t is not int64_t: + # We only have out_t of int64_t if we have datetimelike + raise NotImplementedError else: # We put this inside an indented else block to avoid cython build # warnings about unreachable code @@ -1231,7 +1237,15 @@ def diff_2d( start, stop = 0, sx + periods for j in range(sy): for i in range(start, stop): - out[i, j] = arr[i, j] - arr[i - periods, j] + left = arr[i, j] + right = arr[i - periods, j] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if periods >= 0: start, stop = periods, sy @@ -1239,7 +1253,15 @@ def diff_2d( start, stop = 0, sy + periods for j in range(start, stop): for i in range(sx): - out[i, j] = arr[i, j] - arr[i, j - periods] + left = arr[i, j] + right = arr[i, j - periods] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if axis == 0: if periods >= 0: @@ -1248,7 +1270,15 @@ def diff_2d( start, stop = 0, sx + periods for i in range(start, stop): for j in range(sy): - out[i, j] = arr[i, j] - arr[i - periods, j] + left = arr[i, j] + right = arr[i - periods, j] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right else: if periods >= 0: start, stop = periods, sy @@ -1256,7 +1286,15 @@ def diff_2d( start, stop = 0, sy + periods for i in range(sx): for j in range(start, stop): - out[i, j] = arr[i, j] - arr[i, j - periods] + left = arr[i, j] + right = arr[i, j - periods] + if out_t is int64_t and datetimelike: + if left == NPY_NAT or right == NPY_NAT: + out[i, j] = NPY_NAT + else: + out[i, j] = left - right + else: + out[i, j] = left - right # generated from template diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d2005d46bbbf1..d003fd2d03e7c 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1908,6 +1908,8 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if is_extension_array_dtype(dtype): if hasattr(arr, f"__{op.__name__}__"): + if axis != 0: + raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}") return op(arr, arr.shift(n)) else: warn( @@ -1922,18 +1924,26 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): is_timedelta = False is_bool = False if needs_i8_conversion(arr.dtype): - dtype = np.float64 + dtype = np.int64 arr = arr.view("i8") na = iNaT is_timedelta = True elif is_bool_dtype(dtype): + # We have to cast in order to be able to hold np.nan dtype = np.object_ is_bool = True elif is_integer_dtype(dtype): + # We have to cast in order to be able to hold np.nan dtype = np.float64 + orig_ndim = arr.ndim + if orig_ndim == 1: + # reshape so we can always use algos.diff_2d + arr = arr.reshape(-1, 1) + # TODO: require axis == 0 + dtype = np.dtype(dtype) out_arr = np.empty(arr.shape, dtype=dtype) @@ -1944,7 +1954,7 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): if arr.ndim == 2 and arr.dtype.name in _diff_special: # TODO: can diff_2d dtype specialization troubles be fixed by defining # out_arr inside diff_2d? - algos.diff_2d(arr, out_arr, n, axis) + algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta) else: # To keep mypy happy, _res_indexer is a list while res_indexer is # a tuple, ditto for lag_indexer. @@ -1978,8 +1988,10 @@ def diff(arr, n: int, axis: int = 0, stacklevel=3): out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer] if is_timedelta: - out_arr = out_arr.astype("int64").view("timedelta64[ns]") + out_arr = out_arr.view("timedelta64[ns]") + if orig_ndim == 1: + out_arr = out_arr[:, 0] return out_arr diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 28ceaa61c558f..ac59518a36753 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2405,3 +2405,20 @@ def test_index(self): dtype="timedelta64[ns]", ) tm.assert_series_equal(algos.mode(idx), exp) + + +class TestDiff: + def test_diff_datetimelike_nat(self): + # NaT - NaT is NaT, not 0 + arr = np.arange(12).astype(np.int64).view("datetime64[ns]").reshape(3, 4) + arr[:, 2] = "NaT" + result = algos.diff(arr, 1, axis=0) + + expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4 + expected[:, 2] = "NaT" + expected[0, :] = "NaT" + + tm.assert_numpy_array_equal(result, expected) + + result = algos.diff(arr.T, 1, axis=1) + tm.assert_numpy_array_equal(result, expected.T) From a5e6db4052f49bac8d605de705d31a78a02ad53d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 15 Oct 2020 15:43:06 -0700 Subject: [PATCH 2/3] older numpy compat --- pandas/tests/test_algos.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index ac59518a36753..dff2eb8dce99a 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2411,12 +2411,12 @@ class TestDiff: def test_diff_datetimelike_nat(self): # NaT - NaT is NaT, not 0 arr = np.arange(12).astype(np.int64).view("datetime64[ns]").reshape(3, 4) - arr[:, 2] = "NaT" + arr[:, 2] = np.datetime64("NaT", "ns") result = algos.diff(arr, 1, axis=0) expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4 - expected[:, 2] = "NaT" - expected[0, :] = "NaT" + expected[:, 2] = np.timedelta64("NaT", "ns") + expected[0, :] = np.timedelta64("NaT", "ns") tm.assert_numpy_array_equal(result, expected) From 2429660a99a94b71514a114caf5957d813b12df6 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 16 Oct 2020 13:43:17 -0700 Subject: [PATCH 3/3] tests --- pandas/tests/test_algos.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index dff2eb8dce99a..37d92e220e4cd 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -2408,10 +2408,11 @@ def test_index(self): class TestDiff: - def test_diff_datetimelike_nat(self): + @pytest.mark.parametrize("dtype", ["M8[ns]", "m8[ns]"]) + def test_diff_datetimelike_nat(self, dtype): # NaT - NaT is NaT, not 0 - arr = np.arange(12).astype(np.int64).view("datetime64[ns]").reshape(3, 4) - arr[:, 2] = np.datetime64("NaT", "ns") + arr = np.arange(12).astype(np.int64).view(dtype).reshape(3, 4) + arr[:, 2] = arr.dtype.type("NaT", "ns") result = algos.diff(arr, 1, axis=0) expected = np.ones(arr.shape, dtype="timedelta64[ns]") * 4 @@ -2422,3 +2423,10 @@ def test_diff_datetimelike_nat(self): result = algos.diff(arr.T, 1, axis=1) tm.assert_numpy_array_equal(result, expected.T) + + def test_diff_ea_axis(self): + dta = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")._data + + msg = "cannot diff DatetimeArray on axis=1" + with pytest.raises(ValueError, match=msg): + algos.diff(dta, 1, axis=1)