BUG: PeriodIndex + TimedeltaArray-with-NaT (pandas-dev#47783)

jbrockmendel · web-flow · commit 9f5c8b916189 · 2022-07-19T17:10:24.000-07:00
* BUG: PeriodIndex-with-Nat + TimedeltaArray

* mypy fixup

* de-kludge
diff --git a/pandas/_libs/tslibs/dtypes.pxd b/pandas/_libs/tslibs/dtypes.pxd
@@ -3,7 +3,7 @@ from numpy cimport int64_t
 from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
 
 
-cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
+cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
 cdef NPY_DATETIMEUNIT abbrev_to_npy_unit(str abbrev)
 cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
 cpdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*) except? -1
diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi
@@ -8,6 +8,7 @@ _period_code_map: dict[str, int]
 def periods_per_day(reso: int) -> int: ...
 def periods_per_second(reso: int) -> int: ...
 def is_supported_unit(reso: int) -> bool: ...
+def npy_unit_to_abbrev(reso: int) -> str: ...
 
 class PeriodDtypeBase:
     _dtype_code: int  # PeriodDtypeCode
diff --git a/pandas/_libs/tslibs/dtypes.pyx b/pandas/_libs/tslibs/dtypes.pyx
@@ -289,7 +289,7 @@ def is_supported_unit(NPY_DATETIMEUNIT reso):
     )
 
 
-cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
+cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):
     if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
         # generic -> default to nanoseconds
         return "ns"
diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd
@@ -101,6 +101,7 @@ cpdef cnp.ndarray astype_overflowsafe(
     cnp.ndarray values,  # ndarray[datetime64[anyunit]]
     cnp.dtype dtype,  # ndarray[datetime64[anyunit]]
     bint copy=*,
+    bint round_ok=*,
 )
 cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1
 
diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi
@@ -9,7 +9,10 @@ class OutOfBoundsTimedelta(ValueError): ...
 def py_get_unit_from_dtype(dtype: np.dtype): ...
 def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ...
 def astype_overflowsafe(
-    arr: np.ndarray, dtype: np.dtype, copy: bool = ...
+    arr: np.ndarray,
+    dtype: np.dtype,
+    copy: bool = ...,
+    round_ok: bool = ...,
 ) -> np.ndarray: ...
 def is_unitless(dtype: np.dtype) -> bool: ...
 def compare_mismatched_resolutions(
diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx
@@ -282,6 +282,7 @@ cpdef ndarray astype_overflowsafe(
     ndarray values,
     cnp.dtype dtype,
     bint copy=True,
+    bint round_ok=True,
 ):
     """
     Convert an ndarray with datetime64[X] to datetime64[Y]
@@ -314,20 +315,24 @@ cpdef ndarray astype_overflowsafe(
             "datetime64/timedelta64 values and dtype must have a unit specified"
         )
 
-    if (<object>values).dtype.byteorder == ">":
-        # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
-        values = values.astype(values.dtype.newbyteorder("<"))
-
     if from_unit == to_unit:
         # Check this before allocating result for perf, might save some memory
         if copy:
             return values.copy()
         return values
 
     elif from_unit > to_unit:
-        # e.g. ns -> us, so there is no risk of overflow, so we can use
-        #  numpy's astype safely. Note there _is_ risk of truncation.
-        return values.astype(dtype)
+        if round_ok:
+            # e.g. ns -> us, so there is no risk of overflow, so we can use
+            #  numpy's astype safely. Note there _is_ risk of truncation.
+            return values.astype(dtype)
+        else:
+            iresult2 = astype_round_check(values.view("i8"), from_unit, to_unit)
+            return iresult2.view(dtype)
+
+    if (<object>values).dtype.byteorder == ">":
+        # GH#29684 we incorrectly get OutOfBoundsDatetime if we dont swap
+        values = values.astype(values.dtype.newbyteorder("<"))
 
     cdef:
         ndarray i8values = values.view("i8")
@@ -356,10 +361,11 @@ cpdef ndarray astype_overflowsafe(
                 check_dts_bounds(&dts, to_unit)
             except OutOfBoundsDatetime as err:
                 if is_td:
-                    tdval = np.timedelta64(value).view(values.dtype)
+                    from_abbrev = np.datetime_data(values.dtype)[0]
+                    np_val = np.timedelta64(value, from_abbrev)
                     msg = (
-                        "Cannot convert {tdval} to {dtype} without overflow"
-                        .format(tdval=str(tdval), dtype=str(dtype))
+                        "Cannot convert {np_val} to {dtype} without overflow"
+                        .format(np_val=str(np_val), dtype=str(dtype))
                     )
                     raise OutOfBoundsTimedelta(msg) from err
                 else:
@@ -453,6 +459,52 @@ cdef int op_to_op_code(op):
         return Py_GT
 
 
+cdef ndarray astype_round_check(
+    ndarray i8values,
+    NPY_DATETIMEUNIT from_unit,
+    NPY_DATETIMEUNIT to_unit
+):
+    # cases with from_unit > to_unit, e.g. ns->us, raise if the conversion
+    #  involves truncation, e.g. 1500ns->1us
+    cdef:
+        Py_ssize_t i, N = i8values.size
+
+        # equiv: iresult = np.empty((<object>i8values).shape, dtype="i8")
+        ndarray iresult = cnp.PyArray_EMPTY(
+            i8values.ndim, i8values.shape, cnp.NPY_INT64, 0
+        )
+        cnp.broadcast mi = cnp.PyArray_MultiIterNew2(iresult, i8values)
+
+        # Note the arguments to_unit, from unit are swapped vs how they
+        #  are passed when going to a higher-frequency reso.
+        int64_t mult = get_conversion_factor(to_unit, from_unit)
+        int64_t value, mod
+
+    for i in range(N):
+        # Analogous to: item = i8values[i]
+        value = (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 1))[0]
+
+        if value == NPY_DATETIME_NAT:
+            new_value = NPY_DATETIME_NAT
+        else:
+            new_value, mod = divmod(value, mult)
+            if mod != 0:
+                # TODO: avoid runtime import
+                from pandas._libs.tslibs.dtypes import npy_unit_to_abbrev
+                from_abbrev = npy_unit_to_abbrev(from_unit)
+                to_abbrev = npy_unit_to_abbrev(to_unit)
+                raise ValueError(
+                    f"Cannot losslessly cast '{value} {from_abbrev}' to {to_abbrev}"
+                )
+
+        # Analogous to: iresult[i] = new_value
+        (<int64_t*>cnp.PyArray_MultiIter_DATA(mi, 0))[0] = new_value
+
+        cnp.PyArray_MultiIter_NEXT(mi)
+
+    return iresult
+
+
 @cython.overflowcheck(True)
 cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1:
     """
diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi
@@ -111,6 +111,8 @@ def to_offset(freq: timedelta | str) -> BaseOffset: ...
 
 class Tick(SingleConstructorOffset):
     _reso: int
+    _prefix: str
+    _td64_unit: str
     def __init__(self, n: int = ..., normalize: bool = ...) -> None: ...
     @property
     def delta(self) -> Timedelta: ...
diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx
@@ -796,6 +796,7 @@ cdef class SingleConstructorOffset(BaseOffset):
 cdef class Tick(SingleConstructorOffset):
     _adjust_dst = False
     _prefix = "undefined"
+    _td64_unit = "undefined"
     _attributes = tuple(["n", "normalize"])
 
     def __init__(self, n=1, normalize=False):
@@ -968,48 +969,55 @@ cdef class Tick(SingleConstructorOffset):
 cdef class Day(Tick):
     _nanos_inc = 24 * 3600 * 1_000_000_000
     _prefix = "D"
+    _td64_unit = "D"
     _period_dtype_code = PeriodDtypeCode.D
     _reso = NPY_DATETIMEUNIT.NPY_FR_D
 
 
 cdef class Hour(Tick):
     _nanos_inc = 3600 * 1_000_000_000
     _prefix = "H"
+    _td64_unit = "h"
     _period_dtype_code = PeriodDtypeCode.H
     _reso = NPY_DATETIMEUNIT.NPY_FR_h
 
 
 cdef class Minute(Tick):
     _nanos_inc = 60 * 1_000_000_000
     _prefix = "T"
+    _td64_unit = "m"
     _period_dtype_code = PeriodDtypeCode.T
     _reso = NPY_DATETIMEUNIT.NPY_FR_m
 
 
 cdef class Second(Tick):
     _nanos_inc = 1_000_000_000
     _prefix = "S"
+    _td64_unit = "s"
     _period_dtype_code = PeriodDtypeCode.S
     _reso = NPY_DATETIMEUNIT.NPY_FR_s
 
 
 cdef class Milli(Tick):
     _nanos_inc = 1_000_000
     _prefix = "L"
+    _td64_unit = "ms"
     _period_dtype_code = PeriodDtypeCode.L
     _reso = NPY_DATETIMEUNIT.NPY_FR_ms
 
 
 cdef class Micro(Tick):
     _nanos_inc = 1000
     _prefix = "U"
+    _td64_unit = "us"
     _period_dtype_code = PeriodDtypeCode.U
     _reso = NPY_DATETIMEUNIT.NPY_FR_us
 
 
 cdef class Nano(Tick):
     _nanos_inc = 1
     _prefix = "N"
+    _td64_unit = "ns"
     _period_dtype_code = PeriodDtypeCode.N
     _reso = NPY_DATETIMEUNIT.NPY_FR_ns
 
diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py
@@ -72,10 +72,7 @@
     ABCSeries,
     ABCTimedeltaArray,
 )
-from pandas.core.dtypes.missing import (
-    isna,
-    notna,
-)
+from pandas.core.dtypes.missing import notna
 
 import pandas.core.algorithms as algos
 from pandas.core.arrays import datetimelike as dtl
@@ -792,20 +789,30 @@ def _add_timedelta_arraylike(
         -------
         result : ndarray[int64]
         """
-        if not isinstance(self.freq, Tick):
+        freq = self.freq
+        if not isinstance(freq, Tick):
             # We cannot add timedelta-like to non-tick PeriodArray
             raise TypeError(
                 f"Cannot add or subtract timedelta64[ns] dtype from {self.dtype}"
             )
 
-        if not np.all(isna(other)):
-            delta = self._check_timedeltalike_freq_compat(other)
-        else:
-            # all-NaT TimedeltaIndex is equivalent to a single scalar td64 NaT
-            return self + np.timedelta64("NaT")
+        dtype = np.dtype(f"m8[{freq._td64_unit}]")
+
+        try:
+            delta = astype_overflowsafe(
+                np.asarray(other), dtype=dtype, copy=False, round_ok=False
+            )
+        except ValueError as err:
+            # TODO: not actually a great exception message in this case
+            raise raise_on_incompatible(self, other) from err
+
+        b_mask = np.isnat(delta)
 
-        ordinals = self._addsub_int_array_or_scalar(delta, operator.add).asi8
-        return type(self)(ordinals, dtype=self.dtype)
+        res_values = algos.checked_add_with_arr(
+            self.asi8, delta.view("i8"), arr_mask=self._isnan, b_mask=b_mask
+        )
+        np.putmask(res_values, self._isnan | b_mask, iNaT)
+        return type(self)(res_values, freq=self.freq)
 
     def _check_timedeltalike_freq_compat(self, other):
         """
diff --git a/pandas/tests/arithmetic/test_period.py b/pandas/tests/arithmetic/test_period.py
@@ -1243,6 +1243,21 @@ def test_parr_add_sub_tdt64_nat_array(self, box_with_array, other):
         with pytest.raises(TypeError, match=msg):
             other - obj
 
+        # some but not *all* NaT
+        other = other.copy()
+        other[0] = np.timedelta64(0, "ns")
+        expected = PeriodIndex([pi[0]] + ["NaT"] * 8, freq="19D")
+        expected = tm.box_expected(expected, box_with_array)
+
+        result = obj + other
+        tm.assert_equal(result, expected)
+        result = other + obj
+        tm.assert_equal(result, expected)
+        result = obj - other
+        tm.assert_equal(result, expected)
+        with pytest.raises(TypeError, match=msg):
+            other - obj
+
     # ---------------------------------------------------------------
     # Unsorted
 
diff --git a/pandas/tests/tslibs/test_np_datetime.py b/pandas/tests/tslibs/test_np_datetime.py
@@ -208,3 +208,15 @@ def test_astype_overflowsafe_td64(self):
         result = astype_overflowsafe(arr, dtype2)
         expected = arr.astype(dtype2)
         tm.assert_numpy_array_equal(result, expected)
+
+    def test_astype_overflowsafe_disallow_rounding(self):
+        arr = np.array([-1500, 1500], dtype="M8[ns]")
+        dtype = np.dtype("M8[us]")
+
+        msg = "Cannot losslessly cast '-1500 ns' to us"
+        with pytest.raises(ValueError, match=msg):
+            astype_overflowsafe(arr, dtype, round_ok=False)
+
+        result = astype_overflowsafe(arr, dtype, round_ok=True)
+        expected = arr.astype(dtype)
+        tm.assert_numpy_array_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -289,7 +289,7 @@ def is_supported_unit(NPY_DATETIMEUNIT reso):`
`289`	`289`	`)`
`290`	`290`
`291`	`291`
`292`		`-cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):`
	`292`	`+cpdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit):`
`293`	`293`	`if unit == NPY_DATETIMEUNIT.NPY_FR_ns or unit == NPY_DATETIMEUNIT.NPY_FR_GENERIC:`
`294`	`294`	`# generic -> default to nanoseconds`
`295`	`295`	`return "ns"`
Original file line number	Diff line number	Diff line change
`@@ -101,6 +101,7 @@ cpdef cnp.ndarray astype_overflowsafe(`
`101`	`101`	`cnp.ndarray values, # ndarray[datetime64[anyunit]]`
`102`	`102`	`cnp.dtype dtype, # ndarray[datetime64[anyunit]]`
`103`	`103`	`bint copy=*,`
	`104`	`+ bint round_ok=*,`
`104`	`105`	`)`
`105`	`106`	`cdef int64_t get_conversion_factor(NPY_DATETIMEUNIT from_unit, NPY_DATETIMEUNIT to_unit) except? -1`
`106`	`107`