From c39abff87e30e334918143b28a13e2d4859fc79d Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Jun 2022 16:35:07 -0700 Subject: [PATCH 1/2] ENH: support reso in DTA._box_func --- pandas/_libs/tslibs/conversion.pxd | 7 ++- pandas/_libs/tslibs/conversion.pyx | 17 +++++--- pandas/_libs/tslibs/timestamps.pyx | 23 ++++++++-- pandas/_libs/tslibs/tzconversion.pyx | 13 +++--- pandas/_libs/tslibs/vectorized.pyx | 4 +- pandas/core/arrays/datetimes.py | 4 +- pandas/tests/arrays/test_datetimes.py | 43 ++++++++++++++----- .../tests/scalar/timestamp/test_timestamp.py | 6 ++- 8 files changed, 84 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index ba03de6f0b81f..fb0c7d71ad58f 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -8,7 +8,10 @@ from numpy cimport ( ndarray, ) -from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct +from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, + npy_datetimestruct, +) cdef class _TSObject: @@ -31,3 +34,5 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef datetime localize_pydatetime(datetime dt, tzinfo tz) cdef int64_t cast_from_unit(object ts, str unit) except? -1 cpdef (int64_t, int) precision_from_unit(str unit) + +cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso) diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 57bccd662e1a0..4e1fcbbcdcc61 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -296,14 +296,18 @@ cdef _TSObject convert_to_tsobject(object ts, tzinfo tz, str unit, raise TypeError(f'Cannot convert input [{ts}] of type {type(ts)} to ' f'Timestamp') + maybe_localize_tso(obj, tz, NPY_FR_ns) + return obj + + +cdef maybe_localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso): if tz is not None: - _localize_tso(obj, tz) + _localize_tso(obj, tz, reso) if obj.value != NPY_NAT: # check_overflows needs to run after _localize_tso - check_dts_bounds(&obj.dts) + check_dts_bounds(&obj.dts, reso) check_overflows(obj) - return obj cdef _TSObject convert_datetime_to_tsobject(datetime ts, tzinfo tz, @@ -548,7 +552,7 @@ cdef inline check_overflows(_TSObject obj): # ---------------------------------------------------------------------- # Localization -cdef inline void _localize_tso(_TSObject obj, tzinfo tz): +cdef inline void _localize_tso(_TSObject obj, tzinfo tz, NPY_DATETIMEUNIT reso): """ Given the UTC nanosecond timestamp in obj.value, find the wall-clock representation of that timestamp in the given timezone. @@ -557,6 +561,7 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz): ---------- obj : _TSObject tz : tzinfo + reso : NPY_DATETIMEUNIT Returns ------- @@ -569,7 +574,7 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz): cdef: int64_t local_val Py_ssize_t outpos = -1 - Localizer info = Localizer(tz, NPY_FR_ns) + Localizer info = Localizer(tz, reso) assert obj.tzinfo is None @@ -584,7 +589,7 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz): # infer we went through a pytz path, will have outpos!=-1 tz = tz._tzinfos[tz._transition_info[outpos]] - dt64_to_dtstruct(local_val, &obj.dts) + pandas_datetime_to_datetimestruct(local_val, reso, &obj.dts) obj.tzinfo = tz diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 251211ea61651..c6bae70d04a98 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -51,6 +51,7 @@ from pandas._libs.tslibs.conversion cimport ( _TSObject, convert_datetime_to_tsobject, convert_to_tsobject, + maybe_localize_tso, ) from pandas._libs.tslibs.dtypes cimport ( npy_unit_to_abbrev, @@ -210,6 +211,23 @@ cdef class _Timestamp(ABCTimestamp): # ----------------------------------------------------------------- # Constructors + @classmethod + def _from_value_and_reso(cls, int64_t value, NPY_DATETIMEUNIT reso, tzinfo tz): + cdef: + npy_datetimestruct dts + _TSObject obj = _TSObject() + + if value == NPY_NAT: + return NaT + + obj.value = value + pandas_datetime_to_datetimestruct(value, reso, &obj.dts) + maybe_localize_tso(obj, tz, reso) + + return create_timestamp_from_ts( + value, obj.dts, tz=obj.tzinfo, freq=None, fold=obj.fold, reso=reso + ) + @classmethod def _from_dt64(cls, dt64: np.datetime64): # construct a Timestamp from a np.datetime64 object, keeping the @@ -223,10 +241,7 @@ cdef class _Timestamp(ABCTimestamp): reso = get_datetime64_unit(dt64) value = get_datetime64_value(dt64) - pandas_datetime_to_datetimestruct(value, reso, &dts) - return create_timestamp_from_ts( - value, dts, tz=None, freq=None, fold=0, reso=reso - ) + return cls._from_value_and_reso(value, reso, None) # ----------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 86cda289c80e6..7657633c7215a 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -88,15 +88,16 @@ cdef class Localizer: # NB: using floordiv here is implicitly assuming we will # never see trans or deltas that are not an integer number # of seconds. + # TODO: avoid these np.array calls if reso == NPY_DATETIMEUNIT.NPY_FR_us: - trans = trans // 1_000 - deltas = deltas // 1_000 + trans = np.array(trans) // 1_000 + deltas = np.array(deltas) // 1_000 elif reso == NPY_DATETIMEUNIT.NPY_FR_ms: - trans = trans // 1_000_000 - deltas = deltas // 1_000_000 + trans = np.array(trans) // 1_000_000 + deltas = np.array(deltas) // 1_000_000 elif reso == NPY_DATETIMEUNIT.NPY_FR_s: - trans = trans // 1_000_000_000 - deltas = deltas // 1_000_000_000 + trans = np.array(trans) // 1_000_000_000 + deltas = np.array(deltas) // 1_000_000_000 else: raise NotImplementedError(reso) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index a52823681def6..2cab55e607f15 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -43,7 +43,7 @@ from .tzconversion cimport Localizer @cython.boundscheck(False) @cython.wraparound(False) -def tz_convert_from_utc(ndarray stamps, tzinfo tz): +def tz_convert_from_utc(ndarray stamps, tzinfo tz, NPY_DATETIMEUNIT reso=NPY_FR_ns): # stamps is int64_t, arbitrary ndim """ Convert the values (in i8) from UTC to tz @@ -58,7 +58,7 @@ def tz_convert_from_utc(ndarray stamps, tzinfo tz): ndarray[int64] """ cdef: - Localizer info = Localizer(tz, reso=NPY_FR_ns) + Localizer info = Localizer(tz, reso=reso) int64_t utc_val, local_val Py_ssize_t pos, i, n = stamps.size diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index da5542feaea56..400958449d3ff 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -546,7 +546,7 @@ def _check_compatible_with(self, other, setitem: bool = False): def _box_func(self, x: np.datetime64) -> Timestamp | NaTType: # GH#42228 value = x.view("i8") - ts = Timestamp(value, tz=self.tz) + ts = Timestamp._from_value_and_reso(value, reso=self._reso, tz=self.tz) # Non-overlapping identity check (left operand type: "Timestamp", # right operand type: "NaTType") if ts is not NaT: # type: ignore[comparison-overlap] @@ -775,7 +775,7 @@ def _local_timestamps(self) -> npt.NDArray[np.int64]: if self.tz is None or timezones.is_utc(self.tz): # Avoid the copy that would be made in tzconversion return self.asi8 - return tz_convert_from_utc(self.asi8, self.tz) + return tz_convert_from_utc(self.asi8, self.tz, reso=self._reso) def tz_convert(self, tz) -> DatetimeArray: """ diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py index 14d33ee52aae2..03703dfd2eb28 100644 --- a/pandas/tests/arrays/test_datetimes.py +++ b/pandas/tests/arrays/test_datetimes.py @@ -4,6 +4,9 @@ import numpy as np import pytest +from pandas._libs.tslibs import tz_compare +from pandas._libs.tslibs.dtypes import NpyDatetimeUnit + from pandas.core.dtypes.dtypes import DatetimeTZDtype import pandas as pd @@ -20,16 +23,28 @@ def unit(self, request): @pytest.fixture def reso(self, unit): """Fixture returning datetime resolution for a given time unit""" - # TODO: avoid hard-coding - return {"s": 7, "ms": 8, "us": 9}[unit] + return { + "s": NpyDatetimeUnit.NPY_FR_s.value, + "ms": NpyDatetimeUnit.NPY_FR_ms.value, + "us": NpyDatetimeUnit.NPY_FR_us.value, + }[unit] + + @pytest.fixture + def dtype(self, unit, tz_naive_fixture): + tz = tz_naive_fixture + if tz is None: + return np.dtype(f"datetime64[{unit}]") + else: + return DatetimeTZDtype(unit=unit, tz=tz) - @pytest.mark.xfail(reason="_box_func is not yet patched to get reso right") - def test_non_nano(self, unit, reso): + def test_non_nano(self, unit, reso, dtype): arr = np.arange(5, dtype=np.int64).view(f"M8[{unit}]") - dta = DatetimeArray._simple_new(arr, dtype=arr.dtype) + dta = DatetimeArray._simple_new(arr, dtype=dtype) - assert dta.dtype == arr.dtype + assert dta.dtype == dtype assert dta[0]._reso == reso + assert tz_compare(dta.tz, dta[0].tz) + assert (dta[0] == dta[:1]).all() @pytest.mark.filterwarnings( "ignore:weekofyear and week have been deprecated:FutureWarning" @@ -37,11 +52,19 @@ def test_non_nano(self, unit, reso): @pytest.mark.parametrize( "field", DatetimeArray._field_ops + DatetimeArray._bool_ops ) - def test_fields(self, unit, reso, field): - dti = pd.date_range("2016-01-01", periods=55, freq="D") - arr = np.asarray(dti).astype(f"M8[{unit}]") + def test_fields(self, unit, reso, field, dtype): + tz = getattr(dtype, "tz", None) + dti = pd.date_range("2016-01-01", periods=55, freq="D", tz=tz) + if tz is None: + arr = np.asarray(dti).astype(f"M8[{unit}]") + else: + arr = np.asarray(dti.tz_convert("UTC").tz_localize(None)).astype( + f"M8[{unit}]" + ) - dta = DatetimeArray._simple_new(arr, dtype=arr.dtype) + dta = DatetimeArray._simple_new(arr, dtype=dtype) + + # FIXME: assert (dti == dta).all() res = getattr(dta, field) expected = getattr(dti._data, field) diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 108d58bcc251d..89e5ce2241e42 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -802,10 +802,12 @@ def test_comparison(self, dt64, ts): def test_cmp_cross_reso(self): # numpy gets this wrong because of silent overflow - dt64 = np.datetime64(106752, "D") # won't fit in M8[ns] + dt64 = np.datetime64(9223372800, "s") # won't fit in M8[ns] ts = Timestamp._from_dt64(dt64) - other = Timestamp(dt64 - 1) + # subtracting 3600*24 gives a datetime64 that _can_ fit inside the + # nanosecond implementation bounds. + other = Timestamp(dt64 - 3600 * 24) assert other < ts assert other.asm8 > ts.asm8 # <- numpy gets this wrong assert ts > other From 9e3a7c2bb60301a8496f36ecdc748b6b6c65c8d3 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 9 Jun 2022 18:11:29 -0700 Subject: [PATCH 2/2] mypy fixup --- pandas/_libs/tslibs/timestamps.pyi | 4 ++++ pandas/_libs/tslibs/vectorized.pyi | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 4be9621a594dc..fd593ae453ef7 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -59,6 +59,10 @@ class Timestamp(datetime): # While Timestamp can return pd.NaT, having the constructor return # a Union with NaTType makes things awkward for users of pandas def _set_freq(self, freq: BaseOffset | None) -> None: ... + @classmethod + def _from_value_and_reso( + cls, value: int, reso: int, tz: _tzinfo | None + ) -> Timestamp: ... @property def year(self) -> int: ... @property diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index 919457724606d..8820a17ce5996 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -37,5 +37,7 @@ def ints_to_pydatetime( box: str = ..., ) -> npt.NDArray[np.object_]: ... def tz_convert_from_utc( - stamps: npt.NDArray[np.int64], tz: tzinfo | None + stamps: npt.NDArray[np.int64], + tz: tzinfo | None, + reso: int = ..., ) -> npt.NDArray[np.int64]: ...