From 4c971b5281fd0ce1c2d20c7b5c494eda418f039f Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 22 Apr 2022 10:46:00 -0700 Subject: [PATCH 1/2] ENH: initial support for non-nano Timestamp --- pandas/_libs/tslibs/timestamps.pxd | 14 +- pandas/_libs/tslibs/timestamps.pyx | 134 ++++++++++++++++-- .../tests/scalar/timedelta/test_timedelta.py | 1 + .../tests/scalar/timestamp/test_timestamp.py | 119 ++++++++++++++++ 4 files changed, 249 insertions(+), 19 deletions(-) diff --git a/pandas/_libs/tslibs/timestamps.pxd b/pandas/_libs/tslibs/timestamps.pxd index ce4c5d07ecc53..bde7cf9328712 100644 --- a/pandas/_libs/tslibs/timestamps.pxd +++ b/pandas/_libs/tslibs/timestamps.pxd @@ -5,19 +5,26 @@ from cpython.datetime cimport ( from numpy cimport int64_t from pandas._libs.tslibs.base cimport ABCTimestamp -from pandas._libs.tslibs.np_datetime cimport npy_datetimestruct +from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, + npy_datetimestruct, +) from pandas._libs.tslibs.offsets cimport BaseOffset cdef _Timestamp create_timestamp_from_ts(int64_t value, npy_datetimestruct dts, - tzinfo tz, BaseOffset freq, bint fold) + tzinfo tz, + BaseOffset freq, + bint fold, + NPY_DATETIMEUNIT reso=*) cdef class _Timestamp(ABCTimestamp): cdef readonly: int64_t value, nanosecond BaseOffset _freq + NPY_DATETIMEUNIT _reso cdef bint _get_start_end_field(self, str field, freq) cdef _get_date_name_field(self, str field, object locale) @@ -29,5 +36,4 @@ cdef class _Timestamp(ABCTimestamp): int op) except -1 cpdef void _set_freq(self, freq) cdef _warn_on_field_deprecation(_Timestamp self, freq, str field) - -cdef int64_t normalize_i8_stamp(int64_t local_val) nogil + cdef bint _compare_mismatched_resos(_Timestamp self, _Timestamp other, int op) diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 89fe1feaef3d9..89d142fe3e5d4 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -52,6 +52,7 @@ from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, convert_to_tsobject, ) +from pandas._libs.tslibs.dtypes cimport npy_unit_to_abbrev from pandas._libs.tslibs.util cimport ( is_array, is_datetime64_object, @@ -72,10 +73,16 @@ from pandas._libs.tslibs.nattype cimport ( c_NaT as NaT, ) from pandas._libs.tslibs.np_datetime cimport ( + NPY_DATETIMEUNIT, + NPY_FR_ns, check_dts_bounds, + cmp_dtstructs, cmp_scalar, dt64_to_dtstruct, + get_datetime64_unit, + get_datetime64_value, npy_datetimestruct, + pandas_datetime_to_datetimestruct, pydatetime_to_dt64, ) @@ -114,24 +121,39 @@ _no_input = object() # ---------------------------------------------------------------------- -cdef inline _Timestamp create_timestamp_from_ts(int64_t value, - npy_datetimestruct dts, - tzinfo tz, BaseOffset freq, bint fold): +cdef inline _Timestamp create_timestamp_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + BaseOffset freq, + bint fold, + NPY_DATETIMEUNIT reso=NPY_FR_ns, +): """ convenience routine to construct a Timestamp from its parts """ - cdef _Timestamp ts_base + cdef: + _Timestamp ts_base + ts_base = _Timestamp.__new__(Timestamp, dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) ts_base.value = value ts_base._freq = freq ts_base.nanosecond = dts.ps // 1000 + ts_base._reso = reso return ts_base -def _unpickle_timestamp(value, freq, tz): +def _unpickle_timestamp(value, freq, tz, reso): # GH#41949 dont warn on unpickle if we have a freq - ts = Timestamp(value, tz=tz) + if reso == NPY_FR_ns: + ts = Timestamp(value, tz=tz) + else: + if tz is not None: + raise NotImplementedError + abbrev = npy_unit_to_abbrev(reso) + dt64 = np.datetime64(value, abbrev) + ts = Timestamp._from_dt64(dt64) ts._set_freq(freq) return ts @@ -177,12 +199,36 @@ cdef class _Timestamp(ABCTimestamp): ) return self._freq + # ----------------------------------------------------------------- + # Constructors + + @classmethod + def _from_dt64(cls, dt64: np.datetime64): + # construct a Timestamp from a np.datetime64 object, keeping the + # resolution of the input. + # This is herely mainly so we can incrementally implement non-nano + # (e.g. only tznaive at first) + cdef: + npy_datetimestruct dts + int64_t value + NPY_DATETIMEUNIT reso + + reso = get_datetime64_unit(dt64) + value = get_datetime64_value(dt64) + pandas_datetime_to_datetimestruct(value, reso, &dts) + return create_timestamp_from_ts( + value, dts, tz=None, freq=None, fold=0, reso=reso + ) + + # ----------------------------------------------------------------- + def __hash__(_Timestamp self): if self.nanosecond: return hash(self.value) if self.fold: return datetime.__hash__(self.replace(fold=0)) return datetime.__hash__(self) + # TODO(non-nano): what if we are out of bounds for pydatetime? def __richcmp__(_Timestamp self, object other, int op): cdef: @@ -193,17 +239,16 @@ cdef class _Timestamp(ABCTimestamp): ots = other elif other is NaT: return op == Py_NE - elif PyDateTime_Check(other) or is_datetime64_object(other): - if self.nanosecond == 0 and PyDateTime_Check(other): + elif is_datetime64_object(other): + ots = _Timestamp._from_dt64(other) + elif PyDateTime_Check(other): + if self.nanosecond == 0: val = self.to_pydatetime() return PyObject_RichCompareBool(val, other, op) try: ots = type(self)(other) except ValueError: - if is_datetime64_object(other): - # cast non-nano dt64 to pydatetime - other = other.astype(object) return self._compare_outside_nanorange(other, op) elif is_array(other): @@ -253,7 +298,21 @@ cdef class _Timestamp(ABCTimestamp): raise TypeError( "Cannot compare tz-naive and tz-aware timestamps" ) - return cmp_scalar(self.value, ots.value, op) + if self._reso == ots._reso: + return cmp_scalar(self.value, ots.value, op) + return self._compare_mismatched_resos(ots, op) + + # TODO: copied from Timedelta; try to de-duplicate + cdef inline bint _compare_mismatched_resos(self, _Timestamp other, int op): + # Can't just dispatch to numpy as they silently overflow and get it wrong + cdef: + npy_datetimestruct dts_self + npy_datetimestruct dts_other + + # dispatch to the datetimestruct utils instead of writing new ones! + pandas_datetime_to_datetimestruct(self.value, self._reso, &dts_self) + pandas_datetime_to_datetimestruct(other.value, other._reso, &dts_other) + return cmp_dtstructs(&dts_self, &dts_other, op) cdef bint _compare_outside_nanorange(_Timestamp self, datetime other, int op) except -1: @@ -286,6 +345,9 @@ cdef class _Timestamp(ABCTimestamp): cdef: int64_t nanos = 0 + if isinstance(self, _Timestamp) and self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if is_any_td_scalar(other): nanos = delta_to_nanoseconds(other) try: @@ -325,6 +387,8 @@ cdef class _Timestamp(ABCTimestamp): return NotImplemented def __sub__(self, other): + if isinstance(self, _Timestamp) and self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) if is_any_td_scalar(other) or is_integer_object(other): neg_other = -other @@ -387,6 +451,9 @@ cdef class _Timestamp(ABCTimestamp): return NotImplemented def __rsub__(self, other): + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if PyDateTime_Check(other): try: return type(self)(other) - self @@ -420,6 +487,9 @@ cdef class _Timestamp(ABCTimestamp): ndarray[uint8_t, cast=True] out int month_kw + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if freq: kwds = freq.kwds month_kw = kwds.get('startingMonth', kwds.get('month', 12)) @@ -589,6 +659,9 @@ cdef class _Timestamp(ABCTimestamp): int64_t val object[:] out + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + val = self._maybe_convert_value_to_local() out = get_date_name_field(np.array([val], dtype=np.int64), field, locale=locale) @@ -741,6 +814,9 @@ cdef class _Timestamp(ABCTimestamp): local_val = self._maybe_convert_value_to_local() int64_t normalized + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + normalized = normalize_i8_stamp(local_val) return Timestamp(normalized).tz_localize(self.tzinfo) @@ -758,8 +834,16 @@ cdef class _Timestamp(ABCTimestamp): self._freq = state[1] self.tzinfo = state[2] + if len(state) == 3: + # pre-non-nano pickle + reso = NPY_FR_ns + assert False # checking for coverage + else: + reso = state[4] + self._reso = reso + def __reduce__(self): - object_state = self.value, self._freq, self.tzinfo + object_state = self.value, self._freq, self.tzinfo, self._reso return (_unpickle_timestamp, object_state) # ----------------------------------------------------------------- @@ -886,7 +970,7 @@ cdef class _Timestamp(ABCTimestamp): >>> ts.asm8 numpy.datetime64('2020-03-14T15:00:00.000000000') """ - return np.datetime64(self.value, 'ns') + return self.to_datetime64() def timestamp(self): """ @@ -900,6 +984,9 @@ cdef class _Timestamp(ABCTimestamp): """ # GH 17329 # Note: Naive timestamps will not match datetime.stdlib + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + return round(self.value / 1e9, 6) cpdef datetime to_pydatetime(_Timestamp self, bint warn=True): @@ -931,7 +1018,9 @@ cdef class _Timestamp(ABCTimestamp): """ Return a numpy.datetime64 object with 'ns' precision. """ - return np.datetime64(self.value, "ns") + # TODO: find a way to construct dt64 directly from _reso + abbrev = npy_unit_to_abbrev(self._reso) + return np.datetime64(self.value, abbrev) def to_numpy(self, dtype=None, copy=False) -> np.datetime64: """ @@ -993,6 +1082,9 @@ cdef class _Timestamp(ABCTimestamp): """ from pandas import Period + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if self.tz is not None: # GH#21333 warnings.warn( @@ -1468,6 +1560,9 @@ class Timestamp(_Timestamp): cdef: int64_t nanos = to_offset(freq).nanos + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if self.tz is not None: value = self.tz_localize(None).value else: @@ -1863,6 +1958,9 @@ default 'raise' >>> pd.NaT.tz_localize() NaT """ + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if ambiguous == 'infer': raise ValueError('Cannot infer offset with only one time.') @@ -1940,6 +2038,9 @@ default 'raise' >>> pd.NaT.tz_convert(tz='Asia/Tokyo') NaT """ + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + if self.tzinfo is None: # tz naive, use tz_localize raise TypeError( @@ -2019,6 +2120,9 @@ default 'raise' datetime ts_input tzinfo_type tzobj + if self._reso != NPY_FR_ns: + raise NotImplementedError(self._reso) + # set to naive if needed tzobj = self.tzinfo value = self.value diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 17a8ec5f86fc8..cf7211e82b799 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -72,6 +72,7 @@ def test_mul_preserves_reso(self, td, unit): assert res._reso == unit def test_cmp_cross_reso(self, td): + # numpy gets this wrong because of silent overflow other = Timedelta(days=106751, unit="ns") assert other < td assert td > other diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index 5f7cca99f75c6..ab7bc4c7cb412 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -692,3 +692,122 @@ def test_dt_subclass_add_timedelta(lh, rh): result = lh + rh expected = SubDatetime(2000, 1, 1, 1) assert result == expected + + +class TestNonNano: + @pytest.fixture(params=["s", "ms", "us"]) + def reso(self, request): + return request.param + + @pytest.fixture + def dt64(self, reso): + # cases that are in-bounds for nanosecond, so we can compare against + # the existing implementation. + return np.datetime64("2016-01-01", reso) + + @pytest.fixture + def ts(self, dt64): + return Timestamp._from_dt64(dt64) + + def test_non_nano_construction(self, dt64, ts, reso): + assert ts.value == dt64.view("i8") + + if reso == "s": + assert ts._reso == 7 + elif reso == "ms": + assert ts._reso == 8 + elif reso == "us": + assert ts._reso == 9 + + def test_non_nano_fields(self, dt64, ts): + alt = Timestamp(dt64) + + assert ts.year == alt.year + assert ts.month == alt.month + assert ts.day == alt.day + assert ts.hour == ts.minute == ts.second == ts.microsecond == 0 + assert ts.nanosecond == 0 + + assert ts.to_julian_date() == alt.to_julian_date() + assert ts.weekday() == alt.weekday() + assert ts.isoweekday() == alt.isoweekday() + + def test_repr(self, dt64, ts): + alt = Timestamp(dt64) + + assert str(ts) == str(alt) + assert repr(ts) == repr(alt) + + def test_comparison(self, dt64, ts): + alt = Timestamp(dt64) + + assert ts == dt64 + assert dt64 == ts + assert ts == alt + assert alt == ts + + assert not ts != dt64 + assert not dt64 != ts + assert not ts != alt + assert not alt != ts + + assert not ts < dt64 + assert not dt64 < ts + assert not ts < alt + assert not alt < ts + + assert not ts > dt64 + assert not dt64 > ts + assert not ts > alt + assert not alt > ts + + assert ts >= dt64 + assert dt64 >= ts + assert ts >= alt + assert alt >= ts + + assert ts <= dt64 + assert dt64 <= ts + assert ts <= alt + assert alt <= ts + + def test_cmp_cross_reso(self): + # numpy gets this wrong because of silent overflow + dt64 = np.datetime64(106752, "D") # won't fit in M8[ns] + ts = Timestamp._from_dt64(dt64) + + other = Timestamp(dt64 - 1) + assert other < ts + assert other.asm8 > ts.asm8 # <- numpy gets this wrong + assert ts > other + assert ts.asm8 < other.asm8 # <- numpy gets this wrong + assert not other == ts + assert ts != other + + @pytest.mark.xfail(reason="Dispatches to np.datetime64 which is wrong") + def test_cmp_cross_reso_reversed_dt64(self): + dt64 = np.datetime64(106752, "D") # won't fit in M8[ns] + ts = Timestamp._from_dt64(dt64) + other = Timestamp(dt64 - 1) + + assert other.asm8 < ts + + def test_pickle(self, ts): + rt = tm.round_trip_pickle(ts) + assert rt._reso == ts._reso + assert rt == ts + + def test_asm8(self, dt64, ts): + rt = ts.asm8 + assert rt == dt64 + assert rt.dtype == dt64.dtype + + def test_to_numpy(self, dt64, ts): + res = ts.to_numpy() + assert res == dt64 + assert res.dtype == dt64.dtype + + def test_to_datetime64(self, dt64, ts): + res = ts.to_datetime64() + assert res == dt64 + assert res.dtype == dt64.dtype From ca4d689c3099168df43a1324e092505f7a8d44fb Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 22 Apr 2022 13:52:47 -0700 Subject: [PATCH 2/2] update setup.py --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 384c1a267afe3..67b91c55dd397 100755 --- a/setup.py +++ b/setup.py @@ -543,6 +543,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "_libs.tslibs.timestamps": { "pyxfile": "_libs/tslibs/timestamps", "depends": tseries_depends, + "sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"], }, "_libs.tslibs.timezones": {"pyxfile": "_libs/tslibs/timezones"}, "_libs.tslibs.tzconversion": {