diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 132d742b78e9c..f7a389d6ff38d 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -71,8 +71,7 @@ from pandas._libs.tslibs.nattype cimport ( checknull_with_nat, ) from pandas._libs.tslibs.tzconversion cimport ( - bisect_right_i8, - localize_tzinfo_api, + Localizer, tz_localize_to_utc_single, ) @@ -503,12 +502,10 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, obj : _TSObject """ cdef: + Localizer info = Localizer(tz) _TSObject obj = _TSObject() int64_t value # numpy dt64 datetime dt - ndarray[int64_t] trans - int64_t* tdata - int64_t[::1] deltas value = dtstruct_to_dt64(&dts) obj.dts = dts @@ -522,15 +519,8 @@ cdef _TSObject _create_tsobject_tz_using_offset(npy_datetimestruct dts, # see PEP 495 https://www.python.org/dev/peps/pep-0495/#the-fold-attribute if is_utc(tz): pass - elif is_tzlocal(tz): - localize_tzinfo_api(obj.value, tz, &obj.fold) else: - trans, deltas, typ = get_dst_info(tz) - - if typ == 'dateutil': - tdata = cnp.PyArray_DATA(trans) - pos = bisect_right_i8(tdata, obj.value, trans.shape[0]) - 1 - obj.fold = _infer_tsobject_fold(obj, trans, deltas, pos) + info.utc_val_to_local_val(obj.value, fold=&obj.fold) # Keep the converter same as PyDateTime's dt = datetime(obj.dts.year, obj.dts.month, obj.dts.day, @@ -678,12 +668,8 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz): Sets obj.tzinfo inplace, alters obj.dts inplace. """ cdef: - ndarray[int64_t] trans - int64_t[::1] deltas + Localizer info = Localizer(tz) int64_t local_val - int64_t* tdata - Py_ssize_t pos, ntrans - str typ assert obj.tzinfo is None @@ -691,84 +677,18 @@ cdef inline void _localize_tso(_TSObject obj, tzinfo tz): pass elif obj.value == NPY_NAT: pass - elif is_tzlocal(tz): - local_val = obj.value + localize_tzinfo_api(obj.value, tz, &obj.fold) - dt64_to_dtstruct(local_val, &obj.dts) else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - - if typ == "pytz": - # i.e. treat_tz_as_pytz(tz) - tdata = cnp.PyArray_DATA(trans) - pos = bisect_right_i8(tdata, obj.value, ntrans) - 1 - local_val = obj.value + deltas[pos] + local_val = info.utc_val_to_local_val(obj.value, &obj.fold) + if info.use_pytz: # find right representation of dst etc in pytz timezone - tz = tz._tzinfos[tz._transition_info[pos]] - elif typ == "dateutil": - # i.e. treat_tz_as_dateutil(tz) - tdata = cnp.PyArray_DATA(trans) - pos = bisect_right_i8(tdata, obj.value, ntrans) - 1 - local_val = obj.value + deltas[pos] - - # dateutil supports fold, so we infer fold from value - obj.fold = _infer_tsobject_fold(obj, trans, deltas, pos) - else: - # All other cases have len(deltas) == 1. As of 2018-07-17 - # (and 2022-03-07), all test cases that get here have - # is_fixed_offset(tz). - local_val = obj.value + deltas[0] + tz = info.adjust_pytz_tzinfo(obj.value) dt64_to_dtstruct(local_val, &obj.dts) obj.tzinfo = tz -cdef inline bint _infer_tsobject_fold( - _TSObject obj, - const int64_t[:] trans, - const int64_t[:] deltas, - intp_t pos, -): - """ - Infer _TSObject fold property from value by assuming 0 and then setting - to 1 if necessary. - - Parameters - ---------- - obj : _TSObject - trans : ndarray[int64_t] - ndarray of offset transition points in nanoseconds since epoch. - deltas : int64_t[:] - array of offsets corresponding to transition points in trans. - pos : intp_t - Position of the last transition point before taking fold into account. - - Returns - ------- - bint - Due to daylight saving time, one wall clock time can occur twice - when shifting from summer to winter time; fold describes whether the - datetime-like corresponds to the first (0) or the second time (1) - the wall clock hits the ambiguous time - - References - ---------- - .. [1] "PEP 495 - Local Time Disambiguation" - https://www.python.org/dev/peps/pep-0495/#the-fold-attribute - """ - cdef: - bint fold = 0 - - if pos > 0: - fold_delta = deltas[pos - 1] - deltas[pos] - if obj.value - fold_delta < trans[pos]: - fold = 1 - - return fold - cdef inline datetime _localize_pydatetime(datetime dt, tzinfo tz): """ Take a datetime/Timestamp in UTC and localizes to timezone tz. diff --git a/pandas/_libs/tslibs/tzconversion.pxd b/pandas/_libs/tslibs/tzconversion.pxd index 136e62985995e..8b8d9eca547c6 100644 --- a/pandas/_libs/tslibs/tzconversion.pxd +++ b/pandas/_libs/tslibs/tzconversion.pxd @@ -1,5 +1,8 @@ from cpython.datetime cimport tzinfo -from numpy cimport int64_t +from numpy cimport ( + int64_t, + ndarray, +) cdef int64_t localize_tzinfo_api( @@ -11,3 +14,21 @@ cdef int64_t tz_localize_to_utc_single( ) except? -1 cdef Py_ssize_t bisect_right_i8(int64_t *data, int64_t val, Py_ssize_t n) + + +cdef class Localizer: + cdef readonly: + tzinfo tz + bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz, use_dateutil + ndarray trans + Py_ssize_t ntrans + const int64_t[::1] deltas + int64_t delta + + cdef: + int64_t* tdata + + cdef inline int64_t utc_val_to_local_val( + self, int64_t utc_val, bint* fold=* + ) + cdef tzinfo adjust_pytz_tzinfo(self, int64_t utc_val) diff --git a/pandas/_libs/tslibs/tzconversion.pyx b/pandas/_libs/tslibs/tzconversion.pyx index 9190585b2882d..23a89f6343d60 100644 --- a/pandas/_libs/tslibs/tzconversion.pyx +++ b/pandas/_libs/tslibs/tzconversion.pyx @@ -459,29 +459,7 @@ cpdef int64_t tz_convert_from_utc_single(int64_t utc_val, tzinfo tz): ------- converted: int64 """ - cdef: - int64_t delta - int64_t[::1] deltas - ndarray[int64_t, ndim=1] trans - int64_t* tdata - intp_t pos - - if utc_val == NPY_NAT: - return utc_val - - if is_utc(tz): - return utc_val - elif is_tzlocal(tz): - return utc_val + _tz_localize_using_tzinfo_api(utc_val, tz, to_utc=False) - elif is_fixed_offset(tz): - _, deltas, _ = get_dst_info(tz) - delta = deltas[0] - return utc_val + delta - else: - trans, deltas, _ = get_dst_info(tz) - tdata = cnp.PyArray_DATA(trans) - pos = bisect_right_i8(tdata, utc_val, trans.shape[0]) - 1 - return utc_val + deltas[pos] + return _tz_convert_from_utc(&utc_val, tz)[0] def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): @@ -503,6 +481,10 @@ def tz_convert_from_utc(const int64_t[:] vals, tzinfo tz): if vals.shape[0] == 0: return np.array([], dtype=np.int64) + if is_utc(tz) or tz is None: + # in some asvs up to 60x faster than going through _tz_convert_from_utc + return vals.base.copy() + converted = _tz_convert_from_utc(vals, tz) return np.asarray(converted, dtype=np.int64) @@ -523,35 +505,14 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] stamps, tzinfo tz): converted : ndarray[int64_t] """ cdef: - Py_ssize_t i, ntrans = -1, n = stamps.shape[0] - ndarray[int64_t] trans - int64_t[::1] deltas - int64_t* tdata = NULL - intp_t pos - int64_t utc_val, local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - str typ - + Localizer info = Localizer(tz) + Py_ssize_t i, n = stamps.shape[0] int64_t[::1] result if is_utc(tz) or tz is None: # Much faster than going through the "standard" pattern below return stamps.copy() - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - tdata = cnp.PyArray_DATA(trans) - result = np.empty(n, dtype=np.int64) for i in range(n): @@ -560,17 +521,7 @@ cdef const int64_t[:] _tz_convert_from_utc(const int64_t[:] stamps, tzinfo tz): result[i] = NPY_NAT continue - # The pattern used in vectorized.pyx checks for use_utc here, - # but we handle that case above. - if use_tzlocal: - local_val = utc_val + _tz_localize_using_tzinfo_api(utc_val, tz, to_utc=False) - elif use_fixed: - local_val = utc_val + delta - else: - pos = bisect_right_i8(tdata, utc_val, ntrans) - 1 - local_val = utc_val + deltas[pos] - - result[i] = local_val + result[i] = info.utc_val_to_local_val(utc_val) return result @@ -632,3 +583,137 @@ cdef int64_t _tz_localize_using_tzinfo_api( td = tz.utcoffset(dt) delta = int(td.total_seconds() * 1_000_000_000) return delta + + +cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) + + +@cython.freelist(16) +@cython.final +cdef class Localizer: + # cdef readonly: + # tzinfo tz + # bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz + # ndarray trans + # Py_ssize_t ntrans + # const int64_t[::1] deltas + # int64_t delta + + @cython.initializedcheck(False) + @cython.boundscheck(False) + def __cinit__(self, tzinfo tz): + self.tz = tz + self.use_utc = self.use_tzlocal = self.use_fixed = False + self.use_dst = self.use_pytz = self.use_dateutil = False + self.ntrans = -1 # placeholder + self.delta = -1 # placeholder + self.deltas = _deltas_placeholder + self.tdata = NULL + + if is_utc(tz) or tz is None: + self.use_utc = True + + elif is_tzlocal(tz): + self.use_tzlocal = True + + else: + trans, deltas, typ = get_dst_info(tz) + self.trans = trans + self.tdata = cnp.PyArray_DATA(trans) + self.ntrans = trans.shape[0] + self.deltas = deltas + + if typ != "pytz" and typ != "dateutil": + # static/fixed; in this case we know that len(delta) == 1 + self.use_fixed = True + self.delta = deltas[0] + else: + self.use_dst = True + if typ == "pytz": + self.use_pytz = True + else: + self.use_dateutil = True + + cdef inline int64_t utc_val_to_local_val(self, int64_t utc_val, bint* fold=NULL): + cdef: + int64_t local_val + Py_ssize_t pos + + if self.use_utc: + local_val = utc_val + elif self.use_tzlocal: + local_val = utc_val + localize_tzinfo_api(utc_val, self.tz, fold=fold) + elif self.use_fixed: + local_val = utc_val + self.delta + else: + pos = bisect_right_i8(self.tdata, utc_val, self.ntrans) - 1 + local_val = utc_val + self.deltas[pos] + + if fold is not NULL: + if self.use_dateutil: + fold[0] = _infer_tsobject_fold(utc_val, self.trans, self.deltas, pos) + + # Very best case we'd be able to set this by pointer the same way + # we do with `fold` + # if self.use_pytz: + # # find right representation of dst etc in pytz timezone + # new_tz[0] = self.tz._tzinfos[self.tz._transition_info[pos]] + return local_val + + # See commented-out code at the end of utc_val_to_local_val + cdef tzinfo adjust_pytz_tzinfo(self, int64_t utc_val): + # Caller is responsible for checking self.use_pytz + cdef: + Py_ssize_t pos + tzinfo tz + + pos = bisect_right_i8(self.tdata, utc_val, self.ntrans) - 1 + + # find right representation of dst etc in pytz timezone + tz = self.tz + tz = tz._tzinfos[tz._transition_info[pos]] + return tz + + +cdef inline bint _infer_tsobject_fold( + int64_t value, + const int64_t[::1] trans, + const int64_t[::1] deltas, + intp_t pos, +): + """ + Infer _TSObject fold property from value by assuming 0 and then setting + to 1 if necessary. + + Parameters + ---------- + value : int64_t + trans : ndarray[int64_t] + ndarray of offset transition points in nanoseconds since epoch. + deltas : int64_t[:] + array of offsets corresponding to transition points in trans. + pos : intp_t + Position of the last transition point before taking fold into account. + + Returns + ------- + bint + Due to daylight saving time, one wall clock time can occur twice + when shifting from summer to winter time; fold describes whether the + datetime-like corresponds to the first (0) or the second time (1) + the wall clock hits the ambiguous time + + References + ---------- + .. [1] "PEP 495 - Local Time Disambiguation" + https://www.python.org/dev/peps/pep-0495/#the-fold-attribute + """ + cdef: + bint fold = 0 + + if pos > 0: + fold_delta = deltas[pos - 1] - deltas[pos] + if value - fold_delta < trans[pos]: + fold = 1 + + return fold diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 07121396df4a2..3e7cca2026f20 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -35,15 +35,7 @@ from .np_datetime cimport ( from .offsets cimport BaseOffset from .period cimport get_period_ordinal from .timestamps cimport create_timestamp_from_ts -from .timezones cimport ( - get_dst_info, - is_tzlocal, - is_utc, -) -from .tzconversion cimport ( - bisect_right_i8, - localize_tzinfo_api, -) +from .tzconversion cimport Localizer # ------------------------------------------------------------------------- @@ -85,19 +77,13 @@ def ints_to_pydatetime( ndarray[object] of type specified by box """ cdef: - Py_ssize_t i, ntrans = -1, n = stamps.shape[0] - ndarray[int64_t] trans - int64_t[::1] deltas - int64_t* tdata = NULL - intp_t pos - int64_t utc_val, local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - str typ + Localizer info = Localizer(tz) + Py_ssize_t i, n = stamps.shape[0] + int64_t utc_val, local_val npy_datetimestruct dts tzinfo new_tz ndarray[object] result = np.empty(n, dtype=object) - bint use_pytz = False bint use_date = False, use_time = False, use_ts = False, use_pydt = False if box == "date": @@ -114,21 +100,6 @@ def ints_to_pydatetime( "box must be one of 'datetime', 'date', 'time' or 'timestamp'" ) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - tdata = cnp.PyArray_DATA(trans) - use_pytz = typ == "pytz" - for i in range(n): utc_val = stamps[i] new_tz = tz @@ -137,19 +108,10 @@ def ints_to_pydatetime( result[i] = NaT continue - if use_utc: - local_val = utc_val - elif use_tzlocal: - local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif use_fixed: - local_val = utc_val + delta - else: - pos = bisect_right_i8(tdata, utc_val, ntrans) - 1 - local_val = utc_val + deltas[pos] + local_val = info.utc_val_to_local_val(utc_val) - if use_pytz: - # find right representation of dst etc in pytz timezone - new_tz = tz._tzinfos[tz._transition_info[pos]] + if info.use_pytz: + new_tz = info.adjust_pytz_tzinfo(utc_val) dt64_to_dtstruct(local_val, &dts) @@ -189,46 +151,19 @@ cdef inline c_Resolution _reso_stamp(npy_datetimestruct *dts): @cython.boundscheck(False) def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: - Py_ssize_t i, ntrans = -1, n = stamps.shape[0] - ndarray[int64_t] trans - int64_t[::1] deltas - int64_t* tdata = NULL - intp_t pos - int64_t utc_val, local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - str typ + Localizer info = Localizer(tz) + Py_ssize_t i, n = stamps.shape[0] + int64_t utc_val, local_val npy_datetimestruct dts c_Resolution reso = c_Resolution.RESO_DAY, curr_reso - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - tdata = cnp.PyArray_DATA(trans) - for i in range(n): utc_val = stamps[i] if utc_val == NPY_NAT: continue - if use_utc: - local_val = utc_val - elif use_tzlocal: - local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif use_fixed: - local_val = utc_val + delta - else: - pos = bisect_right_i8(tdata, utc_val, ntrans) - 1 - local_val = utc_val + deltas[pos] + local_val = info.utc_val_to_local_val(utc_val) dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) @@ -258,46 +193,19 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result : int64 ndarray of converted of normalized nanosecond timestamps """ cdef: - Py_ssize_t i, ntrans = -1, n = stamps.shape[0] - ndarray[int64_t] trans - int64_t[::1] deltas - int64_t* tdata = NULL - intp_t pos - int64_t utc_val, local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - str typ + Localizer info = Localizer(tz) + Py_ssize_t i, n = stamps.shape[0] + int64_t utc_val, local_val int64_t[::1] result = np.empty(n, dtype=np.int64) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - tdata = cnp.PyArray_DATA(trans) - for i in range(n): utc_val = stamps[i] if utc_val == NPY_NAT: result[i] = NPY_NAT continue - if use_utc: - local_val = utc_val - elif use_tzlocal: - local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif use_fixed: - local_val = utc_val + delta - else: - pos = bisect_right_i8(tdata, utc_val, ntrans) - 1 - local_val = utc_val + deltas[pos] + local_val = info.utc_val_to_local_val(utc_val) result[i] = normalize_i8_stamp(local_val) @@ -322,40 +230,13 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: is_normalized : bool True if all stamps are normalized """ cdef: - Py_ssize_t i, ntrans = -1, n = stamps.shape[0] - ndarray[int64_t] trans - int64_t[::1] deltas - int64_t* tdata = NULL - intp_t pos - int64_t utc_val, local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - str typ - - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - tdata = cnp.PyArray_DATA(trans) + Localizer info = Localizer(tz) + Py_ssize_t i, n = stamps.shape[0] + int64_t utc_val, local_val for i in range(n): utc_val = stamps[i] - if use_utc: - local_val = utc_val - elif use_tzlocal: - local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif use_fixed: - local_val = utc_val + delta - else: - pos = bisect_right_i8(tdata, utc_val, ntrans) - 1 - local_val = utc_val + deltas[pos] + local_val = info.utc_val_to_local_val(utc_val) if local_val % DAY_NANOS != 0: return False @@ -370,31 +251,13 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: @cython.boundscheck(False) def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: - Py_ssize_t i, ntrans = -1, n = stamps.shape[0] - ndarray[int64_t] trans - int64_t[::1] deltas - int64_t* tdata = NULL - intp_t pos - int64_t utc_val, local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - str typ + Localizer info = Localizer(tz) + Py_ssize_t i, n = stamps.shape[0] + int64_t utc_val, local_val npy_datetimestruct dts int64_t[::1] result = np.empty(n, dtype=np.int64) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - ntrans = trans.shape[0] - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - tdata = cnp.PyArray_DATA(trans) for i in range(n): utc_val = stamps[i] @@ -402,15 +265,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): result[i] = NPY_NAT continue - if use_utc: - local_val = utc_val - elif use_tzlocal: - local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif use_fixed: - local_val = utc_val + delta - else: - pos = bisect_right_i8(tdata, utc_val, ntrans) - 1 - local_val = utc_val + deltas[pos] + local_val = info.utc_val_to_local_val(utc_val) dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(&dts, freq)