From 40dcda5be6918fded4215df1c8cb4a6ffd5d95bc Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Mar 2022 11:27:07 -0800 Subject: [PATCH 01/25] Implement Localizer - perf neutral --- pandas/_libs/tslibs/vectorized.pyx | 181 ++++++++++++++--------------- 1 file changed, 90 insertions(+), 91 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 17720de33ab33..f0b03952cc62b 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -119,18 +119,19 @@ def ints_to_pydatetime( ndarray[object] of type specified by box """ cdef: - Py_ssize_t i, n = len(stamps) - ndarray[int64_t] trans + Localizer info = Localizer(tz) int64_t[:] deltas - intp_t[:] pos + int64_t delta = info.delta + bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed + Py_ssize_t[:] pos + bint use_pytz = info.use_pytz + + Py_ssize_t i, n = len(stamps) npy_datetimestruct dts object dt, new_tz - str typ - int64_t value, local_val, delta = NPY_NAT # dummy for delta + int64_t value, local_val ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) - bint use_utc = False, use_tzlocal = False, use_fixed = False - bint use_pytz = False if box == "date": assert (tz is None), "tz should be None when converting to date" @@ -147,19 +148,9 @@ def ints_to_pydatetime( "box must be one of 'datetime', 'date', 'time' or 'timestamp'" ) - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 - use_pytz = typ == "pytz" + if info.use_dst: + deltas = info.deltas + pos = info.trans.searchsorted(stamps, side="right") - 1 for i in range(n): new_tz = tz @@ -222,27 +213,20 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: + Localizer info = Localizer(tz) + int64_t[:] deltas + int64_t delta = info.delta + bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed + + Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso - ndarray[int64_t] trans - int64_t[:] deltas - intp_t[:] pos - int64_t local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False - - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + int64_t local_val + + if info.use_dst: + deltas = info.deltas + pos = info.trans.searchsorted(stamps, side="right") - 1 for i in range(n): if stamps[i] == NPY_NAT: @@ -285,27 +269,19 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result : int64 ndarray of converted of normalized nanosecond timestamps """ cdef: - Py_ssize_t i, n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans + Localizer info = Localizer(tz) int64_t[:] deltas - str typ + int64_t delta = info.delta + bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed + Py_ssize_t[:] pos - int64_t local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False + Py_ssize_t i, n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) + int64_t local_val - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + if info.use_dst: + deltas = info.deltas + pos = info.trans.searchsorted(stamps, side="right") - 1 for i in range(n): if stamps[i] == NPY_NAT: @@ -344,27 +320,19 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: is_normalized : bool True if all stamps are normalized """ cdef: - Py_ssize_t i, n = len(stamps) - ndarray[int64_t] trans + Localizer info = Localizer(tz) int64_t[:] deltas - intp_t[:] pos - int64_t local_val, delta = NPY_NAT - str typ + int64_t delta = info.delta + bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed + + Py_ssize_t[:] pos + Py_ssize_t i, n = len(stamps) + int64_t local_val int64_t day_nanos = 24 * 3600 * 1_000_000_000 - bint use_utc = False, use_tzlocal = False, use_fixed = False - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + if info.use_dst: + deltas = info.deltas + pos = info.trans.searchsorted(stamps, side="right") - 1 for i in range(n): if use_utc: @@ -389,27 +357,20 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: @cython.boundscheck(False) def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: - Py_ssize_t i, n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans + Localizer info = Localizer(tz) int64_t[:] deltas + int64_t delta = info.delta + bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed + Py_ssize_t[:] pos + Py_ssize_t i, n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) npy_datetimestruct dts - int64_t local_val, delta = NPY_NAT - bint use_utc = False, use_tzlocal = False, use_fixed = False + int64_t local_val - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(stamps, side="right") - 1 + if info.use_dst: + deltas = info.deltas + pos = info.trans.searchsorted(stamps, side="right") - 1 for i in range(n): if stamps[i] == NPY_NAT: @@ -429,3 +390,41 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): result[i] = get_period_ordinal(&dts, freq) return result.base # .base to get underlying ndarray + + + +@cython.freelist(16) +cdef class Localizer: + cdef: + tzinfo tz + bint use_utc + bint use_fixed + bint use_tzlocal + bint use_pytz + bint use_dst + ndarray trans + int64_t[:] deltas + int64_t delta + str typ + + @cython.boundscheck(False) + def __cinit__(self, tzinfo tz): + self.tz = tz + if is_utc(tz) or tz is None: + self.use_utc = True + elif is_tzlocal(tz): + self.use_tzlocal = True + else: + trans, deltas, typ = get_dst_info(tz) + self.trans = trans + self.deltas = deltas + self.typ = typ + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + self.use_fixed = True + self.delta = deltas[0] + else: + self.use_dst = True + if typ == "pytz": + self.use_pytz = True From db075f91f6af6183096e4ecd1acf7a3d4a114b16 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Mar 2022 13:25:38 -0800 Subject: [PATCH 02/25] perf-neutral-ish --- pandas/_libs/tslibs/vectorized.pyx | 55 ++++++++++++------------------ 1 file changed, 22 insertions(+), 33 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index f0b03952cc62b..57f2f990f415e 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -121,10 +121,7 @@ def ints_to_pydatetime( cdef: Localizer info = Localizer(tz) int64_t[:] deltas - int64_t delta = info.delta - bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed Py_ssize_t[:] pos - bint use_pytz = info.use_pytz Py_ssize_t i, n = len(stamps) npy_datetimestruct dts @@ -159,17 +156,17 @@ def ints_to_pydatetime( if value == NPY_NAT: result[i] = NaT else: - if use_utc: + if info.use_utc: local_val = value - elif use_tzlocal: + elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(value, tz) - elif use_fixed: - local_val = value + delta - elif not use_pytz: + elif info.use_fixed: + local_val = value + info.delta + elif not info.use_pytz: # i.e. dateutil # no zone-name change for dateutil tzs - dst etc # represented in single object. - local_val = value + deltas[pos[i]] + local_val = value + info.deltas[pos[i]] else: # pytz # find right representation of dst etc in pytz timezone @@ -215,8 +212,6 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Localizer info = Localizer(tz) int64_t[:] deltas - int64_t delta = info.delta - bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) @@ -232,12 +227,12 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: if stamps[i] == NPY_NAT: continue - if use_utc: + if info.use_utc: local_val = stamps[i] - elif use_tzlocal: + elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta + elif info.use_fixed: + local_val = stamps[i] + info.delta else: local_val = stamps[i] + deltas[pos[i]] @@ -271,8 +266,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t cdef: Localizer info = Localizer(tz) int64_t[:] deltas - int64_t delta = info.delta - bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) @@ -288,12 +281,12 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue - if use_utc: + if info.use_utc: local_val = stamps[i] - elif use_tzlocal: + elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta + elif info.use_fixed: + local_val = stamps[i] + info.delta else: local_val = stamps[i] + deltas[pos[i]] @@ -322,8 +315,6 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: cdef: Localizer info = Localizer(tz) int64_t[:] deltas - int64_t delta = info.delta - bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) @@ -335,12 +326,12 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: pos = info.trans.searchsorted(stamps, side="right") - 1 for i in range(n): - if use_utc: + if info.use_utc: local_val = stamps[i] - elif use_tzlocal: + elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta + elif info.use_fixed: + local_val = stamps[i] + info.delta else: local_val = stamps[i] + deltas[pos[i]] @@ -359,8 +350,6 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Localizer info = Localizer(tz) int64_t[:] deltas - int64_t delta = info.delta - bint use_utc = info.use_utc, use_tzlocal = info.use_tzlocal, use_fixed = info.use_fixed Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) @@ -377,12 +366,12 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): result[i] = NPY_NAT continue - if use_utc: + if info.use_utc: local_val = stamps[i] - elif use_tzlocal: + elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif use_fixed: - local_val = stamps[i] + delta + elif info.use_fixed: + local_val = stamps[i] + info.delta else: local_val = stamps[i] + deltas[pos[i]] From dde49a67556c8c76e83903e9c399d1aa699ca514 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Mar 2022 14:10:25 -0800 Subject: [PATCH 03/25] move Localizer to top --- pandas/_libs/tslibs/vectorized.pyx | 76 +++++++++++++++--------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 57f2f990f415e..c3b22e4b9779e 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -37,6 +37,44 @@ from .timezones cimport ( ) from .tzconversion cimport tz_convert_utc_to_tzlocal + +@cython.freelist(16) +cdef class Localizer: + cdef readonly: + tzinfo tz + bint use_utc + bint use_fixed + bint use_tzlocal + bint use_pytz + bint use_dst + ndarray trans + int64_t[:] deltas + int64_t delta + str typ + + @cython.boundscheck(False) + def __cinit__(self, tzinfo tz): + self.tz = tz + if is_utc(tz) or tz is None: + self.use_utc = True + elif is_tzlocal(tz): + self.use_tzlocal = True + else: + trans, deltas, typ = get_dst_info(tz) + self.trans = trans + self.deltas = deltas + self.typ = typ + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + self.use_fixed = True + self.delta = deltas[0] + else: + self.use_dst = True + if typ == "pytz": + self.use_pytz = True + + # ------------------------------------------------------------------------- cdef inline object create_datetime_from_ts( @@ -379,41 +417,3 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): result[i] = get_period_ordinal(&dts, freq) return result.base # .base to get underlying ndarray - - - -@cython.freelist(16) -cdef class Localizer: - cdef: - tzinfo tz - bint use_utc - bint use_fixed - bint use_tzlocal - bint use_pytz - bint use_dst - ndarray trans - int64_t[:] deltas - int64_t delta - str typ - - @cython.boundscheck(False) - def __cinit__(self, tzinfo tz): - self.tz = tz - if is_utc(tz) or tz is None: - self.use_utc = True - elif is_tzlocal(tz): - self.use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - self.trans = trans - self.deltas = deltas - self.typ = typ - - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - self.use_fixed = True - self.delta = deltas[0] - else: - self.use_dst = True - if typ == "pytz": - self.use_pytz = True From b1ba93397afbbd062ecadd4a83f3945224694672 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 8 Mar 2022 12:22:11 -0800 Subject: [PATCH 04/25] move delta back inline --- pandas/_libs/tslibs/vectorized.pyx | 44 ++++++++++++++++++------------ 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index c3b22e4b9779e..1b13cf008418b 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -48,7 +48,7 @@ cdef class Localizer: bint use_pytz bint use_dst ndarray trans - int64_t[:] deltas + const int64_t[:] deltas int64_t delta str typ @@ -158,13 +158,13 @@ def ints_to_pydatetime( """ cdef: Localizer info = Localizer(tz) - int64_t[:] deltas + const int64_t[:] deltas Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts object dt, new_tz - int64_t value, local_val + int64_t value, local_val, delta ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) @@ -186,6 +186,8 @@ def ints_to_pydatetime( if info.use_dst: deltas = info.deltas pos = info.trans.searchsorted(stamps, side="right") - 1 + elif info.use_fixed: + delta = info.delta for i in range(n): new_tz = tz @@ -199,12 +201,12 @@ def ints_to_pydatetime( elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(value, tz) elif info.use_fixed: - local_val = value + info.delta + local_val = value + delta elif not info.use_pytz: # i.e. dateutil # no zone-name change for dateutil tzs - dst etc # represented in single object. - local_val = value + info.deltas[pos[i]] + local_val = value + deltas[pos[i]] else: # pytz # find right representation of dst etc in pytz timezone @@ -249,17 +251,19 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Localizer info = Localizer(tz) - int64_t[:] deltas + const int64_t[:] deltas Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso - int64_t local_val + int64_t local_val, delta if info.use_dst: deltas = info.deltas pos = info.trans.searchsorted(stamps, side="right") - 1 + elif info.use_fixed: + delta = info.delta for i in range(n): if stamps[i] == NPY_NAT: @@ -270,7 +274,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + info.delta + local_val = stamps[i] + delta else: local_val = stamps[i] + deltas[pos[i]] @@ -303,16 +307,18 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t """ cdef: Localizer info = Localizer(tz) - int64_t[:] deltas + const int64_t[:] deltas Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) - int64_t local_val + int64_t local_val, delta if info.use_dst: deltas = info.deltas pos = info.trans.searchsorted(stamps, side="right") - 1 + elif info.use_fixed: + delta = info.delta for i in range(n): if stamps[i] == NPY_NAT: @@ -324,7 +330,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + info.delta + local_val = stamps[i] + delta else: local_val = stamps[i] + deltas[pos[i]] @@ -352,16 +358,18 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: """ cdef: Localizer info = Localizer(tz) - int64_t[:] deltas + const int64_t[:] deltas Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) - int64_t local_val + int64_t local_val, delta int64_t day_nanos = 24 * 3600 * 1_000_000_000 if info.use_dst: deltas = info.deltas pos = info.trans.searchsorted(stamps, side="right") - 1 + elif info.use_fixed: + delta = info.delta for i in range(n): if info.use_utc: @@ -369,7 +377,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + info.delta + local_val = stamps[i] + delta else: local_val = stamps[i] + deltas[pos[i]] @@ -387,17 +395,19 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Localizer info = Localizer(tz) - int64_t[:] deltas + const int64_t[:] deltas Py_ssize_t[:] pos Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) npy_datetimestruct dts - int64_t local_val + int64_t local_val, delta if info.use_dst: deltas = info.deltas pos = info.trans.searchsorted(stamps, side="right") - 1 + elif info.use_fixed: + delta = info.delta for i in range(n): if stamps[i] == NPY_NAT: @@ -409,7 +419,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + info.delta + local_val = stamps[i] + delta else: local_val = stamps[i] + deltas[pos[i]] From f5e0503d62a8f2e698a3e907a0a75ca794899f92 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Mar 2022 15:58:23 -0800 Subject: [PATCH 05/25] troubleshoot perf --- pandas/_libs/tslibs/vectorized.pyx | 111 ++++++++++++++++++----------- 1 file changed, 68 insertions(+), 43 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 1b13cf008418b..4cd909b03dc76 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -7,6 +7,7 @@ from cpython.datetime cimport ( tzinfo, ) +cimport numpy as cnp import numpy as np from numpy cimport ( @@ -14,6 +15,7 @@ from numpy cimport ( intp_t, ndarray, ) +cnp.import_array() from .conversion cimport normalize_i8_stamp @@ -37,8 +39,13 @@ from .timezones cimport ( ) from .tzconversion cimport tz_convert_utc_to_tzlocal +cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) + +ctypedef int64_t (*localizer_func)(Localizer, int64_t, intp_t*, intp_t) + @cython.freelist(16) +@cython.final cdef class Localizer: cdef readonly: tzinfo tz @@ -48,17 +55,32 @@ cdef class Localizer: bint use_pytz bint use_dst ndarray trans - const int64_t[:] deltas + const int64_t[::1] deltas int64_t delta str typ + # TODO: report cython bug; this declaration works if on its own line + # but raises at compile-time if included in the 'cdef readonly' + # declarations above. + # cdef int64_t (*func)(Localizer, int64_t, intp_t*, intp_t) + # cdef localizer_func func + + @cython.initializedcheck(False) @cython.boundscheck(False) def __cinit__(self, tzinfo tz): self.tz = tz + self.use_utc = self.use_tzlocal = self.use_fixed = self.use_dst = self.use_pytz = False + self.delta = -1 # placeholder + self.deltas = _deltas_placeholder + if is_utc(tz) or tz is None: self.use_utc = True + # self.func = self.func_use_utc + elif is_tzlocal(tz): self.use_tzlocal = True + # self.func = self.func_use_tzlocal + else: trans, deltas, typ = get_dst_info(tz) self.trans = trans @@ -69,11 +91,25 @@ cdef class Localizer: # static/fixed; in this case we know that len(delta) == 1 self.use_fixed = True self.delta = deltas[0] + # self.func = self.func_use_fixed else: self.use_dst = True + # self.func = self.func_use_dst if typ == "pytz": self.use_pytz = True + cdef int64_t func_use_utc(self, int64_t utc_val, intp_t* pos, intp_t i): + return utc_val + + cdef int64_t func_use_tzlocal(self, int64_t utc_val, intp_t* pos, intp_t i): + return tz_convert_utc_to_tzlocal(utc_val, self.tz) + + cdef int64_t func_use_fixed(self, int64_t utc_val, intp_t* pos, intp_t i): + return utc_val + self.delta + + cdef int64_t func_use_dst(self, int64_t utc_val, intp_t* pos, intp_t i): + return utc_val + self.deltas[pos[i]] + # ------------------------------------------------------------------------- @@ -120,6 +156,7 @@ cdef inline object create_time_from_ts( return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) +#@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) def ints_to_pydatetime( @@ -158,13 +195,13 @@ def ints_to_pydatetime( """ cdef: Localizer info = Localizer(tz) - const int64_t[:] deltas - Py_ssize_t[:] pos - + const int64_t[::1] deltas = info.deltas + int64_t value, local_val, delta = info.delta + intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts - object dt, new_tz - int64_t value, local_val, delta + + tzinfo new_tz ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) @@ -184,10 +221,7 @@ def ints_to_pydatetime( ) if info.use_dst: - deltas = info.deltas - pos = info.trans.searchsorted(stamps, side="right") - 1 - elif info.use_fixed: - delta = info.delta + pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) for i in range(n): new_tz = tz @@ -251,19 +285,16 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Localizer info = Localizer(tz) - const int64_t[:] deltas - - Py_ssize_t[:] pos + const int64_t[::1] deltas = info.deltas + int64_t local_val, delta = info.delta + intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts + int reso = RESO_DAY, curr_reso - int64_t local_val, delta if info.use_dst: - deltas = info.deltas - pos = info.trans.searchsorted(stamps, side="right") - 1 - elif info.use_fixed: - delta = info.delta + pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) for i in range(n): if stamps[i] == NPY_NAT: @@ -288,6 +319,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: # ------------------------------------------------------------------------- +#@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): @@ -307,18 +339,15 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t """ cdef: Localizer info = Localizer(tz) - const int64_t[:] deltas - - Py_ssize_t[:] pos + const int64_t[::1] deltas = info.deltas + int64_t local_val, delta = info.delta + intp_t* pos Py_ssize_t i, n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) - int64_t local_val, delta if info.use_dst: - deltas = info.deltas - pos = info.trans.searchsorted(stamps, side="right") - 1 - elif info.use_fixed: - delta = info.delta + pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) for i in range(n): if stamps[i] == NPY_NAT: @@ -339,6 +368,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t return result.base # `.base` to access underlying ndarray +#@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: @@ -358,18 +388,15 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: """ cdef: Localizer info = Localizer(tz) - const int64_t[:] deltas - - Py_ssize_t[:] pos + const int64_t[::1] deltas = info.deltas + int64_t local_val, delta = info.delta + intp_t* pos Py_ssize_t i, n = len(stamps) - int64_t local_val, delta + int64_t day_nanos = 24 * 3600 * 1_000_000_000 if info.use_dst: - deltas = info.deltas - pos = info.trans.searchsorted(stamps, side="right") - 1 - elif info.use_fixed: - delta = info.delta + pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) for i in range(n): if info.use_utc: @@ -390,24 +417,22 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: # ------------------------------------------------------------------------- +@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Localizer info = Localizer(tz) - const int64_t[:] deltas - - Py_ssize_t[:] pos + const int64_t[::1] deltas = info.deltas + int64_t local_val, delta = info.delta + intp_t* pos Py_ssize_t i, n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) npy_datetimestruct dts - int64_t local_val, delta + + int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: - deltas = info.deltas - pos = info.trans.searchsorted(stamps, side="right") - 1 - elif info.use_fixed: - delta = info.delta + pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) for i in range(n): if stamps[i] == NPY_NAT: From 45e568554ea9003ddd32806ca70443676d314b23 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 9 Mar 2022 19:40:55 -0800 Subject: [PATCH 06/25] remove initializedcheck(False) --- pandas/_libs/tslibs/vectorized.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 4cd909b03dc76..4b80680ddc4b0 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,5 +1,6 @@ import cython +cimport numpy as cnp from cpython.datetime cimport ( date, datetime, @@ -7,7 +8,6 @@ from cpython.datetime cimport ( tzinfo, ) -cimport numpy as cnp import numpy as np from numpy cimport ( @@ -15,6 +15,7 @@ from numpy cimport ( intp_t, ndarray, ) + cnp.import_array() from .conversion cimport normalize_i8_stamp @@ -39,6 +40,7 @@ from .timezones cimport ( ) from .tzconversion cimport tz_convert_utc_to_tzlocal + cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) ctypedef int64_t (*localizer_func)(Localizer, int64_t, intp_t*, intp_t) @@ -156,7 +158,6 @@ cdef inline object create_time_from_ts( return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) -#@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) def ints_to_pydatetime( @@ -319,7 +320,6 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: # ------------------------------------------------------------------------- -#@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): @@ -368,7 +368,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t return result.base # `.base` to access underlying ndarray -#@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: @@ -417,7 +416,6 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: # ------------------------------------------------------------------------- -@cython.initializedcheck(False) @cython.wraparound(False) @cython.boundscheck(False) def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): From 53cd93d2c4790619bcc7bb6949e4b129796350f4 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Mar 2022 21:23:15 -0800 Subject: [PATCH 07/25] fewer lines --- pandas/_libs/tslibs/vectorized.pyx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 17a77ac2b8f33..d5d9e9090417e 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -51,11 +51,7 @@ ctypedef int64_t (*localizer_func)(Localizer, int64_t, intp_t*, intp_t) cdef class Localizer: cdef readonly: tzinfo tz - bint use_utc - bint use_fixed - bint use_tzlocal - bint use_pytz - bint use_dst + bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz ndarray trans const int64_t[::1] deltas int64_t delta From 22d4aac1fbd8a2e92329b3e57407dd96e025f567 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 10 Mar 2022 21:24:34 -0800 Subject: [PATCH 08/25] troubleshoot perf --- pandas/_libs/tslibs/vectorized.pyx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index d5d9e9090417e..ff45cc519474d 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -235,7 +235,7 @@ def ints_to_pydatetime( elif info.use_fixed: local_val = value + delta else: - local_val = value + deltas[pos[i]] + local_val = value + info.deltas[pos[i]] if info.use_pytz: # find right representation of dst etc in pytz timezone @@ -299,7 +299,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + delta + local_val = stamps[i] + info.delta else: local_val = stamps[i] + deltas[pos[i]] @@ -354,7 +354,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t elif info.use_fixed: local_val = stamps[i] + delta else: - local_val = stamps[i] + deltas[pos[i]] + local_val = stamps[i] + info.deltas[pos[i]] result[i] = normalize_i8_stamp(local_val) @@ -396,7 +396,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + delta + local_val = stamps[i] + info.delta else: local_val = stamps[i] + deltas[pos[i]] @@ -437,7 +437,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): elif info.use_fixed: local_val = stamps[i] + delta else: - local_val = stamps[i] + deltas[pos[i]] + local_val = stamps[i] + info.deltas[pos[i]] dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(&dts, freq) From 492897e0a0619b42bbca2289ef0d801a714a7039 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 11 Mar 2022 11:14:32 -0800 Subject: [PATCH 09/25] troubleshoot perf --- pandas/_libs/tslibs/vectorized.pyx | 37 +++++------------------------- 1 file changed, 6 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index ff45cc519474d..6fb689868b213 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -1,6 +1,5 @@ import cython -cimport numpy as cnp from cpython.datetime cimport ( date, datetime, @@ -9,7 +8,7 @@ from cpython.datetime cimport ( ) import numpy as np - +cimport numpy as cnp from numpy cimport ( int64_t, intp_t, @@ -43,8 +42,6 @@ from .tzconversion cimport tz_convert_utc_to_tzlocal cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) -ctypedef int64_t (*localizer_func)(Localizer, int64_t, intp_t*, intp_t) - @cython.freelist(16) @cython.final @@ -57,12 +54,6 @@ cdef class Localizer: int64_t delta str typ - # TODO: report cython bug; this declaration works if on its own line - # but raises at compile-time if included in the 'cdef readonly' - # declarations above. - # cdef int64_t (*func)(Localizer, int64_t, intp_t*, intp_t) - # cdef localizer_func func - @cython.initializedcheck(False) @cython.boundscheck(False) def __cinit__(self, tzinfo tz): @@ -73,11 +64,9 @@ cdef class Localizer: if is_utc(tz) or tz is None: self.use_utc = True - # self.func = self.func_use_utc elif is_tzlocal(tz): self.use_tzlocal = True - # self.func = self.func_use_tzlocal else: trans, deltas, typ = get_dst_info(tz) @@ -89,25 +78,11 @@ cdef class Localizer: # static/fixed; in this case we know that len(delta) == 1 self.use_fixed = True self.delta = deltas[0] - # self.func = self.func_use_fixed else: self.use_dst = True - # self.func = self.func_use_dst if typ == "pytz": self.use_pytz = True - cdef int64_t func_use_utc(self, int64_t utc_val, intp_t* pos, intp_t i): - return utc_val - - cdef int64_t func_use_tzlocal(self, int64_t utc_val, intp_t* pos, intp_t i): - return tz_convert_utc_to_tzlocal(utc_val, self.tz) - - cdef int64_t func_use_fixed(self, int64_t utc_val, intp_t* pos, intp_t i): - return utc_val + self.delta - - cdef int64_t func_use_dst(self, int64_t utc_val, intp_t* pos, intp_t i): - return utc_val + self.deltas[pos[i]] - # ------------------------------------------------------------------------- @@ -233,7 +208,7 @@ def ints_to_pydatetime( elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(value, tz) elif info.use_fixed: - local_val = value + delta + local_val = value + info.delta else: local_val = value + info.deltas[pos[i]] @@ -301,7 +276,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: elif info.use_fixed: local_val = stamps[i] + info.delta else: - local_val = stamps[i] + deltas[pos[i]] + local_val = stamps[i] + info.deltas[pos[i]] dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) @@ -352,7 +327,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + delta + local_val = stamps[i] + info.delta else: local_val = stamps[i] + info.deltas[pos[i]] @@ -398,7 +373,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: elif info.use_fixed: local_val = stamps[i] + info.delta else: - local_val = stamps[i] + deltas[pos[i]] + local_val = stamps[i] + info.deltas[pos[i]] if local_val % day_nanos != 0: return False @@ -435,7 +410,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): elif info.use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) elif info.use_fixed: - local_val = stamps[i] + delta + local_val = stamps[i] + info.delta else: local_val = stamps[i] + info.deltas[pos[i]] From 07b934f77d3e8b5470779b3ad31ce1da75dd206a Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 11 Mar 2022 14:05:07 -0800 Subject: [PATCH 10/25] remove unused declarations --- pandas/_libs/tslibs/vectorized.pyx | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 6fb689868b213..634b9c0063e86 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -167,8 +167,7 @@ def ints_to_pydatetime( """ cdef: Localizer info = Localizer(tz) - const int64_t[::1] deltas = info.deltas - int64_t value, local_val, delta = info.delta + int64_t value, local_val intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts @@ -254,8 +253,7 @@ cdef inline int _reso_stamp(npy_datetimestruct *dts): def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Localizer info = Localizer(tz) - const int64_t[::1] deltas = info.deltas - int64_t local_val, delta = info.delta + int64_t local_val intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts @@ -307,8 +305,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t """ cdef: Localizer info = Localizer(tz) - const int64_t[::1] deltas = info.deltas - int64_t local_val, delta = info.delta + int64_t local_val intp_t* pos Py_ssize_t i, n = len(stamps) @@ -355,8 +352,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: """ cdef: Localizer info = Localizer(tz) - const int64_t[::1] deltas = info.deltas - int64_t local_val, delta = info.delta + int64_t local_val intp_t* pos Py_ssize_t i, n = len(stamps) @@ -389,8 +385,7 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Localizer info = Localizer(tz) - const int64_t[::1] deltas = info.deltas - int64_t local_val, delta = info.delta + int64_t local_val intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts From 70e54d725e319e0d40cf0a141136056f9c534f8d Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 11 Mar 2022 16:46:45 -0800 Subject: [PATCH 11/25] lint --- pandas/_libs/tslibs/vectorized.pyx | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 634b9c0063e86..67d7cedf5a315 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -58,7 +58,8 @@ cdef class Localizer: @cython.boundscheck(False) def __cinit__(self, tzinfo tz): self.tz = tz - self.use_utc = self.use_tzlocal = self.use_fixed = self.use_dst = self.use_pytz = False + self.use_utc = self.use_tzlocal = self.use_fixed = False + self.use_dst = self.use_pytz = False self.delta = -1 # placeholder self.deltas = _deltas_placeholder @@ -171,7 +172,6 @@ def ints_to_pydatetime( intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts - tzinfo new_tz ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) @@ -257,7 +257,6 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts - int reso = RESO_DAY, curr_reso if info.use_dst: @@ -308,7 +307,6 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t int64_t local_val intp_t* pos Py_ssize_t i, n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: @@ -355,7 +353,6 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: int64_t local_val intp_t* pos Py_ssize_t i, n = len(stamps) - int64_t day_nanos = 24 * 3600 * 1_000_000_000 if info.use_dst: @@ -389,7 +386,6 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): intp_t* pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts - int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: From 3c064b6898fbf71f975d9bfcac9abf8a7947388d Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 12 Mar 2022 13:08:53 -0800 Subject: [PATCH 12/25] CLN, avoid build warning --- pandas/_libs/tslibs/vectorized.pyx | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 67d7cedf5a315..6d1681deebb9a 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -169,7 +169,7 @@ def ints_to_pydatetime( cdef: Localizer info = Localizer(tz) int64_t value, local_val - intp_t* pos + intp_t* pos = NULL Py_ssize_t i, n = len(stamps) npy_datetimestruct dts tzinfo new_tz @@ -192,7 +192,9 @@ def ints_to_pydatetime( ) if info.use_dst: - pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) + pos = cnp.PyArray_DATA( + info.trans.searchsorted(stamps, side="right") - 1 + ) for i in range(n): new_tz = tz @@ -254,13 +256,15 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos + intp_t* pos = NULL Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso if info.use_dst: - pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) + pos = cnp.PyArray_DATA( + info.trans.searchsorted(stamps, side="right") - 1 + ) for i in range(n): if stamps[i] == NPY_NAT: @@ -305,12 +309,14 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos + intp_t* pos = NULL Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: - pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) + pos = cnp.PyArray_DATA( + info.trans.searchsorted(stamps, side="right") - 1 + ) for i in range(n): if stamps[i] == NPY_NAT: @@ -351,12 +357,14 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos + intp_t* pos = NULL Py_ssize_t i, n = len(stamps) int64_t day_nanos = 24 * 3600 * 1_000_000_000 if info.use_dst: - pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) + pos = cnp.PyArray_DATA( + info.trans.searchsorted(stamps, side="right") - 1 + ) for i in range(n): if info.use_utc: @@ -383,13 +391,15 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos + intp_t* pos = NULL Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: - pos = cnp.PyArray_DATA(info.trans.searchsorted(stamps, side="right") - 1) + pos = cnp.PyArray_DATA( + info.trans.searchsorted(stamps, side="right") - 1 + ) for i in range(n): if stamps[i] == NPY_NAT: From c0b467fcb909c6b4ba515900f6c1053d65dc498a Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 12 Mar 2022 10:11:05 -0500 Subject: [PATCH 13/25] BUG: replace with value also being replaced (#46335) --- doc/source/whatsnew/v1.4.2.rst | 1 + pandas/core/internals/blocks.py | 15 ++++++++++++--- pandas/tests/frame/methods/test_replace.py | 7 +++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.4.2.rst b/doc/source/whatsnew/v1.4.2.rst index 2bdbeb0ab6991..06f1f406c3816 100644 --- a/doc/source/whatsnew/v1.4.2.rst +++ b/doc/source/whatsnew/v1.4.2.rst @@ -18,6 +18,7 @@ Fixed regressions - Fixed regression in :func:`read_csv` killing python process when invalid file input was given for ``engine="c"`` (:issue:`45957`) - Fixed memory performance regression in :meth:`Series.fillna` when called on a :class:`DataFrame` column with ``inplace=True`` (:issue:`46149`) - Provided an alternative solution for passing custom Excel formats in :meth:`.Styler.to_excel`, which was a regression based on stricter CSS validation. Examples available in the documentation for :meth:`.Styler.format` (:issue:`46152`) +- Fixed regression in :meth:`DataFrame.replace` when a replacement value was also a target for replacement (:issue:`46335`) - Fixed regression in :meth:`DataFrame.loc.__setitem__` losing :class:`MultiIndex` names if :class:`DataFrame` was empty before (:issue:`46317`) - diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 3693edbae7d95..69f66973d0954 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -612,9 +612,18 @@ def replace( else: # split so that we only upcast where necessary - return self.split_and_operate( - type(self).replace, to_replace, value, inplace=True - ) + blocks = [] + for i, nb in enumerate(self._split()): + blocks.extend( + type(self).replace( + nb, + to_replace=to_replace, + value=value, + inplace=True, + mask=mask[i : i + 1], + ) + ) + return blocks @final def _replace_regex( diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py index 6b53ef400e53d..2eb300a8905b8 100644 --- a/pandas/tests/frame/methods/test_replace.py +++ b/pandas/tests/frame/methods/test_replace.py @@ -1542,3 +1542,10 @@ def test_replace_regex_dtype_frame(self, regex): expected_df2 = DataFrame({"A": [1], "B": ["1"]}) result_df2 = df2.replace(to_replace="0", value=1, regex=regex) tm.assert_frame_equal(result_df2, expected_df2) + + def test_replace_with_value_also_being_replaced(self): + # GH46306 + df = DataFrame({"A": [0, 1, 2], "B": [1, 0, 2]}) + result = df.replace({0: 1, 1: np.nan}) + expected = DataFrame({"A": [1, np.nan, 2], "B": [np.nan, 1, 2]}) + tm.assert_frame_equal(result, expected) From 63343760cd495f5e9e7d388f67e5103f3aac69bc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 12 Mar 2022 13:07:22 -0800 Subject: [PATCH 14/25] REF: de-duplicate libjoin (#46256) --- pandas/_libs/join.pyx | 238 ++++++++++++++++-------------------------- 1 file changed, 89 insertions(+), 149 deletions(-) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index b908fa2c65e4d..3fc97e3660120 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -93,10 +93,13 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, with nogil: # First pass, determine size of result set, do not use the NA group for i in range(1, max_groups + 1): - if right_count[i] > 0: - count += left_count[i] * right_count[i] + lc = left_count[i] + rc = right_count[i] + + if rc > 0: + count += lc * rc else: - count += left_count[i] + count += lc left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) @@ -679,7 +682,8 @@ def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=True, - tolerance=None): + tolerance=None, + bint use_hashtable=True): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos @@ -701,12 +705,13 @@ def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, left_indexer = np.empty(left_size, dtype=np.intp) right_indexer = np.empty(left_size, dtype=np.intp) - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + if use_hashtable: + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) right_pos = 0 for left_pos in range(left_size): @@ -718,19 +723,25 @@ def asof_join_backward_on_X_by_Y(numeric_t[:] left_values, if allow_exact_matches: while (right_pos < right_size and right_values[right_pos] <= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos += 1 else: while (right_pos < right_size and right_values[right_pos] < left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos += 1 right_pos -= 1 # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) + if use_hashtable: + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + else: + found_right_pos = right_pos + left_indexer[left_pos] = left_pos right_indexer[left_pos] = found_right_pos @@ -748,7 +759,8 @@ def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, by_t[:] left_by_values, by_t[:] right_by_values, bint allow_exact_matches=1, - tolerance=None): + tolerance=None, + bint use_hashtable=True): cdef: Py_ssize_t left_pos, right_pos, left_size, right_size, found_right_pos @@ -770,12 +782,13 @@ def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, left_indexer = np.empty(left_size, dtype=np.intp) right_indexer = np.empty(left_size, dtype=np.intp) - if by_t is object: - hash_table = PyObjectHashTable(right_size) - elif by_t is int64_t: - hash_table = Int64HashTable(right_size) - elif by_t is uint64_t: - hash_table = UInt64HashTable(right_size) + if use_hashtable: + if by_t is object: + hash_table = PyObjectHashTable(right_size) + elif by_t is int64_t: + hash_table = Int64HashTable(right_size) + elif by_t is uint64_t: + hash_table = UInt64HashTable(right_size) right_pos = right_size - 1 for left_pos in range(left_size - 1, -1, -1): @@ -787,19 +800,26 @@ def asof_join_forward_on_X_by_Y(numeric_t[:] left_values, if allow_exact_matches: while (right_pos >= 0 and right_values[right_pos] >= left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos -= 1 else: while (right_pos >= 0 and right_values[right_pos] > left_values[left_pos]): - hash_table.set_item(right_by_values[right_pos], right_pos) + if use_hashtable: + hash_table.set_item(right_by_values[right_pos], right_pos) right_pos -= 1 right_pos += 1 # save positions as the desired index - by_value = left_by_values[left_pos] - found_right_pos = (hash_table.get_item(by_value) - if by_value in hash_table else -1) + if use_hashtable: + by_value = left_by_values[left_pos] + found_right_pos = (hash_table.get_item(by_value) + if by_value in hash_table else -1) + else: + found_right_pos = (right_pos + if right_pos != right_size else -1) + left_indexer[left_pos] = left_pos right_indexer[left_pos] = found_right_pos @@ -820,15 +840,7 @@ def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, tolerance=None): cdef: - Py_ssize_t left_size, right_size, i - ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - numeric_t bdiff, fdiff - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) + ndarray[intp_t] bli, bri, fli, fri # search both forward and backward bli, bri = asof_join_backward_on_X_by_Y( @@ -848,6 +860,27 @@ def asof_join_nearest_on_X_by_Y(numeric_t[:] left_values, tolerance, ) + return _choose_smaller_timestamp(left_values, right_values, bli, bri, fli, fri) + + +cdef _choose_smaller_timestamp( + numeric_t[:] left_values, + numeric_t[:] right_values, + ndarray[intp_t] bli, + ndarray[intp_t] bri, + ndarray[intp_t] fli, + ndarray[intp_t] fri, +): + cdef: + ndarray[intp_t] left_indexer, right_indexer + Py_ssize_t left_size, i + numeric_t bdiff, fdiff + + left_size = len(left_values) + + left_indexer = np.empty(left_size, dtype=np.intp) + right_indexer = np.empty(left_size, dtype=np.intp) + for i in range(len(bri)): # choose timestamp from right with smaller difference if bri[i] != -1 and fri[i] != -1: @@ -870,106 +903,30 @@ def asof_join_backward(numeric_t[:] left_values, bint allow_exact_matches=True, tolerance=None): - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[intp_t] left_indexer, right_indexer - bint has_tolerance = False - numeric_t tolerance_ = 0 - numeric_t diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = True - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) - - right_pos = 0 - for left_pos in range(left_size): - # restart right_pos if it went negative in a previous iteration - if right_pos < 0: - right_pos = 0 - - # find last position in right whose value is less than left's - if allow_exact_matches: - while (right_pos < right_size and - right_values[right_pos] <= left_values[left_pos]): - right_pos += 1 - else: - while (right_pos < right_size and - right_values[right_pos] < left_values[left_pos]): - right_pos += 1 - right_pos -= 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = right_pos - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != -1: - diff = left_values[left_pos] - right_values[right_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer + return asof_join_backward_on_X_by_Y( + left_values, + right_values, + None, + None, + allow_exact_matches=allow_exact_matches, + tolerance=tolerance, + use_hashtable=False, + ) def asof_join_forward(numeric_t[:] left_values, numeric_t[:] right_values, bint allow_exact_matches=True, tolerance=None): - - cdef: - Py_ssize_t left_pos, right_pos, left_size, right_size - ndarray[intp_t] left_indexer, right_indexer - bint has_tolerance = False - numeric_t tolerance_ = 0 - numeric_t diff = 0 - - # if we are using tolerance, set our objects - if tolerance is not None: - has_tolerance = True - tolerance_ = tolerance - - left_size = len(left_values) - right_size = len(right_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) - - right_pos = right_size - 1 - for left_pos in range(left_size - 1, -1, -1): - # restart right_pos if it went over in a previous iteration - if right_pos == right_size: - right_pos = right_size - 1 - - # find first position in right whose value is greater than left's - if allow_exact_matches: - while (right_pos >= 0 and - right_values[right_pos] >= left_values[left_pos]): - right_pos -= 1 - else: - while (right_pos >= 0 and - right_values[right_pos] > left_values[left_pos]): - right_pos -= 1 - right_pos += 1 - - # save positions as the desired index - left_indexer[left_pos] = left_pos - right_indexer[left_pos] = (right_pos - if right_pos != right_size else -1) - - # if needed, verify that tolerance is met - if has_tolerance and right_pos != right_size: - diff = right_values[right_pos] - left_values[left_pos] - if diff > tolerance_: - right_indexer[left_pos] = -1 - - return left_indexer, right_indexer + return asof_join_forward_on_X_by_Y( + left_values, + right_values, + None, + None, + allow_exact_matches=allow_exact_matches, + tolerance=tolerance, + use_hashtable=False, + ) def asof_join_nearest(numeric_t[:] left_values, @@ -978,14 +935,7 @@ def asof_join_nearest(numeric_t[:] left_values, tolerance=None): cdef: - Py_ssize_t left_size, i - ndarray[intp_t] left_indexer, right_indexer, bli, bri, fli, fri - numeric_t bdiff, fdiff - - left_size = len(left_values) - - left_indexer = np.empty(left_size, dtype=np.intp) - right_indexer = np.empty(left_size, dtype=np.intp) + ndarray[intp_t] bli, bri, fli, fri # search both forward and backward bli, bri = asof_join_backward(left_values, right_values, @@ -993,14 +943,4 @@ def asof_join_nearest(numeric_t[:] left_values, fli, fri = asof_join_forward(left_values, right_values, allow_exact_matches, tolerance) - for i in range(len(bri)): - # choose timestamp from right with smaller difference - if bri[i] != -1 and fri[i] != -1: - bdiff = left_values[bli[i]] - right_values[bri[i]] - fdiff = right_values[fri[i]] - left_values[fli[i]] - right_indexer[i] = bri[i] if bdiff <= fdiff else fri[i] - else: - right_indexer[i] = bri[i] if bri[i] != -1 else fri[i] - left_indexer[i] = bli[i] - - return left_indexer, right_indexer + return _choose_smaller_timestamp(left_values, right_values, bli, bri, fli, fri) From eddacb38d83d20b8608cd0d8843b0b01b8250def Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 13 Mar 2022 09:37:23 -0700 Subject: [PATCH 15/25] use bisect_right_i8 --- pandas/_libs/tslibs/vectorized.pyx | 65 +++++++++++++++++------------- 1 file changed, 36 insertions(+), 29 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 6d1681deebb9a..e5115be1363a0 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -8,6 +8,7 @@ from cpython.datetime cimport ( ) import numpy as np + cimport numpy as cnp from numpy cimport ( int64_t, @@ -37,7 +38,10 @@ from .timezones cimport ( is_tzlocal, is_utc, ) -from .tzconversion cimport tz_convert_utc_to_tzlocal +from .tzconversion cimport ( + bisect_right_i8, + tz_convert_utc_to_tzlocal, +) cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) @@ -50,6 +54,7 @@ cdef class Localizer: tzinfo tz bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz ndarray trans + Py_ssize_t ntrans const int64_t[::1] deltas int64_t delta str typ @@ -60,6 +65,7 @@ cdef class Localizer: self.tz = tz self.use_utc = self.use_tzlocal = self.use_fixed = False self.use_dst = self.use_pytz = False + self.ntrans = -1 # placeholder self.delta = -1 # placeholder self.deltas = _deltas_placeholder @@ -72,6 +78,7 @@ cdef class Localizer: else: trans, deltas, typ = get_dst_info(tz) self.trans = trans + self.ntrans = trans.shape[0] self.deltas = deltas self.typ = typ @@ -169,7 +176,8 @@ def ints_to_pydatetime( cdef: Localizer info = Localizer(tz) int64_t value, local_val - intp_t* pos = NULL + Py_ssize_t pos + int64_t* tdata Py_ssize_t i, n = len(stamps) npy_datetimestruct dts tzinfo new_tz @@ -192,9 +200,7 @@ def ints_to_pydatetime( ) if info.use_dst: - pos = cnp.PyArray_DATA( - info.trans.searchsorted(stamps, side="right") - 1 - ) + tdata = cnp.PyArray_DATA(info.trans) for i in range(n): new_tz = tz @@ -211,11 +217,12 @@ def ints_to_pydatetime( elif info.use_fixed: local_val = value + info.delta else: - local_val = value + info.deltas[pos[i]] + pos = bisect_right_i8(tdata, value, info.ntrans) - 1 + local_val = value + info.deltas[pos] - if info.use_pytz: - # find right representation of dst etc in pytz timezone - new_tz = tz._tzinfos[tz._transition_info[pos[i]]] + if info.use_pytz: + # find right representation of dst etc in pytz timezone + new_tz = tz._tzinfos[tz._transition_info[pos]] dt64_to_dtstruct(local_val, &dts) result[i] = func_create(value, dts, new_tz, freq, fold) @@ -256,15 +263,14 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos = NULL + Py_ssize_t pos + int64_t* tdata Py_ssize_t i, n = len(stamps) npy_datetimestruct dts int reso = RESO_DAY, curr_reso if info.use_dst: - pos = cnp.PyArray_DATA( - info.trans.searchsorted(stamps, side="right") - 1 - ) + tdata = cnp.PyArray_DATA(info.trans) for i in range(n): if stamps[i] == NPY_NAT: @@ -277,7 +283,8 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: elif info.use_fixed: local_val = stamps[i] + info.delta else: - local_val = stamps[i] + info.deltas[pos[i]] + pos = bisect_right_i8(tdata, stamps[i], info.ntrans) - 1 + local_val = stamps[i] + info.deltas[pos] dt64_to_dtstruct(local_val, &dts) curr_reso = _reso_stamp(&dts) @@ -309,14 +316,13 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos = NULL + Py_ssize_t pos + int64_t* tdata Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: - pos = cnp.PyArray_DATA( - info.trans.searchsorted(stamps, side="right") - 1 - ) + tdata = cnp.PyArray_DATA(info.trans) for i in range(n): if stamps[i] == NPY_NAT: @@ -330,7 +336,8 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t elif info.use_fixed: local_val = stamps[i] + info.delta else: - local_val = stamps[i] + info.deltas[pos[i]] + pos = bisect_right_i8(tdata, stamps[i], info.ntrans) - 1 + local_val = stamps[i] + info.deltas[pos] result[i] = normalize_i8_stamp(local_val) @@ -357,14 +364,13 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos = NULL + Py_ssize_t pos Py_ssize_t i, n = len(stamps) int64_t day_nanos = 24 * 3600 * 1_000_000_000 + int64_t* tdata if info.use_dst: - pos = cnp.PyArray_DATA( - info.trans.searchsorted(stamps, side="right") - 1 - ) + tdata = cnp.PyArray_DATA(info.trans) for i in range(n): if info.use_utc: @@ -374,7 +380,8 @@ def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None) -> bool: elif info.use_fixed: local_val = stamps[i] + info.delta else: - local_val = stamps[i] + info.deltas[pos[i]] + pos = bisect_right_i8(tdata, stamps[i], info.ntrans) - 1 + local_val = stamps[i] + info.deltas[pos] if local_val % day_nanos != 0: return False @@ -391,15 +398,14 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): cdef: Localizer info = Localizer(tz) int64_t local_val - intp_t* pos = NULL + Py_ssize_t pos Py_ssize_t i, n = len(stamps) npy_datetimestruct dts + int64_t* tdata int64_t[:] result = np.empty(n, dtype=np.int64) if info.use_dst: - pos = cnp.PyArray_DATA( - info.trans.searchsorted(stamps, side="right") - 1 - ) + tdata = cnp.PyArray_DATA(info.trans) for i in range(n): if stamps[i] == NPY_NAT: @@ -413,7 +419,8 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): elif info.use_fixed: local_val = stamps[i] + info.delta else: - local_val = stamps[i] + info.deltas[pos[i]] + pos = bisect_right_i8(tdata, stamps[i], info.ntrans) - 1 + local_val = stamps[i] + info.deltas[pos] dt64_to_dtstruct(local_val, &dts) result[i] = get_period_ordinal(&dts, freq) From c1085df7b89784a9a561b2ec93f3d84590ceecc7 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 16 Mar 2022 13:26:48 -0700 Subject: [PATCH 16/25] micro-optimize? --- pandas/_libs/tslibs/vectorized.pyx | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 530b9052d32b9..341c3af4bba7b 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -57,7 +57,6 @@ cdef class Localizer: Py_ssize_t ntrans const int64_t[::1] deltas int64_t delta - str typ @cython.initializedcheck(False) @cython.boundscheck(False) @@ -80,9 +79,8 @@ cdef class Localizer: self.trans = trans self.ntrans = trans.shape[0] self.deltas = deltas - self.typ = typ - if typ not in ["pytz", "dateutil"]: + if typ != "pytz" and typ != "dateutil": # static/fixed; in this case we know that len(delta) == 1 self.use_fixed = True self.delta = deltas[0] From ebb53706dc3d73032e5c04e00f40231f3b96c2f4 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 18 Mar 2022 10:01:47 -0700 Subject: [PATCH 17/25] troubleshoot --- pandas/_libs/tslibs/vectorized.pyx | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 341c3af4bba7b..38f06b94e3783 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -361,6 +361,7 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): int64_t local_val npy_datetimestruct dts int64_t[:] result = np.empty(n, dtype=np.int64) + bint use_utc=info.use_utc, use_tzlocal=info.use_tzlocal, use_fixed=info.use_fixed if info.use_dst: tdata = cnp.PyArray_DATA(info.trans) @@ -370,11 +371,11 @@ def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): result[i] = NPY_NAT continue - if info.use_utc: + if use_utc: local_val = stamps[i] - elif info.use_tzlocal: + elif use_tzlocal: local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - elif info.use_fixed: + elif use_fixed: local_val = stamps[i] + info.delta else: pos = bisect_right_i8(tdata, stamps[i], info.ntrans) - 1 From 11e3e701f615db40936bd8ab4b3b4df2c9f8fe07 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Mar 2022 07:52:22 -0700 Subject: [PATCH 18/25] troubleshoot --- pandas/_libs/tslibs/vectorized.pyx | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 91a4539486fdd..b1ac78e5ecaa8 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -261,6 +261,14 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: # ------------------------------------------------------------------------- +''' +When using info.use_utc checks inside the loop... ++ 2.40±0.06ms 3.96±0.2ms 1.65 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, datetime.timezone(datetime.timedelta(seconds=3600))) ++ 30.6±2μs 48.3±1μs 1.58 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) ++ 3.56±0.08ms 5.24±0.4ms 1.47 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) ++ 39.5±2μs 57.5±3μs 1.46 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) +''' + @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): @@ -283,6 +291,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t int64_t utc_val, local_val Py_ssize_t pos, i, n = stamps.shape[0] int64_t* tdata = NULL + bint use_utc=info.use_utc,use_tzlocal=info.use_tzlocal,use_fixed=info.use_fixed int64_t[::1] result = np.empty(n, dtype=np.int64) @@ -295,11 +304,11 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue - if info.use_utc: + if use_utc: local_val = utc_val - elif info.use_tzlocal: + elif use_tzlocal: local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif info.use_fixed: + elif use_fixed: local_val = utc_val + info.delta else: pos = bisect_right_i8(tdata, utc_val, info.ntrans) - 1 From a46d12791acc230792909b0b5bbca232e8082030 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Mar 2022 08:06:35 -0700 Subject: [PATCH 19/25] inline info.delta --- pandas/_libs/tslibs/vectorized.pyx | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index b1ac78e5ecaa8..fc45f991f6660 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -267,6 +267,24 @@ When using info.use_utc checks inside the loop... + 30.6±2μs 48.3±1μs 1.58 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) + 3.56±0.08ms 5.24±0.4ms 1.47 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) + 39.5±2μs 57.5±3μs 1.46 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) + +With info.use_utc checked just once... ++ 30.3±2μs 38.7±0.8μs 1.27 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) ++ 447±10ns 552±30ns 1.24 tslibs.normalize.Normalize.time_is_date_array_normalized(1, datetime.timezone.utc) ++ 3.63±0.1ms 4.46±0.2ms 1.23 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) ++ 2.05±0.02μs 2.49±0.2μs 1.21 tslibs.normalize.Normalize.time_is_date_array_normalized(100, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) ++ 2.55±0.2ms 3.09±0.07ms 1.21 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, datetime.timezone(datetime.timedelta(seconds=3600))) ++ 528±10ns 630±10ns 1.19 tslibs.normalize.Normalize.time_is_date_array_normalized(1, tzlocal()) + +With info.use_utc checked just once... (re-run) ++ 29.7±1μs 37.7±0.7μs 1.27 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) ++ 518±9ns 640±10ns 1.24 tslibs.normalize.Normalize.time_is_date_array_normalized(0, tzlocal()) ++ 40.1±0.8μs 46.7±1μs 1.17 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) ++ 515±10ns 578±10ns 1.12 tslibs.normalize.Normalize.time_is_date_array_normalized(0, None) ++ 9.15±0.2ms 10.1±0.4ms 1.11 tslibs.normalize.Normalize.time_is_date_array_normalized(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) +- 8.27±0.3ms 7.11±0.2ms 0.86 tslibs.normalize.Normalize.time_is_date_array_normalized(1000000, datetime.timezone.utc) +- 90.2±7μs 70.4±2μs 0.78 tslibs.normalize.Normalize.time_is_date_array_normalized(10000, None) + ''' @cython.wraparound(False) @@ -288,7 +306,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t """ cdef: Localizer info = Localizer(tz) - int64_t utc_val, local_val + int64_t utc_val, local_val, delta=info.delta Py_ssize_t pos, i, n = stamps.shape[0] int64_t* tdata = NULL bint use_utc=info.use_utc,use_tzlocal=info.use_tzlocal,use_fixed=info.use_fixed @@ -309,7 +327,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t elif use_tzlocal: local_val = utc_val + localize_tzinfo_api(utc_val, tz) elif use_fixed: - local_val = utc_val + info.delta + local_val = utc_val + delta else: pos = bisect_right_i8(tdata, utc_val, info.ntrans) - 1 local_val = utc_val + info.deltas[pos] From c4790504ba2a551aebfca7f505848b8b35a23e3d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Mar 2022 08:37:55 -0700 Subject: [PATCH 20/25] revert troubleshooting --- pandas/_libs/tslibs/vectorized.pyx | 36 +++++------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index fc45f991f6660..246eaa64ea263 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -261,31 +261,6 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: # ------------------------------------------------------------------------- -''' -When using info.use_utc checks inside the loop... -+ 2.40±0.06ms 3.96±0.2ms 1.65 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, datetime.timezone(datetime.timedelta(seconds=3600))) -+ 30.6±2μs 48.3±1μs 1.58 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) -+ 3.56±0.08ms 5.24±0.4ms 1.47 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) -+ 39.5±2μs 57.5±3μs 1.46 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) - -With info.use_utc checked just once... -+ 30.3±2μs 38.7±0.8μs 1.27 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) -+ 447±10ns 552±30ns 1.24 tslibs.normalize.Normalize.time_is_date_array_normalized(1, datetime.timezone.utc) -+ 3.63±0.1ms 4.46±0.2ms 1.23 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) -+ 2.05±0.02μs 2.49±0.2μs 1.21 tslibs.normalize.Normalize.time_is_date_array_normalized(100, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) -+ 2.55±0.2ms 3.09±0.07ms 1.21 tslibs.normalize.Normalize.time_normalize_i8_timestamps(1000000, datetime.timezone(datetime.timedelta(seconds=3600))) -+ 528±10ns 630±10ns 1.19 tslibs.normalize.Normalize.time_is_date_array_normalized(1, tzlocal()) - -With info.use_utc checked just once... (re-run) -+ 29.7±1μs 37.7±0.7μs 1.27 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, datetime.timezone(datetime.timedelta(seconds=3600))) -+ 518±9ns 640±10ns 1.24 tslibs.normalize.Normalize.time_is_date_array_normalized(0, tzlocal()) -+ 40.1±0.8μs 46.7±1μs 1.17 tslibs.normalize.Normalize.time_normalize_i8_timestamps(10000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) -+ 515±10ns 578±10ns 1.12 tslibs.normalize.Normalize.time_is_date_array_normalized(0, None) -+ 9.15±0.2ms 10.1±0.4ms 1.11 tslibs.normalize.Normalize.time_is_date_array_normalized(1000000, tzfile('/usr/share/zoneinfo/Asia/Tokyo')) -- 8.27±0.3ms 7.11±0.2ms 0.86 tslibs.normalize.Normalize.time_is_date_array_normalized(1000000, datetime.timezone.utc) -- 90.2±7μs 70.4±2μs 0.78 tslibs.normalize.Normalize.time_is_date_array_normalized(10000, None) - -''' @cython.wraparound(False) @cython.boundscheck(False) @@ -306,10 +281,9 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t """ cdef: Localizer info = Localizer(tz) - int64_t utc_val, local_val, delta=info.delta + int64_t utc_val, local_val Py_ssize_t pos, i, n = stamps.shape[0] int64_t* tdata = NULL - bint use_utc=info.use_utc,use_tzlocal=info.use_tzlocal,use_fixed=info.use_fixed int64_t[::1] result = np.empty(n, dtype=np.int64) @@ -322,12 +296,12 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t result[i] = NPY_NAT continue - if use_utc: + if info.use_utc: local_val = utc_val - elif use_tzlocal: + elif info.use_tzlocal: local_val = utc_val + localize_tzinfo_api(utc_val, tz) - elif use_fixed: - local_val = utc_val + delta + elif info.use_fixed: + local_val = utc_val + info.delta else: pos = bisect_right_i8(tdata, utc_val, info.ntrans) - 1 local_val = utc_val + info.deltas[pos] From b4427a55a23d6152f42bd173e74c85ba4b7b0361 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Mar 2022 08:45:35 -0700 Subject: [PATCH 21/25] troubleshoot --- pandas/_libs/tslibs/vectorized.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 246eaa64ea263..bdad24585c0ca 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -50,7 +50,7 @@ cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) @cython.freelist(16) @cython.final cdef class Localizer: - cdef readonly: + cdef: tzinfo tz bint use_utc, use_fixed, use_tzlocal, use_dst, use_pytz ndarray trans From 1b620b9a7c7e11e1815305c79f8d37ffeae7a563 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Mar 2022 08:50:50 -0700 Subject: [PATCH 22/25] troubleshoot --- pandas/_libs/tslibs/vectorized.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index bdad24585c0ca..13440e61e5762 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -48,6 +48,7 @@ cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) @cython.freelist(16) +@cython.private @cython.final cdef class Localizer: cdef: From a1b3e00e30418195a1ddd67620d9b627972ea87d Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 20 Mar 2022 08:52:08 -0700 Subject: [PATCH 23/25] troubleshoot --- pandas/_libs/tslibs/vectorized.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 13440e61e5762..392be4ac0d1aa 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -48,7 +48,7 @@ cdef const int64_t[::1] _deltas_placeholder = np.array([], dtype=np.int64) @cython.freelist(16) -@cython.private +@cython.internal @cython.final cdef class Localizer: cdef: From a5c514210d5d68e582fa3afca8e3d77ef1f4ca28 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 28 Mar 2022 20:25:09 -0700 Subject: [PATCH 24/25] try a thing --- pandas/_libs/tslibs/vectorized.pyx | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 33ef57240d398..87711463fbf05 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -32,10 +32,7 @@ from .np_datetime cimport ( ) from .offsets cimport BaseOffset from .period cimport get_period_ordinal -from .timestamps cimport ( - create_timestamp_from_ts, - normalize_i8_stamp, -) +from .timestamps cimport create_timestamp_from_ts from .timezones cimport ( get_dst_info, is_tzlocal, @@ -258,6 +255,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: # ------------------------------------------------------------------------- +@cdivision(False) @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): @@ -302,7 +300,7 @@ cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo t pos = bisect_right_i8(tdata, utc_val, info.ntrans) - 1 local_val = utc_val + info.deltas[pos] - result[i] = normalize_i8_stamp(local_val) + result[i] = local_val - (local_val % DAY_NANOS) return result.base # `.base` to access underlying ndarray From 76132b6dc7bcd687ce998a1494c2b6228f8981e0 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 28 Mar 2022 20:26:50 -0700 Subject: [PATCH 25/25] woops --- pandas/_libs/tslibs/vectorized.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx index 87711463fbf05..fac2f15e4e1c2 100644 --- a/pandas/_libs/tslibs/vectorized.pyx +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -255,7 +255,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution: # ------------------------------------------------------------------------- -@cdivision(False) +@cython.cdivision(False) @cython.wraparound(False) @cython.boundscheck(False) cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz):