From fd4339248cad008aba2bb5a9e0283cc6b825eccd Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Jul 2020 13:17:08 -0700 Subject: [PATCH 1/2] REF: collect get_dst_info methods in tslibs.vectorized --- asv_bench/benchmarks/tslibs/resolution.py | 5 +- asv_bench/benchmarks/tslibs/tslib.py | 5 +- pandas/_libs/tslib.pyx | 170 +-------- pandas/_libs/tslibs/__init__.py | 12 + pandas/_libs/tslibs/conversion.pxd | 1 - pandas/_libs/tslibs/conversion.pyx | 127 ------- pandas/_libs/tslibs/offsets.pyx | 3 +- pandas/_libs/tslibs/period.pxd | 5 + pandas/_libs/tslibs/period.pyx | 57 --- pandas/_libs/tslibs/resolution.pyx | 98 +---- pandas/_libs/tslibs/vectorized.pyx | 440 ++++++++++++++++++++++ pandas/core/arrays/datetimes.py | 20 +- pandas/core/arrays/period.py | 5 +- pandas/core/dtypes/cast.py | 5 +- pandas/core/indexes/datetimes.py | 13 +- pandas/tests/tslibs/test_api.py | 6 + setup.py | 2 + 17 files changed, 506 insertions(+), 468 deletions(-) create mode 100644 pandas/_libs/tslibs/vectorized.pyx diff --git a/asv_bench/benchmarks/tslibs/resolution.py b/asv_bench/benchmarks/tslibs/resolution.py index 274aa1ad6d4a9..280be7932d4db 100644 --- a/asv_bench/benchmarks/tslibs/resolution.py +++ b/asv_bench/benchmarks/tslibs/resolution.py @@ -23,7 +23,10 @@ import numpy as np import pytz -from pandas._libs.tslibs.resolution import get_resolution +try: + from pandas._libs.tslibs import get_resolution +except ImportError: + from pandas._libs.tslibs.resolution import get_resolution class TimeResolution: diff --git a/asv_bench/benchmarks/tslibs/tslib.py b/asv_bench/benchmarks/tslibs/tslib.py index eacf5a5731dc2..5952a402bf89a 100644 --- a/asv_bench/benchmarks/tslibs/tslib.py +++ b/asv_bench/benchmarks/tslibs/tslib.py @@ -21,7 +21,10 @@ import numpy as np import pytz -from pandas._libs.tslib import ints_to_pydatetime +try: + from pandas._libs.tslibs import ints_to_pydatetime +except ImportError: + from pandas._libs.tslib import ints_to_pydatetime _tzs = [ None, diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 3472dbf161b8e..d70d0378a2621 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -4,18 +4,14 @@ from cpython.datetime cimport ( PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, - date, datetime, - time, - timedelta, - tzinfo, ) # import datetime C API PyDateTime_IMPORT cimport numpy as cnp -from numpy cimport float64_t, int64_t, ndarray, uint8_t, intp_t +from numpy cimport float64_t, int64_t, ndarray import numpy as np cnp.import_array() @@ -42,11 +38,6 @@ from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime from pandas._libs.tslibs.parsing import parse_datetime_string -from pandas._libs.tslibs.timezones cimport ( - get_dst_info, - is_utc, - is_tzlocal, -) from pandas._libs.tslibs.conversion cimport ( _TSObject, cast_from_unit, @@ -60,13 +51,10 @@ from pandas._libs.tslibs.nattype cimport ( c_nat_strings as nat_strings, ) -from pandas._libs.tslibs.offsets cimport to_offset - -from pandas._libs.tslibs.timestamps cimport create_timestamp_from_ts, _Timestamp +from pandas._libs.tslibs.timestamps cimport _Timestamp from pandas._libs.tslibs.timestamps import Timestamp from pandas._libs.tslibs.tzconversion cimport ( - tz_convert_utc_to_tzlocal, tz_localize_to_utc_single, ) @@ -74,160 +62,6 @@ from pandas._libs.tslibs.tzconversion cimport ( from pandas._libs.missing cimport checknull_with_nat_and_na -cdef inline object create_datetime_from_ts( - int64_t value, - npy_datetimestruct dts, - tzinfo tz, - object freq, - bint fold, -): - """ - Convenience routine to construct a datetime.datetime from its parts. - """ - return datetime( - dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold - ) - - -cdef inline object create_date_from_ts( - int64_t value, - npy_datetimestruct dts, - tzinfo tz, - object freq, - bint fold -): - """ - Convenience routine to construct a datetime.date from its parts. - """ - # GH 25057 add fold argument to match other func_create signatures - return date(dts.year, dts.month, dts.day) - - -cdef inline object create_time_from_ts( - int64_t value, - npy_datetimestruct dts, - tzinfo tz, - object freq, - bint fold -): - """ - Convenience routine to construct a datetime.time from its parts. - """ - return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def ints_to_pydatetime( - const int64_t[:] arr, - tzinfo tz=None, - object freq=None, - bint fold=False, - str box="datetime" -): - """ - Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp. - - Parameters - ---------- - arr : array of i8 - tz : str, optional - convert to this timezone - freq : str/Offset, optional - freq to convert - fold : bint, default is 0 - Due to daylight saving time, one wall clock time can occur twice - when shifting from summer to winter time; fold describes whether the - datetime-like corresponds to the first (0) or the second time (1) - the wall clock hits the ambiguous time - - .. versionadded:: 1.1.0 - box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime' - * If datetime, convert to datetime.datetime - * If date, convert to datetime.date - * If time, convert to datetime.time - * If Timestamp, convert to pandas.Timestamp - - Returns - ------- - ndarray of dtype specified by box - """ - cdef: - Py_ssize_t i, n = len(arr) - ndarray[int64_t] trans - int64_t[:] deltas - intp_t[:] pos - npy_datetimestruct dts - object dt, new_tz - str typ - int64_t value, local_value, delta = NPY_NAT # dummy for delta - ndarray[object] result = np.empty(n, dtype=object) - object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) - bint use_utc = False, use_tzlocal = False, use_fixed = False - bint use_pytz = False - - if box == "date": - assert (tz is None), "tz should be None when converting to date" - - func_create = create_date_from_ts - elif box == "timestamp": - func_create = create_timestamp_from_ts - - if isinstance(freq, str): - freq = to_offset(freq) - elif box == "time": - func_create = create_time_from_ts - elif box == "datetime": - func_create = create_datetime_from_ts - else: - raise ValueError( - "box must be one of 'datetime', 'date', 'time' or 'timestamp'" - ) - - if is_utc(tz) or tz is None: - use_utc = True - elif is_tzlocal(tz): - use_tzlocal = True - else: - trans, deltas, typ = get_dst_info(tz) - if typ not in ["pytz", "dateutil"]: - # static/fixed; in this case we know that len(delta) == 1 - use_fixed = True - delta = deltas[0] - else: - pos = trans.searchsorted(arr, side="right") - 1 - use_pytz = typ == "pytz" - - for i in range(n): - new_tz = tz - value = arr[i] - - if value == NPY_NAT: - result[i] = NaT - else: - if use_utc: - local_value = value - elif use_tzlocal: - local_value = tz_convert_utc_to_tzlocal(value, tz) - elif use_fixed: - local_value = value + delta - elif not use_pytz: - # i.e. dateutil - # no zone-name change for dateutil tzs - dst etc - # represented in single object. - local_value = value + deltas[pos[i]] - else: - # pytz - # find right representation of dst etc in pytz timezone - new_tz = tz._tzinfos[tz._transition_info[pos[i]]] - local_value = value + deltas[pos[i]] - - dt64_to_dtstruct(local_value, &dts) - result[i] = func_create(value, dts, new_tz, freq, fold) - - return result - - def _test_parse_iso8601(ts: str): """ TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used diff --git a/pandas/_libs/tslibs/__init__.py b/pandas/_libs/tslibs/__init__.py index 76e356370de70..c2f3478a50ab4 100644 --- a/pandas/_libs/tslibs/__init__.py +++ b/pandas/_libs/tslibs/__init__.py @@ -11,8 +11,13 @@ "Period", "Resolution", "Timedelta", + "normalize_i8_timestamps", + "is_date_array_normalized", + "dt64arr_to_periodarr", "delta_to_nanoseconds", + "ints_to_pydatetime", "ints_to_pytimedelta", + "get_resolution", "Timestamp", "tz_convert_single", "to_offset", @@ -30,3 +35,10 @@ from .timedeltas import Timedelta, delta_to_nanoseconds, ints_to_pytimedelta from .timestamps import Timestamp from .tzconversion import tz_convert_single +from .vectorized import ( + dt64arr_to_periodarr, + get_resolution, + ints_to_pydatetime, + is_date_array_normalized, + normalize_i8_timestamps, +) diff --git a/pandas/_libs/tslibs/conversion.pxd b/pandas/_libs/tslibs/conversion.pxd index 0eb94fecf7d6b..73772e5ab4577 100644 --- a/pandas/_libs/tslibs/conversion.pxd +++ b/pandas/_libs/tslibs/conversion.pxd @@ -25,5 +25,4 @@ cdef int64_t get_datetime64_nanos(object val) except? -1 cpdef datetime localize_pydatetime(datetime dt, object tz) cdef int64_t cast_from_unit(object ts, str unit) except? -1 -cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz) cdef int64_t normalize_i8_stamp(int64_t local_val) nogil diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 36a4a1f60d8b9..31d2d0e9572f5 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -765,73 +765,6 @@ cpdef inline datetime localize_pydatetime(datetime dt, object tz): # ---------------------------------------------------------------------- # Normalization - -@cython.wraparound(False) -@cython.boundscheck(False) -cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): - """ - Normalize each of the (nanosecond) timezone aware timestamps in the given - array by rounding down to the beginning of the day (i.e. midnight). - This is midnight for timezone, `tz`. - - Parameters - ---------- - stamps : int64 ndarray - tz : tzinfo or None - - Returns - ------- - result : int64 ndarray of converted of normalized nanosecond timestamps - """ - cdef: - Py_ssize_t i, n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - str typ - Py_ssize_t[:] pos - int64_t delta, local_val - - if tz is None or is_utc(tz): - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] - result[i] = normalize_i8_stamp(local_val) - elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - result[i] = normalize_i8_stamp(local_val) - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + delta - result[i] = normalize_i8_stamp(local_val) - else: - pos = trans.searchsorted(stamps, side='right') - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = stamps[i] + deltas[pos[i]] - result[i] = normalize_i8_stamp(local_val) - - return result.base # `.base` to access underlying ndarray - - @cython.cdivision cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: """ @@ -848,63 +781,3 @@ cdef inline int64_t normalize_i8_stamp(int64_t local_val) nogil: cdef: int64_t day_nanos = 24 * 3600 * 1_000_000_000 return local_val - (local_val % day_nanos) - - -@cython.wraparound(False) -@cython.boundscheck(False) -def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): - """ - Check if all of the given (nanosecond) timestamps are normalized to - midnight, i.e. hour == minute == second == 0. If the optional timezone - `tz` is not None, then this is midnight for this timezone. - - Parameters - ---------- - stamps : int64 ndarray - tz : tzinfo or None - - Returns - ------- - is_normalized : bool True if all stamps are normalized - """ - cdef: - Py_ssize_t i, n = len(stamps) - ndarray[int64_t] trans - int64_t[:] deltas - intp_t[:] pos - int64_t local_val, delta - str typ - int64_t day_nanos = 24 * 3600 * 1_000_000_000 - - if tz is None or is_utc(tz): - for i in range(n): - local_val = stamps[i] - if local_val % day_nanos != 0: - return False - - elif is_tzlocal(tz): - for i in range(n): - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - if local_val % day_nanos != 0: - return False - else: - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - delta = deltas[0] - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + delta - if local_val % day_nanos != 0: - return False - - else: - pos = trans.searchsorted(stamps) - 1 - for i in range(n): - # Adjust datetime64 timestamp, recompute datetimestruct - local_val = stamps[i] + deltas[pos[i]] - if local_val % day_nanos != 0: - return False - - return True diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index e4d05e0d70e2f..fb07e3fe7547e 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -38,7 +38,6 @@ from pandas._libs.tslibs.ccalendar cimport DAY_NANOS, get_days_in_month, dayofwe from pandas._libs.tslibs.conversion cimport ( convert_datetime_to_tsobject, localize_pydatetime, - normalize_i8_timestamps, ) from pandas._libs.tslibs.nattype cimport NPY_NAT, c_NaT as NaT from pandas._libs.tslibs.np_datetime cimport ( @@ -92,6 +91,8 @@ def apply_index_wraps(func): result = np.asarray(result) if self.normalize: + # TODO: Avoid circular/runtime import + from .vectorized import normalize_i8_timestamps result = normalize_i8_timestamps(result.view("i8"), None) return result diff --git a/pandas/_libs/tslibs/period.pxd b/pandas/_libs/tslibs/period.pxd index eb11a4a572e85..9c0342e239a89 100644 --- a/pandas/_libs/tslibs/period.pxd +++ b/pandas/_libs/tslibs/period.pxd @@ -1 +1,6 @@ +from numpy cimport int64_t + +from .np_datetime cimport npy_datetimestruct + cdef bint is_period_object(object obj) +cdef int64_t get_period_ordinal(npy_datetimestruct *dts, int freq) nogil diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c0641297c4b8a..e6ba1968797ed 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -14,7 +14,6 @@ import cython from cpython.datetime cimport ( datetime, - tzinfo, PyDate_Check, PyDateTime_Check, PyDateTime_IMPORT, @@ -41,7 +40,6 @@ cdef extern from "src/datetime/np_datetime.h": cimport pandas._libs.tslibs.util as util from pandas._libs.tslibs.timestamps import Timestamp -from pandas._libs.tslibs.timezones cimport is_utc, is_tzlocal, get_dst_info from pandas._libs.tslibs.timedeltas import Timedelta from pandas._libs.tslibs.timedeltas cimport ( delta_to_nanoseconds, @@ -91,7 +89,6 @@ from pandas._libs.tslibs.offsets cimport ( is_offset_object, ) from pandas._libs.tslibs.offsets import INVALID_FREQ_ERR_MSG -from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal cdef: @@ -1416,60 +1413,6 @@ def extract_freq(ndarray[object] values): # period helpers -@cython.wraparound(False) -@cython.boundscheck(False) -def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): - cdef: - Py_ssize_t n = len(stamps) - int64_t[:] result = np.empty(n, dtype=np.int64) - ndarray[int64_t] trans - int64_t[:] deltas - Py_ssize_t[:] pos - npy_datetimestruct dts - int64_t local_val - - if is_utc(tz) or tz is None: - with nogil: - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i], &dts) - result[i] = get_period_ordinal(&dts, freq) - - elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - result[i] = get_period_ordinal(&dts, freq) - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[0], &dts) - result[i] = get_period_ordinal(&dts, freq) - else: - pos = trans.searchsorted(stamps, side='right') - 1 - - for i in range(n): - if stamps[i] == NPY_NAT: - result[i] = NPY_NAT - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - result[i] = get_period_ordinal(&dts, freq) - - return result.base # .base to get underlying ndarray - - DIFFERENT_FREQ = ("Input has different freq={other_freq} " "from {cls}(freq={own_freq})") diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index d5f10374d2860..d2861d8e9fe8d 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -1,105 +1,9 @@ -from cpython.datetime cimport tzinfo import numpy as np -from numpy cimport ndarray, int64_t, int32_t - -from pandas._libs.tslibs.util cimport get_nat +from numpy cimport int32_t from pandas._libs.tslibs.dtypes import Resolution -from pandas._libs.tslibs.np_datetime cimport ( - npy_datetimestruct, dt64_to_dtstruct) -from pandas._libs.tslibs.timezones cimport ( - is_utc, is_tzlocal, get_dst_info) from pandas._libs.tslibs.ccalendar cimport get_days_in_month -from pandas._libs.tslibs.tzconversion cimport tz_convert_utc_to_tzlocal - -# ---------------------------------------------------------------------- -# Constants - -cdef: - int64_t NPY_NAT = get_nat() - - int RESO_NS = 0 - int RESO_US = 1 - int RESO_MS = 2 - int RESO_SEC = 3 - int RESO_MIN = 4 - int RESO_HR = 5 - int RESO_DAY = 6 - int RESO_MTH = 7 - int RESO_QTR = 8 - int RESO_YR = 9 - - -# ---------------------------------------------------------------------- - - -def get_resolution(const int64_t[:] stamps, tzinfo tz=None): - cdef: - Py_ssize_t i, n = len(stamps) - npy_datetimestruct dts - int reso = RESO_DAY, curr_reso - ndarray[int64_t] trans - int64_t[:] deltas - Py_ssize_t[:] pos - int64_t local_val, delta - - if is_utc(tz) or tz is None: - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - elif is_tzlocal(tz): - for i in range(n): - if stamps[i] == NPY_NAT: - continue - local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) - dt64_to_dtstruct(local_val, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - else: - # Adjust datetime64 timestamp, recompute datetimestruct - trans, deltas, typ = get_dst_info(tz) - - if typ not in ['pytz', 'dateutil']: - # static/fixed; in this case we know that len(delta) == 1 - delta = deltas[0] - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + delta, &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - else: - pos = trans.searchsorted(stamps, side='right') - 1 - for i in range(n): - if stamps[i] == NPY_NAT: - continue - dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) - curr_reso = _reso_stamp(&dts) - if curr_reso < reso: - reso = curr_reso - - return Resolution(reso) - - -cdef inline int _reso_stamp(npy_datetimestruct *dts): - if dts.us != 0: - if dts.us % 1000 == 0: - return RESO_MS - return RESO_US - elif dts.sec != 0: - return RESO_SEC - elif dts.min != 0: - return RESO_MIN - elif dts.hour != 0: - return RESO_HR - return RESO_DAY # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/vectorized.pyx b/pandas/_libs/tslibs/vectorized.pyx new file mode 100644 index 0000000000000..c8f8daf6724c2 --- /dev/null +++ b/pandas/_libs/tslibs/vectorized.pyx @@ -0,0 +1,440 @@ +import cython + +from cpython.datetime cimport datetime, date, time, tzinfo + +import numpy as np +from numpy cimport int64_t, intp_t, ndarray + +from .conversion cimport normalize_i8_stamp +from .dtypes import Resolution +from .nattype cimport NPY_NAT, c_NaT as NaT +from .np_datetime cimport npy_datetimestruct, dt64_to_dtstruct +from .offsets cimport to_offset +from .period cimport get_period_ordinal +from .timestamps cimport create_timestamp_from_ts +from .timezones cimport is_utc, is_tzlocal, get_dst_info +from .tzconversion cimport tz_convert_utc_to_tzlocal + +# ------------------------------------------------------------------------- + +cdef inline object create_datetime_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + object freq, + bint fold, +): + """ + Convenience routine to construct a datetime.datetime from its parts. + """ + return datetime( + dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us, + tz, fold=fold, + ) + + +cdef inline object create_date_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.date from its parts. + """ + # GH#25057 add fold argument to match other func_create signatures + return date(dts.year, dts.month, dts.day) + + +cdef inline object create_time_from_ts( + int64_t value, + npy_datetimestruct dts, + tzinfo tz, + object freq, + bint fold +): + """ + Convenience routine to construct a datetime.time from its parts. + """ + return time(dts.hour, dts.min, dts.sec, dts.us, tz, fold=fold) + + +@cython.wraparound(False) +@cython.boundscheck(False) +def ints_to_pydatetime( + const int64_t[:] arr, + tzinfo tz=None, + object freq=None, + bint fold=False, + str box="datetime" +): + """ + Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp. + + Parameters + ---------- + arr : array of i8 + tz : str, optional + convert to this timezone + freq : str/Offset, optional + freq to convert + fold : bint, default is 0 + Due to daylight saving time, one wall clock time can occur twice + when shifting from summer to winter time; fold describes whether the + datetime-like corresponds to the first (0) or the second time (1) + the wall clock hits the ambiguous time + + .. versionadded:: 1.1.0 + box : {'datetime', 'timestamp', 'date', 'time'}, default 'datetime' + * If datetime, convert to datetime.datetime + * If date, convert to datetime.date + * If time, convert to datetime.time + * If Timestamp, convert to pandas.Timestamp + + Returns + ------- + ndarray of dtype specified by box + """ + cdef: + Py_ssize_t i, n = len(arr) + ndarray[int64_t] trans + int64_t[:] deltas + intp_t[:] pos + npy_datetimestruct dts + object dt, new_tz + str typ + int64_t value, local_value, delta = NPY_NAT # dummy for delta + ndarray[object] result = np.empty(n, dtype=object) + object (*func_create)(int64_t, npy_datetimestruct, tzinfo, object, bint) + bint use_utc = False, use_tzlocal = False, use_fixed = False + bint use_pytz = False + + if box == "date": + assert (tz is None), "tz should be None when converting to date" + + func_create = create_date_from_ts + elif box == "timestamp": + func_create = create_timestamp_from_ts + + if isinstance(freq, str): + freq = to_offset(freq) + elif box == "time": + func_create = create_time_from_ts + elif box == "datetime": + func_create = create_datetime_from_ts + else: + raise ValueError( + "box must be one of 'datetime', 'date', 'time' or 'timestamp'" + ) + + if is_utc(tz) or tz is None: + use_utc = True + elif is_tzlocal(tz): + use_tzlocal = True + else: + trans, deltas, typ = get_dst_info(tz) + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + use_fixed = True + delta = deltas[0] + else: + pos = trans.searchsorted(arr, side="right") - 1 + use_pytz = typ == "pytz" + + for i in range(n): + new_tz = tz + value = arr[i] + + if value == NPY_NAT: + result[i] = NaT + else: + if use_utc: + local_value = value + elif use_tzlocal: + local_value = tz_convert_utc_to_tzlocal(value, tz) + elif use_fixed: + local_value = value + delta + elif not use_pytz: + # i.e. dateutil + # no zone-name change for dateutil tzs - dst etc + # represented in single object. + local_value = value + deltas[pos[i]] + else: + # pytz + # find right representation of dst etc in pytz timezone + new_tz = tz._tzinfos[tz._transition_info[pos[i]]] + local_value = value + deltas[pos[i]] + + dt64_to_dtstruct(local_value, &dts) + result[i] = func_create(value, dts, new_tz, freq, fold) + + return result + + +# ------------------------------------------------------------------------- + +cdef: + int RESO_NS = 0 + int RESO_US = 1 + int RESO_MS = 2 + int RESO_SEC = 3 + int RESO_MIN = 4 + int RESO_HR = 5 + int RESO_DAY = 6 + int RESO_MTH = 7 + int RESO_QTR = 8 + int RESO_YR = 9 + + +cdef inline int _reso_stamp(npy_datetimestruct *dts): + if dts.us != 0: + if dts.us % 1000 == 0: + return RESO_MS + return RESO_US + elif dts.sec != 0: + return RESO_SEC + elif dts.min != 0: + return RESO_MIN + elif dts.hour != 0: + return RESO_HR + return RESO_DAY + + +def get_resolution(const int64_t[:] stamps, tzinfo tz=None): + cdef: + Py_ssize_t i, n = len(stamps) + npy_datetimestruct dts + int reso = RESO_DAY, curr_reso + ndarray[int64_t] trans + int64_t[:] deltas + Py_ssize_t[:] pos + int64_t local_val, delta + + if is_utc(tz) or tz is None: + for i in range(n): + if stamps[i] == NPY_NAT: + continue + dt64_to_dtstruct(stamps[i], &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + elif is_tzlocal(tz): + for i in range(n): + if stamps[i] == NPY_NAT: + continue + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + delta = deltas[0] + for i in range(n): + if stamps[i] == NPY_NAT: + continue + dt64_to_dtstruct(stamps[i] + delta, &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if stamps[i] == NPY_NAT: + continue + dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) + curr_reso = _reso_stamp(&dts) + if curr_reso < reso: + reso = curr_reso + + return Resolution(reso) + + +# ------------------------------------------------------------------------- + +@cython.wraparound(False) +@cython.boundscheck(False) +cpdef ndarray[int64_t] normalize_i8_timestamps(const int64_t[:] stamps, tzinfo tz): + """ + Normalize each of the (nanosecond) timezone aware timestamps in the given + array by rounding down to the beginning of the day (i.e. midnight). + This is midnight for timezone, `tz`. + + Parameters + ---------- + stamps : int64 ndarray + tz : tzinfo or None + + Returns + ------- + result : int64 ndarray of converted of normalized nanosecond timestamps + """ + cdef: + Py_ssize_t i, n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) + ndarray[int64_t] trans + int64_t[:] deltas + str typ + Py_ssize_t[:] pos + int64_t delta, local_val + + if tz is None or is_utc(tz): + with nogil: + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = stamps[i] + result[i] = normalize_i8_stamp(local_val) + elif is_tzlocal(tz): + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + result[i] = normalize_i8_stamp(local_val) + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + delta = deltas[0] + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = stamps[i] + delta + result[i] = normalize_i8_stamp(local_val) + else: + pos = trans.searchsorted(stamps, side="right") - 1 + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = stamps[i] + deltas[pos[i]] + result[i] = normalize_i8_stamp(local_val) + + return result.base # `.base` to access underlying ndarray + + +@cython.wraparound(False) +@cython.boundscheck(False) +def is_date_array_normalized(const int64_t[:] stamps, tzinfo tz=None): + """ + Check if all of the given (nanosecond) timestamps are normalized to + midnight, i.e. hour == minute == second == 0. If the optional timezone + `tz` is not None, then this is midnight for this timezone. + + Parameters + ---------- + stamps : int64 ndarray + tz : tzinfo or None + + Returns + ------- + is_normalized : bool True if all stamps are normalized + """ + cdef: + Py_ssize_t i, n = len(stamps) + ndarray[int64_t] trans + int64_t[:] deltas + intp_t[:] pos + int64_t local_val, delta + str typ + int64_t day_nanos = 24 * 3600 * 1_000_000_000 + + if tz is None or is_utc(tz): + for i in range(n): + local_val = stamps[i] + if local_val % day_nanos != 0: + return False + + elif is_tzlocal(tz): + for i in range(n): + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + if local_val % day_nanos != 0: + return False + else: + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + delta = deltas[0] + for i in range(n): + # Adjust datetime64 timestamp, recompute datetimestruct + local_val = stamps[i] + delta + if local_val % day_nanos != 0: + return False + + else: + pos = trans.searchsorted(stamps) - 1 + for i in range(n): + # Adjust datetime64 timestamp, recompute datetimestruct + local_val = stamps[i] + deltas[pos[i]] + if local_val % day_nanos != 0: + return False + + return True + + +# ------------------------------------------------------------------------- + + +@cython.wraparound(False) +@cython.boundscheck(False) +def dt64arr_to_periodarr(const int64_t[:] stamps, int freq, tzinfo tz): + cdef: + Py_ssize_t n = len(stamps) + int64_t[:] result = np.empty(n, dtype=np.int64) + ndarray[int64_t] trans + int64_t[:] deltas + Py_ssize_t[:] pos + npy_datetimestruct dts + int64_t local_val + + if is_utc(tz) or tz is None: + with nogil: + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + dt64_to_dtstruct(stamps[i], &dts) + result[i] = get_period_ordinal(&dts, freq) + + elif is_tzlocal(tz): + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + local_val = tz_convert_utc_to_tzlocal(stamps[i], tz) + dt64_to_dtstruct(local_val, &dts) + result[i] = get_period_ordinal(&dts, freq) + else: + # Adjust datetime64 timestamp, recompute datetimestruct + trans, deltas, typ = get_dst_info(tz) + + if typ not in ["pytz", "dateutil"]: + # static/fixed; in this case we know that len(delta) == 1 + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + dt64_to_dtstruct(stamps[i] + deltas[0], &dts) + result[i] = get_period_ordinal(&dts, freq) + else: + pos = trans.searchsorted(stamps, side="right") - 1 + + for i in range(n): + if stamps[i] == NPY_NAT: + result[i] = NPY_NAT + continue + dt64_to_dtstruct(stamps[i] + deltas[pos[i]], &dts) + result[i] = get_period_ordinal(&dts, freq) + + return result.base # .base to get underlying ndarray diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index fcfbaa4ac2a1c..8eac45cdedaec 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -10,7 +10,11 @@ Timestamp, conversion, fields, + get_resolution, iNaT, + ints_to_pydatetime, + is_date_array_normalized, + normalize_i8_timestamps, resolution as libresolution, timezones, to_offset, @@ -526,11 +530,11 @@ def is_normalized(self): """ Returns True if all of the dates are at midnight ("no time") """ - return conversion.is_date_array_normalized(self.asi8, self.tz) + return is_date_array_normalized(self.asi8, self.tz) @property # NB: override with cache_readonly in immutable subclasses def _resolution_obj(self) -> libresolution.Resolution: - return libresolution.get_resolution(self.asi8, self.tz) + return get_resolution(self.asi8, self.tz) # ---------------------------------------------------------------- # Array-Like / EA-Interface Methods @@ -559,7 +563,7 @@ def __iter__(self): for i in range(chunks): start_i = i * chunksize end_i = min((i + 1) * chunksize, length) - converted = tslib.ints_to_pydatetime( + converted = ints_to_pydatetime( data[start_i:end_i], tz=self.tz, freq=self.freq, box="timestamp" ) for v in converted: @@ -991,7 +995,7 @@ def to_pydatetime(self) -> np.ndarray: ------- datetimes : ndarray """ - return tslib.ints_to_pydatetime(self.asi8, tz=self.tz) + return ints_to_pydatetime(self.asi8, tz=self.tz) def normalize(self): """ @@ -1031,7 +1035,7 @@ def normalize(self): '2014-08-01 00:00:00+05:30'], dtype='datetime64[ns, Asia/Calcutta]', freq=None) """ - new_values = conversion.normalize_i8_timestamps(self.asi8, self.tz) + new_values = normalize_i8_timestamps(self.asi8, self.tz) return type(self)(new_values)._with_freq("infer").tz_localize(self.tz) def to_period(self, freq=None): @@ -1219,7 +1223,7 @@ def time(self): else: timestamps = self.asi8 - return tslib.ints_to_pydatetime(timestamps, box="time") + return ints_to_pydatetime(timestamps, box="time") @property def timetz(self): @@ -1227,7 +1231,7 @@ def timetz(self): Returns numpy array of datetime.time also containing timezone information. The time part of the Timestamps. """ - return tslib.ints_to_pydatetime(self.asi8, self.tz, box="time") + return ints_to_pydatetime(self.asi8, self.tz, box="time") @property def date(self): @@ -1243,7 +1247,7 @@ def date(self): else: timestamps = self.asi8 - return tslib.ints_to_pydatetime(timestamps, box="date") + return ints_to_pydatetime(timestamps, box="date") def isocalendar(self): """ diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 4b4df3445be4e..feabaf6436f6c 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -10,6 +10,7 @@ NaTType, Timedelta, delta_to_nanoseconds, + dt64arr_to_periodarr as c_dt64arr_to_periodarr, iNaT, period as libperiod, to_offset, @@ -278,7 +279,7 @@ def dtype(self) -> PeriodDtype: return self._dtype # error: Read-only property cannot override read-write property [misc] - @property # type: ignore + @property def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray. @@ -951,7 +952,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): data = data._values base = freq._period_dtype_code - return libperiod.dt64arr_to_periodarr(data.view("i8"), base, tz), freq + return c_dt64arr_to_periodarr(data.view("i8"), base, tz), freq def _get_ordinal_range(start, end, periods, freq, mult=1): diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d0417d51da497..6b84f0e81f48b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -15,6 +15,7 @@ Timedelta, Timestamp, iNaT, + ints_to_pydatetime, ) from pandas._libs.tslibs.timezones import tz_compare from pandas._typing import ArrayLike, Dtype, DtypeObj @@ -919,7 +920,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False): elif is_datetime64_dtype(arr): if is_object_dtype(dtype): - return tslib.ints_to_pydatetime(arr.view(np.int64)) + return ints_to_pydatetime(arr.view(np.int64)) elif dtype == np.int64: if isna(arr).any(): raise ValueError("Cannot convert NaT values to integer") @@ -1399,7 +1400,7 @@ def maybe_cast_to_datetime(value, dtype, errors: str = "raise"): if value.dtype != DT64NS_DTYPE: value = value.astype(DT64NS_DTYPE) ints = np.asarray(value).view("i8") - return tslib.ints_to_pydatetime(ints) + return ints_to_pydatetime(ints) # we have a non-castable dtype that was passed raise TypeError(f"Cannot cast datetime64 to {dtype}") diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 86c6cdf5b15c7..64cf6e3d973bd 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -5,8 +5,15 @@ import numpy as np -from pandas._libs import NaT, Period, Timestamp, index as libindex, lib, tslib -from pandas._libs.tslibs import Resolution, fields, parsing, timezones, to_offset +from pandas._libs import NaT, Period, Timestamp, index as libindex, lib +from pandas._libs.tslibs import ( + Resolution, + fields, + ints_to_pydatetime, + parsing, + timezones, + to_offset, +) from pandas._libs.tslibs.offsets import prefix_mapping from pandas._typing import DtypeObj, Label from pandas.errors import InvalidIndexError @@ -340,7 +347,7 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: def _mpl_repr(self): # how to represent ourselves to matplotlib - return tslib.ints_to_pydatetime(self.asi8, self.tz) + return ints_to_pydatetime(self.asi8, self.tz) @property def _formatter_func(self): diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py index 840a8c2fb68b1..957706fcb460e 100644 --- a/pandas/tests/tslibs/test_api.py +++ b/pandas/tests/tslibs/test_api.py @@ -18,6 +18,7 @@ def test_namespace(): "period", "resolution", "strptime", + "vectorized", "timedeltas", "timestamps", "timezones", @@ -37,7 +38,12 @@ def test_namespace(): "Resolution", "Tick", "Timedelta", + "dt64arr_to_periodarr", "Timestamp", + "is_date_array_normalized", + "ints_to_pydatetime", + "normalize_i8_timestamps", + "get_resolution", "delta_to_nanoseconds", "ints_to_pytimedelta", "localize_pydatetime", diff --git a/setup.py b/setup.py index e9d305d831653..1885546e001fe 100755 --- a/setup.py +++ b/setup.py @@ -322,6 +322,7 @@ class CheckSDist(sdist_class): "pandas/_libs/tslibs/resolution.pyx", "pandas/_libs/tslibs/parsing.pyx", "pandas/_libs/tslibs/tzconversion.pyx", + "pandas/_libs/tslibs/vectorized.pyx", "pandas/_libs/window/indexers.pyx", "pandas/_libs/writers.pyx", "pandas/io/sas/sas.pyx", @@ -659,6 +660,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"): "pyxfile": "_libs/tslibs/tzconversion", "depends": tseries_depends, }, + "_libs.tslibs.vectorized": {"pyxfile": "_libs/tslibs/vectorized"}, "_libs.testing": {"pyxfile": "_libs/testing"}, "_libs.window.aggregations": { "pyxfile": "_libs/window/aggregations", From 9a650668db04d64375203a86d5bf9639d28655bb Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Jul 2020 14:10:43 -0700 Subject: [PATCH 2/2] restore type: ignore --- pandas/core/arrays/period.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index feabaf6436f6c..b336371655466 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -279,7 +279,7 @@ def dtype(self) -> PeriodDtype: return self._dtype # error: Read-only property cannot override read-write property [misc] - @property + @property # type: ignore def freq(self) -> BaseOffset: """ Return the frequency object for this PeriodArray.