From 0f3f668b97e15a22380967364c0d008aef46a4cc Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Jul 2020 15:59:48 -0700 Subject: [PATCH 01/11] PERF: periodarr_to_dt64arr --- asv_bench/benchmarks/tslibs/period.py | 38 ++++++++++++++++++++++- pandas/_libs/tslibs/period.pyx | 43 ++++++++++++++++++++------- 2 files changed, 69 insertions(+), 12 deletions(-) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 9156c4aa90ea0..1734f071220fe 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -2,10 +2,15 @@ Period benchmarks that rely only on tslibs. See benchmarks.period for Period benchmarks that rely on other parts fo pandas. """ -from pandas import Period + +import numpy as np + +from pandas._libs.tslibs.period import Period, periodarr_to_dt64arr from pandas.tseries.frequencies import to_offset +from .tslib import _sizes + class PeriodProperties: @@ -68,3 +73,34 @@ def setup(self, freq, is_offset): def time_period_constructor(self, freq, is_offset): Period("2012-06-01", freq=freq) + + +class TimePeriodArrToDT64Arr: + params = [ + _sizes, + [ + 1000, + 1011, # Annual - November End + 2000, + 2011, # Quarterly - November End + 3000, + 4000, + 4006, # Weekly - Saturday End + 5000, + 6000, + 7000, + 8000, + 9000, + 10000, + 11000, + 12000, + ], + ] + param_names = ["size", "freq"] + + def setup(self, size, freq): + arr = np.arange(size, dtype="i8") + self.i8values = arr + + def time_periodarray_to_dt64arr(self, size, freq): + periodarr_to_dt64arr(self.i8values, freq) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index c0641297c4b8a..2cccc4865a613 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -56,6 +56,7 @@ from pandas._libs.tslibs.ccalendar cimport ( get_days_in_month, ) from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS +from pandas._libs.tslibs.conversion import ensure_datetime64ns from pandas._libs.tslibs.dtypes cimport ( PeriodDtypeBase, @@ -937,7 +938,7 @@ cdef inline int month_to_quarter(int month) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq): +def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: """ Convert array to datetime64 values from a set of ordinals corresponding to periods per period convention. @@ -945,13 +946,32 @@ def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq): cdef: int64_t[:] out Py_ssize_t i, l + npy_datetimestruct dts l = len(periodarr) - out = np.empty(l, dtype='i8') + if FR_NS >= freq >= FR_DAY: + if freq == FR_NS: + return periodarr + + if freq == FR_US: + dta = periodarr.view("M8[us]") + elif freq == FR_MS: + dta = periodarr.view("M8[ms]") + elif freq == FR_SEC: + dta = periodarr.view("M8[s]") + elif freq == FR_MIN: + dta = periodarr.view("M8[m]") + elif freq == FR_HR: + dta = periodarr.view("M8[h]") + elif freq == FR_DAY: + dta = periodarr.view("M8[D]") + return ensure_datetime64ns(dta) + + out = np.empty(l, dtype="i8") for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq) + out[i] = period_ordinal_to_dt64(periodarr[i], freq, &dts) return out.base # .base to access underlying np.ndarray @@ -1104,17 +1124,15 @@ cpdef int64_t period_ordinal(int y, int m, int d, int h, int min, return get_period_ordinal(&dts, freq) -cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: - cdef: - npy_datetimestruct dts - +cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq, + npy_datetimestruct* dts) except? -1: if ordinal == NPY_NAT: return NPY_NAT - get_date_info(ordinal, freq, &dts) + get_date_info(ordinal, freq, dts) - check_dts_bounds(&dts) - return dtstruct_to_dt64(&dts) + check_dts_bounds(dts) + return dtstruct_to_dt64(dts) cdef str period_format(int64_t value, int freq, object fmt=None): @@ -1735,6 +1753,9 @@ cdef class _Period(PeriodMixin): ------- Timestamp """ + cdef: + npy_datetimestruct dts + if tz is not None: # GH#34522 warnings.warn( @@ -1765,7 +1786,7 @@ cdef class _Period(PeriodMixin): val = self.asfreq(freq, how) - dt64 = period_ordinal_to_dt64(val.ordinal, base) + dt64 = period_ordinal_to_dt64(val.ordinal, base, &dts) return Timestamp(dt64, tz=tz) @property From c35b798d20bc3a952aced7d8e26e1bec90ece3c0 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Jul 2020 17:19:16 -0700 Subject: [PATCH 02/11] Fix benchmark for low-freq cases --- asv_bench/benchmarks/tslibs/period.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 1734f071220fe..1a2c89b48c665 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -99,7 +99,7 @@ class TimePeriodArrToDT64Arr: param_names = ["size", "freq"] def setup(self, size, freq): - arr = np.arange(size, dtype="i8") + arr = np.arange(10, dtype="i8").repeat(size // 10) self.i8values = arr def time_periodarray_to_dt64arr(self, size, freq): From 96df731b88640c32639bd526dc7fe5c9c68b1c4b Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Jul 2020 17:24:04 -0700 Subject: [PATCH 03/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 2cccc4865a613..64dd14017f842 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -948,9 +948,7 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: Py_ssize_t i, l npy_datetimestruct dts - l = len(periodarr) - - if FR_NS >= freq >= FR_DAY: + if freq >= FR_DAY: if freq == FR_NS: return periodarr @@ -968,6 +966,7 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: dta = periodarr.view("M8[D]") return ensure_datetime64ns(dta) + l = len(periodarr) out = np.empty(l, dtype="i8") for i in range(l): From a0ece778600e2b1b27327232d881948d51bbcb35 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 7 Jul 2020 18:28:08 -0700 Subject: [PATCH 04/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 64dd14017f842..ecdac94725230 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -946,7 +946,6 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: cdef: int64_t[:] out Py_ssize_t i, l - npy_datetimestruct dts if freq >= FR_DAY: if freq == FR_NS: @@ -970,7 +969,7 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: out = np.empty(l, dtype="i8") for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq, &dts) + out[i] = period_ordinal_to_dt64(periodarr[i], freq) return out.base # .base to access underlying np.ndarray @@ -1123,15 +1122,17 @@ cpdef int64_t period_ordinal(int y, int m, int d, int h, int min, return get_period_ordinal(&dts, freq) -cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq, - npy_datetimestruct* dts) except? -1: +cdef int64_t period_ordinal_to_dt64(int64_t ordinal, int freq) except? -1: + cdef: + npy_datetimestruct dts + if ordinal == NPY_NAT: return NPY_NAT - get_date_info(ordinal, freq, dts) + get_date_info(ordinal, freq, &dts) - check_dts_bounds(dts) - return dtstruct_to_dt64(dts) + check_dts_bounds(&dts) + return dtstruct_to_dt64(&dts) cdef str period_format(int64_t value, int freq, object fmt=None): @@ -1752,9 +1753,6 @@ cdef class _Period(PeriodMixin): ------- Timestamp """ - cdef: - npy_datetimestruct dts - if tz is not None: # GH#34522 warnings.warn( @@ -1785,7 +1783,7 @@ cdef class _Period(PeriodMixin): val = self.asfreq(freq, how) - dt64 = period_ordinal_to_dt64(val.ordinal, base, &dts) + dt64 = period_ordinal_to_dt64(val.ordinal, base) return Timestamp(dt64, tz=tz) @property From d7e6baf88889e3befd35963302a9272bb3f43c53 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 09:47:59 -0700 Subject: [PATCH 05/11] PERF: cast --- pandas/_libs/tslibs/period.pyx | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index ecdac94725230..ce2af488f7638 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -947,7 +947,8 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: int64_t[:] out Py_ssize_t i, l - if freq >= FR_DAY: + if freq >= FR_DAY: + # Short-circuit for performance if freq == FR_NS: return periodarr @@ -968,6 +969,7 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: l = len(periodarr) out = np.empty(l, dtype="i8") + # We get here with freqs that do not correspond to a datetime64 unit for i in range(l): out[i] = period_ordinal_to_dt64(periodarr[i], freq) From f76eccf72b81076e8df76ce15e8ac10b931f1638 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 09:58:08 -0700 Subject: [PATCH 06/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index ce2af488f7638..abb073ee2a1c2 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -947,7 +947,7 @@ def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: int64_t[:] out Py_ssize_t i, l - if freq >= FR_DAY: + if freq >= 6000: # i.e. FR_DAY, hard-code to avoid need to cast # Short-circuit for performance if freq == FR_NS: return periodarr From e265bcdaafa8ba87715886802c7c38c8ceedb8c3 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 10:09:24 -0700 Subject: [PATCH 07/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index abb073ee2a1c2..e35b0c10cae23 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -938,7 +938,7 @@ cdef inline int month_to_quarter(int month) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def periodarr_to_dt64arr(periodarr: ndarray, freq: int) -> ndarray: +def periodarr_to_dt64arr(ndarray periodarr, int freq): """ Convert array to datetime64 values from a set of ordinals corresponding to periods per period convention. From b625c1b9950cb3b4f0a409b9a6283c2366230032 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 10:21:31 -0700 Subject: [PATCH 08/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e35b0c10cae23..5ef1550796df8 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -966,14 +966,15 @@ def periodarr_to_dt64arr(ndarray periodarr, int freq): dta = periodarr.view("M8[D]") return ensure_datetime64ns(dta) - l = len(periodarr) - out = np.empty(l, dtype="i8") + else; + l = len(periodarr) + out = np.empty(l, dtype="i8") - # We get here with freqs that do not correspond to a datetime64 unit - for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq) + # We get here with freqs that do not correspond to a datetime64 unit + for i in range(l): + out[i] = period_ordinal_to_dt64(periodarr[i], freq) - return out.base # .base to access underlying np.ndarray + return out.base # .base to access underlying np.ndarray cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): From d81bbb9e48e8993168265baaaa97f3968149028b Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 10:22:13 -0700 Subject: [PATCH 09/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index 5ef1550796df8..e25c5095e4b87 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -966,7 +966,7 @@ def periodarr_to_dt64arr(ndarray periodarr, int freq): dta = periodarr.view("M8[D]") return ensure_datetime64ns(dta) - else; + else: l = len(periodarr) out = np.empty(l, dtype="i8") From de221d655034e0dfd953b84a3fee4b997af3d0a6 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 10:28:32 -0700 Subject: [PATCH 10/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e25c5095e4b87..b60665c134b86 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -947,7 +947,17 @@ def periodarr_to_dt64arr(ndarray periodarr, int freq): int64_t[:] out Py_ssize_t i, l - if freq >= 6000: # i.e. FR_DAY, hard-code to avoid need to cast + if freq < 6000: # i.e. FR_DAY, hard-code to avoid need to cast + l = len(periodarr) + out = np.empty(l, dtype="i8") + + # We get here with freqs that do not correspond to a datetime64 unit + for i in range(l): + out[i] = period_ordinal_to_dt64(periodarr[i], freq) + + return out.base # .base to access underlying np.ndarray + + else: # Short-circuit for performance if freq == FR_NS: return periodarr @@ -966,16 +976,6 @@ def periodarr_to_dt64arr(ndarray periodarr, int freq): dta = periodarr.view("M8[D]") return ensure_datetime64ns(dta) - else: - l = len(periodarr) - out = np.empty(l, dtype="i8") - - # We get here with freqs that do not correspond to a datetime64 unit - for i in range(l): - out[i] = period_ordinal_to_dt64(periodarr[i], freq) - - return out.base # .base to access underlying np.ndarray - cpdef int64_t period_asfreq(int64_t ordinal, int freq1, int freq2, bint end): """ From 3fe9e107614f5af4f89d89e9bb808cf5f1f3682d Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 8 Jul 2020 10:38:22 -0700 Subject: [PATCH 11/11] troubleshoot perf --- pandas/_libs/tslibs/period.pyx | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index b60665c134b86..dbfb26784be63 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -938,7 +938,7 @@ cdef inline int month_to_quarter(int month) nogil: @cython.wraparound(False) @cython.boundscheck(False) -def periodarr_to_dt64arr(ndarray periodarr, int freq): +def periodarr_to_dt64arr(const int64_t[:] periodarr, int freq): """ Convert array to datetime64 values from a set of ordinals corresponding to periods per period convention. @@ -960,20 +960,20 @@ def periodarr_to_dt64arr(ndarray periodarr, int freq): else: # Short-circuit for performance if freq == FR_NS: - return periodarr + return periodarr.base if freq == FR_US: - dta = periodarr.view("M8[us]") + dta = periodarr.base.view("M8[us]") elif freq == FR_MS: - dta = periodarr.view("M8[ms]") + dta = periodarr.base.view("M8[ms]") elif freq == FR_SEC: - dta = periodarr.view("M8[s]") + dta = periodarr.base.view("M8[s]") elif freq == FR_MIN: - dta = periodarr.view("M8[m]") + dta = periodarr.base.view("M8[m]") elif freq == FR_HR: - dta = periodarr.view("M8[h]") + dta = periodarr.base.view("M8[h]") elif freq == FR_DAY: - dta = periodarr.view("M8[D]") + dta = periodarr.base.view("M8[D]") return ensure_datetime64ns(dta)