From c77371ff27d0ed8da49495f0ff7fb183b88de831 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Mon, 11 Jun 2018 22:01:20 -0700 Subject: [PATCH 1/4] cythonize parts of resolution --- pandas/_libs/tslibs/resolution.pyx | 48 ++++++++++++++++-------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index d0a9501afe566..e32687bd83f11 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -5,7 +5,7 @@ from cython cimport Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport ndarray, int64_t +from numpy cimport ndarray, int64_t, int32_t cnp.import_array() from util cimport is_string_object, get_nat @@ -44,12 +44,12 @@ cdef int RESO_MIN = 4 cdef int RESO_HR = 5 cdef int RESO_DAY = 6 -_ONE_MICRO = 1000L -_ONE_MILLI = _ONE_MICRO * 1000 -_ONE_SECOND = _ONE_MILLI * 1000 -_ONE_MINUTE = 60 * _ONE_SECOND -_ONE_HOUR = 60 * _ONE_MINUTE -_ONE_DAY = 24 * _ONE_HOUR +_ONE_MICRO = 1000L +_ONE_MILLI = (_ONE_MICRO * 1000) +_ONE_SECOND = (_ONE_MILLI * 1000) +_ONE_MINUTE = (60 * _ONE_SECOND) +_ONE_HOUR = (60 * _ONE_MINUTE) +_ONE_DAY = (24 * _ONE_HOUR) # ---------------------------------------------------------------------- @@ -349,7 +349,7 @@ class Resolution(object): # TODO: this is non performant logic here (and duplicative) and this # simply should call unique_1d directly # plus no reason to depend on khash directly -cdef unique_deltas(ndarray[int64_t] arr): +cdef ndarray[int64_t, ndim=1] unique_deltas(ndarray[int64_t] arr): cdef: Py_ssize_t i, n = len(arr) int64_t val @@ -373,21 +373,22 @@ cdef unique_deltas(ndarray[int64_t] arr): return result -def _is_multiple(us, mult): +cdef inline bint _is_multiple(int64_t us, int64_t mult): return us % mult == 0 -def _maybe_add_count(base, count): +cdef inline str _maybe_add_count(str base, count): if count != 1: return '{count}{base}'.format(count=int(count), base=base) else: return base -class _FrequencyInferer(object): +cdef class _FrequencyInferer(object): """ Not sure if I can avoid the state machine here """ + cdef public index, values, warn, is_monotonic, _cache def __init__(self, index, warn=True): self.index = index @@ -475,12 +476,15 @@ class _FrequencyInferer(object): def rep_stamp(self): return Timestamp(self.values[0]) - def month_position_check(self): + cdef month_position_check(self): # TODO: cythonize this, very slow - calendar_end = True - business_end = True - calendar_start = True - business_start = True + cdef: + int32_t daysinmonth, y, m, d + bint calendar_end = True + bint business_end = True + bint calendar_start = True + bint business_start = True + bint cal years = self.fields['Y'] months = self.fields['M'] @@ -525,7 +529,7 @@ class _FrequencyInferer(object): def ydiffs(self): return unique_deltas(self.fields['Y'].astype('i8')) - def _infer_daily_rule(self): + cdef _infer_daily_rule(self): annual_rule = self._get_annual_rule() if annual_rule: nyears = self.ydiffs[0] @@ -562,7 +566,7 @@ class _FrequencyInferer(object): if wom_rule: return wom_rule - def _get_annual_rule(self): + cdef _get_annual_rule(self): if len(self.ydiffs) > 1: return None @@ -573,7 +577,7 @@ class _FrequencyInferer(object): return {'cs': 'AS', 'bs': 'BAS', 'ce': 'A', 'be': 'BA'}.get(pos_check) - def _get_quarterly_rule(self): + cdef _get_quarterly_rule(self): if len(self.mdiffs) > 1: return None @@ -584,14 +588,14 @@ class _FrequencyInferer(object): return {'cs': 'QS', 'bs': 'BQS', 'ce': 'Q', 'be': 'BQ'}.get(pos_check) - def _get_monthly_rule(self): + cdef _get_monthly_rule(self): if len(self.mdiffs) > 1: return None pos_check = self.month_position_check() return {'cs': 'MS', 'bs': 'BMS', 'ce': 'M', 'be': 'BM'}.get(pos_check) - def _is_business_daily(self): + cdef bint _is_business_daily(self): # quick check: cannot be business daily if self.day_deltas != [1, 3]: return False @@ -604,7 +608,7 @@ class _FrequencyInferer(object): return np.all(((weekdays == 0) & (shifts == 3)) | ((weekdays > 0) & (weekdays <= 4) & (shifts == 1))) - def _get_wom_rule(self): + cdef _get_wom_rule(self): # wdiffs = unique(np.diff(self.index.week)) # We also need -47, -49, -48 to catch index spanning year boundary # if not lib.ismember(wdiffs, set([4, 5, -47, -49, -48])).all(): From 84d911460037585869bc23ae27c04b885239c1da Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 12 Jun 2018 12:27:00 -0700 Subject: [PATCH 2/4] add more types --- pandas/_libs/tslibs/resolution.pyx | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index e32687bd83f11..8a12799b1756a 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -388,7 +388,12 @@ cdef class _FrequencyInferer(object): """ Not sure if I can avoid the state machine here """ - cdef public index, values, warn, is_monotonic, _cache + cdef public: + index + values + bint warn + bint is_monotonic + dict _cache def __init__(self, index, warn=True): self.index = index @@ -485,10 +490,14 @@ cdef class _FrequencyInferer(object): bint calendar_start = True bint business_start = True bint cal - - years = self.fields['Y'] - months = self.fields['M'] - days = self.fields['D'] + int32_t[:] years + int32_t[:] months + int32_t[:] days + + fields = self.fields + years = fields['Y'] + months = fields['M'] + days = fields['D'] weekdays = self.index.dayofweek from calendar import monthrange From d2dc1098e40995beb3ec5f62bf985196fe3fb184 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Tue, 12 Jun 2018 19:41:58 -0700 Subject: [PATCH 3/4] cdef TimedeltaFrequencyInferrer to fix test errors --- pandas/_libs/tslibs/resolution.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 8a12799b1756a..196b19d4266b2 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -377,9 +377,9 @@ cdef inline bint _is_multiple(int64_t us, int64_t mult): return us % mult == 0 -cdef inline str _maybe_add_count(str base, count): +cdef inline str _maybe_add_count(str base, int64_t count): if count != 1: - return '{count}{base}'.format(count=int(count), base=base) + return '{count}{base}'.format(count=count, base=base) else: return base @@ -640,9 +640,9 @@ cdef class _FrequencyInferer(object): return 'WOM-{week}{weekday}'.format(week=week, weekday=wd) -class _TimedeltaFrequencyInferer(_FrequencyInferer): +cdef class _TimedeltaFrequencyInferer(_FrequencyInferer): - def _infer_daily_rule(self): + cdef _infer_daily_rule(self): if self.is_unique: days = self.deltas[0] / _ONE_DAY if days % 7 == 0: From bf67f17d883c89f1f6757a63e38787453989d151 Mon Sep 17 00:00:00 2001 From: Brock Mendel Date: Wed, 13 Jun 2018 16:05:19 -0700 Subject: [PATCH 4/4] type as objects explicitly --- pandas/_libs/tslibs/resolution.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/tslibs/resolution.pyx b/pandas/_libs/tslibs/resolution.pyx index 196b19d4266b2..7d6dcb9ecb831 100644 --- a/pandas/_libs/tslibs/resolution.pyx +++ b/pandas/_libs/tslibs/resolution.pyx @@ -389,8 +389,8 @@ cdef class _FrequencyInferer(object): Not sure if I can avoid the state machine here """ cdef public: - index - values + object index + object values bint warn bint is_monotonic dict _cache