From 68f626872676604f34190580df23d9653a389e03 Mon Sep 17 00:00:00 2001 From: bjonen Date: Tue, 16 Sep 2014 00:37:37 +0200 Subject: [PATCH] PERF: Improve performance of CustomBusinessDay --- doc/source/v0.15.0.txt | 1 + pandas/tseries/offsets.py | 117 ++++++++++++------- pandas/tseries/tests/data/cday-0.14.1.pickle | Bin 0 -> 492 bytes pandas/tseries/tests/test_offsets.py | 44 ++++++- vb_suite/timeseries.py | 35 +++++- 5 files changed, 145 insertions(+), 52 deletions(-) create mode 100644 pandas/tseries/tests/data/cday-0.14.1.pickle diff --git a/doc/source/v0.15.0.txt b/doc/source/v0.15.0.txt index 9d03b7b38bea7..5f8c2e7dcd30f 100644 --- a/doc/source/v0.15.0.txt +++ b/doc/source/v0.15.0.txt @@ -833,6 +833,7 @@ Performance - Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`) - Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`). - Performance benchmarking of groupby for large value of ngroups (:issue:`6787`) +- Performance improvement in ``CustomBusinessDay``, ``CustomBusinessMonth`` (:issue:`8236`) diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index 8bb5584fee7a7..55aad38c10fae 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -225,12 +225,12 @@ def _should_cache(self): return self.isAnchored() and self._cacheable def _params(self): - attrs = [(k, v) for k, v in compat.iteritems(vars(self)) - if (k not in ['kwds', 'name', 'normalize', - 'busdaycalendar']) and (k[0] != '_')] - attrs.extend(list(self.kwds.items())) + all_paras = dict(list(vars(self).items()) + list(self.kwds.items())) + if 'holidays' in all_paras and not all_paras['holidays']: + all_paras.pop('holidays') + exclude = ['kwds', 'name','normalize', 'calendar'] + attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude ) and (k[0] != '_')] attrs = sorted(set(attrs)) - params = tuple([str(self.__class__)] + attrs) return params @@ -547,38 +547,57 @@ class CustomBusinessDay(BusinessDay): holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` - calendar : HolidayCalendar instance - instance of AbstractHolidayCalendar that provide the list of holidays + calendar : pd.HolidayCalendar or np.busdaycalendar """ - _cacheable = False _prefix = 'C' - def __init__(self, n=1, normalize=False, **kwds): + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') - - if 'calendar' in kwds: - holidays = kwds['calendar'].holidays() - else: - holidays = kwds.get('holidays', []) + calendar, holidays = self.get_calendar(weekmask=weekmask, + holidays=holidays, + calendar=calendar) + # CustomBusinessDay instances are identified by the + # following two attributes. See DateOffset._params() + # holidays, weekmask + + self.kwds['weekmask'] = self.weekmask = weekmask + self.kwds['holidays'] = self.holidays = holidays + self.kwds['calendar'] = self.calendar = calendar + + def get_calendar(self, weekmask, holidays, calendar): + '''Generate busdaycalendar''' + if isinstance(calendar, np.busdaycalendar): + if not holidays: + holidays = tuple(calendar.holidays) + elif not isinstance(holidays, tuple): + holidays = tuple(holidays) + else: + # trust that calendar.holidays and holidays are + # consistent + pass + return calendar, holidays + + if holidays is None: + holidays = [] + try: + holidays = holidays + calendar.holidays().tolist() + except AttributeError: + pass holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in holidays] - self.holidays = tuple(sorted(holidays)) - self.kwds['holidays'] = self.holidays + holidays = tuple(sorted(holidays)) - self._set_busdaycalendar() + kwargs = {'weekmask': weekmask} + if holidays: + kwargs['holidays'] = holidays - def _set_busdaycalendar(self): - if self.holidays: - kwargs = {'weekmask':self.weekmask,'holidays':self.holidays} - else: - kwargs = {'weekmask':self.weekmask} try: - self.busdaycalendar = np.busdaycalendar(**kwargs) + busdaycalendar = np.busdaycalendar(**kwargs) except: # Check we have the required numpy version from distutils.version import LooseVersion @@ -589,17 +608,23 @@ def _set_busdaycalendar(self): np.__version__) else: raise + return busdaycalendar, holidays def __getstate__(self): """Return a pickleable state""" state = self.__dict__.copy() - del state['busdaycalendar'] + del state['calendar'] return state def __setstate__(self, state): """Reconstruct an instance from a pickled state""" self.__dict__ = state - self._set_busdaycalendar() + calendar, holidays = self.get_calendar(weekmask=self.weekmask, + holidays=self.holidays, + calendar=None) + self.kwds['calendar'] = self.calendar = calendar + self.kwds['holidays'] = self.holidays = holidays + self.kwds['weekmask'] = state['weekmask'] @apply_wraps def apply(self, other): @@ -613,7 +638,7 @@ def apply(self, other): np_dt = np.datetime64(date_in.date()) np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll, - busdaycal=self.busdaycalendar) + busdaycal=self.calendar) dt_date = np_incr_dt.astype(datetime) result = datetime.combine(dt_date, date_in.time()) @@ -635,7 +660,6 @@ def _to_dt64(dt, dtype='datetime64'): # > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]') # numpy.datetime64('2013-05-01T02:00:00.000000+0200') # Thus astype is needed to cast datetime to datetime64[D] - if getattr(dt, 'tzinfo', None) is not None: i8 = tslib.pydt_to_i8(dt) dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo) @@ -649,7 +673,7 @@ def onOffset(self, dt): if self.normalize and not _is_normalized(dt): return False day64 = self._to_dt64(dt,'datetime64[D]') - return np.is_busday(day64, busdaycal=self.busdaycalendar) + return np.is_busday(day64, busdaycal=self.calendar) class MonthOffset(SingleConstructorOffset): @@ -767,7 +791,6 @@ def onOffset(self, dt): _prefix = 'BMS' - class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): """ **EXPERIMENTAL** DateOffset of one custom business month @@ -788,18 +811,22 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset): holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar """ _cacheable = False _prefix = 'CBM' - def __init__(self, n=1, normalize=False, **kwds): + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') - self.cbday = CustomBusinessDay(n=self.n, **kwds) - self.m_offset = MonthEnd() + self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, + weekmask=weekmask, holidays=holidays, + calendar=calendar, **kwds) + self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds) + self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar @apply_wraps def apply(self,other): @@ -817,11 +844,11 @@ def apply(self,other): n -= 1 elif other > cur_cmend and n <= -1: n += 1 - - new = cur_mend + n * MonthEnd() + + new = cur_mend + n * self.m_offset result = self.cbday.rollback(new) return result - + class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): """ **EXPERIMENTAL** DateOffset of one custom business month @@ -842,18 +869,22 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset): holidays : list list/array of dates to exclude from the set of valid business days, passed to ``numpy.busdaycalendar`` + calendar : pd.HolidayCalendar or np.busdaycalendar """ _cacheable = False _prefix = 'CBMS' - def __init__(self, n=1, normalize=False, **kwds): + def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri', + holidays=None, calendar=None, **kwds): self.n = int(n) self.normalize = normalize self.kwds = kwds self.offset = kwds.get('offset', timedelta(0)) - self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri') - self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds) - self.m_offset = MonthBegin(normalize=normalize) + self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, + weekmask=weekmask, holidays=holidays, + calendar=calendar, **kwds) + self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds) + self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar @apply_wraps def apply(self,other): @@ -872,8 +903,8 @@ def apply(self,other): n += 1 elif dt_in < cur_cmbegin and n >= 1: n -= 1 - - new = cur_mbegin + n * MonthBegin() + + new = cur_mbegin + n * self.m_offset result = self.cbday.rollforward(new) return result diff --git a/pandas/tseries/tests/data/cday-0.14.1.pickle b/pandas/tseries/tests/data/cday-0.14.1.pickle new file mode 100644 index 0000000000000000000000000000000000000000..48488099482e4e8823cdc89371c7d567e415419c GIT binary patch literal 492 zcmb8syH5f^5C(7-5svi@KH4kK=4uOVkXU$ug0L~MTW*KkhI=qOyCG+Ci8dJjYCWJa zbbOOcGs&;`=Cc_uWv(QJmcVGBu$a#YEb(VYw#WvKbPhyAS#o6eU)ZehC=lJeFqusC z2Vm<*v=%ExcwAvycuA3JB}>9i5a%vimSJ2N2s4*jEsd!eie)MdM{jrM%9cyu*cHvu zL)3`a_XK$mwFG+{kH@dSb+#*NRDoc_h;7EM?M!&@PmOKX6Ff=0}GsGho|8^Q9332z0k$RsWBR`f?j?oS=~ UZ=n2X;`|q*@h<5kJvR&g0L;>%1poj5 literal 0 HcmV?d00001 diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index b3764b73b15ac..3b2e8f203c313 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -1,3 +1,4 @@ +import os from datetime import date, datetime, timedelta from dateutil.relativedelta import relativedelta from pandas.compat import range @@ -22,6 +23,7 @@ from pandas.tseries.tools import parse_time_string import pandas.tseries.offsets as offsets +from pandas.io.pickle import read_pickle from pandas.tslib import NaT, Timestamp import pandas.tslib as tslib from pandas.util.testing import assertRaisesRegexp @@ -848,6 +850,24 @@ def test_calendar(self): dt = datetime(2014, 1, 17) assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21)) + def test_roundtrip_pickle(self): + def _check_roundtrip(obj): + unpickled = self.round_trip_pickle(obj) + self.assertEqual(unpickled, obj) + _check_roundtrip(self.offset) + _check_roundtrip(self.offset2) + _check_roundtrip(self.offset*2) + + def test_pickle_compat_0_14_1(self): + hdays = [datetime(2013,1,1) for ele in range(4)] + + pth = tm.get_data_path() + + cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle')) + cday = CDay(holidays=hdays) + self.assertEqual(cday, cday0_14_1) + + class CustomBusinessMonthBase(object): _multiprocess_can_split_ = True @@ -894,6 +914,15 @@ def test_offsets_compare_equal(self): offset2 = self._object() self.assertFalse(offset1 != offset2) + def test_roundtrip_pickle(self): + def _check_roundtrip(obj): + unpickled = self.round_trip_pickle(obj) + self.assertEqual(unpickled, obj) + _check_roundtrip(self._object()) + _check_roundtrip(self._object(2)) + _check_roundtrip(self._object()*2) + + class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base): _object = CBMonthEnd @@ -1006,8 +1035,12 @@ def test_holidays(self): def test_datetimeindex(self): from pandas.tseries.holiday import USFederalHolidayCalendar - self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0], - datetime(2012,1,31)) + hcal = USFederalHolidayCalendar() + freq = CBMonthEnd(calendar=hcal) + + self.assertEqual(DatetimeIndex(start='20120101',end='20130101', + freq=freq).tolist()[0], + datetime(2012,1,31)) class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base): _object = CBMonthBegin @@ -1120,8 +1153,11 @@ def test_holidays(self): self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3)) def test_datetimeindex(self): - self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0], - datetime(2012,1,3)) + hcal = USFederalHolidayCalendar() + cbmb = CBMonthBegin(calendar=hcal) + self.assertEqual(DatetimeIndex(start='20120101', end='20130101', + freq=cbmb).tolist()[0], + datetime(2012,1,3)) def assertOnOffset(offset, date, expected): diff --git a/vb_suite/timeseries.py b/vb_suite/timeseries.py index bb55b88cf1f34..c67cdabdc1a06 100644 --- a/vb_suite/timeseries.py +++ b/vb_suite/timeseries.py @@ -285,15 +285,20 @@ def date_range(start=None, end=None, periods=None, freq=None): setup = common_setup + """ import datetime as dt import pandas as pd +import pandas.tseries.holiday import numpy as np date = dt.datetime(2011,1,1) dt64 = np.datetime64('2011-01-01 09:00Z') +hcal = pd.tseries.holiday.USFederalHolidayCalendar() day = pd.offsets.Day() year = pd.offsets.YearBegin() cday = pd.offsets.CustomBusinessDay() -cme = pd.offsets.CustomBusinessMonthEnd() +cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal) +cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal) + +cdayh = pd.offsets.CustomBusinessDay(calendar=hcal) """ timeseries_day_incr = Benchmark("date + day",setup) @@ -306,15 +311,26 @@ def date_range(start=None, end=None, periods=None, freq=None): timeseries_custom_bday_incr = \ Benchmark("date + cday",setup) +timeseries_custom_bday_decr = \ + Benchmark("date - cday",setup) + timeseries_custom_bday_apply = \ Benchmark("cday.apply(date)",setup) timeseries_custom_bday_apply_dt64 = \ Benchmark("cday.apply(dt64)",setup) -# Increment by n -timeseries_custom_bday_incr_n = \ - Benchmark("date + 10 * cday",setup) +timeseries_custom_bday_cal_incr = \ + Benchmark("date + 1 * cdayh",setup) + +timeseries_custom_bday_cal_decr = \ + Benchmark("date - 1 * cdayh",setup) + +timeseries_custom_bday_cal_incr_n = \ + Benchmark("date + 10 * cdayh",setup) + +timeseries_custom_bday_cal_incr_neg_n = \ + Benchmark("date - 10 * cdayh",setup) # Increment custom business month timeseries_custom_bmonthend_incr = \ @@ -323,6 +339,16 @@ def date_range(start=None, end=None, periods=None, freq=None): timeseries_custom_bmonthend_incr_n = \ Benchmark("date + 10 * cme",setup) +timeseries_custom_bmonthend_decr_n = \ + Benchmark("date - 10 * cme",setup) + +timeseries_custom_bmonthbegin_incr_n = \ + Benchmark("date + 10 * cmb",setup) + +timeseries_custom_bmonthbegin_decr_n = \ + Benchmark("date - 10 * cmb",setup) + + #---------------------------------------------------------------------- # month/quarter/year start/end accessors @@ -357,4 +383,3 @@ def iter_n(iterable, n=None): timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup) timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup) -