Skip to content

PERF: Improve performance CustmBusinessDay - 2nd #8293

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 4, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,7 @@ Performance
- Performance improvements in groupby ``.agg`` and ``.apply`` where builtins max/min were not mapped to numpy/cythonized versions (:issue:`7722`)
- Performance improvement in writing to sql (``to_sql``) of up to 50% (:issue:`8208`).
- Performance benchmarking of groupby for large value of ngroups (:issue:`6787`)
- Performance improvement in ``CustomBusinessDay``, ``CustomBusinessMonth`` (:issue:`8236`)



Expand Down
117 changes: 74 additions & 43 deletions pandas/tseries/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,12 @@ def _should_cache(self):
return self.isAnchored() and self._cacheable

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can prob be cached_readonly

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean by this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's a decorator that will cache the property (search for it see in action)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@cache_readonly
def _should_cache(self):
         return self.isAnchored() and self._cacheable

Is this what you are looking for or do you want to change something about the class variable _cacheable?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no I was referring to the kwargs that are passed in, but maybe moot depending what else you changed

def _params(self):
attrs = [(k, v) for k, v in compat.iteritems(vars(self))
if (k not in ['kwds', 'name', 'normalize',
'busdaycalendar']) and (k[0] != '_')]
attrs.extend(list(self.kwds.items()))
all_paras = dict(list(vars(self).items()) + list(self.kwds.items()))
if 'holidays' in all_paras and not all_paras['holidays']:
all_paras.pop('holidays')
exclude = ['kwds', 'name','normalize', 'calendar']
attrs = [(k, v) for k, v in all_paras.items() if (k not in exclude ) and (k[0] != '_')]
attrs = sorted(set(attrs))

params = tuple([str(self.__class__)] + attrs)
return params

Expand Down Expand Up @@ -547,38 +547,57 @@ class CustomBusinessDay(BusinessDay):
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : HolidayCalendar instance
instance of AbstractHolidayCalendar that provide the list of holidays
calendar : pd.HolidayCalendar or np.busdaycalendar
"""

_cacheable = False
_prefix = 'C'

def __init__(self, n=1, normalize=False, **kwds):
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
holidays=None, calendar=None, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')

if 'calendar' in kwds:
holidays = kwds['calendar'].holidays()
else:
holidays = kwds.get('holidays', [])
calendar, holidays = self.get_calendar(weekmask=weekmask,
holidays=holidays,
calendar=calendar)
# CustomBusinessDay instances are identified by the
# following two attributes. See DateOffset._params()
# holidays, weekmask

self.kwds['weekmask'] = self.weekmask = weekmask
self.kwds['holidays'] = self.holidays = holidays
self.kwds['calendar'] = self.calendar = calendar

def get_calendar(self, weekmask, holidays, calendar):
'''Generate busdaycalendar'''
if isinstance(calendar, np.busdaycalendar):
if not holidays:
holidays = tuple(calendar.holidays)
elif not isinstance(holidays, tuple):
holidays = tuple(holidays)
else:
# trust that calendar.holidays and holidays are
# consistent
pass
return calendar, holidays

if holidays is None:
holidays = []
try:
holidays = holidays + calendar.holidays().tolist()
except AttributeError:
pass
holidays = [self._to_dt64(dt, dtype='datetime64[D]') for dt in
holidays]
self.holidays = tuple(sorted(holidays))
self.kwds['holidays'] = self.holidays
holidays = tuple(sorted(holidays))

self._set_busdaycalendar()
kwargs = {'weekmask': weekmask}
if holidays:
kwargs['holidays'] = holidays

def _set_busdaycalendar(self):
if self.holidays:
kwargs = {'weekmask':self.weekmask,'holidays':self.holidays}
else:
kwargs = {'weekmask':self.weekmask}
try:
self.busdaycalendar = np.busdaycalendar(**kwargs)
busdaycalendar = np.busdaycalendar(**kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why use kwds here
better to just pass the arguments no?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You cannot pass an empty list of holidays to np.busdaycalendar - raises an error. So to avoid repeating the try statement kwargs are built up

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok

except:
# Check we have the required numpy version
from distutils.version import LooseVersion
Expand All @@ -589,17 +608,23 @@ def _set_busdaycalendar(self):
np.__version__)
else:
raise
return busdaycalendar, holidays

def __getstate__(self):
"""Return a pickleable state"""
state = self.__dict__.copy()
del state['busdaycalendar']
del state['calendar']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need some pickle tests (and maybe some for compat with 0.14.1)
use self.round_trip_pickle (in the test case); is calendar always defined here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

calendar is set in init

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry thought this was something else

return state

def __setstate__(self, state):
"""Reconstruct an instance from a pickled state"""
self.__dict__ = state
self._set_busdaycalendar()
calendar, holidays = self.get_calendar(weekmask=self.weekmask,
holidays=self.holidays,
calendar=None)
self.kwds['calendar'] = self.calendar = calendar
self.kwds['holidays'] = self.holidays = holidays
self.kwds['weekmask'] = state['weekmask']

@apply_wraps
def apply(self, other):
Expand All @@ -613,7 +638,7 @@ def apply(self, other):
np_dt = np.datetime64(date_in.date())

np_incr_dt = np.busday_offset(np_dt, self.n, roll=roll,
busdaycal=self.busdaycalendar)
busdaycal=self.calendar)

dt_date = np_incr_dt.astype(datetime)
result = datetime.combine(dt_date, date_in.time())
Expand All @@ -635,7 +660,6 @@ def _to_dt64(dt, dtype='datetime64'):
# > np.datetime64(dt.datetime(2013,5,1),dtype='datetime64[D]')
# numpy.datetime64('2013-05-01T02:00:00.000000+0200')
# Thus astype is needed to cast datetime to datetime64[D]

if getattr(dt, 'tzinfo', None) is not None:
i8 = tslib.pydt_to_i8(dt)
dt = tslib.tz_convert_single(i8, 'UTC', dt.tzinfo)
Expand All @@ -649,7 +673,7 @@ def onOffset(self, dt):
if self.normalize and not _is_normalized(dt):
return False
day64 = self._to_dt64(dt,'datetime64[D]')
return np.is_busday(day64, busdaycal=self.busdaycalendar)
return np.is_busday(day64, busdaycal=self.calendar)


class MonthOffset(SingleConstructorOffset):
Expand Down Expand Up @@ -767,7 +791,6 @@ def onOffset(self, dt):
_prefix = 'BMS'



class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
"""
**EXPERIMENTAL** DateOffset of one custom business month
Expand All @@ -788,18 +811,22 @@ class CustomBusinessMonthEnd(BusinessMixin, MonthOffset):
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : pd.HolidayCalendar or np.busdaycalendar
"""

_cacheable = False
_prefix = 'CBM'
def __init__(self, n=1, normalize=False, **kwds):
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
holidays=None, calendar=None, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
self.cbday = CustomBusinessDay(n=self.n, **kwds)
self.m_offset = MonthEnd()
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize,
weekmask=weekmask, holidays=holidays,
calendar=calendar, **kwds)
self.m_offset = MonthEnd(n=1, normalize=normalize, **kwds)
self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar

@apply_wraps
def apply(self,other):
Expand All @@ -817,11 +844,11 @@ def apply(self,other):
n -= 1
elif other > cur_cmend and n <= -1:
n += 1
new = cur_mend + n * MonthEnd()

new = cur_mend + n * self.m_offset
result = self.cbday.rollback(new)
return result

class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
"""
**EXPERIMENTAL** DateOffset of one custom business month
Expand All @@ -842,18 +869,22 @@ class CustomBusinessMonthBegin(BusinessMixin, MonthOffset):
holidays : list
list/array of dates to exclude from the set of valid business days,
passed to ``numpy.busdaycalendar``
calendar : pd.HolidayCalendar or np.busdaycalendar
"""

_cacheable = False
_prefix = 'CBMS'
def __init__(self, n=1, normalize=False, **kwds):
def __init__(self, n=1, normalize=False, weekmask='Mon Tue Wed Thu Fri',
holidays=None, calendar=None, **kwds):
self.n = int(n)
self.normalize = normalize
self.kwds = kwds
self.offset = kwds.get('offset', timedelta(0))
self.weekmask = kwds.get('weekmask', 'Mon Tue Wed Thu Fri')
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize, **kwds)
self.m_offset = MonthBegin(normalize=normalize)
self.cbday = CustomBusinessDay(n=self.n, normalize=normalize,
weekmask=weekmask, holidays=holidays,
calendar=calendar, **kwds)
self.m_offset = MonthBegin(n=1, normalize=normalize, **kwds)
self.kwds['calendar'] = self.cbday.calendar # cache numpy calendar

@apply_wraps
def apply(self,other):
Expand All @@ -872,8 +903,8 @@ def apply(self,other):
n += 1
elif dt_in < cur_cmbegin and n >= 1:
n -= 1
new = cur_mbegin + n * MonthBegin()

new = cur_mbegin + n * self.m_offset
result = self.cbday.rollforward(new)
return result

Expand Down
Binary file added pandas/tseries/tests/data/cday-0.14.1.pickle
Binary file not shown.
44 changes: 40 additions & 4 deletions pandas/tseries/tests/test_offsets.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from datetime import date, datetime, timedelta
from dateutil.relativedelta import relativedelta
from pandas.compat import range
Expand All @@ -22,6 +23,7 @@
from pandas.tseries.tools import parse_time_string
import pandas.tseries.offsets as offsets

from pandas.io.pickle import read_pickle
from pandas.tslib import NaT, Timestamp
import pandas.tslib as tslib
from pandas.util.testing import assertRaisesRegexp
Expand Down Expand Up @@ -848,6 +850,24 @@ def test_calendar(self):
dt = datetime(2014, 1, 17)
assertEq(CDay(calendar=calendar), dt, datetime(2014, 1, 21))

def test_roundtrip_pickle(self):
def _check_roundtrip(obj):
unpickled = self.round_trip_pickle(obj)
self.assertEqual(unpickled, obj)
_check_roundtrip(self.offset)
_check_roundtrip(self.offset2)
_check_roundtrip(self.offset*2)

def test_pickle_compat_0_14_1(self):
hdays = [datetime(2013,1,1) for ele in range(4)]

pth = tm.get_data_path()

cday0_14_1 = read_pickle(os.path.join(pth, 'cday-0.14.1.pickle'))
cday = CDay(holidays=hdays)
self.assertEqual(cday, cday0_14_1)


class CustomBusinessMonthBase(object):
_multiprocess_can_split_ = True

Expand Down Expand Up @@ -894,6 +914,15 @@ def test_offsets_compare_equal(self):
offset2 = self._object()
self.assertFalse(offset1 != offset2)

def test_roundtrip_pickle(self):
def _check_roundtrip(obj):
unpickled = self.round_trip_pickle(obj)
self.assertEqual(unpickled, obj)
_check_roundtrip(self._object())
_check_roundtrip(self._object(2))
_check_roundtrip(self._object()*2)


class TestCustomBusinessMonthEnd(CustomBusinessMonthBase, Base):
_object = CBMonthEnd

Expand Down Expand Up @@ -1006,8 +1035,12 @@ def test_holidays(self):

def test_datetimeindex(self):
from pandas.tseries.holiday import USFederalHolidayCalendar
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthEnd(calendar=USFederalHolidayCalendar())).tolist()[0],
datetime(2012,1,31))
hcal = USFederalHolidayCalendar()
freq = CBMonthEnd(calendar=hcal)

self.assertEqual(DatetimeIndex(start='20120101',end='20130101',
freq=freq).tolist()[0],
datetime(2012,1,31))

class TestCustomBusinessMonthBegin(CustomBusinessMonthBase, Base):
_object = CBMonthBegin
Expand Down Expand Up @@ -1120,8 +1153,11 @@ def test_holidays(self):
self.assertEqual(dt + 2*bm_offset,datetime(2012,2,3))

def test_datetimeindex(self):
self.assertEqual(DatetimeIndex(start='20120101',end='20130101',freq=CBMonthBegin(calendar=USFederalHolidayCalendar())).tolist()[0],
datetime(2012,1,3))
hcal = USFederalHolidayCalendar()
cbmb = CBMonthBegin(calendar=hcal)
self.assertEqual(DatetimeIndex(start='20120101', end='20130101',
freq=cbmb).tolist()[0],
datetime(2012,1,3))


def assertOnOffset(offset, date, expected):
Expand Down
35 changes: 30 additions & 5 deletions vb_suite/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,15 +285,20 @@ def date_range(start=None, end=None, periods=None, freq=None):
setup = common_setup + """
import datetime as dt
import pandas as pd
import pandas.tseries.holiday
import numpy as np

date = dt.datetime(2011,1,1)
dt64 = np.datetime64('2011-01-01 09:00Z')
hcal = pd.tseries.holiday.USFederalHolidayCalendar()

day = pd.offsets.Day()
year = pd.offsets.YearBegin()
cday = pd.offsets.CustomBusinessDay()
cme = pd.offsets.CustomBusinessMonthEnd()
cmb = pd.offsets.CustomBusinessMonthBegin(calendar=hcal)
cme = pd.offsets.CustomBusinessMonthEnd(calendar=hcal)

cdayh = pd.offsets.CustomBusinessDay(calendar=hcal)
"""
timeseries_day_incr = Benchmark("date + day",setup)

Expand All @@ -306,15 +311,26 @@ def date_range(start=None, end=None, periods=None, freq=None):
timeseries_custom_bday_incr = \
Benchmark("date + cday",setup)

timeseries_custom_bday_decr = \
Benchmark("date - cday",setup)

timeseries_custom_bday_apply = \
Benchmark("cday.apply(date)",setup)

timeseries_custom_bday_apply_dt64 = \
Benchmark("cday.apply(dt64)",setup)

# Increment by n
timeseries_custom_bday_incr_n = \
Benchmark("date + 10 * cday",setup)
timeseries_custom_bday_cal_incr = \
Benchmark("date + 1 * cdayh",setup)

timeseries_custom_bday_cal_decr = \
Benchmark("date - 1 * cdayh",setup)

timeseries_custom_bday_cal_incr_n = \
Benchmark("date + 10 * cdayh",setup)

timeseries_custom_bday_cal_incr_neg_n = \
Benchmark("date - 10 * cdayh",setup)

# Increment custom business month
timeseries_custom_bmonthend_incr = \
Expand All @@ -323,6 +339,16 @@ def date_range(start=None, end=None, periods=None, freq=None):
timeseries_custom_bmonthend_incr_n = \
Benchmark("date + 10 * cme",setup)

timeseries_custom_bmonthend_decr_n = \
Benchmark("date - 10 * cme",setup)

timeseries_custom_bmonthbegin_incr_n = \
Benchmark("date + 10 * cmb",setup)

timeseries_custom_bmonthbegin_decr_n = \
Benchmark("date - 10 * cmb",setup)


#----------------------------------------------------------------------
# month/quarter/year start/end accessors

Expand Down Expand Up @@ -357,4 +383,3 @@ def iter_n(iterable, n=None):
timeseries_iter_datetimeindex_preexit = Benchmark('iter_n(idx1, M)', setup)

timeseries_iter_periodindex_preexit = Benchmark('iter_n(idx2, M)', setup)