Skip to content

PERF: perf improvements in DataFrame construction with a non-daily datelike index (GH6479) #6481

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 27, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ Improvements to existing features
- ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
- perf improvements in DataFrame construction with certain offsets, by removing faulty caching
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)

.. _release.bug_fixes-0.14.0:

Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.14.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,9 @@ Enhancements
Performance
~~~~~~~~~~~

- perf improvements in DataFrame construction with certain offsets, by removing faulty caching
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)

Experimental
~~~~~~~~~~~~

Expand Down
40 changes: 20 additions & 20 deletions pandas/tseries/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ def _from_name(cls, suffix=None):
return cls()


class BusinessDay(CacheableOffset, SingleConstructorOffset):
class BusinessDay(SingleConstructorOffset):
"""
DateOffset subclass representing possibly n business days
"""
Expand Down Expand Up @@ -399,7 +399,7 @@ def apply(self, other):
n -= 5 * k
if n == 0 and result.weekday() > 4:
n -= 1

while n != 0:
k = n // abs(n)
result = result + timedelta(k)
Expand Down Expand Up @@ -548,7 +548,7 @@ def name(self):
return "%s-%s" % (self.rule_code, _int_to_month[self.n])


class MonthEnd(CacheableOffset, MonthOffset):
class MonthEnd(MonthOffset):
"""DateOffset of one month end"""

def apply(self, other):
Expand All @@ -572,7 +572,7 @@ def onOffset(cls, dt):
_prefix = 'M'


class MonthBegin(CacheableOffset, MonthOffset):
class MonthBegin(MonthOffset):
"""DateOffset of one month at beginning"""

def apply(self, other):
Expand All @@ -591,7 +591,7 @@ def onOffset(cls, dt):
_prefix = 'MS'


class BusinessMonthEnd(CacheableOffset, MonthOffset):
class BusinessMonthEnd(MonthOffset):
"""DateOffset increments between business EOM dates"""

def isAnchored(self):
Expand Down Expand Up @@ -619,7 +619,7 @@ def apply(self, other):
_prefix = 'BM'


class BusinessMonthBegin(CacheableOffset, MonthOffset):
class BusinessMonthBegin(MonthOffset):
"""DateOffset of one business month at beginning"""

def apply(self, other):
Expand Down Expand Up @@ -654,7 +654,7 @@ def onOffset(cls, dt):
_prefix = 'BMS'


class Week(CacheableOffset, DateOffset):
class Week(DateOffset):
"""
Weekly offset

Expand Down Expand Up @@ -744,7 +744,7 @@ class WeekDay(object):
_weekday_to_int = dict((v, k) for k, v in _int_to_weekday.items())


class WeekOfMonth(CacheableOffset, DateOffset):
class WeekOfMonth(DateOffset):
"""
Describes monthly dates like "the Tuesday of the 2nd week of each month"

Expand Down Expand Up @@ -830,7 +830,7 @@ def _from_name(cls, suffix=None):
weekday = _weekday_to_int[suffix[1:]]
return cls(week=week, weekday=weekday)

class LastWeekOfMonth(CacheableOffset, DateOffset):
class LastWeekOfMonth(DateOffset):
"""
Describes monthly dates in last week of month like "the last Tuesday of each month"

Expand Down Expand Up @@ -940,7 +940,7 @@ def rule_code(self):
return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth])


class BQuarterEnd(CacheableOffset, QuarterOffset):
class BQuarterEnd(QuarterOffset):
"""DateOffset increments between business Quarter dates
startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
Expand Down Expand Up @@ -999,7 +999,7 @@ def onOffset(self, dt):


# TODO: This is basically the same as BQuarterEnd
class BQuarterBegin(CacheableOffset, QuarterOffset):
class BQuarterBegin(QuarterOffset):
_outputName = "BusinessQuarterBegin"
# I suspect this is wrong for *all* of them.
_default_startingMonth = 3
Expand Down Expand Up @@ -1036,7 +1036,7 @@ def apply(self, other):
return as_timestamp(result)


class QuarterEnd(CacheableOffset, QuarterOffset):
class QuarterEnd(QuarterOffset):
"""DateOffset increments between business Quarter dates
startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
Expand Down Expand Up @@ -1077,7 +1077,7 @@ def onOffset(self, dt):
return MonthEnd().onOffset(dt) and modMonth == 0


class QuarterBegin(CacheableOffset, QuarterOffset):
class QuarterBegin(QuarterOffset):
_outputName = 'QuarterBegin'
_default_startingMonth = 3
_from_name_startingMonth = 1
Expand Down Expand Up @@ -1129,7 +1129,7 @@ def rule_code(self):
return '%s-%s' % (self._prefix, _int_to_month[self.month])


class BYearEnd(CacheableOffset, YearOffset):
class BYearEnd(YearOffset):
"""DateOffset increments between business EOM dates"""
_outputName = 'BusinessYearEnd'
_default_month = 12
Expand Down Expand Up @@ -1166,7 +1166,7 @@ def apply(self, other):
return result


class BYearBegin(CacheableOffset, YearOffset):
class BYearBegin(YearOffset):
"""DateOffset increments between business year begin dates"""
_outputName = 'BusinessYearBegin'
_default_month = 1
Expand Down Expand Up @@ -1198,7 +1198,7 @@ def apply(self, other):
return as_timestamp(datetime(other.year, self.month, first))


class YearEnd(CacheableOffset, YearOffset):
class YearEnd(YearOffset):
"""DateOffset increments between calendar year ends"""
_default_month = 12
_prefix = 'A'
Expand Down Expand Up @@ -1254,7 +1254,7 @@ def onOffset(self, dt):
return self.month == dt.month and dt.day == days_in_month


class YearBegin(CacheableOffset, YearOffset):
class YearBegin(YearOffset):
"""DateOffset increments between calendar year begin dates"""
_default_month = 1
_prefix = 'AS'
Expand Down Expand Up @@ -1300,7 +1300,7 @@ def onOffset(self, dt):
return dt.month == self.month and dt.day == 1


class FY5253(CacheableOffset, DateOffset):
class FY5253(DateOffset):
"""
Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar.

Expand Down Expand Up @@ -1501,7 +1501,7 @@ def _from_name(cls, *args):
return cls(**cls._parse_suffix(*args))


class FY5253Quarter(CacheableOffset, DateOffset):
class FY5253Quarter(DateOffset):
"""
DateOffset increments between business quarter dates
for 52-53 week fiscal year (also known as a 4-4-5 calendar).
Expand Down Expand Up @@ -1772,7 +1772,7 @@ def _delta_to_nanoseconds(delta):
+ delta.microseconds) * 1000


class Day(CacheableOffset, Tick):
class Day(Tick):
_inc = timedelta(1)
_prefix = 'D'

Expand Down
45 changes: 22 additions & 23 deletions pandas/tseries/tests/test_offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def test_apply_large_n(self):
rs = st + off
xp = datetime(2011, 12, 26)
self.assertEqual(rs, xp)

off = BDay() * 10
rs = datetime(2014, 1, 5) + off # see #5890
xp = datetime(2014, 1, 17)
Expand Down Expand Up @@ -2427,25 +2427,9 @@ def get_all_subclasses(cls):
return ret

class TestCaching(tm.TestCase):
no_simple_ctr = [WeekOfMonth, FY5253,
FY5253Quarter,
LastWeekOfMonth]

def test_should_cache_month_end(self):
self.assertTrue(MonthEnd()._should_cache())

def test_should_cache_bmonth_end(self):
self.assertTrue(BusinessMonthEnd()._should_cache())

def test_should_cache_week_month(self):
self.assertTrue(WeekOfMonth(weekday=1, week=2)._should_cache())

def test_all_cacheableoffsets(self):
for subclass in get_all_subclasses(CacheableOffset):
if subclass.__name__[0] == "_" \
or subclass in TestCaching.no_simple_ctr:
continue
self.run_X_index_creation(subclass)
# as of GH 6479 (in 0.14.0), offset caching is turned off
# as of v0.12.0 only BusinessMonth/Quarter were actually caching

def setUp(self):
_daterange_cache.clear()
Expand All @@ -2462,20 +2446,35 @@ def run_X_index_creation(self, cls):
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=inst1, normalize=True)
self.assertTrue(cls() in _daterange_cache, cls)

def test_should_cache_month_end(self):
self.assertFalse(MonthEnd()._should_cache())

def test_should_cache_bmonth_end(self):
self.assertFalse(BusinessMonthEnd()._should_cache())

def test_should_cache_week_month(self):
self.assertFalse(WeekOfMonth(weekday=1, week=2)._should_cache())

def test_all_cacheableoffsets(self):
for subclass in get_all_subclasses(CacheableOffset):
if subclass.__name__[0] == "_" \
or subclass in TestCaching.no_simple_ctr:
continue
self.run_X_index_creation(subclass)

def test_month_end_index_creation(self):
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=MonthEnd(), normalize=True)
self.assertTrue(MonthEnd() in _daterange_cache)
self.assertFalse(MonthEnd() in _daterange_cache)

def test_bmonth_end_index_creation(self):
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=BusinessMonthEnd(), normalize=True)
self.assertTrue(BusinessMonthEnd() in _daterange_cache)
self.assertFalse(BusinessMonthEnd() in _daterange_cache)

def test_week_of_month_index_creation(self):
inst1 = WeekOfMonth(weekday=1, week=2)
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=inst1, normalize=True)
inst2 = WeekOfMonth(weekday=1, week=2)
self.assertTrue(inst2 in _daterange_cache)

self.assertFalse(inst2 in _daterange_cache)

class TestReprNames(tm.TestCase):
def test_str_for_named_is_name(self):
Expand Down
23 changes: 23 additions & 0 deletions vb_suite/frame_ctor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
from vbench.benchmark import Benchmark
from datetime import datetime
try:
import pandas.tseries.offsets as offsets
except:
import pandas.core.datetools as offsets

common_setup = """from pandas_vb_common import *
try:
from pandas.tseries.offsets import *
except:
from pandas.core.datetools import *
"""

#----------------------------------------------------------------------
Expand Down Expand Up @@ -36,6 +44,21 @@
"""
frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup)

# dynamically generate benchmarks for every offset
dynamic_benchmarks = {}
n_steps = [1, 2]
for offset in offsets.__all__:
for n in n_steps:
setup = common_setup + """
df = DataFrame(np.random.randn(1000,10),index=date_range('1/1/1900',periods=1000,freq={}({})))
d = dict([ (col,df[col]) for col in df.columns ])
""".format(offset, n)
key = 'frame_ctor_dtindex_{}({})'.format(offset, n)
dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key)

# Have to stuff them in globals() so vbench detects them
globals().update(dynamic_benchmarks)

# from a mi-series
setup = common_setup + """
mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)])
Expand Down