From 1abb1167e4a5af1b6a903571f583e15bc7b900af Mon Sep 17 00:00:00 2001 From: jreback Date: Tue, 25 Feb 2014 18:22:29 -0500 Subject: [PATCH 1/2] PERF: perf improvements in DataFrame construction with a non-daily datelike index (GH6479) Dynamic vbenches --- vb_suite/frame_ctor.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/vb_suite/frame_ctor.py b/vb_suite/frame_ctor.py index 1d8df95de9fe3..8180b39b116fe 100644 --- a/vb_suite/frame_ctor.py +++ b/vb_suite/frame_ctor.py @@ -1,7 +1,15 @@ from vbench.benchmark import Benchmark from datetime import datetime +try: + import pandas.tseries.offsets as offsets +except: + import pandas.core.datetools as offsets common_setup = """from pandas_vb_common import * +try: + from pandas.tseries.offsets import * +except: + from pandas.core.datetools import * """ #---------------------------------------------------------------------- @@ -36,6 +44,21 @@ """ frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup) +# dynamically generate benchmarks for every offset +dynamic_benchmarks = {} +n_steps = [1, 2] +for offset in offsets.__all__: + for n in n_steps: + setup = common_setup + """ +df = DataFrame(np.random.randn(1000,10),index=date_range('1/1/1900',periods=1000,freq={}({}))) +d = dict([ (col,df[col]) for col in df.columns ]) +""".format(offset, n) + key = 'frame_ctor_dtindex_{}({})'.format(offset, n) + dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key) + +# Have to stuff them in globals() so vbench detects them +globals().update(dynamic_benchmarks) + # from a mi-series setup = common_setup + """ mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)]) From 872ba0d357f660db9bee10706671869328ee1c47 Mon Sep 17 00:00:00 2001 From: jreback Date: Wed, 26 Feb 2014 08:32:04 -0500 Subject: [PATCH 2/2] BUG: fix non-caching of some frequency offsets for date generation DOC: release notes --- doc/source/release.rst | 2 ++ doc/source/v0.14.0.txt | 3 ++ pandas/tseries/offsets.py | 40 ++++++++++++------------- pandas/tseries/tests/test_offsets.py | 45 ++++++++++++++-------------- 4 files changed, 47 insertions(+), 43 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 51833dd4d2e3b..d9b416136ae7f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -123,6 +123,8 @@ Improvements to existing features - ``Series.rank()`` now has a percentage rank option (:issue:`5971`) - ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`) +- perf improvements in DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index e914b2a4693d0..13c0b66056695 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -235,6 +235,9 @@ Enhancements Performance ~~~~~~~~~~~ +- perf improvements in DataFrame construction with certain offsets, by removing faulty caching + (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) + Experimental ~~~~~~~~~~~~ diff --git a/pandas/tseries/offsets.py b/pandas/tseries/offsets.py index ab9f49ddd321e..299d532c20b08 100644 --- a/pandas/tseries/offsets.py +++ b/pandas/tseries/offsets.py @@ -291,7 +291,7 @@ def _from_name(cls, suffix=None): return cls() -class BusinessDay(CacheableOffset, SingleConstructorOffset): +class BusinessDay(SingleConstructorOffset): """ DateOffset subclass representing possibly n business days """ @@ -399,7 +399,7 @@ def apply(self, other): n -= 5 * k if n == 0 and result.weekday() > 4: n -= 1 - + while n != 0: k = n // abs(n) result = result + timedelta(k) @@ -548,7 +548,7 @@ def name(self): return "%s-%s" % (self.rule_code, _int_to_month[self.n]) -class MonthEnd(CacheableOffset, MonthOffset): +class MonthEnd(MonthOffset): """DateOffset of one month end""" def apply(self, other): @@ -572,7 +572,7 @@ def onOffset(cls, dt): _prefix = 'M' -class MonthBegin(CacheableOffset, MonthOffset): +class MonthBegin(MonthOffset): """DateOffset of one month at beginning""" def apply(self, other): @@ -591,7 +591,7 @@ def onOffset(cls, dt): _prefix = 'MS' -class BusinessMonthEnd(CacheableOffset, MonthOffset): +class BusinessMonthEnd(MonthOffset): """DateOffset increments between business EOM dates""" def isAnchored(self): @@ -619,7 +619,7 @@ def apply(self, other): _prefix = 'BM' -class BusinessMonthBegin(CacheableOffset, MonthOffset): +class BusinessMonthBegin(MonthOffset): """DateOffset of one business month at beginning""" def apply(self, other): @@ -654,7 +654,7 @@ def onOffset(cls, dt): _prefix = 'BMS' -class Week(CacheableOffset, DateOffset): +class Week(DateOffset): """ Weekly offset @@ -744,7 +744,7 @@ class WeekDay(object): _weekday_to_int = dict((v, k) for k, v in _int_to_weekday.items()) -class WeekOfMonth(CacheableOffset, DateOffset): +class WeekOfMonth(DateOffset): """ Describes monthly dates like "the Tuesday of the 2nd week of each month" @@ -830,7 +830,7 @@ def _from_name(cls, suffix=None): weekday = _weekday_to_int[suffix[1:]] return cls(week=week, weekday=weekday) -class LastWeekOfMonth(CacheableOffset, DateOffset): +class LastWeekOfMonth(DateOffset): """ Describes monthly dates in last week of month like "the last Tuesday of each month" @@ -940,7 +940,7 @@ def rule_code(self): return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth]) -class BQuarterEnd(CacheableOffset, QuarterOffset): +class BQuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -999,7 +999,7 @@ def onOffset(self, dt): # TODO: This is basically the same as BQuarterEnd -class BQuarterBegin(CacheableOffset, QuarterOffset): +class BQuarterBegin(QuarterOffset): _outputName = "BusinessQuarterBegin" # I suspect this is wrong for *all* of them. _default_startingMonth = 3 @@ -1036,7 +1036,7 @@ def apply(self, other): return as_timestamp(result) -class QuarterEnd(CacheableOffset, QuarterOffset): +class QuarterEnd(QuarterOffset): """DateOffset increments between business Quarter dates startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ... startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ... @@ -1077,7 +1077,7 @@ def onOffset(self, dt): return MonthEnd().onOffset(dt) and modMonth == 0 -class QuarterBegin(CacheableOffset, QuarterOffset): +class QuarterBegin(QuarterOffset): _outputName = 'QuarterBegin' _default_startingMonth = 3 _from_name_startingMonth = 1 @@ -1129,7 +1129,7 @@ def rule_code(self): return '%s-%s' % (self._prefix, _int_to_month[self.month]) -class BYearEnd(CacheableOffset, YearOffset): +class BYearEnd(YearOffset): """DateOffset increments between business EOM dates""" _outputName = 'BusinessYearEnd' _default_month = 12 @@ -1166,7 +1166,7 @@ def apply(self, other): return result -class BYearBegin(CacheableOffset, YearOffset): +class BYearBegin(YearOffset): """DateOffset increments between business year begin dates""" _outputName = 'BusinessYearBegin' _default_month = 1 @@ -1198,7 +1198,7 @@ def apply(self, other): return as_timestamp(datetime(other.year, self.month, first)) -class YearEnd(CacheableOffset, YearOffset): +class YearEnd(YearOffset): """DateOffset increments between calendar year ends""" _default_month = 12 _prefix = 'A' @@ -1254,7 +1254,7 @@ def onOffset(self, dt): return self.month == dt.month and dt.day == days_in_month -class YearBegin(CacheableOffset, YearOffset): +class YearBegin(YearOffset): """DateOffset increments between calendar year begin dates""" _default_month = 1 _prefix = 'AS' @@ -1300,7 +1300,7 @@ def onOffset(self, dt): return dt.month == self.month and dt.day == 1 -class FY5253(CacheableOffset, DateOffset): +class FY5253(DateOffset): """ Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar. @@ -1501,7 +1501,7 @@ def _from_name(cls, *args): return cls(**cls._parse_suffix(*args)) -class FY5253Quarter(CacheableOffset, DateOffset): +class FY5253Quarter(DateOffset): """ DateOffset increments between business quarter dates for 52-53 week fiscal year (also known as a 4-4-5 calendar). @@ -1772,7 +1772,7 @@ def _delta_to_nanoseconds(delta): + delta.microseconds) * 1000 -class Day(CacheableOffset, Tick): +class Day(Tick): _inc = timedelta(1) _prefix = 'D' diff --git a/pandas/tseries/tests/test_offsets.py b/pandas/tseries/tests/test_offsets.py index d30a646b1b1d6..50a9558350c5f 100644 --- a/pandas/tseries/tests/test_offsets.py +++ b/pandas/tseries/tests/test_offsets.py @@ -316,7 +316,7 @@ def test_apply_large_n(self): rs = st + off xp = datetime(2011, 12, 26) self.assertEqual(rs, xp) - + off = BDay() * 10 rs = datetime(2014, 1, 5) + off # see #5890 xp = datetime(2014, 1, 17) @@ -2427,25 +2427,9 @@ def get_all_subclasses(cls): return ret class TestCaching(tm.TestCase): - no_simple_ctr = [WeekOfMonth, FY5253, - FY5253Quarter, - LastWeekOfMonth] - - def test_should_cache_month_end(self): - self.assertTrue(MonthEnd()._should_cache()) - - def test_should_cache_bmonth_end(self): - self.assertTrue(BusinessMonthEnd()._should_cache()) - - def test_should_cache_week_month(self): - self.assertTrue(WeekOfMonth(weekday=1, week=2)._should_cache()) - def test_all_cacheableoffsets(self): - for subclass in get_all_subclasses(CacheableOffset): - if subclass.__name__[0] == "_" \ - or subclass in TestCaching.no_simple_ctr: - continue - self.run_X_index_creation(subclass) + # as of GH 6479 (in 0.14.0), offset caching is turned off + # as of v0.12.0 only BusinessMonth/Quarter were actually caching def setUp(self): _daterange_cache.clear() @@ -2462,20 +2446,35 @@ def run_X_index_creation(self, cls): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=inst1, normalize=True) self.assertTrue(cls() in _daterange_cache, cls) + def test_should_cache_month_end(self): + self.assertFalse(MonthEnd()._should_cache()) + + def test_should_cache_bmonth_end(self): + self.assertFalse(BusinessMonthEnd()._should_cache()) + + def test_should_cache_week_month(self): + self.assertFalse(WeekOfMonth(weekday=1, week=2)._should_cache()) + + def test_all_cacheableoffsets(self): + for subclass in get_all_subclasses(CacheableOffset): + if subclass.__name__[0] == "_" \ + or subclass in TestCaching.no_simple_ctr: + continue + self.run_X_index_creation(subclass) + def test_month_end_index_creation(self): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=MonthEnd(), normalize=True) - self.assertTrue(MonthEnd() in _daterange_cache) + self.assertFalse(MonthEnd() in _daterange_cache) def test_bmonth_end_index_creation(self): DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=BusinessMonthEnd(), normalize=True) - self.assertTrue(BusinessMonthEnd() in _daterange_cache) + self.assertFalse(BusinessMonthEnd() in _daterange_cache) def test_week_of_month_index_creation(self): inst1 = WeekOfMonth(weekday=1, week=2) DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=inst1, normalize=True) inst2 = WeekOfMonth(weekday=1, week=2) - self.assertTrue(inst2 in _daterange_cache) - + self.assertFalse(inst2 in _daterange_cache) class TestReprNames(tm.TestCase): def test_str_for_named_is_name(self):