Skip to content

Commit 8cd9819

Browse files
committed
Merge pull request #6481 from jreback/di_perf
PERF: perf improvements in DataFrame construction with a non-daily datelike index (GH6479)
2 parents 10ad9dd + 872ba0d commit 8cd9819

File tree

5 files changed

+70
-43
lines changed

5 files changed

+70
-43
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,8 @@ Improvements to existing features
127127
- ``Series.rank()`` now has a percentage rank option (:issue:`5971`)
128128
- ``quotechar``, ``doublequote``, and ``escapechar`` can now be specified when
129129
using ``DataFrame.to_csv`` (:issue:`5414`, :issue:`4528`)
130+
- perf improvements in DataFrame construction with certain offsets, by removing faulty caching
131+
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)
130132

131133
.. _release.bug_fixes-0.14.0:
132134

doc/source/v0.14.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,9 @@ Enhancements
237237
Performance
238238
~~~~~~~~~~~
239239

240+
- perf improvements in DataFrame construction with certain offsets, by removing faulty caching
241+
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)
242+
240243
Experimental
241244
~~~~~~~~~~~~
242245

pandas/tseries/offsets.py

+20-20
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ def _from_name(cls, suffix=None):
291291
return cls()
292292

293293

294-
class BusinessDay(CacheableOffset, SingleConstructorOffset):
294+
class BusinessDay(SingleConstructorOffset):
295295
"""
296296
DateOffset subclass representing possibly n business days
297297
"""
@@ -399,7 +399,7 @@ def apply(self, other):
399399
n -= 5 * k
400400
if n == 0 and result.weekday() > 4:
401401
n -= 1
402-
402+
403403
while n != 0:
404404
k = n // abs(n)
405405
result = result + timedelta(k)
@@ -548,7 +548,7 @@ def name(self):
548548
return "%s-%s" % (self.rule_code, _int_to_month[self.n])
549549

550550

551-
class MonthEnd(CacheableOffset, MonthOffset):
551+
class MonthEnd(MonthOffset):
552552
"""DateOffset of one month end"""
553553

554554
def apply(self, other):
@@ -572,7 +572,7 @@ def onOffset(cls, dt):
572572
_prefix = 'M'
573573

574574

575-
class MonthBegin(CacheableOffset, MonthOffset):
575+
class MonthBegin(MonthOffset):
576576
"""DateOffset of one month at beginning"""
577577

578578
def apply(self, other):
@@ -591,7 +591,7 @@ def onOffset(cls, dt):
591591
_prefix = 'MS'
592592

593593

594-
class BusinessMonthEnd(CacheableOffset, MonthOffset):
594+
class BusinessMonthEnd(MonthOffset):
595595
"""DateOffset increments between business EOM dates"""
596596

597597
def isAnchored(self):
@@ -619,7 +619,7 @@ def apply(self, other):
619619
_prefix = 'BM'
620620

621621

622-
class BusinessMonthBegin(CacheableOffset, MonthOffset):
622+
class BusinessMonthBegin(MonthOffset):
623623
"""DateOffset of one business month at beginning"""
624624

625625
def apply(self, other):
@@ -654,7 +654,7 @@ def onOffset(cls, dt):
654654
_prefix = 'BMS'
655655

656656

657-
class Week(CacheableOffset, DateOffset):
657+
class Week(DateOffset):
658658
"""
659659
Weekly offset
660660
@@ -744,7 +744,7 @@ class WeekDay(object):
744744
_weekday_to_int = dict((v, k) for k, v in _int_to_weekday.items())
745745

746746

747-
class WeekOfMonth(CacheableOffset, DateOffset):
747+
class WeekOfMonth(DateOffset):
748748
"""
749749
Describes monthly dates like "the Tuesday of the 2nd week of each month"
750750
@@ -830,7 +830,7 @@ def _from_name(cls, suffix=None):
830830
weekday = _weekday_to_int[suffix[1:]]
831831
return cls(week=week, weekday=weekday)
832832

833-
class LastWeekOfMonth(CacheableOffset, DateOffset):
833+
class LastWeekOfMonth(DateOffset):
834834
"""
835835
Describes monthly dates in last week of month like "the last Tuesday of each month"
836836
@@ -940,7 +940,7 @@ def rule_code(self):
940940
return '%s-%s' % (self._prefix, _int_to_month[self.startingMonth])
941941

942942

943-
class BQuarterEnd(CacheableOffset, QuarterOffset):
943+
class BQuarterEnd(QuarterOffset):
944944
"""DateOffset increments between business Quarter dates
945945
startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
946946
startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
@@ -999,7 +999,7 @@ def onOffset(self, dt):
999999

10001000

10011001
# TODO: This is basically the same as BQuarterEnd
1002-
class BQuarterBegin(CacheableOffset, QuarterOffset):
1002+
class BQuarterBegin(QuarterOffset):
10031003
_outputName = "BusinessQuarterBegin"
10041004
# I suspect this is wrong for *all* of them.
10051005
_default_startingMonth = 3
@@ -1036,7 +1036,7 @@ def apply(self, other):
10361036
return as_timestamp(result)
10371037

10381038

1039-
class QuarterEnd(CacheableOffset, QuarterOffset):
1039+
class QuarterEnd(QuarterOffset):
10401040
"""DateOffset increments between business Quarter dates
10411041
startingMonth = 1 corresponds to dates like 1/31/2007, 4/30/2007, ...
10421042
startingMonth = 2 corresponds to dates like 2/28/2007, 5/31/2007, ...
@@ -1077,7 +1077,7 @@ def onOffset(self, dt):
10771077
return MonthEnd().onOffset(dt) and modMonth == 0
10781078

10791079

1080-
class QuarterBegin(CacheableOffset, QuarterOffset):
1080+
class QuarterBegin(QuarterOffset):
10811081
_outputName = 'QuarterBegin'
10821082
_default_startingMonth = 3
10831083
_from_name_startingMonth = 1
@@ -1129,7 +1129,7 @@ def rule_code(self):
11291129
return '%s-%s' % (self._prefix, _int_to_month[self.month])
11301130

11311131

1132-
class BYearEnd(CacheableOffset, YearOffset):
1132+
class BYearEnd(YearOffset):
11331133
"""DateOffset increments between business EOM dates"""
11341134
_outputName = 'BusinessYearEnd'
11351135
_default_month = 12
@@ -1166,7 +1166,7 @@ def apply(self, other):
11661166
return result
11671167

11681168

1169-
class BYearBegin(CacheableOffset, YearOffset):
1169+
class BYearBegin(YearOffset):
11701170
"""DateOffset increments between business year begin dates"""
11711171
_outputName = 'BusinessYearBegin'
11721172
_default_month = 1
@@ -1198,7 +1198,7 @@ def apply(self, other):
11981198
return as_timestamp(datetime(other.year, self.month, first))
11991199

12001200

1201-
class YearEnd(CacheableOffset, YearOffset):
1201+
class YearEnd(YearOffset):
12021202
"""DateOffset increments between calendar year ends"""
12031203
_default_month = 12
12041204
_prefix = 'A'
@@ -1254,7 +1254,7 @@ def onOffset(self, dt):
12541254
return self.month == dt.month and dt.day == days_in_month
12551255

12561256

1257-
class YearBegin(CacheableOffset, YearOffset):
1257+
class YearBegin(YearOffset):
12581258
"""DateOffset increments between calendar year begin dates"""
12591259
_default_month = 1
12601260
_prefix = 'AS'
@@ -1300,7 +1300,7 @@ def onOffset(self, dt):
13001300
return dt.month == self.month and dt.day == 1
13011301

13021302

1303-
class FY5253(CacheableOffset, DateOffset):
1303+
class FY5253(DateOffset):
13041304
"""
13051305
Describes 52-53 week fiscal year. This is also known as a 4-4-5 calendar.
13061306
@@ -1501,7 +1501,7 @@ def _from_name(cls, *args):
15011501
return cls(**cls._parse_suffix(*args))
15021502

15031503

1504-
class FY5253Quarter(CacheableOffset, DateOffset):
1504+
class FY5253Quarter(DateOffset):
15051505
"""
15061506
DateOffset increments between business quarter dates
15071507
for 52-53 week fiscal year (also known as a 4-4-5 calendar).
@@ -1772,7 +1772,7 @@ def _delta_to_nanoseconds(delta):
17721772
+ delta.microseconds) * 1000
17731773

17741774

1775-
class Day(CacheableOffset, Tick):
1775+
class Day(Tick):
17761776
_inc = timedelta(1)
17771777
_prefix = 'D'
17781778

pandas/tseries/tests/test_offsets.py

+22-23
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,7 @@ def test_apply_large_n(self):
316316
rs = st + off
317317
xp = datetime(2011, 12, 26)
318318
self.assertEqual(rs, xp)
319-
319+
320320
off = BDay() * 10
321321
rs = datetime(2014, 1, 5) + off # see #5890
322322
xp = datetime(2014, 1, 17)
@@ -2427,25 +2427,9 @@ def get_all_subclasses(cls):
24272427
return ret
24282428

24292429
class TestCaching(tm.TestCase):
2430-
no_simple_ctr = [WeekOfMonth, FY5253,
2431-
FY5253Quarter,
2432-
LastWeekOfMonth]
2433-
2434-
def test_should_cache_month_end(self):
2435-
self.assertTrue(MonthEnd()._should_cache())
2436-
2437-
def test_should_cache_bmonth_end(self):
2438-
self.assertTrue(BusinessMonthEnd()._should_cache())
2439-
2440-
def test_should_cache_week_month(self):
2441-
self.assertTrue(WeekOfMonth(weekday=1, week=2)._should_cache())
24422430

2443-
def test_all_cacheableoffsets(self):
2444-
for subclass in get_all_subclasses(CacheableOffset):
2445-
if subclass.__name__[0] == "_" \
2446-
or subclass in TestCaching.no_simple_ctr:
2447-
continue
2448-
self.run_X_index_creation(subclass)
2431+
# as of GH 6479 (in 0.14.0), offset caching is turned off
2432+
# as of v0.12.0 only BusinessMonth/Quarter were actually caching
24492433

24502434
def setUp(self):
24512435
_daterange_cache.clear()
@@ -2462,20 +2446,35 @@ def run_X_index_creation(self, cls):
24622446
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=inst1, normalize=True)
24632447
self.assertTrue(cls() in _daterange_cache, cls)
24642448

2449+
def test_should_cache_month_end(self):
2450+
self.assertFalse(MonthEnd()._should_cache())
2451+
2452+
def test_should_cache_bmonth_end(self):
2453+
self.assertFalse(BusinessMonthEnd()._should_cache())
2454+
2455+
def test_should_cache_week_month(self):
2456+
self.assertFalse(WeekOfMonth(weekday=1, week=2)._should_cache())
2457+
2458+
def test_all_cacheableoffsets(self):
2459+
for subclass in get_all_subclasses(CacheableOffset):
2460+
if subclass.__name__[0] == "_" \
2461+
or subclass in TestCaching.no_simple_ctr:
2462+
continue
2463+
self.run_X_index_creation(subclass)
2464+
24652465
def test_month_end_index_creation(self):
24662466
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,31), freq=MonthEnd(), normalize=True)
2467-
self.assertTrue(MonthEnd() in _daterange_cache)
2467+
self.assertFalse(MonthEnd() in _daterange_cache)
24682468

24692469
def test_bmonth_end_index_creation(self):
24702470
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=BusinessMonthEnd(), normalize=True)
2471-
self.assertTrue(BusinessMonthEnd() in _daterange_cache)
2471+
self.assertFalse(BusinessMonthEnd() in _daterange_cache)
24722472

24732473
def test_week_of_month_index_creation(self):
24742474
inst1 = WeekOfMonth(weekday=1, week=2)
24752475
DatetimeIndex(start=datetime(2013,1,31), end=datetime(2013,3,29), freq=inst1, normalize=True)
24762476
inst2 = WeekOfMonth(weekday=1, week=2)
2477-
self.assertTrue(inst2 in _daterange_cache)
2478-
2477+
self.assertFalse(inst2 in _daterange_cache)
24792478

24802479
class TestReprNames(tm.TestCase):
24812480
def test_str_for_named_is_name(self):

vb_suite/frame_ctor.py

+23
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,15 @@
11
from vbench.benchmark import Benchmark
22
from datetime import datetime
3+
try:
4+
import pandas.tseries.offsets as offsets
5+
except:
6+
import pandas.core.datetools as offsets
37

48
common_setup = """from pandas_vb_common import *
9+
try:
10+
from pandas.tseries.offsets import *
11+
except:
12+
from pandas.core.datetools import *
513
"""
614

715
#----------------------------------------------------------------------
@@ -36,6 +44,21 @@
3644
"""
3745
frame_ctor_nested_dict_int64 = Benchmark("DataFrame(data)", setup)
3846

47+
# dynamically generate benchmarks for every offset
48+
dynamic_benchmarks = {}
49+
n_steps = [1, 2]
50+
for offset in offsets.__all__:
51+
for n in n_steps:
52+
setup = common_setup + """
53+
df = DataFrame(np.random.randn(1000,10),index=date_range('1/1/1900',periods=1000,freq={}({})))
54+
d = dict([ (col,df[col]) for col in df.columns ])
55+
""".format(offset, n)
56+
key = 'frame_ctor_dtindex_{}({})'.format(offset, n)
57+
dynamic_benchmarks[key] = Benchmark("DataFrame(d)", setup, name=key)
58+
59+
# Have to stuff them in globals() so vbench detects them
60+
globals().update(dynamic_benchmarks)
61+
3962
# from a mi-series
4063
setup = common_setup + """
4164
mi = MultiIndex.from_tuples([(x,y) for x in range(100) for y in range(100)])

0 commit comments

Comments
 (0)