From fec1b5149d12d2f8a5176e17568235049b32bd24 Mon Sep 17 00:00:00 2001 From: rs2 Date: Sat, 16 Apr 2016 23:03:40 +0100 Subject: [PATCH 1/6] Fix performance issues when creating multiple instances of Period (#12903, #11831) --- pandas/src/period.pyx | 31 +++++++------------------------ pandas/tseries/constants.py | 7 +++++++ pandas/tseries/frequencies.py | 14 +++++++------- 3 files changed, 21 insertions(+), 31 deletions(-) create mode 100644 pandas/tseries/constants.py diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 33c213ac5d8df..1eb154b2473f1 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -36,6 +36,13 @@ from tslib cimport ( _nat_scalar_rules, ) +from pandas.tseries.frequencies import _period_alias_dict +from pandas.tseries.frequencies import _get_freq_str +from pandas.tseries.frequencies import to_offset +from pandas.tseries.frequencies import get_freq_code as _gfc +from pandas.tseries import frequencies +from pandas.tseries.constants import US_RESO, MS_RESO, S_RESO, T_RESO, H_RESO, D_RESO + from sys import version_info cdef bint PY2 = version_info[0] == 2 @@ -476,12 +483,6 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): reso = curr_reso return reso -US_RESO = 0 -MS_RESO = 1 -S_RESO = 2 -T_RESO = 3 -H_RESO = 4 -D_RESO = 5 cdef inline int _reso_stamp(pandas_datetimestruct *dts): if dts.us != 0: @@ -662,16 +663,12 @@ cdef class Period(object): def _maybe_convert_freq(cls, object freq): if isinstance(freq, compat.string_types): - from pandas.tseries.frequencies import _period_alias_dict freq = freq.upper() freq = _period_alias_dict.get(freq, freq) elif isinstance(freq, (int, tuple)): - from pandas.tseries.frequencies import get_freq_code as _gfc - from pandas.tseries.frequencies import _get_freq_str code, stride = _gfc(freq) freq = _get_freq_str(code, stride) - from pandas.tseries.frequencies import to_offset freq = to_offset(freq) if freq.n <= 0: @@ -691,9 +688,6 @@ cdef class Period(object): def __init__(self, value=None, freq=None, ordinal=None, year=None, month=1, quarter=None, day=1, hour=0, minute=0, second=0): - from pandas.tseries import frequencies - from pandas.tseries.frequencies import get_freq_code as _gfc - # freq points to a tuple (base, mult); base is one of the defined # periods such as A, Q, etc. Every five minutes would be, e.g., # ('T', 5) but may be passed in as a string like '5T' @@ -771,7 +765,6 @@ cdef class Period(object): def __richcmp__(self, other, op): if isinstance(other, Period): - from pandas.tseries.frequencies import get_freq_code as _gfc if other.freq != self.freq: msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr) raise IncompatibleFrequency(msg) @@ -790,7 +783,6 @@ cdef class Period(object): return hash((self.ordinal, self.freq)) def _add_delta(self, other): - from pandas.tseries import frequencies if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)): offset = frequencies.to_offset(self.freq.rule_code) if isinstance(offset, offsets.Tick): @@ -868,7 +860,6 @@ cdef class Period(object): ------- resampled : Period """ - from pandas.tseries.frequencies import get_freq_code as _gfc how = _validate_end_alias(how) base1, mult1 = _gfc(self.freq) base2, mult2 = _gfc(freq) @@ -918,8 +909,6 @@ cdef class Period(object): ------- Timestamp """ - from pandas.tseries import frequencies - from pandas.tseries.frequencies import get_freq_code as _gfc how = _validate_end_alias(how) if freq is None: @@ -933,7 +922,6 @@ cdef class Period(object): return Timestamp(dt64, tz=tz) cdef _field(self, alias): - from pandas.tseries.frequencies import get_freq_code as _gfc base, mult = _gfc(self.freq) return get_period_field(alias, self.ordinal, base) @@ -996,7 +984,6 @@ cdef class Period(object): return self.freq.freqstr def __repr__(self): - from pandas.tseries.frequencies import get_freq_code as _gfc base, mult = _gfc(self.freq) formatted = period_format(self.ordinal, base) return "Period('%s', '%s')" % (formatted, self.freqstr) @@ -1008,7 +995,6 @@ cdef class Period(object): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - from pandas.tseries.frequencies import get_freq_code as _gfc base, mult = _gfc(self.freq) formatted = period_format(self.ordinal, base) value = ("%s" % formatted) @@ -1159,14 +1145,12 @@ cdef class Period(object): >>> a.strftime('%b. %d, %Y was a %A') 'Jan. 01, 2001 was a Monday' """ - from pandas.tseries.frequencies import get_freq_code as _gfc base, mult = _gfc(self.freq) return period_format(self.ordinal, base, fmt) def _ordinal_from_fields(year, month, quarter, day, hour, minute, second, freq): - from pandas.tseries.frequencies import get_freq_code as _gfc base, mult = _gfc(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) @@ -1179,7 +1163,6 @@ def _quarter_to_myear(year, quarter, freq): if quarter <= 0 or quarter > 4: raise ValueError('Quarter must be 1 <= q <= 4') - from pandas.tseries import frequencies mnum = frequencies._month_numbers[frequencies._get_rule_month(freq)] + 1 month = (mnum + (quarter - 1) * 3) % 12 + 1 if month > mnum: diff --git a/pandas/tseries/constants.py b/pandas/tseries/constants.py new file mode 100644 index 0000000000000..2998a573dab63 --- /dev/null +++ b/pandas/tseries/constants.py @@ -0,0 +1,7 @@ +US_RESO = 0 +MS_RESO = 1 +S_RESO = 2 +T_RESO = 3 +H_RESO = 4 +D_RESO = 5 + diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 058a8db9ead08..1af388324afb2 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -14,7 +14,7 @@ import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib -import pandas._period as period +from pandas.tseries import constants from pandas.tslib import Timedelta from pytz import AmbiguousTimeError @@ -38,12 +38,12 @@ class Resolution(object): # defined in period.pyx # note that these are different from freq codes - RESO_US = period.US_RESO - RESO_MS = period.MS_RESO - RESO_SEC = period.S_RESO - RESO_MIN = period.T_RESO - RESO_HR = period.H_RESO - RESO_DAY = period.D_RESO + RESO_US = constants.US_RESO + RESO_MS = constants.MS_RESO + RESO_SEC = constants.S_RESO + RESO_MIN = constants.T_RESO + RESO_HR = constants.H_RESO + RESO_DAY = constants.D_RESO _reso_str_map = { RESO_US: 'microsecond', From 5b3e291549a008c3beed7f6803f446645e3c09d5 Mon Sep 17 00:00:00 2001 From: rs2 Date: Tue, 19 Apr 2016 22:45:56 +0100 Subject: [PATCH 2/6] Moved constants to frequencies.py --- pandas/src/period.pyx | 51 ++++++++++++++++------------------- pandas/tseries/constants.py | 7 ----- pandas/tseries/frequencies.py | 21 ++++++++++----- 3 files changed, 37 insertions(+), 42 deletions(-) delete mode 100644 pandas/tseries/constants.py diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index 1eb154b2473f1..b390fe92d9f47 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -36,12 +36,7 @@ from tslib cimport ( _nat_scalar_rules, ) -from pandas.tseries.frequencies import _period_alias_dict -from pandas.tseries.frequencies import _get_freq_str -from pandas.tseries.frequencies import to_offset -from pandas.tseries.frequencies import get_freq_code as _gfc from pandas.tseries import frequencies -from pandas.tseries.constants import US_RESO, MS_RESO, S_RESO, T_RESO, H_RESO, D_RESO from sys import version_info @@ -468,7 +463,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) pandas_datetimestruct dts - int reso = D_RESO, curr_reso + int reso = frequencies.D_RESO, curr_reso if tz is not None: tz = maybe_get_tz(tz) @@ -487,20 +482,20 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef inline int _reso_stamp(pandas_datetimestruct *dts): if dts.us != 0: if dts.us % 1000 == 0: - return MS_RESO - return US_RESO + return frequencies.MS_RESO + return frequencies.US_RESO elif dts.sec != 0: - return S_RESO + return frequencies.S_RESO elif dts.min != 0: - return T_RESO + return frequencies.T_RESO elif dts.hour != 0: - return H_RESO - return D_RESO + return frequencies.H_RESO + return frequencies.D_RESO cdef _reso_local(ndarray[int64_t] stamps, object tz): cdef: Py_ssize_t n = len(stamps) - int reso = D_RESO, curr_reso + int reso = frequencies.D_RESO, curr_reso ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts @@ -664,12 +659,12 @@ cdef class Period(object): if isinstance(freq, compat.string_types): freq = freq.upper() - freq = _period_alias_dict.get(freq, freq) + freq = frequencies._period_alias_dict.get(freq, freq) elif isinstance(freq, (int, tuple)): - code, stride = _gfc(freq) - freq = _get_freq_str(code, stride) + code, stride = frequencies.get_freq_code(freq) + freq = frequencies._get_freq_str(code, stride) - freq = to_offset(freq) + freq = frequencies.to_offset(freq) if freq.n <= 0: raise ValueError('Frequency must be positive, because it' @@ -711,7 +706,7 @@ cdef class Period(object): elif isinstance(value, Period): other = value - if freq is None or _gfc(freq) == _gfc(other.freq): + if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq): ordinal = other.ordinal freq = other.freq else: @@ -752,7 +747,7 @@ cdef class Period(object): msg = "Value must be Period, string, integer, or datetime" raise ValueError(msg) - base, mult = _gfc(freq) + base, mult = frequencies.get_freq_code(freq) if ordinal is None: self.ordinal = get_period_ordinal(dt.year, dt.month, dt.day, @@ -861,8 +856,8 @@ cdef class Period(object): resampled : Period """ how = _validate_end_alias(how) - base1, mult1 = _gfc(self.freq) - base2, mult2 = _gfc(freq) + base1, mult1 = frequencies.get_freq_code(self.freq) + base2, mult2 = frequencies.get_freq_code(freq) if self.ordinal == tslib.iNaT: ordinal = self.ordinal @@ -912,17 +907,17 @@ cdef class Period(object): how = _validate_end_alias(how) if freq is None: - base, mult = _gfc(self.freq) + base, mult = frequencies.get_freq_code(self.freq) freq = frequencies.get_to_timestamp_base(base) - base, mult = _gfc(freq) + base, mult = frequencies.get_freq_code(freq) val = self.asfreq(freq, how) dt64 = period_ordinal_to_dt64(val.ordinal, base) return Timestamp(dt64, tz=tz) cdef _field(self, alias): - base, mult = _gfc(self.freq) + base, mult = frequencies.get_freq_code(self.freq) return get_period_field(alias, self.ordinal, base) property year: @@ -984,7 +979,7 @@ cdef class Period(object): return self.freq.freqstr def __repr__(self): - base, mult = _gfc(self.freq) + base, mult = frequencies.get_freq_code(self.freq) formatted = period_format(self.ordinal, base) return "Period('%s', '%s')" % (formatted, self.freqstr) @@ -995,7 +990,7 @@ cdef class Period(object): Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. """ - base, mult = _gfc(self.freq) + base, mult = frequencies.get_freq_code(self.freq) formatted = period_format(self.ordinal, base) value = ("%s" % formatted) return value @@ -1145,13 +1140,13 @@ cdef class Period(object): >>> a.strftime('%b. %d, %Y was a %A') 'Jan. 01, 2001 was a Monday' """ - base, mult = _gfc(self.freq) + base, mult = frequencies.get_freq_code(self.freq) return period_format(self.ordinal, base, fmt) def _ordinal_from_fields(year, month, quarter, day, hour, minute, second, freq): - base, mult = _gfc(freq) + base, mult = frequencies.get_freq_code(freq) if quarter is not None: year, month = _quarter_to_myear(year, quarter, freq) diff --git a/pandas/tseries/constants.py b/pandas/tseries/constants.py deleted file mode 100644 index 2998a573dab63..0000000000000 --- a/pandas/tseries/constants.py +++ /dev/null @@ -1,7 +0,0 @@ -US_RESO = 0 -MS_RESO = 1 -S_RESO = 2 -T_RESO = 3 -H_RESO = 4 -D_RESO = 5 - diff --git a/pandas/tseries/frequencies.py b/pandas/tseries/frequencies.py index 1af388324afb2..b053f455d7f4b 100644 --- a/pandas/tseries/frequencies.py +++ b/pandas/tseries/frequencies.py @@ -14,7 +14,6 @@ import pandas.core.common as com import pandas.lib as lib import pandas.tslib as tslib -from pandas.tseries import constants from pandas.tslib import Timedelta from pytz import AmbiguousTimeError @@ -34,16 +33,24 @@ class FreqGroup(object): FR_NS = 12000 +US_RESO = 0 +MS_RESO = 1 +S_RESO = 2 +T_RESO = 3 +H_RESO = 4 +D_RESO = 5 + + class Resolution(object): # defined in period.pyx # note that these are different from freq codes - RESO_US = constants.US_RESO - RESO_MS = constants.MS_RESO - RESO_SEC = constants.S_RESO - RESO_MIN = constants.T_RESO - RESO_HR = constants.H_RESO - RESO_DAY = constants.D_RESO + RESO_US = US_RESO + RESO_MS = MS_RESO + RESO_SEC = S_RESO + RESO_MIN = T_RESO + RESO_HR = H_RESO + RESO_DAY = D_RESO _reso_str_map = { RESO_US: 'microsecond', From 8f254e3ddca7c38dcac1a22c5ce9fcdbe97729d8 Mon Sep 17 00:00:00 2001 From: rs2 Date: Wed, 20 Apr 2016 05:08:38 +0100 Subject: [PATCH 3/6] Added a whatsnew entry + ensured constants are imported correctly by test_tslib.py --- doc/source/whatsnew/v0.18.1.txt | 2 +- pandas/tseries/tests/test_tslib.py | 15 +++++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index cc84347313b42..b739da28c28de 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -243,7 +243,7 @@ Performance Improvements - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) - +- Improved performance of creating multiple instances of Period. This has regressed from ``O(1)`` to ``O(n)`` in terms on number of calls to ``_find_and_load`` between ``0.15.2`` and ``0.18.0``. (:issue:`12903`, :issue:`11831`) diff --git a/pandas/tseries/tests/test_tslib.py b/pandas/tseries/tests/test_tslib.py index 863bc6f630d06..053c59fcbc7a5 100644 --- a/pandas/tseries/tests/test_tslib.py +++ b/pandas/tseries/tests/test_tslib.py @@ -11,7 +11,10 @@ from pandas.tslib import get_timezone from pandas._period import period_asfreq, period_ordinal from pandas.tseries.index import date_range, DatetimeIndex -from pandas.tseries.frequencies import get_freq +from pandas.tseries.frequencies import ( + get_freq, + US_RESO, MS_RESO, S_RESO, H_RESO, D_RESO, T_RESO +) import pandas.tseries.tools as tools import pandas.tseries.offsets as offsets import pandas.util.testing as tm @@ -1307,11 +1310,11 @@ def test_resolution(self): for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T', 'S', 'L', 'U'], - [period.D_RESO, period.D_RESO, - period.D_RESO, period.D_RESO, - period.H_RESO, period.T_RESO, - period.S_RESO, period.MS_RESO, - period.US_RESO]): + [D_RESO, D_RESO, + D_RESO, D_RESO, + H_RESO, T_RESO, + S_RESO, MS_RESO, + US_RESO]): for tz in [None, 'Asia/Tokyo', 'US/Eastern', 'dateutil/US/Eastern']: idx = date_range(start='2013-04-01', periods=30, freq=freq, From 8bcfd572308eb0852ab52c5e04df604dcb852eb7 Mon Sep 17 00:00:00 2001 From: rs2 Date: Wed, 20 Apr 2016 06:06:30 +0100 Subject: [PATCH 4/6] Reworded whatsnew --- doc/source/whatsnew/v0.18.1.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index b739da28c28de..d79bbfbe2e390 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -243,7 +243,7 @@ Performance Improvements - Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`) -- Improved performance of creating multiple instances of Period. This has regressed from ``O(1)`` to ``O(n)`` in terms on number of calls to ``_find_and_load`` between ``0.15.2`` and ``0.18.0``. (:issue:`12903`, :issue:`11831`) +- Improved performance of ``Period`` construction and plotting of ``Period``s. (:issue:`12903`, :issue:`11831`) From 1c5a2aba82f7d9667a9ceebe8082d47c623a0f8f Mon Sep 17 00:00:00 2001 From: rs2 Date: Wed, 20 Apr 2016 07:42:25 +0100 Subject: [PATCH 5/6] Added asv benchmark for Period, PeriodIndex --- asv_bench/benchmarks/period.py | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 asv_bench/benchmarks/period.py diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py new file mode 100644 index 0000000000000..1e4453af70b58 --- /dev/null +++ b/asv_bench/benchmarks/period.py @@ -0,0 +1,9 @@ +from pandas import Period, PeriodIndex, date_range + + +class create_period_index_from_date_range(object): + goal_time = 0.2 + + def time_period_index(self): + # Simulate irregular PeriodIndex + PeriodIndex(date_range('1985', periods=10000).to_pydatetime(), freq='D') From 0d9712db22ab813650a40c8165eddeaf78a1e8aa Mon Sep 17 00:00:00 2001 From: rs2 Date: Wed, 20 Apr 2016 08:37:06 +0100 Subject: [PATCH 6/6] Make RESO constants global in period.pyx and reduce the number of loops in asv_benchmarks/period.py --- asv_bench/benchmarks/period.py | 4 ++-- pandas/src/period.pyx | 22 ++++++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/asv_bench/benchmarks/period.py b/asv_bench/benchmarks/period.py index 1e4453af70b58..012030a71ac82 100644 --- a/asv_bench/benchmarks/period.py +++ b/asv_bench/benchmarks/period.py @@ -1,4 +1,4 @@ -from pandas import Period, PeriodIndex, date_range +from pandas import PeriodIndex, date_range class create_period_index_from_date_range(object): @@ -6,4 +6,4 @@ class create_period_index_from_date_range(object): def time_period_index(self): # Simulate irregular PeriodIndex - PeriodIndex(date_range('1985', periods=10000).to_pydatetime(), freq='D') + PeriodIndex(date_range('1985', periods=1000).to_pydatetime(), freq='D') diff --git a/pandas/src/period.pyx b/pandas/src/period.pyx index b390fe92d9f47..e5802ccef7495 100644 --- a/pandas/src/period.pyx +++ b/pandas/src/period.pyx @@ -44,6 +44,12 @@ cdef bint PY2 = version_info[0] == 2 cdef int64_t NPY_NAT = util.get_nat() +cdef int US_RESO = frequencies.US_RESO +cdef int MS_RESO = frequencies.MS_RESO +cdef int S_RESO = frequencies.S_RESO +cdef int T_RESO = frequencies.T_RESO +cdef int H_RESO = frequencies.H_RESO +cdef int D_RESO = frequencies.D_RESO cdef extern from "period_helper.h": ctypedef struct date_info: @@ -463,7 +469,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef: Py_ssize_t i, n = len(stamps) pandas_datetimestruct dts - int reso = frequencies.D_RESO, curr_reso + int reso = D_RESO, curr_reso if tz is not None: tz = maybe_get_tz(tz) @@ -482,20 +488,20 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None): cdef inline int _reso_stamp(pandas_datetimestruct *dts): if dts.us != 0: if dts.us % 1000 == 0: - return frequencies.MS_RESO - return frequencies.US_RESO + return MS_RESO + return US_RESO elif dts.sec != 0: - return frequencies.S_RESO + return S_RESO elif dts.min != 0: - return frequencies.T_RESO + return T_RESO elif dts.hour != 0: - return frequencies.H_RESO - return frequencies.D_RESO + return H_RESO + return D_RESO cdef _reso_local(ndarray[int64_t] stamps, object tz): cdef: Py_ssize_t n = len(stamps) - int reso = frequencies.D_RESO, curr_reso + int reso = D_RESO, curr_reso ndarray[int64_t] trans, deltas, pos pandas_datetimestruct dts