Skip to content

Fix performance issues when creating multiple instances of Period (#12903, #11831) #12909

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions asv_bench/benchmarks/period.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pandas import Period, PeriodIndex, date_range


class create_period_index_from_date_range(object):
goal_time = 0.2

def time_period_index(self):
# Simulate irregular PeriodIndex
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a benchmark for #11831 as well (you can put in this file even though its plotting)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's quite possible there is already a benchmark for this in the plotting benchmarks

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ahh right, there prob is

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It was Python 3.5.1 :: Anaconda 2.5.0 (64-bit)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I'm not committing any new benchmarks, but let me run them locally before and after the change...

PeriodIndex(date_range('1985', periods=10000).to_pydatetime(), freq='D')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just make this 1000 (it will be the same ratio, but the previous versions will take less time)

2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.18.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ Performance Improvements


- Improved performance of ``DataFrame.to_sql`` when checking case sensitivity for tables. Now only checks if table has been created correctly when table name is not lower case. (:issue:`12876`)

- Improved performance of ``Period`` construction and plotting of ``Period``s. (:issue:`12903`, :issue:`11831`)



Expand Down
72 changes: 25 additions & 47 deletions pandas/src/period.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ from tslib cimport (
_nat_scalar_rules,
)

from pandas.tseries import frequencies

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what you need to do is at the top here type and import all of the freqs that you need e.g

cdef int D_RESO = frequencies.D_RESO
...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will do in a moment.

from sys import version_info

cdef bint PY2 = version_info[0] == 2
Expand Down Expand Up @@ -461,7 +463,7 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
cdef:
Py_ssize_t i, n = len(stamps)
pandas_datetimestruct dts
int reso = D_RESO, curr_reso
int reso = frequencies.D_RESO, curr_reso
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

then you can simply use D_RESO as a global


if tz is not None:
tz = maybe_get_tz(tz)
Expand All @@ -476,30 +478,24 @@ cpdef resolution(ndarray[int64_t] stamps, tz=None):
reso = curr_reso
return reso

US_RESO = 0
MS_RESO = 1
S_RESO = 2
T_RESO = 3
H_RESO = 4
D_RESO = 5

cdef inline int _reso_stamp(pandas_datetimestruct *dts):
if dts.us != 0:
if dts.us % 1000 == 0:
return MS_RESO
return US_RESO
return frequencies.MS_RESO
return frequencies.US_RESO
elif dts.sec != 0:
return S_RESO
return frequencies.S_RESO
elif dts.min != 0:
return T_RESO
return frequencies.T_RESO
elif dts.hour != 0:
return H_RESO
return D_RESO
return frequencies.H_RESO
return frequencies.D_RESO

cdef _reso_local(ndarray[int64_t] stamps, object tz):
cdef:
Py_ssize_t n = len(stamps)
int reso = D_RESO, curr_reso
int reso = frequencies.D_RESO, curr_reso
ndarray[int64_t] trans, deltas, pos
pandas_datetimestruct dts

Expand Down Expand Up @@ -662,17 +658,13 @@ cdef class Period(object):
def _maybe_convert_freq(cls, object freq):

if isinstance(freq, compat.string_types):
from pandas.tseries.frequencies import _period_alias_dict
freq = freq.upper()
freq = _period_alias_dict.get(freq, freq)
freq = frequencies._period_alias_dict.get(freq, freq)
elif isinstance(freq, (int, tuple)):
from pandas.tseries.frequencies import get_freq_code as _gfc
from pandas.tseries.frequencies import _get_freq_str
code, stride = _gfc(freq)
freq = _get_freq_str(code, stride)
code, stride = frequencies.get_freq_code(freq)
freq = frequencies._get_freq_str(code, stride)

from pandas.tseries.frequencies import to_offset
freq = to_offset(freq)
freq = frequencies.to_offset(freq)

if freq.n <= 0:
raise ValueError('Frequency must be positive, because it'
Expand All @@ -691,9 +683,6 @@ cdef class Period(object):
def __init__(self, value=None, freq=None, ordinal=None,
year=None, month=1, quarter=None, day=1,
hour=0, minute=0, second=0):
from pandas.tseries import frequencies
from pandas.tseries.frequencies import get_freq_code as _gfc

# freq points to a tuple (base, mult); base is one of the defined
# periods such as A, Q, etc. Every five minutes would be, e.g.,
# ('T', 5) but may be passed in as a string like '5T'
Expand All @@ -717,7 +706,7 @@ cdef class Period(object):

elif isinstance(value, Period):
other = value
if freq is None or _gfc(freq) == _gfc(other.freq):
if freq is None or frequencies.get_freq_code(freq) == frequencies.get_freq_code(other.freq):
ordinal = other.ordinal
freq = other.freq
else:
Expand Down Expand Up @@ -758,7 +747,7 @@ cdef class Period(object):
msg = "Value must be Period, string, integer, or datetime"
raise ValueError(msg)

base, mult = _gfc(freq)
base, mult = frequencies.get_freq_code(freq)

if ordinal is None:
self.ordinal = get_period_ordinal(dt.year, dt.month, dt.day,
Expand All @@ -771,7 +760,6 @@ cdef class Period(object):

def __richcmp__(self, other, op):
if isinstance(other, Period):
from pandas.tseries.frequencies import get_freq_code as _gfc
if other.freq != self.freq:
msg = _DIFFERENT_FREQ.format(self.freqstr, other.freqstr)
raise IncompatibleFrequency(msg)
Expand All @@ -790,7 +778,6 @@ cdef class Period(object):
return hash((self.ordinal, self.freq))

def _add_delta(self, other):
from pandas.tseries import frequencies
if isinstance(other, (timedelta, np.timedelta64, offsets.Tick, Timedelta)):
offset = frequencies.to_offset(self.freq.rule_code)
if isinstance(offset, offsets.Tick):
Expand Down Expand Up @@ -868,10 +855,9 @@ cdef class Period(object):
-------
resampled : Period
"""
from pandas.tseries.frequencies import get_freq_code as _gfc
how = _validate_end_alias(how)
base1, mult1 = _gfc(self.freq)
base2, mult2 = _gfc(freq)
base1, mult1 = frequencies.get_freq_code(self.freq)
base2, mult2 = frequencies.get_freq_code(freq)

if self.ordinal == tslib.iNaT:
ordinal = self.ordinal
Expand Down Expand Up @@ -918,23 +904,20 @@ cdef class Period(object):
-------
Timestamp
"""
from pandas.tseries import frequencies
from pandas.tseries.frequencies import get_freq_code as _gfc
how = _validate_end_alias(how)

if freq is None:
base, mult = _gfc(self.freq)
base, mult = frequencies.get_freq_code(self.freq)
freq = frequencies.get_to_timestamp_base(base)

base, mult = _gfc(freq)
base, mult = frequencies.get_freq_code(freq)
val = self.asfreq(freq, how)

dt64 = period_ordinal_to_dt64(val.ordinal, base)
return Timestamp(dt64, tz=tz)

cdef _field(self, alias):
from pandas.tseries.frequencies import get_freq_code as _gfc
base, mult = _gfc(self.freq)
base, mult = frequencies.get_freq_code(self.freq)
return get_period_field(alias, self.ordinal, base)

property year:
Expand Down Expand Up @@ -996,8 +979,7 @@ cdef class Period(object):
return self.freq.freqstr

def __repr__(self):
from pandas.tseries.frequencies import get_freq_code as _gfc
base, mult = _gfc(self.freq)
base, mult = frequencies.get_freq_code(self.freq)
formatted = period_format(self.ordinal, base)
return "Period('%s', '%s')" % (formatted, self.freqstr)

Expand All @@ -1008,8 +990,7 @@ cdef class Period(object):
Invoked by unicode(df) in py2 only. Yields a Unicode String in both
py2/py3.
"""
from pandas.tseries.frequencies import get_freq_code as _gfc
base, mult = _gfc(self.freq)
base, mult = frequencies.get_freq_code(self.freq)
formatted = period_format(self.ordinal, base)
value = ("%s" % formatted)
return value
Expand Down Expand Up @@ -1159,15 +1140,13 @@ cdef class Period(object):
>>> a.strftime('%b. %d, %Y was a %A')
'Jan. 01, 2001 was a Monday'
"""
from pandas.tseries.frequencies import get_freq_code as _gfc
base, mult = _gfc(self.freq)
base, mult = frequencies.get_freq_code(self.freq)
return period_format(self.ordinal, base, fmt)


def _ordinal_from_fields(year, month, quarter, day, hour, minute,
second, freq):
from pandas.tseries.frequencies import get_freq_code as _gfc
base, mult = _gfc(freq)
base, mult = frequencies.get_freq_code(freq)
if quarter is not None:
year, month = _quarter_to_myear(year, quarter, freq)

Expand All @@ -1179,7 +1158,6 @@ def _quarter_to_myear(year, quarter, freq):
if quarter <= 0 or quarter > 4:
raise ValueError('Quarter must be 1 <= q <= 4')

from pandas.tseries import frequencies
mnum = frequencies._month_numbers[frequencies._get_rule_month(freq)] + 1
month = (mnum + (quarter - 1) * 3) % 12 + 1
if month > mnum:
Expand Down
21 changes: 14 additions & 7 deletions pandas/tseries/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import pandas.core.common as com
import pandas.lib as lib
import pandas.tslib as tslib
import pandas._period as period
from pandas.tslib import Timedelta
from pytz import AmbiguousTimeError

Expand All @@ -34,16 +33,24 @@ class FreqGroup(object):
FR_NS = 12000


US_RESO = 0
MS_RESO = 1
S_RESO = 2
T_RESO = 3
H_RESO = 4
D_RESO = 5


class Resolution(object):

# defined in period.pyx
# note that these are different from freq codes
RESO_US = period.US_RESO
RESO_MS = period.MS_RESO
RESO_SEC = period.S_RESO
RESO_MIN = period.T_RESO
RESO_HR = period.H_RESO
RESO_DAY = period.D_RESO
RESO_US = US_RESO
RESO_MS = MS_RESO
RESO_SEC = S_RESO
RESO_MIN = T_RESO
RESO_HR = H_RESO
RESO_DAY = D_RESO

_reso_str_map = {
RESO_US: 'microsecond',
Expand Down
15 changes: 9 additions & 6 deletions pandas/tseries/tests/test_tslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
from pandas.tslib import get_timezone
from pandas._period import period_asfreq, period_ordinal
from pandas.tseries.index import date_range, DatetimeIndex
from pandas.tseries.frequencies import get_freq
from pandas.tseries.frequencies import (
get_freq,
US_RESO, MS_RESO, S_RESO, H_RESO, D_RESO, T_RESO
)
import pandas.tseries.tools as tools
import pandas.tseries.offsets as offsets
import pandas.util.testing as tm
Expand Down Expand Up @@ -1307,11 +1310,11 @@ def test_resolution(self):

for freq, expected in zip(['A', 'Q', 'M', 'D', 'H', 'T',
'S', 'L', 'U'],
[period.D_RESO, period.D_RESO,
period.D_RESO, period.D_RESO,
period.H_RESO, period.T_RESO,
period.S_RESO, period.MS_RESO,
period.US_RESO]):
[D_RESO, D_RESO,
D_RESO, D_RESO,
H_RESO, T_RESO,
S_RESO, MS_RESO,
US_RESO]):
for tz in [None, 'Asia/Tokyo', 'US/Eastern',
'dateutil/US/Eastern']:
idx = date_range(start='2013-04-01', periods=30, freq=freq,
Expand Down