Skip to content

PERF: vectorized DateOffset with months #11205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 2, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1056,6 +1056,7 @@ Performance Improvements
- 2x improvement of ``Series.value_counts`` for float dtype (:issue:`10821`)
- Enable ``infer_datetime_format`` in ``to_datetime`` when date components do not have 0 padding (:issue:`11142`)
- Regression from 0.16.1 in constructing ``DataFrame`` from nested dictionary (:issue:`11084`)
- Performance improvements in addition/subtraction operations for ``DateOffset`` with ``Series`` or ``DatetimeIndex`` (issue:`10744`, :issue:`11205`)

.. _whatsnew_0170.bug_fixes:

Expand Down
18 changes: 9 additions & 9 deletions pandas/tseries/offsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,16 +261,12 @@ def apply_index(self, i):
# relativedelta/_offset path only valid for base DateOffset
if (self._use_relativedelta and
set(self.kwds).issubset(relativedelta_fast)):

months = ((self.kwds.get('years', 0) * 12
+ self.kwds.get('months', 0)) * self.n)
if months:
base = (i.to_period('M') + months).to_timestamp()
time = i.to_perioddelta('D')
days = i.to_perioddelta('M') - time
# minimum prevents month-end from wrapping
day_offset = np.minimum(days,
to_timedelta(base.days_in_month - 1, unit='D'))
i = base + day_offset + time
shifted = tslib.shift_months(i.asi8, months)
i = i._shallow_copy(shifted)

weeks = (self.kwds.get('weeks', 0)) * self.n
if weeks:
Expand Down Expand Up @@ -1081,7 +1077,9 @@ def apply(self, other):

@apply_index_wraps
def apply_index(self, i):
return self._end_apply_index(i, 'M')
months = self.n - 1 if self.n >= 0 else self.n
shifted = tslib.shift_months(i.asi8, months, 'end')
return i._shallow_copy(shifted)

def onOffset(self, dt):
if self.normalize and not _is_normalized(dt):
Expand All @@ -1106,7 +1104,9 @@ def apply(self, other):

@apply_index_wraps
def apply_index(self, i):
return self._beg_apply_index(i, 'M')
months = self.n + 1 if self.n < 0 else self.n
shifted = tslib.shift_months(i.asi8, months, 'start')
return i._shallow_copy(shifted)

def onOffset(self, dt):
if self.normalize and not _is_normalized(dt):
Expand Down
14 changes: 7 additions & 7 deletions pandas/tseries/tests/test_timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -2565,32 +2565,32 @@ def test_datetime64_with_DateOffset(self):
for klass, assert_func in zip([Series, DatetimeIndex],
[self.assert_series_equal,
tm.assert_index_equal]):
s = klass(date_range('2000-01-01', '2000-01-31'))
s = klass(date_range('2000-01-01', '2000-01-31'), name='a')
result = s + pd.DateOffset(years=1)
result2 = pd.DateOffset(years=1) + s
exp = klass(date_range('2001-01-01', '2001-01-31'))
exp = klass(date_range('2001-01-01', '2001-01-31'), name='a')
assert_func(result, exp)
assert_func(result2, exp)

result = s - pd.DateOffset(years=1)
exp = klass(date_range('1999-01-01', '1999-01-31'))
exp = klass(date_range('1999-01-01', '1999-01-31'), name='a')
assert_func(result, exp)

s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
pd.Timestamp('2000-02-15', tz='US/Central')])
pd.Timestamp('2000-02-15', tz='US/Central')], name='a')
result = s + pd.offsets.Day()
result2 = pd.offsets.Day() + s
exp = klass([Timestamp('2000-01-16 00:15:00', tz='US/Central'),
Timestamp('2000-02-16', tz='US/Central')])
Timestamp('2000-02-16', tz='US/Central')], name='a')
assert_func(result, exp)
assert_func(result2, exp)

s = klass([Timestamp('2000-01-15 00:15:00', tz='US/Central'),
pd.Timestamp('2000-02-15', tz='US/Central')])
pd.Timestamp('2000-02-15', tz='US/Central')], name='a')
result = s + pd.offsets.MonthEnd()
result2 = pd.offsets.MonthEnd() + s
exp = klass([Timestamp('2000-01-31 00:15:00', tz='US/Central'),
Timestamp('2000-02-29', tz='US/Central')])
Timestamp('2000-02-29', tz='US/Central')], name='a')
assert_func(result, exp)
assert_func(result2, exp)

Expand Down
11 changes: 11 additions & 0 deletions pandas/tseries/tests/test_tslib.py
Original file line number Diff line number Diff line change
Expand Up @@ -949,6 +949,17 @@ def compare_local_to_utc(tz_didx, utc_didx):
tslib.maybe_get_tz('Asia/Tokyo'))
self.assert_numpy_array_equal(result, np.array([tslib.iNaT], dtype=np.int64))

def test_shift_months(self):
s = DatetimeIndex([Timestamp('2000-01-05 00:15:00'), Timestamp('2000-01-31 00:23:00'),
Timestamp('2000-01-01'), Timestamp('2000-02-29'), Timestamp('2000-12-31')])
for years in [-1, 0, 1]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a name in the test for the dti

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

didn't add here because the tslib method just operates on the ints, but I did add names to some of the existing DateOffset tests

for months in [-2, 0, 2]:
actual = DatetimeIndex(tslib.shift_months(s.asi8, years * 12 + months))
expected = DatetimeIndex([x + offsets.DateOffset(years=years, months=months) for x in s])
tm.assert_index_equal(actual, expected)



class TestTimestampOps(tm.TestCase):
def test_timestamp_and_datetime(self):
self.assertEqual((Timestamp(datetime.datetime(2013, 10, 13)) - datetime.datetime(2013, 10, 12)).days, 1)
Expand Down
70 changes: 70 additions & 0 deletions pandas/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -3847,6 +3847,7 @@ def get_time_micros(ndarray[int64_t] dtindex):

return micros


@cython.wraparound(False)
def get_date_field(ndarray[int64_t] dtindex, object field):
'''
Expand Down Expand Up @@ -4386,6 +4387,75 @@ cpdef normalize_date(object dt):
raise TypeError('Unrecognized type: %s' % type(dt))


cdef inline int _year_add_months(pandas_datetimestruct dts,
int months):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a doc-string to these

'''new year number after shifting pandas_datetimestruct number of months'''
return dts.year + (dts.month + months - 1) / 12

cdef inline int _month_add_months(pandas_datetimestruct dts,
int months):
'''new month number after shifting pandas_datetimestruct number of months'''
cdef int new_month = (dts.month + months) % 12
return 12 if new_month == 0 else new_month

@cython.wraparound(False)
def shift_months(int64_t[:] dtindex, int months, object day=None):
'''
Given an int64-based datetime index, shift all elements
specified number of months using DateOffset semantics

day: {None, 'start', 'end'}
* None: day of month
* 'start' 1st day of month
* 'end' last day of month
'''
cdef:
Py_ssize_t i
int days_in_month
pandas_datetimestruct dts
int count = len(dtindex)
int64_t[:] out = np.empty(count, dtype='int64')

for i in range(count):
if dtindex[i] == NPY_NAT:
out[i] = NPY_NAT
else:
pandas_datetime_to_datetimestruct(dtindex[i], PANDAS_FR_ns, &dts)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this routine could actually be used in the Dateoffset for adding months (possibly dealing with month end/start as well)

could do in another PR though


if day is None:
dts.year = _year_add_months(dts, months)
dts.month = _month_add_months(dts, months)
#prevent day from wrapping around month end
days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1]
dts.day = min(dts.day, days_in_month)
elif day == 'start':
dts.year = _year_add_months(dts, months)
dts.month = _month_add_months(dts, months)

# offset semantics - when subtracting if at the start anchor
# point, shift back by one more month
if months <= 0 and dts.day == 1:
dts.year = _year_add_months(dts, -1)
dts.month = _month_add_months(dts, -1)
else:
dts.day = 1
elif day == 'end':
days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1]
dts.year = _year_add_months(dts, months)
dts.month = _month_add_months(dts, months)

# similar semantics - when adding shift forward by one
# month if already at an end of month
if months >= 0 and dts.day == days_in_month:
dts.year = _year_add_months(dts, 1)
dts.month = _month_add_months(dts, 1)

days_in_month = days_per_month_table[is_leapyear(dts.year)][dts.month-1]
dts.day = days_in_month

out[i] = pandas_datetimestruct_to_datetime(PANDAS_FR_ns, &dts)
return np.asarray(out)

#----------------------------------------------------------------------
# Don't even ask

Expand Down