Skip to content

PERF: Vectorized Timedelta property access (#18092) #18225

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions asv_bench/benchmarks/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,46 @@ def setup(self):

def test_add_td_ts(self):
self.td + self.ts


class TimedeltaProperties(object):
goal_time = 0.2

def setup(self):
self.td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35)

def time_timedelta_days(self):
self.td.days

def time_timedelta_seconds(self):
self.td.seconds

def time_timedelta_microseconds(self):
self.td.microseconds

def time_timedelta_nanoseconds(self):
self.td.nanoseconds


class DatetimeAccessor(object):
goal_time = 0.2

def setup(self):
self.N = 100000
self.series = pd.Series(
pd.timedelta_range('1 days', periods=self.N, freq='h')
)
def time_dt_accessor(self):
self.series.dt

def time_timedelta_dt_accessor_days(self):
self.series.dt.days

def time_timedelta_dt_accessor_seconds(self):
self.series.dt.seconds

def time_timedelta_dt_accessor_microseconds(self):
self.series.dt.microseconds

def time_timedelta_dt_accessor_nanoseconds(self):
self.series.dt.nanoseconds
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ Performance Improvements
- Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`)
- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
- :class`DateOffset` arithmetic performance is improved (:issue:`18218`)
- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`)
-

.. _whatsnew_0220.docs:
Expand Down
115 changes: 115 additions & 0 deletions pandas/_libs/src/datetime/np_datetime.c
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,17 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr,
convert_datetime_to_datetimestruct(&meta, val, result);
}

void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
PANDAS_DATETIMEUNIT fr,
pandas_timedeltastruct *result) {
pandas_datetime_metadata meta;

meta.base = fr;
meta.num - 1;

convert_timedelta_to_timedeltastruct(&meta, val, result);
}

PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) {
return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *)obj)->obmeta.base;
}
Expand Down Expand Up @@ -980,3 +991,107 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta,

return 0;
}

/*
* Converts a timedelta from a timedeltastruct to a timedelta based
* on some metadata. The timedelta is assumed to be valid.
*
* Returns 0 on success, -1 on failure.
*/
int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta,
npy_timedelta td,
pandas_timedeltastruct *out) {
npy_int64 perday;
npy_int64 frac;
npy_int64 sfrac;
npy_int64 ifrac;
int sign;
npy_int64 DAY_NS = 86400000000000LL;

/* Initialize the output to all zeros */
memset(out, 0, sizeof(pandas_timedeltastruct));

switch (meta->base) {
case PANDAS_FR_ns:

// put frac in seconds
if (td < 0 && td % (1000LL * 1000LL * 1000LL) != 0)
frac = td / (1000LL * 1000LL * 1000LL) - 1;
else
frac = td / (1000LL * 1000LL * 1000LL);

if (frac < 0) {
sign = -1;

// even fraction
if ((-frac % 86400LL) != 0) {
out->days = -frac / 86400LL + 1;
frac += 86400LL * out->days;
} else {
frac = -frac;
}
} else {
sign = 1;
out->days = 0;
}

if (frac >= 86400) {
out->days += frac / 86400LL;
frac -= out->days * 86400LL;
}

if (frac >= 3600) {
out->hrs = frac / 3600LL;
frac -= out->hrs * 3600LL;
} else {
out->hrs = 0;
}

if (frac >= 60) {
out->min = frac / 60LL;
frac -= out->min * 60LL;
} else {
out->min = 0;
}

if (frac >= 0) {
out->sec = frac;
frac -= out->sec;
} else {
out->sec = 0;
}

sfrac = (out->hrs * 3600LL + out->min * 60LL
+ out->sec) * (1000LL * 1000LL * 1000LL);

if (sign < 0)
out->days = -out->days;

ifrac = td - (out->days * DAY_NS + sfrac);

if (ifrac != 0) {
out->ms = ifrac / (1000LL * 1000LL);
ifrac -= out->ms * 1000LL * 1000LL;
out->us = ifrac / 1000LL;
ifrac -= out->us * 1000LL;
out->ns = ifrac;
} else {
out->ms = 0;
out->us = 0;
out->ns = 0;
}

out->seconds = out->hrs * 3600 + out->min * 60 + out->sec;
out->microseconds = out->ms * 1000 + out->us;
out->nanoseconds = out->ns;
break;

default:
PyErr_SetString(PyExc_RuntimeError,
"NumPy datetime metadata is corrupted with invalid "
"base unit");
return -1;
}

return 0;
}
16 changes: 16 additions & 0 deletions pandas/_libs/src/datetime/np_datetime.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,18 @@ typedef struct {
npy_int32 month, day, hour, min, sec, us, ps, as;
} pandas_datetimestruct;

typedef struct {
npy_int64 days;
npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds;
} pandas_timedeltastruct;

typedef struct {
PANDAS_DATETIMEUNIT base;
int num;
} pandas_datetime_metadata;

typedef pandas_datetime_metadata pandas_timedelta_metadata;

extern const pandas_datetimestruct _NS_MIN_DTS;
extern const pandas_datetimestruct _NS_MAX_DTS;

Expand All @@ -71,6 +78,10 @@ npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr,
void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr,
pandas_datetimestruct *result);

void pandas_timedelta_to_timedeltastruct(npy_timedelta val,
PANDAS_DATETIMEUNIT fr,
pandas_timedeltastruct *result);

int dayofweek(int y, int m, int d);

extern const int days_per_month_table[2][12];
Expand Down Expand Up @@ -131,6 +142,11 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta,
npy_datetime dt,
pandas_datetimestruct *out);

int
convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta,
npy_timedelta td,
pandas_timedeltastruct *out);


PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj);

Expand Down
120 changes: 119 additions & 1 deletion pandas/_libs/tslibs/fields.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ from numpy cimport ndarray, int64_t, int32_t, int8_t
np.import_array()


from np_datetime cimport pandas_datetimestruct, dt64_to_dtstruct
from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct,
dt64_to_dtstruct, td64_to_tdstruct)

from datetime cimport (
days_per_month_table,
Expand Down Expand Up @@ -545,6 +546,123 @@ def get_date_field(ndarray[int64_t] dtindex, object field):
raise ValueError("Field %s not supported" % field)


@cython.wraparound(False)
@cython.boundscheck(False)
def get_timedelta_field(ndarray[int64_t] tdindex, object field):
"""
Given a int64-based timedelta index, extract the days, hrs, sec.,
field and return an array of these values.
"""
cdef:
Py_ssize_t i, count = 0
ndarray[int32_t] out
pandas_timedeltastruct tds

count = len(tdindex)
out = np.empty(count, dtype='i4')

if field == 'days':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.days
return out

elif field == 'h':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.hrs
return out

elif field == 's':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.sec
return out

elif field == 'seconds':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.seconds
return out

elif field == 'ms':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.ms
return out

elif field == 'microseconds':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.microseconds
return out

elif field == 'us':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.us
return out

elif field == 'ns':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.ns
return out

elif field == 'nanoseconds':
with nogil:
for i in range(count):
if tdindex[i] == NPY_NAT:
out[i] = -1
continue

td64_to_tdstruct(tdindex[i], &tds)
out[i] = tds.nanoseconds
return out

raise ValueError("Field %s not supported" % field)


cdef inline int days_in_month(pandas_datetimestruct dts) nogil:
return days_per_month_table[is_leapyear(dts.year)][dts.month - 1]

Expand Down
5 changes: 5 additions & 0 deletions pandas/_libs/tslibs/np_datetime.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ cdef extern from "../src/datetime/np_datetime.h":
int64_t year
int32_t month, day, hour, min, sec, us, ps, as

ctypedef struct pandas_timedeltastruct:
int64_t days
int32_t hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds

ctypedef enum PANDAS_DATETIMEUNIT:
PANDAS_FR_Y
PANDAS_FR_M
Expand All @@ -54,6 +58,7 @@ cdef check_dts_bounds(pandas_datetimestruct *dts)

cdef int64_t dtstruct_to_dt64(pandas_datetimestruct* dts) nogil
cdef void dt64_to_dtstruct(int64_t dt64, pandas_datetimestruct* out) nogil
cdef void td64_to_tdstruct(int64_t td64, pandas_timedeltastruct* out) nogil

cdef int64_t pydatetime_to_dt64(datetime val, pandas_datetimestruct *dts)
cdef int64_t pydate_to_dt64(date val, pandas_datetimestruct *dts)
Expand Down
Loading