diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index c112d1ef72eb8..0f8c8458628b1 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -40,3 +40,46 @@ def setup(self): def test_add_td_ts(self): self.td + self.ts + + +class TimedeltaProperties(object): + goal_time = 0.2 + + def setup(self): + self.td = Timedelta(days=365, minutes=35, seconds=25, milliseconds=35) + + def time_timedelta_days(self): + self.td.days + + def time_timedelta_seconds(self): + self.td.seconds + + def time_timedelta_microseconds(self): + self.td.microseconds + + def time_timedelta_nanoseconds(self): + self.td.nanoseconds + + +class DatetimeAccessor(object): + goal_time = 0.2 + + def setup(self): + self.N = 100000 + self.series = pd.Series( + pd.timedelta_range('1 days', periods=self.N, freq='h') + ) + def time_dt_accessor(self): + self.series.dt + + def time_timedelta_dt_accessor_days(self): + self.series.dt.days + + def time_timedelta_dt_accessor_seconds(self): + self.series.dt.seconds + + def time_timedelta_dt_accessor_microseconds(self): + self.series.dt.microseconds + + def time_timedelta_dt_accessor_nanoseconds(self): + self.series.dt.nanoseconds diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 50f10efb07484..dd5b849b42a08 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -72,6 +72,7 @@ Performance Improvements - Indexers on ``Series`` or ``DataFrame`` no longer create a reference cycle (:issue:`17956`) - Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`) - :class`DateOffset` arithmetic performance is improved (:issue:`18218`) +- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`) - .. _whatsnew_0220.docs: diff --git a/pandas/_libs/src/datetime/np_datetime.c b/pandas/_libs/src/datetime/np_datetime.c index f8254ed9d8418..7278cbaff86ca 100644 --- a/pandas/_libs/src/datetime/np_datetime.c +++ b/pandas/_libs/src/datetime/np_datetime.c @@ -562,6 +562,17 @@ void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, convert_datetime_to_datetimestruct(&meta, val, result); } +void pandas_timedelta_to_timedeltastruct(npy_timedelta val, + PANDAS_DATETIMEUNIT fr, + pandas_timedeltastruct *result) { + pandas_datetime_metadata meta; + + meta.base = fr; + meta.num - 1; + + convert_timedelta_to_timedeltastruct(&meta, val, result); +} + PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj) { return (PANDAS_DATETIMEUNIT)((PyDatetimeScalarObject *)obj)->obmeta.base; } @@ -980,3 +991,107 @@ int convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, return 0; } + +/* + * Converts a timedelta from a timedeltastruct to a timedelta based + * on some metadata. The timedelta is assumed to be valid. + * + * Returns 0 on success, -1 on failure. + */ +int convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, + npy_timedelta td, + pandas_timedeltastruct *out) { + npy_int64 perday; + npy_int64 frac; + npy_int64 sfrac; + npy_int64 ifrac; + int sign; + npy_int64 DAY_NS = 86400000000000LL; + + /* Initialize the output to all zeros */ + memset(out, 0, sizeof(pandas_timedeltastruct)); + + switch (meta->base) { + case PANDAS_FR_ns: + + // put frac in seconds + if (td < 0 && td % (1000LL * 1000LL * 1000LL) != 0) + frac = td / (1000LL * 1000LL * 1000LL) - 1; + else + frac = td / (1000LL * 1000LL * 1000LL); + + if (frac < 0) { + sign = -1; + + // even fraction + if ((-frac % 86400LL) != 0) { + out->days = -frac / 86400LL + 1; + frac += 86400LL * out->days; + } else { + frac = -frac; + } + } else { + sign = 1; + out->days = 0; + } + + if (frac >= 86400) { + out->days += frac / 86400LL; + frac -= out->days * 86400LL; + } + + if (frac >= 3600) { + out->hrs = frac / 3600LL; + frac -= out->hrs * 3600LL; + } else { + out->hrs = 0; + } + + if (frac >= 60) { + out->min = frac / 60LL; + frac -= out->min * 60LL; + } else { + out->min = 0; + } + + if (frac >= 0) { + out->sec = frac; + frac -= out->sec; + } else { + out->sec = 0; + } + + sfrac = (out->hrs * 3600LL + out->min * 60LL + + out->sec) * (1000LL * 1000LL * 1000LL); + + if (sign < 0) + out->days = -out->days; + + ifrac = td - (out->days * DAY_NS + sfrac); + + if (ifrac != 0) { + out->ms = ifrac / (1000LL * 1000LL); + ifrac -= out->ms * 1000LL * 1000LL; + out->us = ifrac / 1000LL; + ifrac -= out->us * 1000LL; + out->ns = ifrac; + } else { + out->ms = 0; + out->us = 0; + out->ns = 0; + } + + out->seconds = out->hrs * 3600 + out->min * 60 + out->sec; + out->microseconds = out->ms * 1000 + out->us; + out->nanoseconds = out->ns; + break; + + default: + PyErr_SetString(PyExc_RuntimeError, + "NumPy datetime metadata is corrupted with invalid " + "base unit"); + return -1; + } + + return 0; +} diff --git a/pandas/_libs/src/datetime/np_datetime.h b/pandas/_libs/src/datetime/np_datetime.h index af3d2e0f01c1b..c51a4bddac82f 100644 --- a/pandas/_libs/src/datetime/np_datetime.h +++ b/pandas/_libs/src/datetime/np_datetime.h @@ -49,11 +49,18 @@ typedef struct { npy_int32 month, day, hour, min, sec, us, ps, as; } pandas_datetimestruct; +typedef struct { + npy_int64 days; + npy_int32 hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds; +} pandas_timedeltastruct; + typedef struct { PANDAS_DATETIMEUNIT base; int num; } pandas_datetime_metadata; +typedef pandas_datetime_metadata pandas_timedelta_metadata; + extern const pandas_datetimestruct _NS_MIN_DTS; extern const pandas_datetimestruct _NS_MAX_DTS; @@ -71,6 +78,10 @@ npy_datetime pandas_datetimestruct_to_datetime(PANDAS_DATETIMEUNIT fr, void pandas_datetime_to_datetimestruct(npy_datetime val, PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result); +void pandas_timedelta_to_timedeltastruct(npy_timedelta val, + PANDAS_DATETIMEUNIT fr, + pandas_timedeltastruct *result); + int dayofweek(int y, int m, int d); extern const int days_per_month_table[2][12]; @@ -131,6 +142,11 @@ convert_datetime_to_datetimestruct(pandas_datetime_metadata *meta, npy_datetime dt, pandas_datetimestruct *out); +int +convert_timedelta_to_timedeltastruct(pandas_timedelta_metadata *meta, + npy_timedelta td, + pandas_timedeltastruct *out); + PANDAS_DATETIMEUNIT get_datetime64_unit(PyObject *obj); diff --git a/pandas/_libs/tslibs/fields.pyx b/pandas/_libs/tslibs/fields.pyx index b40646295cce5..3ab84853dfc4a 100644 --- a/pandas/_libs/tslibs/fields.pyx +++ b/pandas/_libs/tslibs/fields.pyx @@ -17,7 +17,8 @@ from numpy cimport ndarray, int64_t, int32_t, int8_t np.import_array() -from np_datetime cimport pandas_datetimestruct, dt64_to_dtstruct +from np_datetime cimport (pandas_datetimestruct, pandas_timedeltastruct, + dt64_to_dtstruct, td64_to_tdstruct) from datetime cimport ( days_per_month_table, @@ -545,6 +546,123 @@ def get_date_field(ndarray[int64_t] dtindex, object field): raise ValueError("Field %s not supported" % field) +@cython.wraparound(False) +@cython.boundscheck(False) +def get_timedelta_field(ndarray[int64_t] tdindex, object field): + """ + Given a int64-based timedelta index, extract the days, hrs, sec., + field and return an array of these values. + """ + cdef: + Py_ssize_t i, count = 0 + ndarray[int32_t] out + pandas_timedeltastruct tds + + count = len(tdindex) + out = np.empty(count, dtype='i4') + + if field == 'days': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.days + return out + + elif field == 'h': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.hrs + return out + + elif field == 's': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.sec + return out + + elif field == 'seconds': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.seconds + return out + + elif field == 'ms': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.ms + return out + + elif field == 'microseconds': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.microseconds + return out + + elif field == 'us': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.us + return out + + elif field == 'ns': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.ns + return out + + elif field == 'nanoseconds': + with nogil: + for i in range(count): + if tdindex[i] == NPY_NAT: + out[i] = -1 + continue + + td64_to_tdstruct(tdindex[i], &tds) + out[i] = tds.nanoseconds + return out + + raise ValueError("Field %s not supported" % field) + + cdef inline int days_in_month(pandas_datetimestruct dts) nogil: return days_per_month_table[is_leapyear(dts.year)][dts.month - 1] diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index 1ae0499f90c0d..3692822ada135 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -30,6 +30,10 @@ cdef extern from "../src/datetime/np_datetime.h": int64_t year int32_t month, day, hour, min, sec, us, ps, as + ctypedef struct pandas_timedeltastruct: + int64_t days + int32_t hrs, min, sec, ms, us, ns, seconds, microseconds, nanoseconds + ctypedef enum PANDAS_DATETIMEUNIT: PANDAS_FR_Y PANDAS_FR_M @@ -54,6 +58,7 @@ cdef check_dts_bounds(pandas_datetimestruct *dts) cdef int64_t dtstruct_to_dt64(pandas_datetimestruct* dts) nogil cdef void dt64_to_dtstruct(int64_t dt64, pandas_datetimestruct* out) nogil +cdef void td64_to_tdstruct(int64_t td64, pandas_timedeltastruct* out) nogil cdef int64_t pydatetime_to_dt64(datetime val, pandas_datetimestruct *dts) cdef int64_t pydate_to_dt64(date val, pandas_datetimestruct *dts) diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index abd6c59ea6244..72c028161a937 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -26,6 +26,11 @@ cdef extern from "../src/datetime/np_datetime.h": PANDAS_DATETIMEUNIT fr, pandas_datetimestruct *result) nogil + void pandas_timedelta_to_timedeltastruct(npy_timedelta val, + PANDAS_DATETIMEUNIT fr, + pandas_timedeltastruct *result + ) nogil + pandas_datetimestruct _NS_MIN_DTS, _NS_MAX_DTS # ---------------------------------------------------------------------- @@ -127,6 +132,13 @@ cdef inline void dt64_to_dtstruct(int64_t dt64, pandas_datetime_to_datetimestruct(dt64, PANDAS_FR_ns, out) return +cdef inline void td64_to_tdstruct(int64_t td64, + pandas_timedeltastruct* out) nogil: + """Convenience function to call pandas_timedelta_to_timedeltastruct + with the by-far-most-common frequency PANDAS_FR_ns""" + pandas_timedelta_to_timedeltastruct(td64, PANDAS_FR_ns, out) + return + cdef inline int64_t pydatetime_to_dt64(datetime val, pandas_datetimestruct *dts): diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 869ff5ee77bda..aba213122ea31 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -26,7 +26,8 @@ from util cimport (is_timedelta64_object, is_datetime64_object, is_integer_object, is_float_object, is_string_object) -from np_datetime cimport cmp_scalar, reverse_ops +from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct, + pandas_timedeltastruct) from nattype import nat_strings, NaT from nattype cimport _checknull_with_nat @@ -584,65 +585,26 @@ cdef class _Timedelta(timedelta): """ compute the components """ - cdef int64_t sfrac, ifrac, frac, ivalue = self.value - if self.is_populated: return - # put frac in seconds - frac = ivalue / (1000 * 1000 * 1000) - if frac < 0: - self._sign = -1 + cdef: + pandas_timedeltastruct tds - # even fraction - if (-frac % 86400) != 0: - self._d = -frac / 86400 + 1 - frac += 86400 * self._d - else: - frac = -frac + td64_to_tdstruct(self.value, &tds) + self._d = tds.days + if self._d < 0: + self._sign = -1 else: self._sign = 1 - self._d = 0 - - if frac >= 86400: - self._d += frac / 86400 - frac -= self._d * 86400 - - if frac >= 3600: - self._h = frac / 3600 - frac -= self._h * 3600 - else: - self._h = 0 - - if frac >= 60: - self._m = frac / 60 - frac -= self._m * 60 - else: - self._m = 0 - - if frac >= 0: - self._s = frac - frac -= self._s - else: - self._s = 0 - - sfrac = (self._h * 3600 + self._m * 60 - + self._s) * (1000 * 1000 * 1000) - if self._sign < 0: - ifrac = ivalue + self._d * DAY_NS - sfrac - else: - ifrac = ivalue - (self._d * DAY_NS + sfrac) - - if ifrac != 0: - self._ms = ifrac / (1000 * 1000) - ifrac -= self._ms * 1000 * 1000 - self._us = ifrac / 1000 - ifrac -= self._us * 1000 - self._ns = ifrac - else: - self._ms = 0 - self._us = 0 - self._ns = 0 + self._h = tds.hrs + self._m = tds.min + self._s = tds.sec + self._ms = tds.ms + self._us = tds.us + self._ns = tds.ns + self._seconds = tds.seconds + self._microseconds = tds.microseconds self.is_populated = 1 @@ -671,10 +633,6 @@ cdef class _Timedelta(timedelta): def components(self): """ Return a Components NamedTuple-like """ self._ensure_components() - if self._sign < 0: - return Components(-self._d, self._h, self._m, self._s, - self._ms, self._us, self._ns) - # return the named tuple return Components(self._d, self._h, self._m, self._s, self._ms, self._us, self._ns) @@ -717,8 +675,6 @@ cdef class _Timedelta(timedelta): .components will return the shown components """ self._ensure_components() - if self._sign < 0: - return -1 * self._d return self._d @property @@ -729,7 +685,7 @@ cdef class _Timedelta(timedelta): .components will return the shown components """ self._ensure_components() - return self._h * 3600 + self._m * 60 + self._s + return self._seconds @property def microseconds(self): @@ -739,7 +695,7 @@ cdef class _Timedelta(timedelta): .components will return the shown components """ self._ensure_components() - return self._ms * 1000 + self._us + return self._microseconds @property def nanoseconds(self): @@ -778,9 +734,9 @@ cdef class _Timedelta(timedelta): if format == 'all': seconds_pretty = "%02d.%03d%03d%03d" % ( self._s, self._ms, self._us, self._ns) - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, - sign2_pretty, self._h, - self._m, seconds_pretty) + return "%d days%s%02d:%02d:%s" % (self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) # by default not showing nano if self._ms or self._us or self._ns: @@ -794,7 +750,7 @@ cdef class _Timedelta(timedelta): if format == 'even_day': if not subs: - return "%s%d days" % (sign_pretty, self._d) + return "%d days" % (self._d) elif format == 'sub_day': if not self._d: @@ -806,10 +762,10 @@ cdef class _Timedelta(timedelta): self._h, self._m, seconds_pretty) if subs or format=='long': - return "%s%d days%s%02d:%02d:%s" % (sign_pretty, self._d, - sign2_pretty, self._h, - self._m, seconds_pretty) - return "%s%d days" % (sign_pretty, self._d) + return "%d days%s%02d:%02d:%s" % (self._d, + sign2_pretty, self._h, + self._m, seconds_pretty) + return "%d days" % (self._d) def __repr__(self): return "Timedelta('{0}')".format(self._repr_base(format='long')) diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 604af1cfd678a..e4bc46fb7bdbe 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -35,20 +35,15 @@ from pandas._libs import (lib, index as libindex, tslib as libts, join as libjoin, Timedelta, NaT, iNaT) from pandas._libs.tslibs.timedeltas import array_to_timedelta64 +from pandas._libs.tslibs.fields import get_timedelta_field def _field_accessor(name, alias, docstring=None): def f(self): + values = self.asi8 + result = get_timedelta_field(values, alias) if self.hasnans: - result = np.empty(len(self), dtype='float64') - mask = self._isnan - imask = ~mask - result.flat[imask] = np.array([getattr(Timedelta(val), alias) - for val in self.asi8[imask]]) - result[mask] = np.nan - else: - result = np.array([getattr(Timedelta(val), alias) - for val in self.asi8], dtype='int64') + result = self._maybe_mask_results(result, convert='float64') return Index(result, name=self.name)