Skip to content

Commit e477eee

Browse files
authored
Remove undefined behavior from npy_datetimestruct_to_datetime (#55151)
* refactor npy_datetimestruct_to_datetime * Used builtin overflow directives * macro fixups * more macro cleanups * more macro refactor * musllinux c support * macro cleanup * more refactor * fix cython warning * windows fix * Raise Outofboundsdatetime * cleanup GIL warnings * more error handling cleanup * updates * error message update * error fixups * test fixup * clang-format * updates * fixed error message * try nogil * revert offsets changes * simplified error handling * period fixup * fixed test failure * try speedup * updated benchmark * revert noexcepts * shared function for dts formatting * import -> cimport * pass address * typo * remove comment * cdef object -> str
1 parent 55d78ca commit e477eee

File tree

12 files changed

+323
-123
lines changed

12 files changed

+323
-123
lines changed

asv_bench/benchmarks/tslibs/period.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,11 @@ def setup(self, size, freq, tz):
151151
# tzlocal is cumbersomely slow, so skip to keep runtime in check
152152
raise NotImplementedError
153153

154-
arr = np.arange(10, dtype="i8").repeat(size // 10)
154+
# we pick 2**55 because smaller values end up returning
155+
# -1 from npy_datetimestruct_to_datetime with NPY_FR_Y frequency
156+
# this artificially slows down functions since -1 is also the
157+
# error sentinel
158+
arr = np.arange(2**55, 2**55 + 10, dtype="i8").repeat(size // 10)
155159
self.i8values = arr
156160

157161
def time_dt64arr_to_periodarr(self, size, freq, tz):

pandas/_libs/src/vendored/numpy/datetime/np_datetime.c

+230-85
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,58 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
2929
#include <numpy/arrayscalars.h>
3030
#include <numpy/ndarraytypes.h>
3131

32+
#if defined(_WIN32)
33+
#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
34+
#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
35+
#endif
36+
#include <intsafe.h>
37+
#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
38+
#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
39+
#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
40+
#else
41+
#if defined __has_builtin
42+
#if __has_builtin(__builtin_add_overflow)
43+
#if _LP64 || __LP64__ || _ILP64 || __ILP64__
44+
#define checked_int64_add(a, b, res) __builtin_saddl_overflow(a, b, res)
45+
#define checked_int64_sub(a, b, res) __builtin_ssubl_overflow(a, b, res)
46+
#define checked_int64_mul(a, b, res) __builtin_smull_overflow(a, b, res)
47+
#else
48+
#define checked_int64_add(a, b, res) __builtin_saddll_overflow(a, b, res)
49+
#define checked_int64_sub(a, b, res) __builtin_ssubll_overflow(a, b, res)
50+
#define checked_int64_mul(a, b, res) __builtin_smulll_overflow(a, b, res)
51+
#endif
52+
#else
53+
_Static_assert(0,
54+
"Overflow checking not detected; please try a newer compiler");
55+
#endif
56+
// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
57+
// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
58+
#elif __GNUC__ > 7
59+
#if _LP64 || __LP64__ || _ILP64 || __ILP64__
60+
#define checked_int64_add(a, b, res) __builtin_saddl_overflow(a, b, res)
61+
#define checked_int64_sub(a, b, res) __builtin_ssubl_overflow(a, b, res)
62+
#define checked_int64_mul(a, b, res) __builtin_smull_overflow(a, b, res)
63+
#else
64+
#define checked_int64_add(a, b, res) __builtin_saddll_overflow(a, b, res)
65+
#define checked_int64_sub(a, b, res) __builtin_ssubll_overflow(a, b, res)
66+
#define checked_int64_mul(a, b, res) __builtin_smulll_overflow(a, b, res)
67+
#endif
68+
#else
69+
_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
70+
#endif
71+
#endif
72+
73+
#define PD_CHECK_OVERFLOW(FUNC) \
74+
do { \
75+
if ((FUNC) != 0) { \
76+
PyGILState_STATE gstate = PyGILState_Ensure(); \
77+
PyErr_SetString(PyExc_OverflowError, \
78+
"Overflow occurred in npy_datetimestruct_to_datetime"); \
79+
PyGILState_Release(gstate); \
80+
return -1; \
81+
} \
82+
} while (0)
83+
3284
const int days_per_month_table[2][12] = {
3385
{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
3486
{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
@@ -299,96 +351,189 @@ PyObject *extract_utc_offset(PyObject *obj) {
299351
return tmp;
300352
}
301353

354+
static inline int scaleYearToEpoch(int64_t year, int64_t *result) {
355+
return checked_int64_sub(year, 1970, result);
356+
}
357+
358+
static inline int scaleYearsToMonths(int64_t years, int64_t *result) {
359+
return checked_int64_mul(years, 12, result);
360+
}
361+
362+
static inline int scaleDaysToWeeks(int64_t days, int64_t *result) {
363+
if (days >= 0) {
364+
*result = days / 7;
365+
return 0;
366+
} else {
367+
int res;
368+
int64_t checked_days;
369+
if ((res = checked_int64_sub(days, 6, &checked_days))) {
370+
return res;
371+
}
372+
373+
*result = checked_days / 7;
374+
return 0;
375+
}
376+
}
377+
378+
static inline int scaleDaysToHours(int64_t days, int64_t *result) {
379+
return checked_int64_mul(days, 24, result);
380+
}
381+
382+
static inline int scaleHoursToMinutes(int64_t hours, int64_t *result) {
383+
return checked_int64_mul(hours, 60, result);
384+
}
385+
386+
static inline int scaleMinutesToSeconds(int64_t minutes, int64_t *result) {
387+
return checked_int64_mul(minutes, 60, result);
388+
}
389+
390+
static inline int scaleSecondsToMilliseconds(int64_t seconds, int64_t *result) {
391+
return checked_int64_mul(seconds, 1000, result);
392+
}
393+
394+
static inline int scaleSecondsToMicroseconds(int64_t seconds, int64_t *result) {
395+
return checked_int64_mul(seconds, 1000000, result);
396+
}
397+
398+
static inline int scaleMicrosecondsToNanoseconds(int64_t microseconds,
399+
int64_t *result) {
400+
return checked_int64_mul(microseconds, 1000, result);
401+
}
402+
403+
static inline int scaleMicrosecondsToPicoseconds(int64_t microseconds,
404+
int64_t *result) {
405+
return checked_int64_mul(microseconds, 1000000, result);
406+
}
407+
408+
static inline int64_t scalePicosecondsToFemtoseconds(int64_t picoseconds,
409+
int64_t *result) {
410+
return checked_int64_mul(picoseconds, 1000, result);
411+
}
412+
413+
static inline int64_t scalePicosecondsToAttoseconds(int64_t picoseconds,
414+
int64_t *result) {
415+
return checked_int64_mul(picoseconds, 1000000, result);
416+
}
417+
302418
/*
303419
* Converts a datetime from a datetimestruct to a datetime based
304-
* on a metadata unit. The date is assumed to be valid.
420+
* on a metadata unit. Returns -1 on and sets PyErr on error.
305421
*/
306422
npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
307423
const npy_datetimestruct *dts) {
308-
npy_datetime ret;
309-
310-
if (base == NPY_FR_Y) {
311-
/* Truncate to the year */
312-
ret = dts->year - 1970;
313-
} else if (base == NPY_FR_M) {
314-
/* Truncate to the month */
315-
ret = 12 * (dts->year - 1970) + (dts->month - 1);
316-
} else {
317-
/* Otherwise calculate the number of days to start */
318-
npy_int64 days = get_datetimestruct_days(dts);
319-
320-
switch (base) {
321-
case NPY_FR_W:
322-
/* Truncate to weeks */
323-
if (days >= 0) {
324-
ret = days / 7;
325-
} else {
326-
ret = (days - 6) / 7;
327-
}
328-
break;
329-
case NPY_FR_D:
330-
ret = days;
331-
break;
332-
case NPY_FR_h:
333-
ret = days * 24 + dts->hour;
334-
break;
335-
case NPY_FR_m:
336-
ret = (days * 24 + dts->hour) * 60 + dts->min;
337-
break;
338-
case NPY_FR_s:
339-
ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec;
340-
break;
341-
case NPY_FR_ms:
342-
ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * 1000 +
343-
dts->us / 1000;
344-
break;
345-
case NPY_FR_us:
346-
ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
347-
1000000 +
348-
dts->us;
349-
break;
350-
case NPY_FR_ns:
351-
ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
352-
1000000 +
353-
dts->us) *
354-
1000 +
355-
dts->ps / 1000;
356-
break;
357-
case NPY_FR_ps:
358-
ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
359-
1000000 +
360-
dts->us) *
361-
1000000 +
362-
dts->ps;
363-
break;
364-
case NPY_FR_fs:
365-
/* only 2.6 hours */
366-
ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
367-
1000000 +
368-
dts->us) *
369-
1000000 +
370-
dts->ps) *
371-
1000 +
372-
dts->as / 1000;
373-
break;
374-
case NPY_FR_as:
375-
/* only 9.2 secs */
376-
ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
377-
1000000 +
378-
dts->us) *
379-
1000000 +
380-
dts->ps) *
381-
1000000 +
382-
dts->as;
383-
break;
384-
default:
385-
/* Something got corrupted */
386-
PyErr_SetString(PyExc_ValueError,
387-
"NumPy datetime metadata with corrupt unit value");
388-
return -1;
389-
}
390-
}
391-
return ret;
424+
if ((base == NPY_FR_Y) || (base == NPY_FR_M)) {
425+
int64_t years;
426+
PD_CHECK_OVERFLOW(scaleYearToEpoch(dts->year, &years));
427+
428+
if (base == NPY_FR_Y) {
429+
return years;
430+
}
431+
432+
int64_t months;
433+
PD_CHECK_OVERFLOW(scaleYearsToMonths(years, &months));
434+
435+
int64_t months_adder;
436+
PD_CHECK_OVERFLOW(checked_int64_sub(dts->month, 1, &months_adder));
437+
PD_CHECK_OVERFLOW(checked_int64_add(months, months_adder, &months));
438+
439+
if (base == NPY_FR_M) {
440+
return months;
441+
}
442+
}
443+
444+
const int64_t days = get_datetimestruct_days(dts);
445+
if (base == NPY_FR_D) {
446+
return days;
447+
}
448+
449+
if (base == NPY_FR_W) {
450+
int64_t weeks;
451+
PD_CHECK_OVERFLOW(scaleDaysToWeeks(days, &weeks));
452+
return weeks;
453+
}
454+
455+
int64_t hours;
456+
PD_CHECK_OVERFLOW(scaleDaysToHours(days, &hours));
457+
PD_CHECK_OVERFLOW(checked_int64_add(hours, dts->hour, &hours));
458+
459+
if (base == NPY_FR_h) {
460+
return hours;
461+
}
462+
463+
int64_t minutes;
464+
PD_CHECK_OVERFLOW(scaleHoursToMinutes(hours, &minutes));
465+
PD_CHECK_OVERFLOW(checked_int64_add(minutes, dts->min, &minutes));
466+
467+
if (base == NPY_FR_m) {
468+
return minutes;
469+
}
470+
471+
int64_t seconds;
472+
PD_CHECK_OVERFLOW(scaleMinutesToSeconds(minutes, &seconds));
473+
PD_CHECK_OVERFLOW(checked_int64_add(seconds, dts->sec, &seconds));
474+
475+
if (base == NPY_FR_s) {
476+
return seconds;
477+
}
478+
479+
if (base == NPY_FR_ms) {
480+
int64_t milliseconds;
481+
PD_CHECK_OVERFLOW(scaleSecondsToMilliseconds(seconds, &milliseconds));
482+
PD_CHECK_OVERFLOW(
483+
checked_int64_add(milliseconds, dts->us / 1000, &milliseconds));
484+
485+
return milliseconds;
486+
}
487+
488+
int64_t microseconds;
489+
PD_CHECK_OVERFLOW(scaleSecondsToMicroseconds(seconds, &microseconds));
490+
PD_CHECK_OVERFLOW(checked_int64_add(microseconds, dts->us, &microseconds));
491+
492+
if (base == NPY_FR_us) {
493+
return microseconds;
494+
}
495+
496+
if (base == NPY_FR_ns) {
497+
int64_t nanoseconds;
498+
PD_CHECK_OVERFLOW(
499+
scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds));
500+
PD_CHECK_OVERFLOW(
501+
checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds));
502+
503+
return nanoseconds;
504+
}
505+
506+
int64_t picoseconds;
507+
PD_CHECK_OVERFLOW(scaleMicrosecondsToPicoseconds(microseconds, &picoseconds));
508+
PD_CHECK_OVERFLOW(checked_int64_add(picoseconds, dts->ps, &picoseconds));
509+
510+
if (base == NPY_FR_ps) {
511+
return picoseconds;
512+
}
513+
514+
if (base == NPY_FR_fs) {
515+
int64_t femtoseconds;
516+
PD_CHECK_OVERFLOW(
517+
scalePicosecondsToFemtoseconds(picoseconds, &femtoseconds));
518+
PD_CHECK_OVERFLOW(
519+
checked_int64_add(femtoseconds, dts->as / 1000, &femtoseconds));
520+
return femtoseconds;
521+
}
522+
523+
if (base == NPY_FR_as) {
524+
int64_t attoseconds;
525+
PD_CHECK_OVERFLOW(scalePicosecondsToAttoseconds(picoseconds, &attoseconds));
526+
PD_CHECK_OVERFLOW(checked_int64_add(attoseconds, dts->as, &attoseconds));
527+
return attoseconds;
528+
}
529+
530+
/* Something got corrupted */
531+
PyGILState_STATE gstate = PyGILState_Ensure();
532+
PyErr_SetString(PyExc_ValueError,
533+
"NumPy datetime metadata with corrupt unit value");
534+
PyGILState_Release(gstate);
535+
536+
return -1;
392537
}
393538

394539
/*

pandas/_libs/tslib.pyx

+4-4
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@ cnp.import_array()
3434
from pandas._libs.tslibs.np_datetime cimport (
3535
NPY_DATETIMEUNIT,
3636
NPY_FR_ns,
37-
check_dts_bounds,
3837
import_pandas_datetime,
3938
npy_datetimestruct,
4039
npy_datetimestruct_to_datetime,
@@ -99,8 +98,10 @@ def _test_parse_iso8601(ts: str):
9998
obj = _TSObject()
10099

101100
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
102-
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
103-
check_dts_bounds(&obj.dts)
101+
try:
102+
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
103+
except OverflowError as err:
104+
raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {ts}") from err
104105
if out_local == 1:
105106
obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
106107
obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
@@ -488,7 +489,6 @@ cpdef array_to_datetime(
488489

489490
elif PyDate_Check(val):
490491
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
491-
check_dts_bounds(&dts, creso)
492492
state.found_other = True
493493

494494
elif is_datetime64_object(val):

0 commit comments

Comments
 (0)