Skip to content

Remove undefined behavior from npy_datetimestruct_to_datetime #55151

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 46 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from 38 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
e27a0da
refactor npy_datetimestruct_to_datetime
WillAyd Sep 15, 2023
0dea606
Used builtin overflow directives
WillAyd Oct 17, 2023
d6a24f3
macro fixups
WillAyd Oct 17, 2023
21e919c
more macro cleanups
WillAyd Oct 17, 2023
6302f2f
more macro refactor
WillAyd Oct 17, 2023
e2646a6
musllinux c support
WillAyd Oct 18, 2023
5852d20
Merge remote-tracking branch 'upstream/main' into refactor-np-datetime
WillAyd Oct 19, 2023
f55a58a
macro cleanup
WillAyd Oct 19, 2023
057e74f
more refactor
WillAyd Oct 19, 2023
2d6d5fc
fix cython warning
WillAyd Oct 19, 2023
e65e229
windows fix
WillAyd Oct 19, 2023
af29e7c
Raise Outofboundsdatetime
WillAyd Oct 20, 2023
b69b489
Merge remote-tracking branch 'upstream/main' into refactor-np-datetime
WillAyd Oct 20, 2023
f9e5e35
cleanup GIL warnings
WillAyd Oct 20, 2023
1b89dfe
more error handling cleanup
WillAyd Oct 20, 2023
73a1507
updates
WillAyd Oct 20, 2023
ef24509
error message update
WillAyd Oct 20, 2023
2240b09
error fixups
WillAyd Oct 20, 2023
1640002
test fixup
WillAyd Oct 22, 2023
9cdb9c9
clang-format
WillAyd Oct 23, 2023
9e0cec0
Merge branch 'main' into refactor-np-datetime
WillAyd Oct 23, 2023
25f3edf
Merge branch 'main' into refactor-np-datetime
WillAyd Oct 24, 2023
5c09a13
Merge branch 'main' into refactor-np-datetime
WillAyd Oct 26, 2023
4d8696c
updates
WillAyd Oct 26, 2023
b3d5b7c
fixed error message
WillAyd Oct 26, 2023
2f60947
try nogil
WillAyd Oct 26, 2023
f26c924
Merge remote-tracking branch 'upstream/main' into refactor-np-datetime
WillAyd Nov 1, 2023
03315fc
revert offsets changes
WillAyd Nov 1, 2023
b3a80b4
simplified error handling
WillAyd Nov 1, 2023
038db5f
period fixup
WillAyd Nov 1, 2023
df2a4df
fixed test failure
WillAyd Nov 1, 2023
1afb12d
try speedup
WillAyd Nov 1, 2023
e293642
updated benchmark
WillAyd Nov 2, 2023
2203421
Merge remote-tracking branch 'upstream/main' into refactor-np-datetime
WillAyd Nov 2, 2023
7ee90dd
revert noexcepts
WillAyd Nov 3, 2023
81f32a9
Merge remote-tracking branch 'upstream/main' into refactor-np-datetime
WillAyd Nov 3, 2023
8d918dc
Merge branch 'main' into refactor-np-datetime
WillAyd Nov 6, 2023
7e8571f
Merge remote-tracking branch 'upstream/main' into refactor-np-datetime
WillAyd Nov 8, 2023
07e8106
Merge branch 'main' into refactor-np-datetime
WillAyd Nov 10, 2023
eab64ac
shared function for dts formatting
WillAyd Nov 10, 2023
7fbd191
import -> cimport
WillAyd Nov 10, 2023
4eba919
pass address
WillAyd Nov 10, 2023
9e1914d
typo
WillAyd Nov 10, 2023
188eb10
Merge branch 'main' into refactor-np-datetime
WillAyd Nov 13, 2023
162c858
remove comment
WillAyd Nov 14, 2023
de90a2f
cdef object -> str
WillAyd Nov 14, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion asv_bench/benchmarks/tslibs/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,11 @@ def setup(self, size, freq, tz):
# tzlocal is cumbersomely slow, so skip to keep runtime in check
raise NotImplementedError

arr = np.arange(10, dtype="i8").repeat(size // 10)
# we pick 2**55 because smaller values end up returning
# -1 from npy_datetimestruct_to_datetime with NPY_FR_Y frequency
# this artificially slows down functions since -1 is also the
# error sentinel
arr = np.arange(2**55, 2**55 + 10, dtype="i8").repeat(size // 10)
self.i8values = arr

def time_dt64arr_to_periodarr(self, size, freq, tz):
Expand Down
2 changes: 1 addition & 1 deletion meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ project(
meson_version: '>=1.2.1',
default_options: [
'buildtype=release',
'c_std=c99'
'c_std=c11'
]
)

Expand Down
315 changes: 230 additions & 85 deletions pandas/_libs/src/vendored/numpy/datetime/np_datetime.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,58 @@ This file is derived from NumPy 1.7. See NUMPY_LICENSE.txt
#include <numpy/arrayscalars.h>
#include <numpy/ndarraytypes.h>

#if defined(_WIN32)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI C23 has the <stdckdint.h> header which should replace these macros in a few years when that becomes widely available

#ifndef ENABLE_INTSAFE_SIGNED_FUNCTIONS
#define ENABLE_INTSAFE_SIGNED_FUNCTIONS
#endif
#include <intsafe.h>
#define checked_int64_add(a, b, res) LongLongAdd(a, b, res)
#define checked_int64_sub(a, b, res) LongLongSub(a, b, res)
#define checked_int64_mul(a, b, res) LongLongMult(a, b, res)
#else
#if defined __has_builtin
#if __has_builtin(__builtin_add_overflow)
#if _LP64 || __LP64__ || _ILP64 || __ILP64__
#define checked_int64_add(a, b, res) __builtin_saddl_overflow(a, b, res)
#define checked_int64_sub(a, b, res) __builtin_ssubl_overflow(a, b, res)
#define checked_int64_mul(a, b, res) __builtin_smull_overflow(a, b, res)
#else
#define checked_int64_add(a, b, res) __builtin_saddll_overflow(a, b, res)
#define checked_int64_sub(a, b, res) __builtin_ssubll_overflow(a, b, res)
#define checked_int64_mul(a, b, res) __builtin_smulll_overflow(a, b, res)
#endif
#else
_Static_assert(0,
"Overflow checking not detected; please try a newer compiler");
#endif
// __has_builtin was added in gcc 10, but our muslinux_1_1 build environment
// only has gcc-9.3, so fall back to __GNUC__ macro as long as we have that
#elif __GNUC__ > 7
#if _LP64 || __LP64__ || _ILP64 || __ILP64__
#define checked_int64_add(a, b, res) __builtin_saddl_overflow(a, b, res)
#define checked_int64_sub(a, b, res) __builtin_ssubl_overflow(a, b, res)
#define checked_int64_mul(a, b, res) __builtin_smull_overflow(a, b, res)
#else
#define checked_int64_add(a, b, res) __builtin_saddll_overflow(a, b, res)
#define checked_int64_sub(a, b, res) __builtin_ssubll_overflow(a, b, res)
#define checked_int64_mul(a, b, res) __builtin_smulll_overflow(a, b, res)
#endif
#else
_Static_assert(0, "__has_builtin not detected; please try a newer compiler");
#endif
#endif

#define PD_CHECK_OVERFLOW(FUNC) \
do { \
if ((FUNC) != 0) { \
PyGILState_STATE gstate = PyGILState_Ensure(); \
PyErr_SetString(PyExc_OverflowError, \
"Overflow occurred in npy_datetimestruct_to_datetime"); \
PyGILState_Release(gstate); \
return -1; \
} \
} while (0)

const int days_per_month_table[2][12] = {
{31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
{31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
Expand Down Expand Up @@ -299,96 +351,189 @@ PyObject *extract_utc_offset(PyObject *obj) {
return tmp;
}

static inline int scaleYearToEpoch(int64_t year, int64_t *result) {
return checked_int64_sub(year, 1970, result);
}

static inline int scaleYearsToMonths(int64_t years, int64_t *result) {
return checked_int64_mul(years, 12, result);
}

static inline int scaleDaysToWeeks(int64_t days, int64_t *result) {
if (days >= 0) {
*result = days / 7;
return 0;
} else {
int res;
int64_t checked_days;
if ((res = checked_int64_sub(days, 6, &checked_days))) {
return res;
}

*result = checked_days / 7;
return 0;
}
}

static inline int scaleDaysToHours(int64_t days, int64_t *result) {
return checked_int64_mul(days, 24, result);
}

static inline int scaleHoursToMinutes(int64_t hours, int64_t *result) {
return checked_int64_mul(hours, 60, result);
}

static inline int scaleMinutesToSeconds(int64_t minutes, int64_t *result) {
return checked_int64_mul(minutes, 60, result);
}

static inline int scaleSecondsToMilliseconds(int64_t seconds, int64_t *result) {
return checked_int64_mul(seconds, 1000, result);
}

static inline int scaleSecondsToMicroseconds(int64_t seconds, int64_t *result) {
return checked_int64_mul(seconds, 1000000, result);
}

static inline int scaleMicrosecondsToNanoseconds(int64_t microseconds,
int64_t *result) {
return checked_int64_mul(microseconds, 1000, result);
}

static inline int scaleMicrosecondsToPicoseconds(int64_t microseconds,
int64_t *result) {
return checked_int64_mul(microseconds, 1000000, result);
}

static inline int64_t scalePicosecondsToFemtoseconds(int64_t picoseconds,
int64_t *result) {
return checked_int64_mul(picoseconds, 1000, result);
}

static inline int64_t scalePicosecondsToAttoseconds(int64_t picoseconds,
int64_t *result) {
return checked_int64_mul(picoseconds, 1000000, result);
}

/*
* Converts a datetime from a datetimestruct to a datetime based
* on a metadata unit. The date is assumed to be valid.
* on a metadata unit. Returns -1 on and sets PyErr on error.
*/
npy_datetime npy_datetimestruct_to_datetime(NPY_DATETIMEUNIT base,
const npy_datetimestruct *dts) {
npy_datetime ret;

if (base == NPY_FR_Y) {
/* Truncate to the year */
ret = dts->year - 1970;
} else if (base == NPY_FR_M) {
/* Truncate to the month */
ret = 12 * (dts->year - 1970) + (dts->month - 1);
} else {
/* Otherwise calculate the number of days to start */
npy_int64 days = get_datetimestruct_days(dts);

switch (base) {
case NPY_FR_W:
/* Truncate to weeks */
if (days >= 0) {
ret = days / 7;
} else {
ret = (days - 6) / 7;
}
break;
case NPY_FR_D:
ret = days;
break;
case NPY_FR_h:
ret = days * 24 + dts->hour;
break;
case NPY_FR_m:
ret = (days * 24 + dts->hour) * 60 + dts->min;
break;
case NPY_FR_s:
ret = ((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec;
break;
case NPY_FR_ms:
ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) * 1000 +
dts->us / 1000;
break;
case NPY_FR_us:
ret = (((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
1000000 +
dts->us;
break;
case NPY_FR_ns:
ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
1000000 +
dts->us) *
1000 +
dts->ps / 1000;
break;
case NPY_FR_ps:
ret = ((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
1000000 +
dts->us) *
1000000 +
dts->ps;
break;
case NPY_FR_fs:
/* only 2.6 hours */
ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
1000000 +
dts->us) *
1000000 +
dts->ps) *
1000 +
dts->as / 1000;
break;
case NPY_FR_as:
/* only 9.2 secs */
ret = (((((days * 24 + dts->hour) * 60 + dts->min) * 60 + dts->sec) *
1000000 +
dts->us) *
1000000 +
dts->ps) *
1000000 +
dts->as;
break;
default:
/* Something got corrupted */
PyErr_SetString(PyExc_ValueError,
"NumPy datetime metadata with corrupt unit value");
return -1;
}
}
return ret;
if ((base == NPY_FR_Y) || (base == NPY_FR_M)) {
int64_t years;
PD_CHECK_OVERFLOW(scaleYearToEpoch(dts->year, &years));

if (base == NPY_FR_Y) {
return years;
}

int64_t months;
PD_CHECK_OVERFLOW(scaleYearsToMonths(years, &months));

int64_t months_adder;
PD_CHECK_OVERFLOW(checked_int64_sub(dts->month, 1, &months_adder));
PD_CHECK_OVERFLOW(checked_int64_add(months, months_adder, &months));

if (base == NPY_FR_M) {
return months;
}
}

const int64_t days = get_datetimestruct_days(dts);
if (base == NPY_FR_D) {
return days;
}

if (base == NPY_FR_W) {
int64_t weeks;
PD_CHECK_OVERFLOW(scaleDaysToWeeks(days, &weeks));
return weeks;
}

int64_t hours;
PD_CHECK_OVERFLOW(scaleDaysToHours(days, &hours));
PD_CHECK_OVERFLOW(checked_int64_add(hours, dts->hour, &hours));

if (base == NPY_FR_h) {
return hours;
}

int64_t minutes;
PD_CHECK_OVERFLOW(scaleHoursToMinutes(hours, &minutes));
PD_CHECK_OVERFLOW(checked_int64_add(minutes, dts->min, &minutes));

if (base == NPY_FR_m) {
return minutes;
}

int64_t seconds;
PD_CHECK_OVERFLOW(scaleMinutesToSeconds(minutes, &seconds));
PD_CHECK_OVERFLOW(checked_int64_add(seconds, dts->sec, &seconds));

if (base == NPY_FR_s) {
return seconds;
}

if (base == NPY_FR_ms) {
int64_t milliseconds;
PD_CHECK_OVERFLOW(scaleSecondsToMilliseconds(seconds, &milliseconds));
PD_CHECK_OVERFLOW(
checked_int64_add(milliseconds, dts->us / 1000, &milliseconds));

return milliseconds;
}

int64_t microseconds;
PD_CHECK_OVERFLOW(scaleSecondsToMicroseconds(seconds, &microseconds));
PD_CHECK_OVERFLOW(checked_int64_add(microseconds, dts->us, &microseconds));

if (base == NPY_FR_us) {
return microseconds;
}

if (base == NPY_FR_ns) {
int64_t nanoseconds;
PD_CHECK_OVERFLOW(
scaleMicrosecondsToNanoseconds(microseconds, &nanoseconds));
PD_CHECK_OVERFLOW(
checked_int64_add(nanoseconds, dts->ps / 1000, &nanoseconds));

return nanoseconds;
}

int64_t picoseconds;
PD_CHECK_OVERFLOW(scaleMicrosecondsToPicoseconds(microseconds, &picoseconds));
PD_CHECK_OVERFLOW(checked_int64_add(picoseconds, dts->ps, &picoseconds));

if (base == NPY_FR_ps) {
return picoseconds;
}

if (base == NPY_FR_fs) {
int64_t femtoseconds;
PD_CHECK_OVERFLOW(
scalePicosecondsToFemtoseconds(picoseconds, &femtoseconds));
PD_CHECK_OVERFLOW(
checked_int64_add(femtoseconds, dts->as / 1000, &femtoseconds));
return femtoseconds;
}

if (base == NPY_FR_as) {
int64_t attoseconds;
PD_CHECK_OVERFLOW(scalePicosecondsToAttoseconds(picoseconds, &attoseconds));
PD_CHECK_OVERFLOW(checked_int64_add(attoseconds, dts->as, &attoseconds));
return attoseconds;
}

/* Something got corrupted */
PyGILState_STATE gstate = PyGILState_Ensure();
PyErr_SetString(PyExc_ValueError,
"NumPy datetime metadata with corrupt unit value");
PyGILState_Release(gstate);

return -1;
}

/*
Expand Down
8 changes: 4 additions & 4 deletions pandas/_libs/tslib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ cnp.import_array()
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
check_dts_bounds,
import_pandas_datetime,
npy_datetimestruct,
npy_datetimestruct_to_datetime,
Expand Down Expand Up @@ -98,8 +97,10 @@ def _test_parse_iso8601(ts: str):
obj = _TSObject()

string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
check_dts_bounds(&obj.dts)
try:
obj.value = npy_datetimestruct_to_datetime(NPY_FR_ns, &obj.dts)
except OverflowError as err:
raise OutOfBoundsDatetime(f"Out of bounds nanosecond timestamp: {ts}") from err
if out_local == 1:
obj.tzinfo = timezone(timedelta(minutes=out_tzoffset))
obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
Expand Down Expand Up @@ -487,7 +488,6 @@ cpdef array_to_datetime(

elif PyDate_Check(val):
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
check_dts_bounds(&dts, creso)

elif is_datetime64_object(val):
iresult[i] = get_datetime64_nanos(val, creso)
Expand Down
Loading