-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Implement C Level Timedelta ISO Function; fix JSON usage #30903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 28 commits
48d2cf0
aecc34d
9d0384b
fd9675e
33fc37b
e2e9995
a2cbd85
88974b5
65a727f
5d84cc3
5445333
191d219
66a2a43
60b5537
dae5336
88df8bf
4146d9f
24a7910
b1e7da0
0046c3c
77b7bae
3ef4aff
960dce6
6d2c8da
9b431f2
4a94f15
40468bf
0259370
d1c00e5
9efb929
9cbc075
29f497f
35d4a4b
bd2c4db
a8423d0
ebe58c7
d8f7575
8486e37
ef08ad6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,6 @@ static PyTypeObject *cls_dataframe; | |
static PyTypeObject *cls_series; | ||
static PyTypeObject *cls_index; | ||
static PyTypeObject *cls_nat; | ||
PyObject *cls_timedelta; | ||
|
||
npy_int64 get_nat(void) { return NPY_MIN_INT64; } | ||
|
||
|
@@ -165,7 +164,6 @@ void *initObjToJSON(void) { | |
cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index"); | ||
cls_series = | ||
(PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series"); | ||
cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta"); | ||
Py_DECREF(mod_pandas); | ||
} | ||
|
||
|
@@ -399,6 +397,7 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { | |
PyErr_SetString(PyExc_ValueError, | ||
"Could not convert datetime value to string"); | ||
PyObject_Free(result); | ||
return NULL; | ||
} | ||
|
||
// Note that get_datetime_iso_8601_strlen just gives a generic size | ||
|
@@ -407,6 +406,32 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) { | |
return result; | ||
} | ||
|
||
/* Converts the int64_t representation of a duration to ISO; mutates len */ | ||
static char *int64ToIsoDuration(int64_t value, size_t *len) { | ||
pandas_timedeltastruct tds; | ||
int ret_code; | ||
|
||
pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds); | ||
|
||
// Max theoretical length of ISO Duration with 64 bit day | ||
// as the largest unit is 70 characters + 1 for a null terminator | ||
char *result = PyObject_Malloc(71); | ||
if (result == NULL) { | ||
PyErr_NoMemory(); | ||
return NULL; | ||
} | ||
|
||
ret_code = make_iso_8601_timedelta(&tds, result, len); | ||
if (ret_code == -1) { | ||
PyErr_SetString(PyExc_ValueError, | ||
"Could not convert timedelta value to string"); | ||
PyObject_Free(result); | ||
return NULL; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
/* JSON callback. returns a char* and mutates the pointer to *len */ | ||
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), | ||
JSONTypeContext *tc, size_t *len) { | ||
|
@@ -419,6 +444,12 @@ static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) { | |
return dt; | ||
} | ||
|
||
/* JSON callback. returns a char* and mutates the pointer to *len */ | ||
static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused), | ||
JSONTypeContext *tc, size_t *len) { | ||
return int64ToIsoDuration(GET_TC(tc)->longValue, len); | ||
} | ||
|
||
/* Convert PyDatetime To ISO C-string. mutates len */ | ||
static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, | ||
size_t *len) { | ||
|
@@ -1504,6 +1535,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, | |
char **ret; | ||
char *dataptr, *cLabel; | ||
int type_num; | ||
NPY_DATETIMEUNIT base = enc->datetimeUnit; | ||
PRINTMARK(); | ||
|
||
if (!labels) { | ||
|
@@ -1541,60 +1573,64 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, | |
break; | ||
} | ||
|
||
// TODO: vectorized timedelta solution | ||
if (enc->datetimeIso && | ||
(type_num == NPY_TIMEDELTA || PyDelta_Check(item))) { | ||
PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item); | ||
if (td == NULL) { | ||
Py_DECREF(item); | ||
NpyArr_freeLabels(ret, num); | ||
ret = 0; | ||
break; | ||
} | ||
|
||
PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL); | ||
Py_DECREF(td); | ||
if (iso == NULL) { | ||
Py_DECREF(item); | ||
NpyArr_freeLabels(ret, num); | ||
ret = 0; | ||
break; | ||
} | ||
|
||
cLabel = (char *)PyUnicode_AsUTF8(iso); | ||
Py_DECREF(iso); | ||
len = strlen(cLabel); | ||
} else if (PyTypeNum_ISDATETIME(type_num)) { | ||
NPY_DATETIMEUNIT base = enc->datetimeUnit; | ||
npy_int64 longVal; | ||
int is_datetimelike = 0; | ||
npy_int64 nanosecVal; | ||
if (PyTypeNum_ISDATETIME(type_num)) { | ||
is_datetimelike = 1; | ||
PyArray_VectorUnaryFunc *castfunc = | ||
PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64); | ||
if (!castfunc) { | ||
PyErr_Format(PyExc_ValueError, | ||
"Cannot cast numpy dtype %d to long", | ||
enc->npyType); | ||
} | ||
castfunc(dataptr, &longVal, 1, NULL, NULL); | ||
if (enc->datetimeIso) { | ||
cLabel = int64ToIso(longVal, base, &len); | ||
castfunc(dataptr, &nanosecVal, 1, NULL, NULL); | ||
} else if (PyDate_Check(item) || PyDelta_Check(item)) { | ||
is_datetimelike = 1; | ||
if (PyObject_HasAttrString(item, "value")) { | ||
nanosecVal = get_long_attr(item, "value"); | ||
} else { | ||
if (!scaleNanosecToUnit(&longVal, base)) { | ||
// TODO: This gets hit but somehow doesn't cause errors | ||
// need to clean up (elsewhere in module as well) | ||
if (PyDelta_Check(item)) { | ||
nanosecVal = total_seconds(item) * | ||
1000000000LL; // nanoseconds per second | ||
} else { | ||
// datetime.* objects don't follow above rules | ||
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns); | ||
} | ||
cLabel = PyObject_Malloc(21); // 21 chars for int64 | ||
sprintf(cLabel, "%" NPY_INT64_FMT, longVal); | ||
len = strlen(cLabel); | ||
} | ||
} else if (PyDateTime_Check(item) || PyDate_Check(item)) { | ||
NPY_DATETIMEUNIT base = enc->datetimeUnit; | ||
if (enc->datetimeIso) { | ||
cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len); | ||
} | ||
|
||
if (is_datetimelike) { | ||
// JSON requires a string for the index so write "null" | ||
// is there is a standard for this? | ||
if (nanosecVal == get_nat()) { | ||
len = 5; // TODO: shouldn't require extra space for terminator | ||
cLabel = PyObject_Malloc(len); | ||
strncpy(cLabel, "null", len); | ||
} else { | ||
cLabel = PyObject_Malloc(21); // 21 chars for int64 | ||
sprintf(cLabel, "%" NPY_DATETIME_FMT, | ||
PyDateTimeToEpoch(item, base)); | ||
len = strlen(cLabel); | ||
if (enc->datetimeIso) { | ||
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) { | ||
cLabel = int64ToIsoDuration(nanosecVal, &len); | ||
} else { | ||
if (type_num == NPY_DATETIME) { | ||
cLabel = int64ToIso(nanosecVal, base, &len); | ||
} else { | ||
cLabel = PyDateTimeToIso((PyDateTime_Date *)item, | ||
base, &len); | ||
} | ||
} | ||
if (cLabel == NULL) { | ||
Py_DECREF(item); | ||
NpyArr_freeLabels(ret, num); | ||
ret = 0; | ||
break; | ||
} | ||
} else { | ||
cLabel = PyObject_Malloc(21); // 21 chars for int64 | ||
sprintf(cLabel, "%" NPY_DATETIME_FMT, | ||
NpyDateTimeToEpoch(nanosecVal, base)); | ||
len = strlen(cLabel); | ||
} | ||
} | ||
} else { // Fallback to string representation | ||
PyObject *str = PyObject_Str(item); | ||
|
@@ -1615,6 +1651,11 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc, | |
ret[i] = PyObject_Malloc(len + 1); | ||
memcpy(ret[i], cLabel, len + 1); | ||
|
||
if (is_datetimelike) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This fixes a memory leak in the current implementation where any datetimelike values used in the index would leak their respective character representation out; will add a whatsnew once the file is created |
||
// these were created with PyObject_Malloc so free accordingly | ||
PyObject_Free(cLabel); | ||
} | ||
|
||
if (PyErr_Occurred()) { | ||
NpyArr_freeLabels(ret, num); | ||
ret = 0; | ||
|
@@ -1703,7 +1744,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { | |
|
||
if (enc->datetimeIso) { | ||
PRINTMARK(); | ||
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; | ||
if (enc->npyType == NPY_TIMEDELTA) { | ||
pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; | ||
} else { | ||
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback; | ||
} | ||
// Currently no way to pass longVal to iso function, so use | ||
// state management | ||
GET_TC(tc)->longValue = longVal; | ||
|
@@ -1823,28 +1868,31 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { | |
value = total_seconds(obj) * 1000000000LL; // nanoseconds per second | ||
} | ||
|
||
unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; | ||
if (scaleNanosecToUnit(&value, unit) != 0) { | ||
// TODO: Add some kind of error handling here | ||
} | ||
|
||
exc = PyErr_Occurred(); | ||
|
||
if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { | ||
PRINTMARK(); | ||
goto INVALID; | ||
} | ||
GET_TC(tc)->longValue = value; | ||
|
||
PRINTMARK(); | ||
if (value == get_nat()) { | ||
PRINTMARK(); | ||
tc->type = JT_NULL; | ||
return; | ||
} | ||
} else if (enc->datetimeIso) { | ||
pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback; | ||
tc->type = JT_UTF8; | ||
} else { | ||
unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit; | ||
if (scaleNanosecToUnit(&(GET_TC(tc)->longValue), unit) != 0) { | ||
// TODO: Add some kind of error handling here | ||
} | ||
|
||
GET_TC(tc)->longValue = value; | ||
exc = PyErr_Occurred(); | ||
|
||
PRINTMARK(); | ||
tc->type = JT_LONG; | ||
if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { | ||
PRINTMARK(); | ||
goto INVALID; | ||
} | ||
|
||
tc->type = JT_LONG; | ||
} | ||
return; | ||
} else if (PyArray_IsScalar(obj, Integer)) { | ||
PRINTMARK(); | ||
|
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -905,3 +905,37 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen, | |||
outlen); | ||||
return -1; | ||||
} | ||||
|
||||
|
||||
int make_iso_8601_timedelta(pandas_timedeltastruct *tds, | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note that we do also have a isoformat function for time deltas defined here: pandas/pandas/_libs/tslibs/timedeltas.pyx Line 1097 in cdffa43
But I think the path to get that to work in the JSON module would be way more complicated than implementing in C as we do here (and less consistent with other datetime handling). Maybe as a follow up we want to use this C function in the Timedelta class |
||||
char *outstr, size_t *outlen) { | ||||
*outlen = 0; | ||||
*outlen += snprintf(outstr, 60, // NOLINT | ||||
"P%" NPY_INT64_FMT | ||||
"DT%" NPY_INT32_FMT | ||||
"H%" NPY_INT32_FMT | ||||
"M%" NPY_INT32_FMT, | ||||
tds->days, tds->hrs, tds->min, tds->sec); | ||||
outstr += *outlen; | ||||
|
||||
if (tds->ns != 0) { | ||||
*outlen += snprintf(outstr, 12, // NOLINT | ||||
".%03" NPY_INT32_FMT | ||||
"%03" NPY_INT32_FMT | ||||
"%03" NPY_INT32_FMT | ||||
"S", tds->ms, tds->us, tds->ns); | ||||
} else if (tds->us != 0) { | ||||
*outlen += snprintf(outstr, 9, // NOLINT | ||||
".%03" NPY_INT32_FMT | ||||
"%03" NPY_INT32_FMT | ||||
"S", tds->ms, tds->us); | ||||
} else if (tds->ms != 0) { | ||||
*outlen += snprintf(outstr, 6, // NOLINT | ||||
".%03" NPY_INT32_FMT "S", tds->ms); | ||||
} else { | ||||
*outlen += snprintf(outstr, 2, // NOLINT | ||||
"%s", "S"); | ||||
} | ||||
|
||||
return 0; | ||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think after this PR going to split these conversion functions off into a separate module as the JSON one is getting rather big
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The other PR is #31057 should probably be merged first