Skip to content

Implement C Level Timedelta ISO Function; fix JSON usage #30903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 39 commits into from
Mar 19, 2020
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
48d2cf0
Working C impl of timedelta ISO
WillAyd Jan 3, 2020
aecc34d
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 10, 2020
9d0384b
More consistent impl
WillAyd Jan 10, 2020
fd9675e
shared between python / numpy timedelta
WillAyd Jan 10, 2020
33fc37b
Shared td handling code
WillAyd Jan 10, 2020
e2e9995
added tests from cbertinato
WillAyd Jan 10, 2020
a2cbd85
shared code
WillAyd Jan 10, 2020
88974b5
working tests
WillAyd Jan 10, 2020
65a727f
reformat
WillAyd Jan 10, 2020
5d84cc3
Expanded test coverage
WillAyd Jan 10, 2020
5445333
fixed test
WillAyd Jan 10, 2020
191d219
better null handling
WillAyd Jan 10, 2020
66a2a43
expanded test
WillAyd Jan 10, 2020
60b5537
removed print
WillAyd Jan 10, 2020
dae5336
refactor
WillAyd Jan 10, 2020
88df8bf
fix incorrect test
WillAyd Jan 10, 2020
4146d9f
refactor
WillAyd Jan 10, 2020
24a7910
reformat
WillAyd Jan 10, 2020
b1e7da0
more date testing
WillAyd Jan 10, 2020
0046c3c
refactored with bug fix
WillAyd Jan 10, 2020
77b7bae
simplified timedelta test
WillAyd Jan 10, 2020
3ef4aff
Added timedelta coverage
WillAyd Jan 10, 2020
960dce6
stylistic updates
WillAyd Jan 10, 2020
6d2c8da
Removed unneeded timedelta import
WillAyd Jan 10, 2020
9b431f2
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 10, 2020
4a94f15
style updates
WillAyd Jan 10, 2020
40468bf
replace sprintf with snprintf
WillAyd Jan 11, 2020
0259370
ignore lint errors
WillAyd Jan 11, 2020
d1c00e5
Update test_pandas.py
WillAyd Jan 11, 2020
9efb929
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 20, 2020
9cbc075
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 20, 2020
29f497f
moved conversion func
WillAyd Jan 21, 2020
35d4a4b
fix note
WillAyd Jan 21, 2020
bd2c4db
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Feb 7, 2020
a8423d0
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Feb 12, 2020
ebe58c7
Whatsnew
WillAyd Feb 12, 2020
d8f7575
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Feb 20, 2020
8486e37
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Mar 17, 2020
ef08ad6
more comprehensive testing
WillAyd Mar 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 107 additions & 61 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ static PyTypeObject *cls_dataframe;
static PyTypeObject *cls_series;
static PyTypeObject *cls_index;
static PyTypeObject *cls_nat;
PyObject *cls_timedelta;

npy_int64 get_nat(void) { return NPY_MIN_INT64; }

Expand Down Expand Up @@ -165,7 +164,6 @@ void *initObjToJSON(void) {
cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index");
cls_series =
(PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series");
cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta");
Py_DECREF(mod_pandas);
}

Expand Down Expand Up @@ -399,6 +397,7 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
PyErr_SetString(PyExc_ValueError,
"Could not convert datetime value to string");
PyObject_Free(result);
return NULL;
}

// Note that get_datetime_iso_8601_strlen just gives a generic size
Expand All @@ -407,6 +406,30 @@ static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
return result;
}

/* Converts the int64_t representation of a duration to ISO; mutates len */
static char *int64ToIsoDuration(int64_t value, size_t *len) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think after this PR going to split these conversion functions off into a separate module as the JSON one is getting rather big

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other PR is #31057 should probably be merged first

pandas_timedeltastruct tds;
int ret_code;

pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);

char *result = PyObject_Malloc(100); // TODO: Better bounds
if (result == NULL) {
PyErr_NoMemory();
return NULL;
}

ret_code = make_iso_8601_timedelta(&tds, result, len);
if (ret_code == -1) {
PyErr_SetString(PyExc_ValueError,
"Could not convert timedelta value to string");
PyObject_Free(result);
return NULL;
}

return result;
}

/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
Expand All @@ -419,6 +442,12 @@ static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
return dt;
}

/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
return int64ToIsoDuration(GET_TC(tc)->longValue, len);
}

/* Convert PyDatetime To ISO C-string. mutates len */
static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base,
size_t *len) {
Expand Down Expand Up @@ -1504,6 +1533,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
char **ret;
char *dataptr, *cLabel;
int type_num;
NPY_DATETIMEUNIT base = enc->datetimeUnit;
PRINTMARK();

if (!labels) {
Expand Down Expand Up @@ -1541,60 +1571,64 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
break;
}

// TODO: vectorized timedelta solution
if (enc->datetimeIso &&
(type_num == NPY_TIMEDELTA || PyDelta_Check(item))) {
PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item);
if (td == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

PyObject *iso = PyObject_CallMethod(td, "isoformat", NULL);
Py_DECREF(td);
if (iso == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

cLabel = (char *)PyUnicode_AsUTF8(iso);
Py_DECREF(iso);
len = strlen(cLabel);
} else if (PyTypeNum_ISDATETIME(type_num)) {
NPY_DATETIMEUNIT base = enc->datetimeUnit;
npy_int64 longVal;
int is_datetimelike = 0;
npy_int64 nanosecVal;
if (PyTypeNum_ISDATETIME(type_num)) {
is_datetimelike = 1;
PyArray_VectorUnaryFunc *castfunc =
PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
if (!castfunc) {
PyErr_Format(PyExc_ValueError,
"Cannot cast numpy dtype %d to long",
enc->npyType);
}
castfunc(dataptr, &longVal, 1, NULL, NULL);
if (enc->datetimeIso) {
cLabel = int64ToIso(longVal, base, &len);
castfunc(dataptr, &nanosecVal, 1, NULL, NULL);
} else if (PyDate_Check(item) || PyDelta_Check(item)) {
is_datetimelike = 1;
if (PyObject_HasAttrString(item, "value")) {
nanosecVal = get_long_attr(item, "value");
} else {
if (!scaleNanosecToUnit(&longVal, base)) {
// TODO: This gets hit but somehow doesn't cause errors
// need to clean up (elsewhere in module as well)
if (PyDelta_Check(item)) {
nanosecVal = total_seconds(item) *
1000000000LL; // nanoseconds per second
} else {
// datetime.* objects don't follow above rules
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
}
cLabel = PyObject_Malloc(21); // 21 chars for int64
sprintf(cLabel, "%" NPY_INT64_FMT, longVal);
len = strlen(cLabel);
}
} else if (PyDateTime_Check(item) || PyDate_Check(item)) {
NPY_DATETIMEUNIT base = enc->datetimeUnit;
if (enc->datetimeIso) {
cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len);
}

if (is_datetimelike) {
// JSON requires a string for the index so write "null"
// is there is a standard for this?
if (nanosecVal == get_nat()) {
len = 5; // TODO: shouldn't require extra space for terminator
cLabel = PyObject_Malloc(len);
strncpy(cLabel, "null", len);
} else {
cLabel = PyObject_Malloc(21); // 21 chars for int64
sprintf(cLabel, "%" NPY_DATETIME_FMT,
PyDateTimeToEpoch(item, base));
len = strlen(cLabel);
if (enc->datetimeIso) {
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
cLabel = int64ToIsoDuration(nanosecVal, &len);
} else {
if (type_num == NPY_DATETIME) {
cLabel = int64ToIso(nanosecVal, base, &len);
} else {
cLabel = PyDateTimeToIso((PyDateTime_Date *)item,
base, &len);
}
}
if (cLabel == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}
} else {
cLabel = PyObject_Malloc(21); // 21 chars for int64
sprintf(cLabel, "%" NPY_DATETIME_FMT,
NpyDateTimeToEpoch(nanosecVal, base));
len = strlen(cLabel);
}
}
} else { // Fallback to string representation
PyObject *str = PyObject_Str(item);
Expand All @@ -1615,6 +1649,11 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
ret[i] = PyObject_Malloc(len + 1);
memcpy(ret[i], cLabel, len + 1);

if (is_datetimelike) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fixes a memory leak in the current implementation where any datetimelike values used in the index would leak their respective character representation out; will add a whatsnew once the file is created

// these were created with PyObject_Malloc so free accordingly
PyObject_Free(cLabel);
}

if (PyErr_Occurred()) {
NpyArr_freeLabels(ret, num);
ret = 0;
Expand Down Expand Up @@ -1703,7 +1742,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {

if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
if (enc->npyType == NPY_TIMEDELTA) {
pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
} else {
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
}
// Currently no way to pass longVal to iso function, so use
// state management
GET_TC(tc)->longValue = longVal;
Expand Down Expand Up @@ -1823,28 +1866,31 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
value = total_seconds(obj) * 1000000000LL; // nanoseconds per second
}

unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
if (scaleNanosecToUnit(&value, unit) != 0) {
// TODO: Add some kind of error handling here
}

exc = PyErr_Occurred();

if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
PRINTMARK();
goto INVALID;
}
GET_TC(tc)->longValue = value;

PRINTMARK();
if (value == get_nat()) {
PRINTMARK();
tc->type = JT_NULL;
return;
}
} else if (enc->datetimeIso) {
pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
tc->type = JT_UTF8;
} else {
unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
if (scaleNanosecToUnit(&(GET_TC(tc)->longValue), unit) != 0) {
// TODO: Add some kind of error handling here
}

GET_TC(tc)->longValue = value;
exc = PyErr_Occurred();

PRINTMARK();
tc->type = JT_LONG;
if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
PRINTMARK();
goto INVALID;
}

tc->type = JT_LONG;
}
return;
} else if (PyArray_IsScalar(obj, Integer)) {
PRINTMARK();
Expand Down
21 changes: 21 additions & 0 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,24 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
outlen);
return -1;
}


int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen) {
*outlen = 0;
// sprintf returns the number of characters required for formatting, so use that to move buffer
*outlen += sprintf(outstr, "P%" NPY_INT64_FMT "DT%" NPY_INT32_FMT "H%" NPY_INT32_FMT "M%" NPY_INT32_FMT,
tds->days, tds->hrs, tds->min, tds->sec);
outstr += *outlen;

if (tds->ns != 0) {
*outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us, tds->ns);
} else if (tds->us != 0) {
*outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "%03" NPY_INT32_FMT "S", tds->ms, tds->us);
} else if (tds->ms != 0) {
*outlen += sprintf(outstr, ".%03" NPY_INT32_FMT "S", tds->ms);
} else {
*outlen += sprintf(outstr, "%s", "S");
}

return 0;
}
9 changes: 9 additions & 0 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,13 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
int
make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
NPY_DATETIMEUNIT base);

/*
* Converts an pandas_timedeltastruct to an ISO 8601 string.
*
* Mutates outlen to provide size of (non-NULL terminated) string.
*
* Returns NULL on error.
*/
int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr, size_t *outlen);
#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
3 changes: 1 addition & 2 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,7 @@ def test_timestamp_in_columns(self):
result = df.to_json(orient="table")
js = json.loads(result)
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
# TODO - below expectation is not correct; see GH 28256
assert js["schema"]["fields"][2]["name"] == 10000
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"

@pytest.mark.parametrize(
"case",
Expand Down
47 changes: 47 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from collections import OrderedDict
import datetime
from datetime import timedelta
from io import StringIO
import json
Expand Down Expand Up @@ -810,6 +811,29 @@ def test_convert_dates(self):
result = read_json(json, typ="series")
tm.assert_series_equal(result, ts)

@pytest.mark.parametrize("date_format", ["epoch", "iso"])
@pytest.mark.parametrize("as_object", [True, False])
@pytest.mark.parametrize("date_typ", [datetime.datetime, pd.Timestamp])
def test_date_index_and_values(self, date_format, as_object, date_typ):
data = [date_typ(year=2020, month=1, day=1), pd.NaT]
if as_object:
data.append("a")

ser = pd.Series(data, index=data)
result = ser.to_json(date_format=date_format)

if date_format == "epoch":
expected = '{"1577836800000":1577836800000,"null":null}'
else:
expected = (
'{"2020-01-01T00:00:00.000Z":"2020-01-01T00:00:00.000Z"' ',"null":null}'
)

if as_object:
expected = expected.replace("}", ',"a":"a"}')

assert result == expected

@pytest.mark.parametrize(
"infer_word",
[
Expand Down Expand Up @@ -1032,6 +1056,29 @@ def test_mixed_timedelta_datetime(self):
result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"})
tm.assert_frame_equal(result, expected, check_index_type=False)

@pytest.mark.parametrize("as_object", [True, False])
@pytest.mark.parametrize("date_format", ["iso", "epoch"])
@pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
def test_timedelta_to_json(self, as_object, date_format, timedelta_typ):
# GH28156: to_json not correctly formatting Timedelta
data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT]
if as_object:
data.append("a")

ser = pd.Series(data, index=data)
if date_format == "iso":
expected = (
'{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}'
)
else:
expected = '{"86400000":86400000,"172800000":172800000,"null":null}'

if as_object:
expected = expected.replace("}", ',"a":"a"}')

result = ser.to_json(date_format=date_format)
assert result == expected

def test_default_handler(self):
value = object()
frame = DataFrame({"a": [7, value]})
Expand Down