Skip to content

Implement C Level Timedelta ISO Function; fix JSON usage #30903

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 39 commits into from
Mar 19, 2020
Merged
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
48d2cf0
Working C impl of timedelta ISO
WillAyd Jan 3, 2020
aecc34d
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 10, 2020
9d0384b
More consistent impl
WillAyd Jan 10, 2020
fd9675e
shared between python / numpy timedelta
WillAyd Jan 10, 2020
33fc37b
Shared td handling code
WillAyd Jan 10, 2020
e2e9995
added tests from cbertinato
WillAyd Jan 10, 2020
a2cbd85
shared code
WillAyd Jan 10, 2020
88974b5
working tests
WillAyd Jan 10, 2020
65a727f
reformat
WillAyd Jan 10, 2020
5d84cc3
Expanded test coverage
WillAyd Jan 10, 2020
5445333
fixed test
WillAyd Jan 10, 2020
191d219
better null handling
WillAyd Jan 10, 2020
66a2a43
expanded test
WillAyd Jan 10, 2020
60b5537
removed print
WillAyd Jan 10, 2020
dae5336
refactor
WillAyd Jan 10, 2020
88df8bf
fix incorrect test
WillAyd Jan 10, 2020
4146d9f
refactor
WillAyd Jan 10, 2020
24a7910
reformat
WillAyd Jan 10, 2020
b1e7da0
more date testing
WillAyd Jan 10, 2020
0046c3c
refactored with bug fix
WillAyd Jan 10, 2020
77b7bae
simplified timedelta test
WillAyd Jan 10, 2020
3ef4aff
Added timedelta coverage
WillAyd Jan 10, 2020
960dce6
stylistic updates
WillAyd Jan 10, 2020
6d2c8da
Removed unneeded timedelta import
WillAyd Jan 10, 2020
9b431f2
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 10, 2020
4a94f15
style updates
WillAyd Jan 10, 2020
40468bf
replace sprintf with snprintf
WillAyd Jan 11, 2020
0259370
ignore lint errors
WillAyd Jan 11, 2020
d1c00e5
Update test_pandas.py
WillAyd Jan 11, 2020
9efb929
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 20, 2020
9cbc075
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Jan 20, 2020
29f497f
moved conversion func
WillAyd Jan 21, 2020
35d4a4b
fix note
WillAyd Jan 21, 2020
bd2c4db
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Feb 7, 2020
a8423d0
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Feb 12, 2020
ebe58c7
Whatsnew
WillAyd Feb 12, 2020
d8f7575
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Feb 20, 2020
8486e37
Merge remote-tracking branch 'upstream/master' into json-timedelta
WillAyd Mar 17, 2020
ef08ad6
more comprehensive testing
WillAyd Mar 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions pandas/_libs/src/ujson/python/date_conversions.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,3 +116,29 @@ npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base) {
npy_datetime npy_dt = npy_datetimestruct_to_datetime(NPY_FR_ns, &dts);
return NpyDateTimeToEpoch(npy_dt, base);
}

/* Converts the int64_t representation of a duration to ISO; mutates len */
char *int64ToIsoDuration(int64_t value, size_t *len) {
pandas_timedeltastruct tds;
int ret_code;

pandas_timedelta_to_timedeltastruct(value, NPY_FR_ns, &tds);

// Max theoretical length of ISO Duration with 64 bit day
// as the largest unit is 70 characters + 1 for a null terminator
char *result = PyObject_Malloc(71);
if (result == NULL) {
PyErr_NoMemory();
return NULL;
}

ret_code = make_iso_8601_timedelta(&tds, result, len);
if (ret_code == -1) {
PyErr_SetString(PyExc_ValueError,
"Could not convert timedelta value to string");
PyObject_Free(result);
return NULL;
}

return result;
}
2 changes: 2 additions & 0 deletions pandas/_libs/src/ujson/python/date_conversions.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,6 @@ char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base, size_t *len);
// Convert a Python Date/Datetime to Unix epoch with resolution base
npy_datetime PyDateTimeToEpoch(PyDateTime_Date *dt, NPY_DATETIMEUNIT base);

char *int64ToIsoDuration(int64_t value, size_t *len);

#endif
82 changes: 36 additions & 46 deletions pandas/_libs/src/ujson/python/objToJSON.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ static PyTypeObject *cls_dataframe;
static PyTypeObject *cls_series;
static PyTypeObject *cls_index;
static PyTypeObject *cls_nat;
PyObject *cls_timedelta;

npy_int64 get_nat(void) { return NPY_MIN_INT64; }

Expand Down Expand Up @@ -164,7 +163,6 @@ void *initObjToJSON(void) {
cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index");
cls_series =
(PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series");
cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta");
Py_DECREF(mod_pandas);
}

Expand Down Expand Up @@ -357,6 +355,12 @@ static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused),
return int64ToIso(GET_TC(tc)->longValue, base, len);
}

/* JSON callback. returns a char* and mutates the pointer to *len */
static char *NpyTimeDeltaToIsoCallback(JSOBJ Py_UNUSED(unused),
JSONTypeContext *tc, size_t *len) {
return int64ToIsoDuration(GET_TC(tc)->longValue, len);
}

/* JSON callback */
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
size_t *len) {
Expand Down Expand Up @@ -1445,7 +1449,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
1000000000LL; // nanoseconds per second
} else {
// datetime.* objects don't follow above rules
nanosecVal = PyDateTimeToEpoch(item, NPY_FR_ns);
nanosecVal =
PyDateTimeToEpoch((PyDateTime_Date *)item, NPY_FR_ns);
}
}
}
Expand All @@ -1457,31 +1462,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
strncpy(cLabel, "null", len);
} else {
if (enc->datetimeIso) {
// TODO: Vectorized Timedelta function
if ((type_num == NPY_TIMEDELTA) || (PyDelta_Check(item))) {
PyObject *td =
PyObject_CallFunction(cls_timedelta, "(O)", item);
if (td == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

PyObject *iso =
PyObject_CallMethod(td, "isoformat", NULL);
Py_DECREF(td);
if (iso == NULL) {
Py_DECREF(item);
NpyArr_freeLabels(ret, num);
ret = 0;
break;
}

len = strlen(PyUnicode_AsUTF8(iso));
cLabel = PyObject_Malloc(len + 1);
memcpy(cLabel, PyUnicode_AsUTF8(iso), len + 1);
Py_DECREF(iso);
cLabel = int64ToIsoDuration(nanosecVal, &len);
} else {
if (type_num == NPY_DATETIME) {
cLabel = int64ToIso(nanosecVal, base, &len);
Expand Down Expand Up @@ -1614,7 +1596,11 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {

if (enc->datetimeIso) {
PRINTMARK();
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
if (enc->npyType == NPY_TIMEDELTA) {
pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
} else {
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
}
// Currently no way to pass longVal to iso function, so use
// state management
GET_TC(tc)->longValue = longVal;
Expand Down Expand Up @@ -1695,7 +1681,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base);
GET_TC(tc)->longValue =
PyDateTimeToEpoch((PyDateTime_Date *)obj, base);
tc->type = JT_LONG;
}
return;
Expand All @@ -1721,7 +1708,8 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
PRINTMARK();
NPY_DATETIMEUNIT base =
((PyObjectEncoder *)tc->encoder)->datetimeUnit;
GET_TC(tc)->longValue = PyDateTimeToEpoch((PyDateTime_Date *)obj, base);
GET_TC(tc)->longValue =
PyDateTimeToEpoch((PyDateTime_Date *)obj, base);
tc->type = JT_LONG;
}
return;
Expand All @@ -1734,28 +1722,30 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
value = total_seconds(obj) * 1000000000LL; // nanoseconds per second
}

unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
if (scaleNanosecToUnit(&value, unit) != 0) {
// TODO: Add some kind of error handling here
}

exc = PyErr_Occurred();

if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
PRINTMARK();
goto INVALID;
}

PRINTMARK();
if (value == get_nat()) {
PRINTMARK();
tc->type = JT_NULL;
return;
}
} else if (enc->datetimeIso) {
pc->PyTypeToUTF8 = NpyTimeDeltaToIsoCallback;
tc->type = JT_UTF8;
} else {
unit = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
if (scaleNanosecToUnit(&value, unit) != 0) {
// TODO: Add some kind of error handling here
}

GET_TC(tc)->longValue = value;
exc = PyErr_Occurred();

PRINTMARK();
tc->type = JT_LONG;
if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
PRINTMARK();
goto INVALID;
}

tc->type = JT_LONG;
}
GET_TC(tc)->longValue = value;
return;
} else if (PyArray_IsScalar(obj, Integer)) {
PRINTMARK();
Expand Down
34 changes: 34 additions & 0 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.c
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,37 @@ int make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
outlen);
return -1;
}


int make_iso_8601_timedelta(pandas_timedeltastruct *tds,
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that we do also have a isoformat function for time deltas defined here:

def isoformat(self) -> str:

But I think the path to get that to work in the JSON module would be way more complicated than implementing in C as we do here (and less consistent with other datetime handling).

Maybe as a follow up we want to use this C function in the Timedelta class

char *outstr, size_t *outlen) {
*outlen = 0;
*outlen += snprintf(outstr, 60, // NOLINT
"P%" NPY_INT64_FMT
"DT%" NPY_INT32_FMT
"H%" NPY_INT32_FMT
"M%" NPY_INT32_FMT,
tds->days, tds->hrs, tds->min, tds->sec);
outstr += *outlen;

if (tds->ns != 0) {
*outlen += snprintf(outstr, 12, // NOLINT
".%03" NPY_INT32_FMT
"%03" NPY_INT32_FMT
"%03" NPY_INT32_FMT
"S", tds->ms, tds->us, tds->ns);
} else if (tds->us != 0) {
*outlen += snprintf(outstr, 9, // NOLINT
".%03" NPY_INT32_FMT
"%03" NPY_INT32_FMT
"S", tds->ms, tds->us);
} else if (tds->ms != 0) {
*outlen += snprintf(outstr, 6, // NOLINT
".%03" NPY_INT32_FMT "S", tds->ms);
} else {
*outlen += snprintf(outstr, 2, // NOLINT
"%s", "S");
}

return 0;
}
10 changes: 10 additions & 0 deletions pandas/_libs/tslibs/src/datetime/np_datetime_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,4 +79,14 @@ get_datetime_iso_8601_strlen(int local, NPY_DATETIMEUNIT base);
int
make_iso_8601_datetime(npy_datetimestruct *dts, char *outstr, int outlen,
NPY_DATETIMEUNIT base);

/*
* Converts an pandas_timedeltastruct to an ISO 8601 string.
*
* Mutates outlen to provide size of (non-NULL terminated) string.
*
* Returns NULL on error.
*/
int make_iso_8601_timedelta(pandas_timedeltastruct *tds, char *outstr,
size_t *outlen);
#endif // PANDAS__LIBS_TSLIBS_SRC_DATETIME_NP_DATETIME_STRINGS_H_
3 changes: 1 addition & 2 deletions pandas/tests/io/json/test_json_table_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,8 +603,7 @@ def test_timestamp_in_columns(self):
result = df.to_json(orient="table")
js = json.loads(result)
assert js["schema"]["fields"][1]["name"] == "2016-01-01T00:00:00.000Z"
# TODO - below expectation is not correct; see GH 28256
assert js["schema"]["fields"][2]["name"] == 10000
assert js["schema"]["fields"][2]["name"] == "P0DT0H0M10S"

@pytest.mark.parametrize(
"case",
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/io/json/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,29 @@ def test_mixed_timedelta_datetime(self):
result = pd.read_json(frame.to_json(date_unit="ns"), dtype={"a": "int64"})
tm.assert_frame_equal(result, expected, check_index_type=False)

@pytest.mark.parametrize("as_object", [True, False])
@pytest.mark.parametrize("date_format", ["iso", "epoch"])
@pytest.mark.parametrize("timedelta_typ", [pd.Timedelta, timedelta])
def test_timedelta_to_json(self, as_object, date_format, timedelta_typ):
# GH28156: to_json not correctly formatting Timedelta
data = [timedelta_typ(days=1), timedelta_typ(days=2), pd.NaT]
if as_object:
data.append("a")

ser = pd.Series(data, index=data)
if date_format == "iso":
expected = (
'{"P1DT0H0M0S":"P1DT0H0M0S","P2DT0H0M0S":"P2DT0H0M0S","null":null}'
)
else:
expected = '{"86400000":86400000,"172800000":172800000,"null":null}'

if as_object:
expected = expected.replace("}", ',"a":"a"}')

result = ser.to_json(date_format=date_format)
assert result == expected

def test_default_handler(self):
value = object()
frame = DataFrame({"a": [7, value]})
Expand Down