Skip to content

Commit a895ac7

Browse files
WillAydjreback
authored andcommitted
Improve ISO Date Performance for JSON (#30496)
1 parent 64ddb07 commit a895ac7

File tree

3 files changed

+83
-69
lines changed

3 files changed

+83
-69
lines changed

asv_bench/benchmarks/io/json.py

+24
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,30 @@ def peakmem_to_json_wide(self, orient, frame):
132132
df.to_json(self.fname, orient=orient)
133133

134134

135+
class ToJSONISO(BaseIO):
136+
fname = "__test__.json"
137+
params = [["split", "columns", "index", "values", "records"]]
138+
param_names = ["orient"]
139+
140+
def setup(self, orient):
141+
N = 10 ** 5
142+
index = date_range("20000101", periods=N, freq="H")
143+
timedeltas = timedelta_range(start=1, periods=N, freq="s")
144+
datetimes = date_range(start=1, periods=N, freq="s")
145+
self.df = DataFrame(
146+
{
147+
"td_1": timedeltas,
148+
"td_2": timedeltas,
149+
"ts_1": datetimes,
150+
"ts_2": datetimes,
151+
},
152+
index=index,
153+
)
154+
155+
def time_iso_format(self, orient):
156+
self.df.to_json(orient=orient, date_format="iso")
157+
158+
135159
class ToJSONLines(BaseIO):
136160

137161
fname = "__test__.json"

pandas/_libs/src/ujson/python/objToJSON.c

+57-67
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ static PyTypeObject *cls_dataframe;
5454
static PyTypeObject *cls_series;
5555
static PyTypeObject *cls_index;
5656
static PyTypeObject *cls_nat;
57-
PyObject *cls_timestamp;
5857
PyObject *cls_timedelta;
5958

6059
npy_int64 get_nat(void) { return NPY_MIN_INT64; }
@@ -166,7 +165,6 @@ void *initObjToJSON(void) {
166165
cls_index = (PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Index");
167166
cls_series =
168167
(PyTypeObject *)PyObject_GetAttrString(mod_pandas, "Series");
169-
cls_timestamp = PyObject_GetAttrString(mod_pandas, "Timestamp");
170168
cls_timedelta = PyObject_GetAttrString(mod_pandas, "Timedelta");
171169
Py_DECREF(mod_pandas);
172170
}
@@ -408,30 +406,25 @@ static char *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *Py_UNUSED(tc),
408406
return (char *)PyUnicode_AsUTF8AndSize(_obj, (Py_ssize_t *)_outLen);
409407
}
410408

411-
/* returns a char* and mutates the pointer to *len */
412-
static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
413-
size_t *len) {
409+
/* Converts the int64_t representation of a datetime to ISO; mutates len */
410+
static char *int64ToIso(int64_t value, NPY_DATETIMEUNIT base, size_t *len) {
414411
npy_datetimestruct dts;
415412
int ret_code;
416-
int64_t longVal = GET_TC(tc)->longValue;
417413

418-
pandas_datetime_to_datetimestruct(longVal, NPY_FR_ns, &dts);
414+
pandas_datetime_to_datetimestruct(value, NPY_FR_ns, &dts);
419415

420-
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
421416
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
422417
char *result = PyObject_Malloc(*len);
423418

424419
if (result == NULL) {
425420
PyErr_NoMemory();
426-
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
427421
return NULL;
428422
}
429423

430424
ret_code = make_iso_8601_datetime(&dts, result, *len, base);
431425
if (ret_code != 0) {
432426
PyErr_SetString(PyExc_ValueError,
433427
"Could not convert datetime value to string");
434-
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
435428
PyObject_Free(result);
436429
}
437430

@@ -441,30 +434,33 @@ static char *NpyDateTimeToIso(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc,
441434
return result;
442435
}
443436

437+
/* JSON callback. returns a char* and mutates the pointer to *len */
438+
static char *NpyDateTimeToIsoCallback(JSOBJ Py_UNUSED(unused), JSONTypeContext *tc,
439+
size_t *len) {
440+
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
441+
return int64ToIso(GET_TC(tc)->longValue, base, len);
442+
}
443+
444444
static npy_datetime NpyDateTimeToEpoch(npy_datetime dt, NPY_DATETIMEUNIT base) {
445445
scaleNanosecToUnit(&dt, base);
446446
return dt;
447447
}
448448

449-
static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
449+
/* Convert PyDatetime To ISO C-string. mutates len */
450+
static char *PyDateTimeToIso(PyDateTime_Date *obj, NPY_DATETIMEUNIT base,
451+
size_t *len) {
450452
npy_datetimestruct dts;
451453
int ret;
452454

453-
if (!PyDateTime_Check(obj)) {
454-
// TODO: raise TypeError
455-
}
456-
457455
ret = convert_pydatetime_to_datetimestruct(obj, &dts);
458456
if (ret != 0) {
459457
if (!PyErr_Occurred()) {
460458
PyErr_SetString(PyExc_ValueError,
461459
"Could not convert PyDateTime to numpy datetime");
462460
}
463-
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
464461
return NULL;
465462
}
466463

467-
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
468464
*len = (size_t)get_datetime_iso_8601_strlen(0, base);
469465
char *result = PyObject_Malloc(*len);
470466
ret = make_iso_8601_datetime(&dts, result, *len, base);
@@ -473,7 +469,6 @@ static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
473469
PRINTMARK();
474470
PyErr_SetString(PyExc_ValueError,
475471
"Could not convert datetime value to string");
476-
((JSONObjectEncoder *)tc->encoder)->errorMsg = "";
477472
PyObject_Free(result);
478473
return NULL;
479474
}
@@ -484,6 +479,19 @@ static char *PyDateTimeToIso(JSOBJ obj, JSONTypeContext *tc, size_t *len) {
484479
return result;
485480
}
486481

482+
/* JSON callback */
483+
static char *PyDateTimeToIsoCallback(JSOBJ obj, JSONTypeContext *tc,
484+
size_t *len) {
485+
486+
if (!PyDateTime_Check(obj)) {
487+
PyErr_SetString(PyExc_TypeError, "Expected datetime object");
488+
return NULL;
489+
}
490+
491+
NPY_DATETIMEUNIT base = ((PyObjectEncoder *)tc->encoder)->datetimeUnit;
492+
return PyDateTimeToIso(obj, base, len);
493+
}
494+
487495
static npy_datetime PyDateTimeToEpoch(PyObject *obj, NPY_DATETIMEUNIT base) {
488496
npy_datetimestruct dts;
489497
int ret;
@@ -1518,7 +1526,8 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
15181526
npy_intp num) {
15191527
// NOTE this function steals a reference to labels.
15201528
PyObject *item = NULL;
1521-
npy_intp i, stride, len;
1529+
size_t len;
1530+
npy_intp i, stride;
15221531
char **ret;
15231532
char *dataptr, *cLabel;
15241533
int type_num;
@@ -1559,8 +1568,7 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
15591568
break;
15601569
}
15611570

1562-
// TODO: for any matches on type_num (date and timedeltas) should use a
1563-
// vectorized solution to convert to epoch or iso formats
1571+
// TODO: vectorized timedelta solution
15641572
if (enc->datetimeIso &&
15651573
(type_num == NPY_TIMEDELTA || PyDelta_Check(item))) {
15661574
PyObject *td = PyObject_CallFunction(cls_timedelta, "(O)", item);
@@ -1583,54 +1591,36 @@ char **NpyArr_encodeLabels(PyArrayObject *labels, PyObjectEncoder *enc,
15831591
cLabel = (char *)PyUnicode_AsUTF8(iso);
15841592
Py_DECREF(iso);
15851593
len = strlen(cLabel);
1586-
} else if (PyTypeNum_ISDATETIME(type_num) || PyDateTime_Check(item) ||
1587-
PyDate_Check(item)) {
1588-
PyObject *ts = PyObject_CallFunction(cls_timestamp, "(O)", item);
1589-
if (ts == NULL) {
1590-
Py_DECREF(item);
1591-
NpyArr_freeLabels(ret, num);
1592-
ret = 0;
1593-
break;
1594+
} else if (PyTypeNum_ISDATETIME(type_num)) {
1595+
NPY_DATETIMEUNIT base = enc->datetimeUnit;
1596+
npy_int64 longVal;
1597+
PyArray_VectorUnaryFunc *castfunc =
1598+
PyArray_GetCastFunc(PyArray_DescrFromType(type_num), NPY_INT64);
1599+
if (!castfunc) {
1600+
PyErr_Format(PyExc_ValueError,
1601+
"Cannot cast numpy dtype %d to long",
1602+
enc->npyType);
15941603
}
1595-
1604+
castfunc(dataptr, &longVal, 1, NULL, NULL);
15961605
if (enc->datetimeIso) {
1597-
PyObject *iso = PyObject_CallMethod(ts, "isoformat", NULL);
1598-
Py_DECREF(ts);
1599-
if (iso == NULL) {
1600-
Py_DECREF(item);
1601-
NpyArr_freeLabels(ret, num);
1602-
ret = 0;
1603-
break;
1606+
cLabel = int64ToIso(longVal, base, &len);
1607+
} else {
1608+
if (!scaleNanosecToUnit(&longVal, base)) {
1609+
// TODO: This gets hit but somehow doesn't cause errors
1610+
// need to clean up (elsewhere in module as well)
16041611
}
1605-
1606-
cLabel = (char *)PyUnicode_AsUTF8(iso);
1607-
Py_DECREF(iso);
1612+
cLabel = PyObject_Malloc(21); // 21 chars for int64
1613+
sprintf(cLabel, "%" NPY_INT64_FMT, longVal);
16081614
len = strlen(cLabel);
1615+
}
1616+
} else if (PyDateTime_Check(item) || PyDate_Check(item)) {
1617+
NPY_DATETIMEUNIT base = enc->datetimeUnit;
1618+
if (enc->datetimeIso) {
1619+
cLabel = PyDateTimeToIso((PyDateTime_Date *)item, base, &len);
16091620
} else {
1610-
npy_int64 value;
1611-
// TODO: refactor to not duplicate what goes on in
1612-
// beginTypeContext
1613-
if (PyObject_HasAttrString(ts, "value")) {
1614-
PRINTMARK();
1615-
value = get_long_attr(ts, "value");
1616-
} else {
1617-
PRINTMARK();
1618-
value = total_seconds(ts) *
1619-
1000000000LL; // nanoseconds per second
1620-
}
1621-
Py_DECREF(ts);
1622-
1623-
NPY_DATETIMEUNIT unit = enc->datetimeUnit;
1624-
if (scaleNanosecToUnit(&value, unit) != 0) {
1625-
Py_DECREF(item);
1626-
NpyArr_freeLabels(ret, num);
1627-
ret = 0;
1628-
break;
1629-
}
1630-
1631-
char buf[21] = {0}; // 21 chars for 2**63 as string
1632-
cLabel = buf;
1633-
sprintf(buf, "%" NPY_INT64_FMT, value);
1621+
cLabel = PyObject_Malloc(21); // 21 chars for int64
1622+
sprintf(cLabel, "%" NPY_DATETIME_FMT,
1623+
PyDateTimeToEpoch(item, base));
16341624
len = strlen(cLabel);
16351625
}
16361626
} else { // Fallback to string representation
@@ -1740,7 +1730,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
17401730

17411731
if (enc->datetimeIso) {
17421732
PRINTMARK();
1743-
pc->PyTypeToUTF8 = NpyDateTimeToIso;
1733+
pc->PyTypeToUTF8 = NpyDateTimeToIsoCallback;
17441734
// Currently no way to pass longVal to iso function, so use
17451735
// state management
17461736
GET_TC(tc)->longValue = longVal;
@@ -1815,7 +1805,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
18151805
PRINTMARK();
18161806
if (enc->datetimeIso) {
18171807
PRINTMARK();
1818-
pc->PyTypeToUTF8 = PyDateTimeToIso;
1808+
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
18191809
tc->type = JT_UTF8;
18201810
} else {
18211811
PRINTMARK();
@@ -1841,7 +1831,7 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
18411831
PRINTMARK();
18421832
if (enc->datetimeIso) {
18431833
PRINTMARK();
1844-
pc->PyTypeToUTF8 = PyDateTimeToIso;
1834+
pc->PyTypeToUTF8 = PyDateTimeToIsoCallback;
18451835
tc->type = JT_UTF8;
18461836
} else {
18471837
PRINTMARK();

pandas/tests/io/json/test_pandas.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -854,7 +854,7 @@ def test_date_format_frame(self, date, date_unit):
854854
json = df.to_json(date_format="iso")
855855
result = read_json(json)
856856
expected = df.copy()
857-
# expected.index = expected.index.tz_localize("UTC")
857+
expected.index = expected.index.tz_localize("UTC")
858858
expected["date"] = expected["date"].dt.tz_localize("UTC")
859859
tm.assert_frame_equal(result, expected)
860860

@@ -884,7 +884,7 @@ def test_date_format_series(self, date, date_unit):
884884
json = ts.to_json(date_format="iso")
885885
result = read_json(json, typ="series")
886886
expected = ts.copy()
887-
# expected.index = expected.index.tz_localize("UTC")
887+
expected.index = expected.index.tz_localize("UTC")
888888
expected = expected.dt.tz_localize("UTC")
889889
tm.assert_series_equal(result, expected)
890890

0 commit comments

Comments
 (0)