Skip to content

Commit d85b93d

Browse files
arw2019WillAyd
andauthored
fix to_json for numbers larger than sys.maxsize (#34473)
* BUG: overflow on to_json with numbers larger than sys.maxsize * TST: overflow on to_json with numbers larger than sys.maxsize (#34395) * DOC: update with issue #34395 * TST: removed unused import * ENH: added case JT_BIGNUM to encode * ENH: added JT_BIGNUM to JSTYPES * BUG: changed error for ints>sys.maxsize into JT_BIGNUM * ENH: removed debug statements * BUG: removed dumps wrapper * removed bigNum from TypeContext * TST: fixed bug in the test * added pointer to string rep converter for BigNum * TST: removed ujson.loads from the test * added getBigNumStringValue * added code to JT_BIGNUM handler by analogy with JT_UTF8 * TST: update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd <[email protected]> * added Object_getBigNumStringValue to pyEncoder * added skeletal code for Object_GetBigNumStringValue * completed Object_getBigNumStringValue using PyObject_Repr * BUG: changed Object_getBigNumStringValue * improved Object_getBigNumStringValue some more * update getBigNumStringValue argument * corrected Object_getBigNumStringValue * more fixes to Object_getBigNumStringValue * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c * Update pandas/_libs/src/ujson/python/objToJSON.c * updated pyEncoder for JT_BIGNUM * updated pyEncoder * moved getBigNumStringValue to pyEncoder * fixed declaration of Object_getBigNumStringValue * fixed Object_getBigNumStringValue * catch overflow error with PyLong_AsLongLongAndOverflow * remove unnecessary error check * added shortcircuit for error check * simplify int overflow error catching Co-authored-by: William Ayd <[email protected]> * Update long int test in pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd <[email protected]> * removed tests expecting numeric overflow * remove underscore from overflow Co-authored-by: William Ayd <[email protected]> * removed underscores from _overflow everywhere * fixed small typo * fix type of exc * deleted numeric overflow tests * remove extraneous condition in if statement Co-authored-by: William Ayd <[email protected]> * remove extraneous condition in if statement Co-authored-by: William Ayd <[email protected]> * change _Bool into int Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <[email protected]> * Update pandas/_libs/src/ujson/lib/ultrajsonenc.c Co-authored-by: William Ayd <[email protected]> * allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd <[email protected]> * allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd <[email protected]> * reinstate RESERVE_STRING(szlen) in JT_BIGNUM case * replaced (private) with (public) in whatnew * release bytes in Object_endTypeContext * in JT_BIGNUM change if+if into if+else if * added reallocation of bigNum_bytes * removed bigNum_bytes * added to_json test for ints>sys.maxsize * Use python malloc to match PyObject_Free in endTypeContext Co-authored-by: William Ayd <[email protected]> * TST: added manually constructed strs to compare encodings * fixed styling to minimize diff with master * fixed styling * fixed conflicts with master * fix styling to minimize diff * fix styling to minimize diff * fixed styling * added negative nigNum to test_to_json_large_numers * added negative nigNum to test_to_json_large_numers * Update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd <[email protected]> * fixe test_to_json_for_large_nums for -ve * TST: added xfail for ujson.encode with long int input * TST: fixed variable names in test_to_json_large_numbers * TST: added xfail test for json.decode Series with long int * TST: added xfail test for json.decode DataFrame with long int * BENCH: added benchmarks for long ints Co-authored-by: William Ayd <[email protected]>
1 parent f9e4c8c commit d85b93d

File tree

7 files changed

+123
-19
lines changed

7 files changed

+123
-19
lines changed

asv_bench/benchmarks/io/json.py

+30
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import sys
2+
13
import numpy as np
24

35
from pandas import DataFrame, concat, date_range, read_json, timedelta_range
@@ -82,6 +84,7 @@ def setup(self, orient, frame):
8284
timedeltas = timedelta_range(start=1, periods=N, freq="s")
8385
datetimes = date_range(start=1, periods=N, freq="s")
8486
ints = np.random.randint(100000000, size=N)
87+
longints = sys.maxsize * np.random.randint(100000000, size=N)
8588
floats = np.random.randn(N)
8689
strings = tm.makeStringIndex(N)
8790
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
@@ -120,6 +123,18 @@ def setup(self, orient, frame):
120123
index=index,
121124
)
122125

126+
self.df_longint_float_str = DataFrame(
127+
{
128+
"longint_1": longints,
129+
"longint_2": longints,
130+
"float_1": floats,
131+
"float_2": floats,
132+
"str_1": strings,
133+
"str_2": strings,
134+
},
135+
index=index,
136+
)
137+
123138
def time_to_json(self, orient, frame):
124139
getattr(self, frame).to_json(self.fname, orient=orient)
125140

@@ -172,6 +187,7 @@ def setup(self):
172187
timedeltas = timedelta_range(start=1, periods=N, freq="s")
173188
datetimes = date_range(start=1, periods=N, freq="s")
174189
ints = np.random.randint(100000000, size=N)
190+
longints = sys.maxsize * np.random.randint(100000000, size=N)
175191
floats = np.random.randn(N)
176192
strings = tm.makeStringIndex(N)
177193
self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
@@ -209,6 +225,17 @@ def setup(self):
209225
},
210226
index=index,
211227
)
228+
self.df_longint_float_str = DataFrame(
229+
{
230+
"longint_1": longints,
231+
"longint_2": longints,
232+
"float_1": floats,
233+
"float_2": floats,
234+
"str_1": strings,
235+
"str_2": strings,
236+
},
237+
index=index,
238+
)
212239

213240
def time_floats_with_int_idex_lines(self):
214241
self.df.to_json(self.fname, orient="records", lines=True)
@@ -225,6 +252,9 @@ def time_float_int_lines(self):
225252
def time_float_int_str_lines(self):
226253
self.df_int_float_str.to_json(self.fname, orient="records", lines=True)
227254

255+
def time_float_longint_str_lines(self):
256+
self.df_longint_float_str.to_json(self.fname, orient="records", lines=True)
257+
228258

229259
class ToJSONMem:
230260
def setup_cache(self):

doc/source/whatsnew/v1.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1026,6 +1026,7 @@ I/O
10261026
- Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
10271027
- :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`)
10281028
- Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`)
1029+
- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`)
10291030

10301031
Plotting
10311032
^^^^^^^^

pandas/_libs/src/ujson/lib/ultrajson.h

+3
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ enum JSTYPES {
150150
JT_INT, // (JSINT32 (signed 32-bit))
151151
JT_LONG, // (JSINT64 (signed 64-bit))
152152
JT_DOUBLE, // (double)
153+
JT_BIGNUM, // integer larger than sys.maxsize
153154
JT_UTF8, // (char 8-bit)
154155
JT_ARRAY, // Array structure
155156
JT_OBJECT, // Key/Value structure
@@ -187,6 +188,8 @@ typedef struct __JSONObjectEncoder {
187188
JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc);
188189
JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
189190
double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
191+
const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc,
192+
size_t *_outLen);
190193

191194
/*
192195
Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT)

pandas/_libs/src/ujson/lib/ultrajsonenc.c

+29
Original file line numberDiff line numberDiff line change
@@ -1107,6 +1107,35 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
11071107
Buffer_AppendCharUnchecked(enc, '\"');
11081108
break;
11091109
}
1110+
1111+
case JT_BIGNUM: {
1112+
value = enc->getBigNumStringValue(obj, &tc, &szlen);
1113+
1114+
Buffer_Reserve(enc, RESERVE_STRING(szlen));
1115+
if (enc->errorMsg) {
1116+
enc->endTypeContext(obj, &tc);
1117+
return;
1118+
}
1119+
1120+
if (enc->forceASCII) {
1121+
if (!Buffer_EscapeStringValidated(obj, enc, value,
1122+
value + szlen)) {
1123+
enc->endTypeContext(obj, &tc);
1124+
enc->level--;
1125+
return;
1126+
}
1127+
} else {
1128+
if (!Buffer_EscapeStringUnvalidated(enc, value,
1129+
value + szlen)) {
1130+
enc->endTypeContext(obj, &tc);
1131+
enc->level--;
1132+
return;
1133+
}
1134+
}
1135+
1136+
break;
1137+
1138+
}
11101139
}
11111140

11121141
enc->endTypeContext(obj, &tc);

pandas/_libs/src/ujson/python/objToJSON.c

+24-7
Original file line numberDiff line numberDiff line change
@@ -1629,15 +1629,20 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
16291629
if (PyLong_Check(obj)) {
16301630
PRINTMARK();
16311631
tc->type = JT_LONG;
1632-
GET_TC(tc)->longValue = PyLong_AsLongLong(obj);
1632+
int overflow = 0;
1633+
GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
1634+
int err;
1635+
err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred();
16331636

1634-
exc = PyErr_Occurred();
1635-
1636-
if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
1637+
if (overflow){
1638+
PRINTMARK();
1639+
tc->type = JT_BIGNUM;
1640+
}
1641+
else if (err) {
16371642
PRINTMARK();
16381643
goto INVALID;
16391644
}
1640-
1645+
16411646
return;
16421647
} else if (PyFloat_Check(obj)) {
16431648
PRINTMARK();
@@ -2105,7 +2110,6 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
21052110
NpyArr_freeLabels(GET_TC(tc)->columnLabels,
21062111
GET_TC(tc)->columnLabelsLen);
21072112
GET_TC(tc)->columnLabels = NULL;
2108-
21092113
PyObject_Free(GET_TC(tc)->cStr);
21102114
GET_TC(tc)->cStr = NULL;
21112115
PyObject_Free(tc->prv);
@@ -2126,6 +2130,19 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
21262130
return GET_TC(tc)->doubleValue;
21272131
}
21282132

2133+
const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc,
2134+
size_t *_outLen) {
2135+
PyObject* repr = PyObject_Str(obj);
2136+
const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen);
2137+
char* bytes = PyObject_Malloc(*_outLen + 1);
2138+
memcpy(bytes, str, *_outLen + 1);
2139+
GET_TC(tc)->cStr = bytes;
2140+
2141+
Py_DECREF(repr);
2142+
2143+
return GET_TC(tc)->cStr;
2144+
}
2145+
21292146
static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); }
21302147

21312148
void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
@@ -2181,6 +2198,7 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
21812198
Object_getLongValue,
21822199
NULL, // getIntValue is unused
21832200
Object_getDoubleValue,
2201+
Object_getBigNumStringValue,
21842202
Object_iterBegin,
21852203
Object_iterNext,
21862204
Object_iterEnd,
@@ -2294,7 +2312,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
22942312
if (ret != buffer) {
22952313
encoder->free(ret);
22962314
}
2297-
22982315
PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg);
22992316
return NULL;
23002317
}

pandas/tests/io/json/test_pandas.py

+24
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from io import StringIO
55
import json
66
import os
7+
import sys
78

89
import numpy as np
910
import pytest
@@ -1242,6 +1243,29 @@ def test_read_jsonl_unicode_chars(self):
12421243
expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
12431244
tm.assert_frame_equal(result, expected)
12441245

1246+
@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
1247+
def test_to_json_large_numbers(self, bigNum):
1248+
# GH34473
1249+
series = Series(bigNum, dtype=object, index=["articleId"])
1250+
json = series.to_json()
1251+
expected = '{"articleId":' + str(bigNum) + "}"
1252+
assert json == expected
1253+
# GH 20599
1254+
with pytest.raises(ValueError):
1255+
json = StringIO(json)
1256+
result = read_json(json)
1257+
tm.assert_series_equal(series, result)
1258+
1259+
df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
1260+
json = df.to_json()
1261+
expected = '{"0":{"articleId":' + str(bigNum) + "}}"
1262+
assert json == expected
1263+
# GH 20599
1264+
with pytest.raises(ValueError):
1265+
json = StringIO(json)
1266+
result = read_json(json)
1267+
tm.assert_frame_equal(df, result)
1268+
12451269
def test_read_json_large_numbers(self):
12461270
# GH18842
12471271
json = '{"articleId": "1404366058080022500245"}'

pandas/tests/io/json/test_ujson.py

+12-12
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import locale
66
import math
77
import re
8+
import sys
89
import time
910

1011
import dateutil
@@ -559,6 +560,17 @@ def test_encode_long_conversion(self):
559560
assert output == json.dumps(long_input)
560561
assert long_input == ujson.decode(output)
561562

563+
@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
564+
def test_dumps_ints_larger_than_maxsize(self, bigNum):
565+
# GH34395
566+
bigNum = sys.maxsize + 1
567+
encoding = ujson.encode(bigNum)
568+
assert str(bigNum) == encoding
569+
570+
# GH20599
571+
with pytest.raises(ValueError):
572+
assert ujson.loads(encoding) == bigNum
573+
562574
@pytest.mark.parametrize(
563575
"int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]
564576
)
@@ -570,18 +582,6 @@ def test_loads_non_str_bytes_raises(self):
570582
with pytest.raises(TypeError, match=msg):
571583
ujson.loads(None)
572584

573-
def test_encode_numeric_overflow(self):
574-
with pytest.raises(OverflowError):
575-
ujson.encode(12839128391289382193812939)
576-
577-
def test_encode_numeric_overflow_nested(self):
578-
class Nested:
579-
x = 12839128391289382193812939
580-
581-
for _ in range(0, 100):
582-
with pytest.raises(OverflowError):
583-
ujson.encode(Nested())
584-
585585
@pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1])
586586
def test_decode_number_with_32bit_sign_bit(self, val):
587587
# Test that numbers that fit within 32 bits but would have the

0 commit comments

Comments
 (0)