diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index a490e250943f5..ed0fb5b8fe342 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -1,3 +1,5 @@ +import sys + import numpy as np from pandas import DataFrame, concat, date_range, read_json, timedelta_range @@ -82,6 +84,7 @@ def setup(self, orient, frame): timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) + longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) @@ -120,6 +123,18 @@ def setup(self, orient, frame): index=index, ) + self.df_longint_float_str = DataFrame( + { + "longint_1": longints, + "longint_2": longints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) + def time_to_json(self, orient, frame): getattr(self, frame).to_json(self.fname, orient=orient) @@ -172,6 +187,7 @@ def setup(self): timedeltas = timedelta_range(start=1, periods=N, freq="s") datetimes = date_range(start=1, periods=N, freq="s") ints = np.random.randint(100000000, size=N) + longints = sys.maxsize * np.random.randint(100000000, size=N) floats = np.random.randn(N) strings = tm.makeStringIndex(N) self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N)) @@ -209,6 +225,17 @@ def setup(self): }, index=index, ) + self.df_longint_float_str = DataFrame( + { + "longint_1": longints, + "longint_2": longints, + "float_1": floats, + "float_2": floats, + "str_1": strings, + "str_2": strings, + }, + index=index, + ) def time_floats_with_int_idex_lines(self): self.df.to_json(self.fname, orient="records", lines=True) @@ -225,6 +252,9 @@ def time_float_int_lines(self): def time_float_int_str_lines(self): self.df_int_float_str.to_json(self.fname, orient="records", lines=True) + def time_float_longint_str_lines(self): + self.df_longint_float_str.to_json(self.fname, orient="records", lines=True) + class ToJSONMem: def setup_cache(self): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7c9fa53568f45..d6f313b5c3b35 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1020,6 +1020,7 @@ I/O - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`) - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`) - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`) +- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index acb66b668e8dc..69284e1c3f2ab 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -150,6 +150,7 @@ enum JSTYPES { JT_INT, // (JSINT32 (signed 32-bit)) JT_LONG, // (JSINT64 (signed 64-bit)) JT_DOUBLE, // (double) + JT_BIGNUM, // integer larger than sys.maxsize JT_UTF8, // (char 8-bit) JT_ARRAY, // Array structure JT_OBJECT, // Key/Value structure @@ -187,6 +188,8 @@ typedef struct __JSONObjectEncoder { JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc); JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc); double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc); + const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen); /* Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT) diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c index 065e3b2c60cf9..51aa39a16920e 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsonenc.c +++ b/pandas/_libs/src/ujson/lib/ultrajsonenc.c @@ -1107,6 +1107,35 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, Buffer_AppendCharUnchecked(enc, '\"'); break; } + + case JT_BIGNUM: { + value = enc->getBigNumStringValue(obj, &tc, &szlen); + + Buffer_Reserve(enc, RESERVE_STRING(szlen)); + if (enc->errorMsg) { + enc->endTypeContext(obj, &tc); + return; + } + + if (enc->forceASCII) { + if (!Buffer_EscapeStringValidated(obj, enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } else { + if (!Buffer_EscapeStringUnvalidated(enc, value, + value + szlen)) { + enc->endTypeContext(obj, &tc); + enc->level--; + return; + } + } + + break; + + } } enc->endTypeContext(obj, &tc); diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c index c71e941f7d6e8..1de9642761961 100644 --- a/pandas/_libs/src/ujson/python/objToJSON.c +++ b/pandas/_libs/src/ujson/python/objToJSON.c @@ -1629,15 +1629,20 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) { if (PyLong_Check(obj)) { PRINTMARK(); tc->type = JT_LONG; - GET_TC(tc)->longValue = PyLong_AsLongLong(obj); + int overflow = 0; + GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow); + int err; + err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred(); - exc = PyErr_Occurred(); - - if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) { + if (overflow){ + PRINTMARK(); + tc->type = JT_BIGNUM; + } + else if (err) { PRINTMARK(); goto INVALID; } - + return; } else if (PyFloat_Check(obj)) { PRINTMARK(); @@ -2105,7 +2110,6 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { NpyArr_freeLabels(GET_TC(tc)->columnLabels, GET_TC(tc)->columnLabelsLen); GET_TC(tc)->columnLabels = NULL; - PyObject_Free(GET_TC(tc)->cStr); GET_TC(tc)->cStr = NULL; PyObject_Free(tc->prv); @@ -2126,6 +2130,19 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) { return GET_TC(tc)->doubleValue; } +const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, + size_t *_outLen) { + PyObject* repr = PyObject_Str(obj); + const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen); + char* bytes = PyObject_Malloc(*_outLen + 1); + memcpy(bytes, str, *_outLen + 1); + GET_TC(tc)->cStr = bytes; + + Py_DECREF(repr); + + return GET_TC(tc)->cStr; +} + static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); } void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) { @@ -2181,6 +2198,7 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, Object_getLongValue, NULL, // getIntValue is unused Object_getDoubleValue, + Object_getBigNumStringValue, Object_iterBegin, Object_iterNext, Object_iterEnd, @@ -2294,7 +2312,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args, if (ret != buffer) { encoder->free(ret); } - PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg); return NULL; } diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 8578b31fbb81e..10f49b9b81528 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -4,6 +4,7 @@ from io import StringIO import json import os +import sys import numpy as np import pytest @@ -1242,6 +1243,29 @@ def test_read_jsonl_unicode_chars(self): expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + def test_to_json_large_numbers(self, bigNum): + # GH34473 + series = Series(bigNum, dtype=object, index=["articleId"]) + json = series.to_json() + expected = '{"articleId":' + str(bigNum) + "}" + assert json == expected + # GH 20599 + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_series_equal(series, result) + + df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0]) + json = df.to_json() + expected = '{"0":{"articleId":' + str(bigNum) + "}}" + assert json == expected + # GH 20599 + with pytest.raises(ValueError): + json = StringIO(json) + result = read_json(json) + tm.assert_frame_equal(df, result) + def test_read_json_large_numbers(self): # GH18842 json = '{"articleId": "1404366058080022500245"}' diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 7dc73d5be1538..e1a136e1a3728 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -5,6 +5,7 @@ import locale import math import re +import sys import time import dateutil @@ -559,6 +560,17 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) + @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + def test_dumps_ints_larger_than_maxsize(self, bigNum): + # GH34395 + bigNum = sys.maxsize + 1 + encoding = ujson.encode(bigNum) + assert str(bigNum) == encoding + + # GH20599 + with pytest.raises(ValueError): + assert ujson.loads(encoding) == bigNum + @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] ) @@ -570,18 +582,6 @@ def test_loads_non_str_bytes_raises(self): with pytest.raises(TypeError, match=msg): ujson.loads(None) - def test_encode_numeric_overflow(self): - with pytest.raises(OverflowError): - ujson.encode(12839128391289382193812939) - - def test_encode_numeric_overflow_nested(self): - class Nested: - x = 12839128391289382193812939 - - for _ in range(0, 100): - with pytest.raises(OverflowError): - ujson.encode(Nested()) - @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1]) def test_decode_number_with_32bit_sign_bit(self, val): # Test that numbers that fit within 32 bits but would have the