diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h index 69284e1c3f2ab..ef034b1ac233a 100644 --- a/pandas/_libs/src/ujson/lib/ultrajson.h +++ b/pandas/_libs/src/ujson/lib/ultrajson.h @@ -304,6 +304,7 @@ typedef struct __JSONObjectDecoder { JSOBJ (*newInt)(void *prv, JSINT32 value); JSOBJ (*newLong)(void *prv, JSINT64 value); JSOBJ (*newDouble)(void *prv, double value); + JSOBJ (*newBigNum)(void *prv, char* cStr); void (*releaseObject)(void *prv, JSOBJ obj, void *decoder); JSPFN_MALLOC malloc; JSPFN_FREE free; diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c index 36eb170f8048f..265576a7f9004 100644 --- a/pandas/_libs/src/ujson/lib/ultrajsondec.c +++ b/pandas/_libs/src/ujson/lib/ultrajsondec.c @@ -43,6 +43,7 @@ Numeric decoder derived from from TCL library #include #include #include +#include #include #include #include @@ -64,6 +65,7 @@ struct DecoderState { int escHeap; int lastType; JSUINT32 objDepth; + char *cStr; // storage for BigNum void *prv; JSONObjectDecoder *dec; }; @@ -118,6 +120,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { int intNeg = 1; int mantSize = 0; JSUINT64 intValue; + JSLONG newDigit; int chr; int decimalCount = 0; double frcValue = 0.0; @@ -125,6 +128,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { double expValue; char *offset = ds->start; + JSUINT64 overflowLimit = LLONG_MAX; if (*(offset) == 'I') { @@ -160,12 +164,35 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { // FIXME: Check for arithmetic overflow here // PERF: Don't do 64-bit arithmetic here unless we know we have // to - intValue = intValue * 10ULL + (JSLONG)(chr - 48); + newDigit = (JSLONG)(chr - 48); + + // TO DO: need to fix overflow catching + if (intValue> (overflowLimit-newDigit)/10) { + + // convert current inValue into string + int length = snprintf( NULL, 0, "%lu", intValue); + char* intValue_asStr = malloc( length + 1 ); + snprintf(intValue_asStr, length + 1, "%lu", intValue); + + if (strlen(ds->cStr)== 0) { // first overflow + ds->cStr = (char*)realloc(ds->cStr, strlen(intValue_asStr)+1); + strcpy(ds->cStr, intValue_asStr); + } else { // has overflown before + char* cStr_prev = malloc(strlen(ds->cStr)); + memcpy(cStr_prev, ds->cStr, strlen(ds->cStr)); + + size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr); + ds->cStr = (char*)realloc(ds->cStr, new_size); + + strcpy(ds->cStr, cStr_prev); + strcat(ds->cStr, intValue_asStr); + } - if (intValue > overflowLimit) { - return SetError(ds, -1, overflowLimit == LLONG_MAX - ? "Value is too big" - : "Value is too small"); + // then reset intValue + intValue = (newDigit==0) ? 10 : newDigit; + } + else { + intValue = intValue * 10ULL + newDigit; } offset++; @@ -196,7 +223,25 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) { ds->lastType = JT_INT; ds->start = offset; - if ((intValue >> 31)) { + // check if ds->cStr has been written to + if (strlen(ds->cStr)>0){ + + // covert intValue to cString + int length = snprintf( NULL, 0, "%lu", intValue); + char* intValue_asStr = malloc( length + 1 ); + snprintf(intValue_asStr, length + 1, "%lu", intValue); + + char* cStr_prev = malloc(strlen(ds->cStr)); + memcpy(cStr_prev, ds->cStr, strlen(ds->cStr)); + + size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr); + ds->cStr = (char*)realloc(ds->cStr, new_size); + strcpy(ds->cStr, cStr_prev); + strcat(ds->cStr, intValue_asStr); + + return ds->dec->newBigNum(ds->prv, ds->cStr); + } + else if ((intValue >> 31)) { return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg)); } else { return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg)); @@ -1170,6 +1215,9 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, ds.dec->errorStr = NULL; ds.dec->errorOffset = NULL; ds.objDepth = 0; + + ds.cStr = malloc(sizeof("\0")); + strcpy(ds.cStr, "\0"); ds.dec = dec; @@ -1198,5 +1246,7 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, return SetError(&ds, -1, "Trailing data"); } + free(ds.cStr); + return ret; } diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c index 3db10237b2688..cae20cb14455d 100644 --- a/pandas/_libs/src/ujson/python/JSONtoObj.c +++ b/pandas/_libs/src/ujson/python/JSONtoObj.c @@ -483,6 +483,10 @@ JSOBJ Object_newDouble(void *prv, double value) { return PyFloat_FromDouble(value); } +JSOBJ Object_newBigNum(void* prv, char* cStr) { + return PyLong_FromString(cStr, NULL, 0); +} + static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) { PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder; if (obj != decoder->npyarr_addr) { @@ -509,8 +513,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) { Object_newPosInf, Object_newNegInf, Object_newObject, Object_endObject, Object_newArray, Object_endArray, Object_newInteger, Object_newLong, Object_newDouble, - Object_releaseObject, PyObject_Malloc, PyObject_Free, - PyObject_Realloc}; + Object_newBigNum, Object_releaseObject, PyObject_Malloc, + PyObject_Free, PyObject_Realloc}; dec.preciseFloat = 0; dec.prv = NULL; diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 952c583040360..ca9478033b094 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -560,7 +560,15 @@ def test_encode_long_conversion(self): assert output == json.dumps(long_input) assert long_input == ujson.decode(output) - @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)]) + @pytest.mark.parametrize( + "bigNum", + [ + sys.maxsize + 1, + sys.maxsize * sys.maxsize + 100, + -(sys.maxsize + 2), + -(sys.maxsize * sys.maxsize + 100), + ], + ) def test_dumps_ints_larger_than_maxsize(self, bigNum): # GH34395 bigNum = sys.maxsize + 1 @@ -568,8 +576,7 @@ def test_dumps_ints_larger_than_maxsize(self, bigNum): assert str(bigNum) == encoding # GH20599 - with pytest.raises(ValueError): - assert ujson.loads(encoding) == bigNum + assert ujson.decode(encoding) == bigNum @pytest.mark.parametrize( "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"] @@ -1046,13 +1053,6 @@ def test_decode_array(self, arr): def test_decode_extreme_numbers(self, extreme_num): assert extreme_num == ujson.decode(str(extreme_num)) - @pytest.mark.parametrize( - "too_extreme_num", ["9223372036854775808", "-90223372036854775809"] - ) - def test_decode_too_extreme_numbers(self, too_extreme_num): - with pytest.raises(ValueError): - ujson.decode(too_extreme_num) - def test_decode_with_trailing_whitespaces(self): assert {} == ujson.decode("{}\n\t ") @@ -1061,8 +1061,10 @@ def test_decode_with_trailing_non_whitespaces(self): ujson.decode("{}\n\t a") def test_decode_array_with_big_int(self): - with pytest.raises(ValueError): - ujson.loads("[18446098363113800555]") + # GH20599 + result = ujson.loads("[18446098363113800555]") + expected = [18446098363113800555] + assert result == expected @pytest.mark.parametrize( "float_number",