Skip to content

BUG: json.decode fails for nums larger than sys.maxsize #34984

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/src/ujson/lib/ultrajson.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ typedef struct __JSONObjectDecoder {
JSOBJ (*newInt)(void *prv, JSINT32 value);
JSOBJ (*newLong)(void *prv, JSINT64 value);
JSOBJ (*newDouble)(void *prv, double value);
JSOBJ (*newBigNum)(void *prv, char* cStr);
void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
JSPFN_MALLOC malloc;
JSPFN_FREE free;
Expand Down
61 changes: 55 additions & 6 deletions pandas/_libs/src/ujson/lib/ultrajsondec.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Numeric decoder derived from from TCL library
#include <limits.h>
#include <locale.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
Expand All @@ -64,6 +65,7 @@ struct DecoderState {
int escHeap;
int lastType;
JSUINT32 objDepth;
char *cStr; // storage for BigNum
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to change the struct here; just look at the existing implementation of decode_string and mirror that

void *prv;
JSONObjectDecoder *dec;
};
Expand Down Expand Up @@ -118,6 +120,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
int intNeg = 1;
int mantSize = 0;
JSUINT64 intValue;
JSLONG newDigit;
int chr;
int decimalCount = 0;
double frcValue = 0.0;
Expand Down Expand Up @@ -160,12 +163,35 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
// FIXME: Check for arithmetic overflow here
// PERF: Don't do 64-bit arithmetic here unless we know we have
// to
intValue = intValue * 10ULL + (JSLONG)(chr - 48);
newDigit = (JSLONG)(chr - 48);

// TO DO: need to fix overflow catching
if (intValue*10ULL + newDigit > overflowLimit) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this works generally. If you want a reference for overflow detection try looking at the python implementation of PyLong_AsLongLongAndOverflow

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel like this might be ok because overflowLimit is adjusted earlier on depending on whether the number is positive or negative (in line 140 overflowLimit gets changed if intValue came with a minus sign):

JSUINT64 overflowLimit = LLONG_MAX;

But I'm down to rewrite this part to follow the PyLong_AsLongLongAndOverflow implementation:

https://github.com/python/cpython/blob/04cdeb7a5617c48102f45b965e683b12cdf934f8/Objects/longobject.c#L380

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is if intValue*10ULL + newDigit overflows than the comparison to overflowLimit is a moot point because you've already hit undefined behavior with the former


// convert current inValue into string
int length = snprintf( NULL, 0, "%lu", intValue);
char* intValue_asStr = malloc( length + 1 );
snprintf(intValue_asStr, length + 1, "%lu", intValue);

if (strlen(ds->cStr)== 0) { // first overflow
ds->cStr = (char*)realloc(ds->cStr, strlen(intValue_asStr)+1);
strcpy(ds->cStr, intValue_asStr);
} else { // has overflown before
char* cStr_prev = malloc(strlen(ds->cStr));
memcpy(cStr_prev, ds->cStr, strlen(ds->cStr));

size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr);
ds->cStr = (char*)realloc(ds->cStr, new_size);

strcpy(ds->cStr, cStr_prev);
strcat(ds->cStr, intValue_asStr);
}

if (intValue > overflowLimit) {
return SetError(ds, -1, overflowLimit == LLONG_MAX
? "Value is too big"
: "Value is too small");
// then reset intValue
intValue = (newDigit==0) ? 10 : newDigit;
}
else {
intValue = intValue * 10ULL + newDigit;
}

offset++;
Expand Down Expand Up @@ -196,7 +222,25 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
ds->lastType = JT_INT;
ds->start = offset;

if ((intValue >> 31)) {
// check if ds->cStr has been written to
if (strlen(ds->cStr)>0){

// covert intValue to cString
int length = snprintf( NULL, 0, "%lu", intValue);
char* intValue_asStr = malloc( length + 1 );
snprintf(intValue_asStr, length + 1, "%lu", intValue);

char* cStr_prev = malloc(strlen(ds->cStr));
memcpy(cStr_prev, ds->cStr, strlen(ds->cStr));

size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr);
ds->cStr = (char*)realloc(ds->cStr, new_size);
strcpy(ds->cStr, cStr_prev);
strcat(ds->cStr, intValue_asStr);

return ds->dec->newBigNum(ds->prv, ds->cStr);
}
else if ((intValue >> 31)) {
return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
} else {
return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
Expand Down Expand Up @@ -1170,6 +1214,9 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
ds.dec->errorStr = NULL;
ds.dec->errorOffset = NULL;
ds.objDepth = 0;

ds.cStr = malloc(sizeof("\0"));
strcpy(ds.cStr, "\0");

ds.dec = dec;

Expand Down Expand Up @@ -1198,5 +1245,7 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
return SetError(&ds, -1, "Trailing data");
}

free(ds.cStr);

return ret;
}
8 changes: 6 additions & 2 deletions pandas/_libs/src/ujson/python/JSONtoObj.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,10 @@ JSOBJ Object_newDouble(void *prv, double value) {
return PyFloat_FromDouble(value);
}

JSOBJ Object_newBigNum(void* prv, char* cStr) {
return PyLong_FromString(cStr, NULL, 0);
}

static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
if (obj != decoder->npyarr_addr) {
Expand All @@ -509,8 +513,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
Object_newPosInf, Object_newNegInf, Object_newObject,
Object_endObject, Object_newArray, Object_endArray,
Object_newInteger, Object_newLong, Object_newDouble,
Object_releaseObject, PyObject_Malloc, PyObject_Free,
PyObject_Realloc};
Object_newBigNum, Object_releaseObject, PyObject_Malloc,
PyObject_Free, PyObject_Realloc};

dec.preciseFloat = 0;
dec.prv = NULL;
Expand Down
26 changes: 14 additions & 12 deletions pandas/tests/io/json/test_ujson.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,16 +560,23 @@ def test_encode_long_conversion(self):
assert output == json.dumps(long_input)
assert long_input == ujson.decode(output)

@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
@pytest.mark.parametrize(
"bigNum",
[
sys.maxsize + 1,
sys.maxsize * sys.maxsize + 100,
-(sys.maxsize + 2),
-(sys.maxsize * sys.maxsize + 100),
],
)
def test_dumps_ints_larger_than_maxsize(self, bigNum):
# GH34395
bigNum = sys.maxsize + 1
encoding = ujson.encode(bigNum)
assert str(bigNum) == encoding

# GH20599
with pytest.raises(ValueError):
assert ujson.loads(encoding) == bigNum
assert ujson.decode(encoding) == bigNum

@pytest.mark.parametrize(
"int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]
Expand Down Expand Up @@ -1046,13 +1053,6 @@ def test_decode_array(self, arr):
def test_decode_extreme_numbers(self, extreme_num):
assert extreme_num == ujson.decode(str(extreme_num))

@pytest.mark.parametrize(
"too_extreme_num", ["9223372036854775808", "-90223372036854775809"]
)
def test_decode_too_extreme_numbers(self, too_extreme_num):
with pytest.raises(ValueError):
ujson.decode(too_extreme_num)

def test_decode_with_trailing_whitespaces(self):
assert {} == ujson.decode("{}\n\t ")

Expand All @@ -1061,8 +1061,10 @@ def test_decode_with_trailing_non_whitespaces(self):
ujson.decode("{}\n\t a")

def test_decode_array_with_big_int(self):
with pytest.raises(ValueError):
ujson.loads("[18446098363113800555]")
# GH20599
result = ujson.loads("[18446098363113800555]")
expected = [18446098363113800555]
assert result == expected

@pytest.mark.parametrize(
"float_number",
Expand Down