Skip to content

BUG: json.decode fails for nums larger than sys.maxsize #34984

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pandas/_libs/src/ujson/lib/ultrajson.h
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ typedef struct __JSONObjectDecoder {
JSOBJ (*newInt)(void *prv, JSINT32 value);
JSOBJ (*newLong)(void *prv, JSINT64 value);
JSOBJ (*newDouble)(void *prv, double value);
JSOBJ (*newBigNum)(void *prv, char* cStr);
void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
JSPFN_MALLOC malloc;
JSPFN_FREE free;
Expand Down
62 changes: 56 additions & 6 deletions pandas/_libs/src/ujson/lib/ultrajsondec.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ Numeric decoder derived from from TCL library
#include <limits.h>
#include <locale.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <wchar.h>
Expand All @@ -64,6 +65,7 @@ struct DecoderState {
int escHeap;
int lastType;
JSUINT32 objDepth;
char *cStr; // storage for BigNum
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need to change the struct here; just look at the existing implementation of decode_string and mirror that

void *prv;
JSONObjectDecoder *dec;
};
Expand Down Expand Up @@ -118,13 +120,15 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
int intNeg = 1;
int mantSize = 0;
JSUINT64 intValue;
JSLONG newDigit;
int chr;
int decimalCount = 0;
double frcValue = 0.0;
double expNeg;
double expValue;
char *offset = ds->start;


JSUINT64 overflowLimit = LLONG_MAX;

if (*(offset) == 'I') {
Expand Down Expand Up @@ -160,12 +164,35 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
// FIXME: Check for arithmetic overflow here
// PERF: Don't do 64-bit arithmetic here unless we know we have
// to
intValue = intValue * 10ULL + (JSLONG)(chr - 48);
newDigit = (JSLONG)(chr - 48);

// TO DO: need to fix overflow catching
if (intValue> (overflowLimit-newDigit)/10) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@WillAyd Will this work? (intValue is always positive so it's always floor division on the RHS)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think so. Assuming a 64 bit platform where the largest integer is 2 ** 63 - 1 this would not branch for 2 ** 63 (which it should)


// convert current inValue into string
int length = snprintf( NULL, 0, "%lu", intValue);
char* intValue_asStr = malloc( length + 1 );
snprintf(intValue_asStr, length + 1, "%lu", intValue);

if (strlen(ds->cStr)== 0) { // first overflow
ds->cStr = (char*)realloc(ds->cStr, strlen(intValue_asStr)+1);
strcpy(ds->cStr, intValue_asStr);
} else { // has overflown before
char* cStr_prev = malloc(strlen(ds->cStr));
memcpy(cStr_prev, ds->cStr, strlen(ds->cStr));

size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr);
ds->cStr = (char*)realloc(ds->cStr, new_size);

strcpy(ds->cStr, cStr_prev);
strcat(ds->cStr, intValue_asStr);
}

if (intValue > overflowLimit) {
return SetError(ds, -1, overflowLimit == LLONG_MAX
? "Value is too big"
: "Value is too small");
// then reset intValue
intValue = (newDigit==0) ? 10 : newDigit;
}
else {
intValue = intValue * 10ULL + newDigit;
}

offset++;
Expand Down Expand Up @@ -196,7 +223,25 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
ds->lastType = JT_INT;
ds->start = offset;

if ((intValue >> 31)) {
// check if ds->cStr has been written to
if (strlen(ds->cStr)>0){

// covert intValue to cString
int length = snprintf( NULL, 0, "%lu", intValue);
char* intValue_asStr = malloc( length + 1 );
snprintf(intValue_asStr, length + 1, "%lu", intValue);

char* cStr_prev = malloc(strlen(ds->cStr));
memcpy(cStr_prev, ds->cStr, strlen(ds->cStr));

size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr);
ds->cStr = (char*)realloc(ds->cStr, new_size);
strcpy(ds->cStr, cStr_prev);
strcat(ds->cStr, intValue_asStr);

return ds->dec->newBigNum(ds->prv, ds->cStr);
}
else if ((intValue >> 31)) {
return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
} else {
return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
Expand Down Expand Up @@ -1170,6 +1215,9 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
ds.dec->errorStr = NULL;
ds.dec->errorOffset = NULL;
ds.objDepth = 0;

ds.cStr = malloc(sizeof("\0"));
strcpy(ds.cStr, "\0");

ds.dec = dec;

Expand Down Expand Up @@ -1198,5 +1246,7 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
return SetError(&ds, -1, "Trailing data");
}

free(ds.cStr);

return ret;
}
8 changes: 6 additions & 2 deletions pandas/_libs/src/ujson/python/JSONtoObj.c
Original file line number Diff line number Diff line change
Expand Up @@ -483,6 +483,10 @@ JSOBJ Object_newDouble(void *prv, double value) {
return PyFloat_FromDouble(value);
}

JSOBJ Object_newBigNum(void* prv, char* cStr) {
return PyLong_FromString(cStr, NULL, 0);
}

static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
if (obj != decoder->npyarr_addr) {
Expand All @@ -509,8 +513,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
Object_newPosInf, Object_newNegInf, Object_newObject,
Object_endObject, Object_newArray, Object_endArray,
Object_newInteger, Object_newLong, Object_newDouble,
Object_releaseObject, PyObject_Malloc, PyObject_Free,
PyObject_Realloc};
Object_newBigNum, Object_releaseObject, PyObject_Malloc,
PyObject_Free, PyObject_Realloc};

dec.preciseFloat = 0;
dec.prv = NULL;
Expand Down
26 changes: 14 additions & 12 deletions pandas/tests/io/json/test_ujson.py
Original file line number Diff line number Diff line change
Expand Up @@ -560,16 +560,23 @@ def test_encode_long_conversion(self):
assert output == json.dumps(long_input)
assert long_input == ujson.decode(output)

@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
@pytest.mark.parametrize(
"bigNum",
[
sys.maxsize + 1,
sys.maxsize * sys.maxsize + 100,
-(sys.maxsize + 2),
-(sys.maxsize * sys.maxsize + 100),
],
)
def test_dumps_ints_larger_than_maxsize(self, bigNum):
# GH34395
bigNum = sys.maxsize + 1
encoding = ujson.encode(bigNum)
assert str(bigNum) == encoding

# GH20599
with pytest.raises(ValueError):
assert ujson.loads(encoding) == bigNum
assert ujson.decode(encoding) == bigNum

@pytest.mark.parametrize(
"int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]
Expand Down Expand Up @@ -1046,13 +1053,6 @@ def test_decode_array(self, arr):
def test_decode_extreme_numbers(self, extreme_num):
assert extreme_num == ujson.decode(str(extreme_num))

@pytest.mark.parametrize(
"too_extreme_num", ["9223372036854775808", "-90223372036854775809"]
)
def test_decode_too_extreme_numbers(self, too_extreme_num):
with pytest.raises(ValueError):
ujson.decode(too_extreme_num)

def test_decode_with_trailing_whitespaces(self):
assert {} == ujson.decode("{}\n\t ")

Expand All @@ -1061,8 +1061,10 @@ def test_decode_with_trailing_non_whitespaces(self):
ujson.decode("{}\n\t a")

def test_decode_array_with_big_int(self):
with pytest.raises(ValueError):
ujson.loads("[18446098363113800555]")
# GH20599
result = ujson.loads("[18446098363113800555]")
expected = [18446098363113800555]
assert result == expected

@pytest.mark.parametrize(
"float_number",
Expand Down