pandas-dev · arw2019 · Jun 25, 2020 · Jun 25, 2020 · Jun 25, 2020 · Jun 25, 2020
diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h
@@ -304,6 +304,7 @@ typedef struct __JSONObjectDecoder {
   JSOBJ (*newInt)(void *prv, JSINT32 value);
   JSOBJ (*newLong)(void *prv, JSINT64 value);
   JSOBJ (*newDouble)(void *prv, double value);
+  JSOBJ (*newBigNum)(void *prv, char* cStr);
   void (*releaseObject)(void *prv, JSOBJ obj, void *decoder);
   JSPFN_MALLOC malloc;
   JSPFN_FREE free;

diff --git a/pandas/_libs/src/ujson/lib/ultrajsondec.c b/pandas/_libs/src/ujson/lib/ultrajsondec.c
@@ -43,6 +43,7 @@ Numeric decoder derived from from TCL library
 #include <limits.h>
 #include <locale.h>
 #include <math.h>
+#include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <wchar.h>
@@ -64,6 +65,7 @@ struct DecoderState {
     int escHeap;
     int lastType;
     JSUINT32 objDepth;
+    char *cStr; // storage for BigNum
     void *prv;
     JSONObjectDecoder *dec;
 };
@@ -118,13 +120,15 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
     int intNeg = 1;
     int mantSize = 0;
     JSUINT64 intValue;
+    JSLONG newDigit;
     int chr;
     int decimalCount = 0;
     double frcValue = 0.0;
     double expNeg;
     double expValue;
     char *offset = ds->start;
 
+
     JSUINT64 overflowLimit = LLONG_MAX;
 
     if (*(offset) == 'I') {
@@ -160,12 +164,35 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
                 // FIXME: Check for arithmetic overflow here
                 // PERF: Don't do 64-bit arithmetic here unless we know we have
                 // to
-                intValue = intValue * 10ULL + (JSLONG)(chr - 48);
+                newDigit = (JSLONG)(chr - 48);
+
+                // TO DO: need to fix overflow catching
+                if (intValue> (overflowLimit-newDigit)/10) {
+
+                    // convert current inValue into string
+                    int length = snprintf( NULL, 0, "%lu", intValue);
+                    char* intValue_asStr = malloc( length + 1 );
+                    snprintf(intValue_asStr, length + 1, "%lu", intValue);
+
+                    if (strlen(ds->cStr)== 0) { // first overflow
+                        ds->cStr = (char*)realloc(ds->cStr, strlen(intValue_asStr)+1);
+                        strcpy(ds->cStr, intValue_asStr);
+                    } else { // has overflown before 
+                        char* cStr_prev = malloc(strlen(ds->cStr));
+                        memcpy(cStr_prev, ds->cStr, strlen(ds->cStr));
+
+                        size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr);
+                        ds->cStr = (char*)realloc(ds->cStr, new_size);
+
+                        strcpy(ds->cStr, cStr_prev);
+                        strcat(ds->cStr, intValue_asStr);
+                    }
 
-                if (intValue > overflowLimit) {
-                    return SetError(ds, -1, overflowLimit == LLONG_MAX
-                                                ? "Value is too big"
-                                                : "Value is too small");
+                    // then reset intValue
+                    intValue = (newDigit==0) ? 10 : newDigit;
+                }
+                else {
+                    intValue = intValue * 10ULL + newDigit;
                 }
 
                 offset++;
@@ -196,7 +223,25 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_numeric(struct DecoderState *ds) {
     ds->lastType = JT_INT;
     ds->start = offset;
 
-    if ((intValue >> 31)) {
+    // check if ds->cStr has been written to
+    if (strlen(ds->cStr)>0){
+
+        // covert intValue to cString
+        int length = snprintf( NULL, 0, "%lu", intValue);
+        char* intValue_asStr = malloc( length + 1 );
+        snprintf(intValue_asStr, length + 1, "%lu", intValue);
+
+        char* cStr_prev = malloc(strlen(ds->cStr));
+        memcpy(cStr_prev, ds->cStr, strlen(ds->cStr));
+
+        size_t new_size = strlen(ds->cStr) + strlen(intValue_asStr);
+        ds->cStr = (char*)realloc(ds->cStr, new_size);
+        strcpy(ds->cStr, cStr_prev);
+        strcat(ds->cStr, intValue_asStr);
+
+        return ds->dec->newBigNum(ds->prv, ds->cStr);
+    }
+    else if ((intValue >> 31)) {
         return ds->dec->newLong(ds->prv, (JSINT64)(intValue * (JSINT64)intNeg));
     } else {
         return ds->dec->newInt(ds->prv, (JSINT32)(intValue * intNeg));
@@ -1170,6 +1215,9 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
     ds.dec->errorStr = NULL;
     ds.dec->errorOffset = NULL;
     ds.objDepth = 0;
+
+    ds.cStr = malloc(sizeof("\0"));
+    strcpy(ds.cStr, "\0");
 
     ds.dec = dec;
 
@@ -1198,5 +1246,7 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer,
         return SetError(&ds, -1, "Trailing data");
     }
 
+    free(ds.cStr);
+
     return ret;
 }
diff --git a/pandas/_libs/src/ujson/python/JSONtoObj.c b/pandas/_libs/src/ujson/python/JSONtoObj.c
@@ -483,6 +483,10 @@ JSOBJ Object_newDouble(void *prv, double value) {
     return PyFloat_FromDouble(value);
 }
 
+JSOBJ Object_newBigNum(void* prv, char* cStr) {
+    return PyLong_FromString(cStr, NULL, 0);
+}
+
 static void Object_releaseObject(void *prv, JSOBJ obj, void *_decoder) {
     PyObjectDecoder *decoder = (PyObjectDecoder *)_decoder;
     if (obj != decoder->npyarr_addr) {
@@ -509,8 +513,8 @@ PyObject *JSONToObj(PyObject *self, PyObject *args, PyObject *kwargs) {
         Object_newPosInf, Object_newNegInf,     Object_newObject,
         Object_endObject,     Object_newArray,  Object_endArray,
         Object_newInteger,    Object_newLong,   Object_newDouble,
-        Object_releaseObject, PyObject_Malloc, PyObject_Free,
-        PyObject_Realloc};
+        Object_newBigNum, Object_releaseObject, PyObject_Malloc, 
+        PyObject_Free, PyObject_Realloc};
 
     dec.preciseFloat = 0;
     dec.prv = NULL;

diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
@@ -560,16 +560,23 @@ def test_encode_long_conversion(self):
         assert output == json.dumps(long_input)
         assert long_input == ujson.decode(output)
 
-    @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
+    @pytest.mark.parametrize(
+        "bigNum",
+        [
+            sys.maxsize + 1,
+            sys.maxsize * sys.maxsize + 100,
+            -(sys.maxsize + 2),
+            -(sys.maxsize * sys.maxsize + 100),
+        ],
+    )
     def test_dumps_ints_larger_than_maxsize(self, bigNum):
         # GH34395
         bigNum = sys.maxsize + 1
         encoding = ujson.encode(bigNum)
         assert str(bigNum) == encoding
 
         # GH20599
-        with pytest.raises(ValueError):
-            assert ujson.loads(encoding) == bigNum
+        assert ujson.decode(encoding) == bigNum
 
     @pytest.mark.parametrize(
         "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]
@@ -1046,13 +1053,6 @@ def test_decode_array(self, arr):
     def test_decode_extreme_numbers(self, extreme_num):
         assert extreme_num == ujson.decode(str(extreme_num))
 
-    @pytest.mark.parametrize(
-        "too_extreme_num", ["9223372036854775808", "-90223372036854775809"]
-    )
-    def test_decode_too_extreme_numbers(self, too_extreme_num):
-        with pytest.raises(ValueError):
-            ujson.decode(too_extreme_num)
-
     def test_decode_with_trailing_whitespaces(self):
         assert {} == ujson.decode("{}\n\t ")
 
@@ -1061,8 +1061,10 @@ def test_decode_with_trailing_non_whitespaces(self):
             ujson.decode("{}\n\t a")
 
     def test_decode_array_with_big_int(self):
-        with pytest.raises(ValueError):
-            ujson.loads("[18446098363113800555]")
+        # GH20599
+        result = ujson.loads("[18446098363113800555]")
+        expected = [18446098363113800555]
+        assert result == expected
 
     @pytest.mark.parametrize(
         "float_number",