fix to_json for numbers larger than sys.maxsize (#34473)

arw2019 · WillAyd · web-flow · commit d85b93df71de · 2020-06-24T16:44:40.000-07:00
* BUG: overflow on to_json with numbers larger than sys.maxsize * TST: overflow on to_json with numbers larger than sys.maxsize (#34395) * DOC: update with issue #34395 * TST: removed unused import * ENH: added case JT_BIGNUM to encode * ENH: added JT_BIGNUM to JSTYPES * BUG: changed error for ints>sys.maxsize into JT_BIGNUM * ENH: removed debug statements * BUG: removed dumps wrapper * removed bigNum from TypeContext * TST: fixed bug in the test * added pointer to string rep converter for BigNum * TST: removed ujson.loads from the test * added getBigNumStringValue * added code to JT_BIGNUM handler by analogy with JT_UTF8 * TST: update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd <william.ayd@icloud.com> * added Object_getBigNumStringValue to pyEncoder * added skeletal code for Object_GetBigNumStringValue * completed Object_getBigNumStringValue using PyObject_Repr * BUG: changed Object_getBigNumStringValue * improved Object_getBigNumStringValue some more * update getBigNumStringValue argument * corrected Object_getBigNumStringValue * more fixes to Object_getBigNumStringValue * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c * Update pandas/_libs/src/ujson/python/objToJSON.c * updated pyEncoder for JT_BIGNUM * updated pyEncoder * moved getBigNumStringValue to pyEncoder * fixed declaration of Object_getBigNumStringValue * fixed Object_getBigNumStringValue * catch overflow error with PyLong_AsLongLongAndOverflow * remove unnecessary error check * added shortcircuit for error check * simplify int overflow error catching Co-authored-by: William Ayd <william.ayd@icloud.com> * Update long int test in pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd <william.ayd@icloud.com> * removed tests expecting numeric overflow * remove underscore from overflow Co-authored-by: William Ayd <william.ayd@icloud.com> * removed underscores from _overflow everywhere * fixed small typo * fix type of exc * deleted numeric overflow tests * remove extraneous condition in if statement Co-authored-by: William Ayd <william.ayd@icloud.com> * remove extraneous condition in if statement Co-authored-by: William Ayd <william.ayd@icloud.com> * change _Bool into int Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/python/objToJSON.c Co-authored-by: William Ayd <william.ayd@icloud.com> * Update pandas/_libs/src/ujson/lib/ultrajsonenc.c Co-authored-by: William Ayd <william.ayd@icloud.com> * allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd <william.ayd@icloud.com> * allocate an extra byte in Object_getBigNumStringValue Co-authored-by: William Ayd <william.ayd@icloud.com> * reinstate RESERVE_STRING(szlen) in JT_BIGNUM case * replaced (private) with (public) in whatnew * release bytes in Object_endTypeContext * in JT_BIGNUM change if+if into if+else if * added reallocation of bigNum_bytes * removed bigNum_bytes * added to_json test for ints>sys.maxsize * Use python malloc to match PyObject_Free in endTypeContext Co-authored-by: William Ayd <william.ayd@icloud.com> * TST: added manually constructed strs to compare encodings * fixed styling to minimize diff with master * fixed styling * fixed conflicts with master * fix styling to minimize diff * fix styling to minimize diff * fixed styling * added negative nigNum to test_to_json_large_numers * added negative nigNum to test_to_json_large_numers * Update pandas/tests/io/json/test_ujson.py Co-authored-by: William Ayd <william.ayd@icloud.com> * fixe test_to_json_for_large_nums for -ve * TST: added xfail for ujson.encode with long int input * TST: fixed variable names in test_to_json_large_numbers * TST: added xfail test for json.decode Series with long int * TST: added xfail test for json.decode DataFrame with long int * BENCH: added benchmarks for long ints Co-authored-by: William Ayd <william.ayd@icloud.com>
diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py
@@ -1,3 +1,5 @@
+import sys
+
 import numpy as np
 
 from pandas import DataFrame, concat, date_range, read_json, timedelta_range
@@ -82,6 +84,7 @@ def setup(self, orient, frame):
         timedeltas = timedelta_range(start=1, periods=N, freq="s")
         datetimes = date_range(start=1, periods=N, freq="s")
         ints = np.random.randint(100000000, size=N)
+        longints = sys.maxsize * np.random.randint(100000000, size=N)
         floats = np.random.randn(N)
         strings = tm.makeStringIndex(N)
         self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
@@ -120,6 +123,18 @@ def setup(self, orient, frame):
             index=index,
         )
 
+        self.df_longint_float_str = DataFrame(
+            {
+                "longint_1": longints,
+                "longint_2": longints,
+                "float_1": floats,
+                "float_2": floats,
+                "str_1": strings,
+                "str_2": strings,
+            },
+            index=index,
+        )
+
     def time_to_json(self, orient, frame):
         getattr(self, frame).to_json(self.fname, orient=orient)
 
@@ -172,6 +187,7 @@ def setup(self):
         timedeltas = timedelta_range(start=1, periods=N, freq="s")
         datetimes = date_range(start=1, periods=N, freq="s")
         ints = np.random.randint(100000000, size=N)
+        longints = sys.maxsize * np.random.randint(100000000, size=N)
         floats = np.random.randn(N)
         strings = tm.makeStringIndex(N)
         self.df = DataFrame(np.random.randn(N, ncols), index=np.arange(N))
@@ -209,6 +225,17 @@ def setup(self):
             },
             index=index,
         )
+        self.df_longint_float_str = DataFrame(
+            {
+                "longint_1": longints,
+                "longint_2": longints,
+                "float_1": floats,
+                "float_2": floats,
+                "str_1": strings,
+                "str_2": strings,
+            },
+            index=index,
+        )
 
     def time_floats_with_int_idex_lines(self):
         self.df.to_json(self.fname, orient="records", lines=True)
@@ -225,6 +252,9 @@ def time_float_int_lines(self):
     def time_float_int_str_lines(self):
         self.df_int_float_str.to_json(self.fname, orient="records", lines=True)
 
+    def time_float_longint_str_lines(self):
+        self.df_longint_float_str.to_json(self.fname, orient="records", lines=True)
+
 
 class ToJSONMem:
     def setup_cache(self):
diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -1026,6 +1026,7 @@ I/O
 - Bug in :meth:`~pandas.io.stata.StataReader` which resulted in categorical variables with difference dtypes when reading data using an iterator. (:issue:`31544`)
 - :meth:`HDFStore.keys` has now an optional `include` parameter that allows the retrieval of all native HDF5 table names (:issue:`29916`)
 - Bug in :meth:`read_excel` for ODS files removes 0.0 values (:issue:`27222`)
+- Bug in :meth:`ujson.encode` was raising an `OverflowError` with numbers larger than sys.maxsize (:issue: `34395`)
 
 Plotting
 ^^^^^^^^
diff --git a/pandas/_libs/src/ujson/lib/ultrajson.h b/pandas/_libs/src/ujson/lib/ultrajson.h
@@ -150,6 +150,7 @@ enum JSTYPES {
   JT_INT,      // (JSINT32 (signed 32-bit))
   JT_LONG,     // (JSINT64 (signed 64-bit))
   JT_DOUBLE,   // (double)
+  JT_BIGNUM,   // integer larger than sys.maxsize
   JT_UTF8,     // (char 8-bit)
   JT_ARRAY,    // Array structure
   JT_OBJECT,   // Key/Value structure
@@ -187,6 +188,8 @@ typedef struct __JSONObjectEncoder {
   JSINT64 (*getLongValue)(JSOBJ obj, JSONTypeContext *tc);
   JSINT32 (*getIntValue)(JSOBJ obj, JSONTypeContext *tc);
   double (*getDoubleValue)(JSOBJ obj, JSONTypeContext *tc);
+  const char *(*getBigNumStringValue)(JSOBJ obj, JSONTypeContext *tc,
+                                size_t *_outLen);
 
   /*
   Begin iteration of an iteratable object (JS_ARRAY or JS_OBJECT)
diff --git a/pandas/_libs/src/ujson/lib/ultrajsonenc.c b/pandas/_libs/src/ujson/lib/ultrajsonenc.c
@@ -1107,6 +1107,35 @@ void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name,
             Buffer_AppendCharUnchecked(enc, '\"');
             break;
         }
+
+        case JT_BIGNUM: {
+            value = enc->getBigNumStringValue(obj, &tc, &szlen);
+
+            Buffer_Reserve(enc, RESERVE_STRING(szlen));
+            if (enc->errorMsg) {
+                enc->endTypeContext(obj, &tc);
+                return;
+            }
+
+            if (enc->forceASCII) {
+                if (!Buffer_EscapeStringValidated(obj, enc, value,
+                                                  value + szlen)) {
+                    enc->endTypeContext(obj, &tc);
+                    enc->level--;
+                    return;
+                }
+            } else {
+                if (!Buffer_EscapeStringUnvalidated(enc, value,
+                                                    value + szlen)) {
+                    enc->endTypeContext(obj, &tc);
+                    enc->level--;
+                    return;
+                }
+            }
+
+            break;
+            
+        }
     }
 
     enc->endTypeContext(obj, &tc);
diff --git a/pandas/_libs/src/ujson/python/objToJSON.c b/pandas/_libs/src/ujson/python/objToJSON.c
@@ -1629,15 +1629,20 @@ void Object_beginTypeContext(JSOBJ _obj, JSONTypeContext *tc) {
     if (PyLong_Check(obj)) {
         PRINTMARK();
         tc->type = JT_LONG;
-        GET_TC(tc)->longValue = PyLong_AsLongLong(obj);
+        int overflow = 0;
+        GET_TC(tc)->longValue = PyLong_AsLongLongAndOverflow(obj, &overflow);
+        int err;
+        err = (GET_TC(tc)->longValue == -1) && PyErr_Occurred();
 
-        exc = PyErr_Occurred();
-
-        if (exc && PyErr_ExceptionMatches(PyExc_OverflowError)) {
+        if (overflow){
+            PRINTMARK();
+            tc->type = JT_BIGNUM;
+        }
+        else if (err) {
             PRINTMARK();
             goto INVALID;
         }
-
+        
         return;
     } else if (PyFloat_Check(obj)) {
         PRINTMARK();
@@ -2105,7 +2110,6 @@ void Object_endTypeContext(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
         NpyArr_freeLabels(GET_TC(tc)->columnLabels,
                           GET_TC(tc)->columnLabelsLen);
         GET_TC(tc)->columnLabels = NULL;
-
         PyObject_Free(GET_TC(tc)->cStr);
         GET_TC(tc)->cStr = NULL;
         PyObject_Free(tc->prv);
@@ -2126,6 +2130,19 @@ double Object_getDoubleValue(JSOBJ Py_UNUSED(obj), JSONTypeContext *tc) {
     return GET_TC(tc)->doubleValue;
 }
 
+const char *Object_getBigNumStringValue(JSOBJ obj, JSONTypeContext *tc, 
+                                    size_t *_outLen) {
+    PyObject* repr = PyObject_Str(obj);
+    const char *str = PyUnicode_AsUTF8AndSize(repr, (Py_ssize_t *) _outLen);
+    char* bytes = PyObject_Malloc(*_outLen + 1);
+    memcpy(bytes, str, *_outLen + 1);
+    GET_TC(tc)->cStr = bytes;
+
+    Py_DECREF(repr);
+    
+    return GET_TC(tc)->cStr;
+}
+
 static void Object_releaseObject(JSOBJ _obj) { Py_DECREF((PyObject *)_obj); }
 
 void Object_iterBegin(JSOBJ obj, JSONTypeContext *tc) {
@@ -2181,6 +2198,7 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
         Object_getLongValue,
         NULL, // getIntValue is unused
         Object_getDoubleValue,
+        Object_getBigNumStringValue,
         Object_iterBegin,
         Object_iterNext,
         Object_iterEnd,
@@ -2294,7 +2312,6 @@ PyObject *objToJSON(PyObject *Py_UNUSED(self), PyObject *args,
         if (ret != buffer) {
             encoder->free(ret);
         }
-
         PyErr_Format(PyExc_OverflowError, "%s", encoder->errorMsg);
         return NULL;
     }
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
@@ -4,6 +4,7 @@
 from io import StringIO
 import json
 import os
+import sys
 
 import numpy as np
 import pytest
@@ -1242,6 +1243,29 @@ def test_read_jsonl_unicode_chars(self):
         expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
+    def test_to_json_large_numbers(self, bigNum):
+        # GH34473
+        series = Series(bigNum, dtype=object, index=["articleId"])
+        json = series.to_json()
+        expected = '{"articleId":' + str(bigNum) + "}"
+        assert json == expected
+        # GH 20599
+        with pytest.raises(ValueError):
+            json = StringIO(json)
+            result = read_json(json)
+            tm.assert_series_equal(series, result)
+
+        df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
+        json = df.to_json()
+        expected = '{"0":{"articleId":' + str(bigNum) + "}}"
+        assert json == expected
+        # GH 20599
+        with pytest.raises(ValueError):
+            json = StringIO(json)
+            result = read_json(json)
+            tm.assert_frame_equal(df, result)
+
     def test_read_json_large_numbers(self):
         # GH18842
         json = '{"articleId": "1404366058080022500245"}'
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
@@ -5,6 +5,7 @@
 import locale
 import math
 import re
+import sys
 import time
 
 import dateutil
@@ -559,6 +560,17 @@ def test_encode_long_conversion(self):
         assert output == json.dumps(long_input)
         assert long_input == ujson.decode(output)
 
+    @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
+    def test_dumps_ints_larger_than_maxsize(self, bigNum):
+        # GH34395
+        bigNum = sys.maxsize + 1
+        encoding = ujson.encode(bigNum)
+        assert str(bigNum) == encoding
+
+        # GH20599
+        with pytest.raises(ValueError):
+            assert ujson.loads(encoding) == bigNum
+
     @pytest.mark.parametrize(
         "int_exp", ["1337E40", "1.337E40", "1337E+9", "1.337e+40", "1.337E-4"]
     )
@@ -570,18 +582,6 @@ def test_loads_non_str_bytes_raises(self):
         with pytest.raises(TypeError, match=msg):
             ujson.loads(None)
 
-    def test_encode_numeric_overflow(self):
-        with pytest.raises(OverflowError):
-            ujson.encode(12839128391289382193812939)
-
-    def test_encode_numeric_overflow_nested(self):
-        class Nested:
-            x = 12839128391289382193812939
-
-        for _ in range(0, 100):
-            with pytest.raises(OverflowError):
-                ujson.encode(Nested())
-
     @pytest.mark.parametrize("val", [3590016419, 2 ** 31, 2 ** 32, (2 ** 32) - 1])
     def test_decode_number_with_32bit_sign_bit(self, val):
         # Test that numbers that fit within 32 bits but would have the