OverflowError for string of large numbers (int64) while reading json #18842

MaxBer · 2017-12-19T14:17:28Z

import json
import pandas as pd
json_data = {"articleId":"1404366058080022500245"}
pd.read_json(json.dumps(json_data), typ='series')

Python int too large to convert to C long: OverflowError
...
File "/var/task/pandas/io/json/json.py", line 366, in read_json
return json_reader.read()
File "/var/task/pandas/io/json/json.py", line 467, in read
obj = self._get_object_parser(self.data)
File "/var/task/pandas/io/json/json.py", line 489, in _get_object_parser
obj = SeriesParser(json, **kwargs).parse()
File "/var/task/pandas/io/json/json.py", line 582, in parse
self._try_convert_types()
File "/var/task/pandas/io/json/json.py", line 752, in _try_convert_types
'data', self.obj, convert_dates=self.convert_dates)
File "/var/task/pandas/io/json/json.py", line 621, in _try_convert_data
new_data, result = self._try_convert_to_date(data)
File "/var/task/pandas/io/json/json.py", line 684, in _try_convert_to_date
new_data = data.astype('int64')
File "/var/task/pandas/util/_decorators.py", line 118, in wrapper
return func(*args, **kwargs)
File "/var/task/pandas/core/generic.py", line 4004, in astype
**kwargs)
File "/var/task/pandas/core/internals.py", line 3462, in astype
return self.apply('astype', dtype=dtype, **kwargs)
File "/var/task/pandas/core/internals.py", line 3329, in apply
applied = getattr(b, f)(**kwargs)
File "/var/task/pandas/core/internals.py", line 544, in astype
**kwargs)
File "/var/task/pandas/core/internals.py", line 625, in _astype
values = astype_nansafe(values.ravel(), dtype, copy=True)
File "/var/task/pandas/core/dtypes/cast.py", line 692, in astype_nansafe
return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
File "pandas/_libs/lib.pyx", line 854, in pandas._libs.lib.astype_intsafe
File "pandas/_libs/src/util.pxd", line 91, in util.set_value_at_unsafe
OverflowError: Python int too large to convert to C long

Reading json containing stringified numbers larger than int64 causes OverflowError in pandas 0.21.1. Appears to be working fine in 0.21.0

Max

WillAyd · 2017-12-19T16:48:28Z

It looks like previous versions would automatically convert that large number to a float, but you're getting an overflow error in 0.21.1 because it is attempting to fit that number into an int.

FWIW you can work around this by explicitly specifying the dtype:

pd.read_json(json.dumps(json_data), typ='series', dtype={"articleId": np.float64})

WillAyd · 2017-12-22T01:21:11Z

I believe the difference here was introduced in 4fd104a at the line below. Previously this was using a bare except that would catch everything, but now that the exception is more explicit that OverFlow error is not being caught and instead is sent further up the stack.

I can take a go at this in a PR

pandas/pandas/io/json/json.py

Line 684 in 4fd104a

new_data = data.astype('int64')

tocunha · 2018-09-10T20:03:10Z

I am experiencing the same error when reading a huge compressed json file.

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in next(self)
568 if lines:
569 lines_json = self._combine_lines(lines)
--> 570 obj = self._get_object_parser(lines_json)
571
572 # Make sure that the returned objects have the right index.

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in _get_object_parser(self, json)
544 obj = None
545 if typ == 'frame':
--> 546 obj = FrameParser(json, **kwargs).parse()
547
548 if typ == 'series' or obj is None:

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in parse(self)
642 if self.convert_axes:
643 self._convert_axes()
--> 644 self._try_convert_types()
645 return self.obj
646

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in _try_convert_types(self)
897
898 self._process_converter(
--> 899 lambda col, c: self._try_convert_data(col, c, convert_dates=False))
900
901 def _try_convert_dates(self):

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in _process_converter(self, f, filt)
877 for i, (col, c) in enumerate(self.obj.iteritems()):
878 if filt(col, c):
--> 879 new_data, result = f(col, c)
880 if result:
881 c = new_data

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in (col, c)
897
898 self._process_converter(
--> 899 lambda col, c: self._try_convert_data(col, c, convert_dates=False))
900
901 def _try_convert_dates(self):

~/anaconda3/lib/python3.6/site-packages/pandas/io/json/json.py in _try_convert_data(self, name, data, use_dtypes, convert_dates)
712 # coerce ints if we can
713 try:
--> 714 new_data = data.astype('int64')
715 if (new_data == data).all():
716 data = new_data

~/anaconda3/lib/python3.6/site-packages/pandas/util/_decorators.py in wrapper(*args, **kwargs)
176 else:
177 kwargs[new_arg_name] = new_arg_value
--> 178 return func(*args, **kwargs)
179 return wrapper
180 return _deprecate_kwarg

~/anaconda3/lib/python3.6/site-packages/pandas/core/generic.py in astype(self, dtype, copy, errors, **kwargs)
4999 # else, only a single dtype is given
5000 new_data = self._data.astype(dtype=dtype, copy=copy, errors=errors,
-> 5001 **kwargs)
5002 return self._constructor(new_data).finalize(self)
5003

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, **kwargs)
3712
3713 def astype(self, dtype, **kwargs):
-> 3714 return self.apply('astype', dtype=dtype, **kwargs)
3715
3716 def convert(self, **kwargs):

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in apply(self, f, axes, filter, do_integrity_check, consolidate, **kwargs)
3579
3580 kwargs['mgr'] = self
-> 3581 applied = getattr(b, f)(**kwargs)
3582 result_blocks = _extend_blocks(applied, result_blocks)
3583

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in astype(self, dtype, copy, errors, values, **kwargs)
573 def astype(self, dtype, copy=False, errors='raise', values=None, **kwargs):
574 return self._astype(dtype, copy=copy, errors=errors, values=values,
--> 575 **kwargs)
576
577 def _astype(self, dtype, copy=False, errors='raise', values=None,

~/anaconda3/lib/python3.6/site-packages/pandas/core/internals.py in _astype(self, dtype, copy, errors, values, klass, mgr, **kwargs)
662
663 # _astype_nansafe works fine with 1-d only
--> 664 values = astype_nansafe(values.ravel(), dtype, copy=True)
665 values = values.reshape(self.shape)
666

~/anaconda3/lib/python3.6/site-packages/pandas/core/dtypes/cast.py in astype_nansafe(arr, dtype, copy)
707 # work around NumPy brokenness, #1987
708 if np.issubdtype(dtype.type, np.integer):
--> 709 return lib.astype_intsafe(arr.ravel(), dtype).reshape(arr.shape)
710
711 # if we have a datetime/timedelta array of objects

pandas/_libs/lib.pyx in pandas._libs.lib.astype_intsafe()

pandas/_libs/src/util.pxd in util.set_value_at_unsafe()

OverflowError: Python int too large to convert to C long

tocunha · 2018-09-10T20:04:55Z

I tried the work around, but didn't have success.

        for chunk in pd.read_json(dir_+filename, chunksize=10000, compression = 'bz2', lines=True, dtype={"created_utc": np.int64, 'retrieved_on':np.int64, 'created':np.int64}):

tocunha · 2018-09-14T18:54:07Z

I avoided this error passing dtype=False

Mojahid-Ahmad · 2020-02-16T06:57:01Z

I avoided this error passing dtype=False

Thanks, man you saved my time

WillAyd mentioned this issue Dec 22, 2017

Fixed read_json int overflow #18905

Merged

4 tasks

gfyoung added Dtype Conversions Unexpected or buggy dtype conversions IO JSON read_json, to_json, json_normalize Regression Functionality that used to work in a prior pandas version labels Dec 22, 2017

jreback added this to the 0.23.0 milestone Dec 23, 2017

jreback closed this as completed in #18905 Dec 27, 2017

Udayraj123 mentioned this issue Apr 4, 2018

read_json reads large integers as strings incorrectly if dtype not explicitly mentioned #20608

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

OverflowError for string of large numbers (int64) while reading json #18842

OverflowError for string of large numbers (int64) while reading json #18842

MaxBer commented Dec 19, 2017

WillAyd commented Dec 19, 2017

WillAyd commented Dec 22, 2017 •

edited

Loading

tocunha commented Sep 10, 2018

tocunha commented Sep 10, 2018

tocunha commented Sep 14, 2018

Mojahid-Ahmad commented Feb 16, 2020

OverflowError for string of large numbers (int64) while reading json #18842

OverflowError for string of large numbers (int64) while reading json #18842

Comments

MaxBer commented Dec 19, 2017

WillAyd commented Dec 19, 2017

WillAyd commented Dec 22, 2017 • edited Loading

tocunha commented Sep 10, 2018

tocunha commented Sep 10, 2018

tocunha commented Sep 14, 2018

Mojahid-Ahmad commented Feb 16, 2020

WillAyd commented Dec 22, 2017 •

edited

Loading