Skip to content

Commit 8316a1e

Browse files
committed
BUG: raise OverflowError on integer values exceeding int64 precision in parsers. close #2247
1 parent 6b5be05 commit 8316a1e

File tree

3 files changed

+38
-3
lines changed

3 files changed

+38
-3
lines changed

RELEASE.rst

+1
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ pandas 0.10.0
7979
- Enable joins between MultiIndex and regular Index (#2024)
8080
- Fix time zone metadata issue when unioning non-overlapping DatetimeIndex
8181
objects (#2367)
82+
- Raise/handle int64 overflows in parsers (#2247)
8283

8384
pandas 0.9.1
8485
============

pandas/io/tests/test_parsers.py

+17
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@
3434
parse_date_time, parse_date_fields, parse_all_fields
3535
)
3636

37+
from pandas._parser import OverflowError
38+
3739
def _skip_if_no_xlrd():
3840
try:
3941
import xlrd
@@ -1898,6 +1900,21 @@ def test_disable_bool_parsing(self):
18981900
result = read_csv(StringIO(data), dtype=object, na_filter=False)
18991901
self.assertEquals(result['B'][2], '')
19001902

1903+
def test_int64_overflow(self):
1904+
data = """ID
1905+
00013007854817840016671868
1906+
00013007854817840016749251
1907+
00013007854817840016754630
1908+
00013007854817840016781876
1909+
00013007854817840017028824
1910+
00013007854817840017963235
1911+
00013007854817840018860166"""
1912+
1913+
result = read_csv(StringIO(data))
1914+
self.assertTrue(result['ID'].dtype == object)
1915+
1916+
self.assertRaises(OverflowError, read_csv, StringIO(data),
1917+
dtype='i8')
19011918

19021919
class TestParseSQL(unittest.TestCase):
19031920

pandas/src/parser.pyx

+20-3
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,8 @@ cdef extern from "parser/tokenizer.h":
7676
EAT_COMMENT
7777
FINISHED
7878

79+
enum: ERROR_OVERFLOW
80+
7981
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
8082
int *status)
8183
ctypedef int (*io_cleanup)(void *src)
@@ -840,9 +842,13 @@ cdef class TextReader:
840842
else:
841843
col_res = None
842844
for dt in dtype_cast_order:
843-
col_res, na_count = self._convert_with_dtype(dt, i, start,
844-
end, na_filter,
845-
na_hashset)
845+
try:
846+
col_res, na_count = self._convert_with_dtype(
847+
dt, i, start, end, na_filter, na_hashset)
848+
except OverflowError:
849+
col_res, na_count = self._convert_with_dtype(
850+
'|O8', i, start, end, na_filter, na_hashset)
851+
846852
if col_res is not None:
847853
break
848854

@@ -966,6 +972,11 @@ cdef class TextReader:
966972
class CParserError(Exception):
967973
pass
968974

975+
976+
class OverflowError(ValueError):
977+
pass
978+
979+
969980
def _ensure_encoded(list lst):
970981
cdef list result = []
971982
for x in lst:
@@ -1251,6 +1262,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
12511262

12521263
return result, na_count
12531264

1265+
12541266
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
12551267
bint na_filter, kh_str_t *na_hashset):
12561268
cdef:
@@ -1283,13 +1295,18 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
12831295
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
12841296
&error, parser.thousands)
12851297
if error != 0:
1298+
if error == ERROR_OVERFLOW:
1299+
raise OverflowError(word)
1300+
12861301
return None, None
12871302
else:
12881303
for i in range(lines):
12891304
word = COLITER_NEXT(it)
12901305
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
12911306
&error, parser.thousands)
12921307
if error != 0:
1308+
if error == ERROR_OVERFLOW:
1309+
raise OverflowError(word)
12931310
return None, None
12941311

12951312
return result, na_count

0 commit comments

Comments
 (0)