BUG: raise OverflowError on integer values exceeding int64 precision in parsers. close #2247

wesm · wesm · commit 8316a1e52f36 · 2012-11-27T23:09:35.000-05:00
diff --git a/RELEASE.rst b/RELEASE.rst
@@ -79,6 +79,7 @@ pandas 0.10.0
   - Enable joins between MultiIndex and regular Index (#2024)
   - Fix time zone metadata issue when unioning non-overlapping DatetimeIndex
     objects (#2367)
+  - Raise/handle int64 overflows in parsers (#2247)
 
 pandas 0.9.1
 ============
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -34,6 +34,8 @@
     parse_date_time, parse_date_fields, parse_all_fields
 )
 
+from pandas._parser import OverflowError
+
 def _skip_if_no_xlrd():
     try:
         import xlrd
@@ -1898,6 +1900,21 @@ def test_disable_bool_parsing(self):
         result = read_csv(StringIO(data), dtype=object, na_filter=False)
         self.assertEquals(result['B'][2], '')
 
+    def test_int64_overflow(self):
+        data = """ID
+00013007854817840016671868
+00013007854817840016749251
+00013007854817840016754630
+00013007854817840016781876
+00013007854817840017028824
+00013007854817840017963235
+00013007854817840018860166"""
+
+        result = read_csv(StringIO(data))
+        self.assertTrue(result['ID'].dtype == object)
+
+        self.assertRaises(OverflowError, read_csv, StringIO(data),
+                          dtype='i8')
 
 class TestParseSQL(unittest.TestCase):
 
diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx
@@ -76,6 +76,8 @@ cdef extern from "parser/tokenizer.h":
         EAT_COMMENT
         FINISHED
 
+    enum: ERROR_OVERFLOW
+
     ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                                  int *status)
     ctypedef int (*io_cleanup)(void *src)
@@ -840,9 +842,13 @@ cdef class TextReader:
         else:
             col_res = None
             for dt in dtype_cast_order:
-                col_res, na_count = self._convert_with_dtype(dt, i, start,
-                                                             end, na_filter,
-                                                             na_hashset)
+                try:
+                    col_res, na_count = self._convert_with_dtype(
+                        dt, i, start, end, na_filter, na_hashset)
+                except OverflowError:
+                    col_res, na_count = self._convert_with_dtype(
+                        '|O8', i, start, end, na_filter, na_hashset)
+
                 if col_res is not None:
                     break
 
@@ -966,6 +972,11 @@ cdef class TextReader:
 class CParserError(Exception):
     pass
 
+
+class OverflowError(ValueError):
+    pass
+
+
 def _ensure_encoded(list lst):
     cdef list result = []
     for x in lst:
@@ -1251,6 +1262,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end,
 
     return result, na_count
 
+
 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
@@ -1283,13 +1295,18 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
+                if error == ERROR_OVERFLOW:
+                    raise OverflowError(word)
+
                 return None, None
     else:
         for i in range(lines):
             word = COLITER_NEXT(it)
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
             if error != 0:
+                if error == ERROR_OVERFLOW:
+                    raise OverflowError(word)
                 return None, None
 
     return result, na_count