BUG: Parse uint64 in read_csv

gfyoung · gfyoung · commit 2ef3225c1fd6 · 2016-12-30T21:02:01.000-08:00
Closes pandas-devgh-14983.
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -286,6 +286,7 @@ Bug Fixes
 - Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
 - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
 - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
+- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
 - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
 - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
 - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -956,29 +956,39 @@ def test_int64_overflow(self):
             self.assertRaises(OverflowError, self.read_csv,
                               StringIO(data), converters={'ID': conv})
 
-        # These numbers fall right inside the int64 range,
+        # These numbers fall right inside the int64-uint64 range,
         # so they should be parsed as string.
+        ui_max = np.iinfo(np.uint64).max
         i_max = np.iinfo(np.int64).max
         i_min = np.iinfo(np.int64).min
 
-        for x in [i_max, i_min]:
+        for x in [i_max, i_min, ui_max]:
             result = self.read_csv(StringIO(str(x)), header=None)
             expected = DataFrame([x])
             tm.assert_frame_equal(result, expected)
 
-        # These numbers fall just outside the int64 range,
+        # These numbers fall just outside the int64-uint64 range,
         # so they should be parsed as string.
-        too_big = i_max + 1
+        too_big = ui_max + 1
         too_small = i_min - 1
 
         for x in [too_big, too_small]:
             result = self.read_csv(StringIO(str(x)), header=None)
-            if self.engine == 'python' and x == too_big:
-                expected = DataFrame([x])
-            else:
-                expected = DataFrame([str(x)])
+            expected = DataFrame([str(x)])
             tm.assert_frame_equal(result, expected)
 
+        # No numerical dtype can hold both negative and uint64 values,
+        # so they should be cast as string.
+        data = '-1\n' + str(2**63)
+        expected = DataFrame([str(-1), str(2**63)])
+        result = self.read_csv(StringIO(data), header=None)
+        tm.assert_frame_equal(result, expected)
+
+        data = str(2**63) + '\n-1'
+        expected = DataFrame([str(2**63), str(-1)])
+        result = self.read_csv(StringIO(data), header=None)
+        tm.assert_frame_equal(result, expected)
+
     def test_empty_with_nrows_chunksize(self):
         # see gh-9535
         expected = DataFrame([], columns=['foo', 'bar'])
diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py
@@ -275,3 +275,11 @@ def test_empty_dtype(self):
         result = self.read_csv(StringIO(data), header=0,
                                dtype={'a': np.int32, 1: np.float64})
         tm.assert_frame_equal(result, expected)
+
+    def test_numeric_dtype(self):
+        data = '0\n1'
+
+        for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
+            expected = pd.DataFrame([0, 1], dtype=dt)
+            result = self.read_csv(StringIO(data), header=None, dtype=dt)
+            tm.assert_frame_equal(expected, result)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -107,6 +107,8 @@ cdef extern from "parser/tokenizer.h":
         FINISHED
 
     enum: ERROR_OVERFLOW
+    enum: ERROR_MINUS_SIGN
+    enum: ERROR_UINT64_NAN
 
     ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
                                   int *status)
@@ -217,7 +219,8 @@ cdef extern from "parser/tokenizer.h":
 
     int64_t str_to_int64(char *p_item, int64_t int_min,
                          int64_t int_max, int *error, char tsep) nogil
-#    uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
+    uint64_t str_to_uint64(char *p_item, uint64_t uint_max,
+                           int *error, char tsep) nogil
 
     double xstrtod(const char *p, char **q, char decimal, char sci,
                    char tsep, int skip_trailing) nogil
@@ -1164,12 +1167,17 @@ cdef class TextReader:
                              kh_str_t *na_hashset,
                              object na_flist):
         if is_integer_dtype(dtype):
-            result, na_count = _try_int64(self.parser, i, start,
-                                          end, na_filter, na_hashset)
-            if user_dtype and na_count is not None:
-                if na_count > 0:
-                    raise ValueError("Integer column has NA values in "
-                                     "column {column}".format(column=i))
+            try:
+                result, na_count = _try_int64(self.parser, i, start,
+                                              end, na_filter, na_hashset)
+                if user_dtype and na_count is not None:
+                    if na_count > 0:
+                        raise ValueError("Integer column has NA values in "
+                                         "column {column}".format(column=i))
+            except OverflowError:
+                result = _try_uint64(self.parser, i, start, end,
+                                     na_filter, na_hashset)
+                na_count = 0
 
             if result is not None and dtype != 'int64':
                 result = result.astype(dtype)
@@ -1750,6 +1758,69 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,
 
     return 0
 
+cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
+                 bint na_filter, kh_str_t *na_hashset):
+    cdef:
+        int error
+        size_t i, lines
+        coliter_t it
+        uint64_t *data
+        ndarray result
+        khiter_t k
+
+    lines = line_end - line_start
+    result = np.empty(lines, dtype=np.uint64)
+    data = <uint64_t *> result.data
+    coliter_setup(&it, parser, col, line_start)
+    with nogil:
+        error = _try_uint64_nogil(parser, col, line_start, line_end,
+                                  na_filter, na_hashset, data)
+    if error != 0:
+        if (error == ERROR_OVERFLOW or error == ERROR_UINT64_NAN or
+            error == ERROR_MINUS_SIGN):
+            # Can't get the word variable
+            raise OverflowError('Overflow')
+        return None
+
+    return result
+
+cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
+                                  int line_end, bint na_filter,
+                                  const kh_str_t *na_hashset,
+                                  uint64_t *data) nogil:
+    cdef:
+        int error
+        size_t i
+        size_t lines = line_end - line_start
+        coliter_t it
+        const char *word = NULL
+        khiter_t k
+
+    coliter_setup(&it, parser, col, line_start)
+
+    if na_filter:
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+            k = kh_get_str(na_hashset, word)
+            # in the hash table
+            if k != na_hashset.n_buckets:
+                error = ERROR_UINT64_NAN
+                return error
+
+            data[i] = str_to_uint64(word, UINT64_MAX,
+                                    &error, parser.thousands)
+            if error != 0:
+                return error
+    else:
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+            data[i] = str_to_uint64(word, UINT64_MAX,
+                                    &error, parser.thousands)
+            if error != 0:
+                return error
+
+    return 0
+
 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -1757,6 +1757,14 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
 // End of xstrtod code
 // ---------------------------------------------------------------------------
 
+int uint_state_init(uint_state *self) {
+    self->seen_sint = 0;
+    self->seen_uint = 0;
+    self->seen_null = 0;
+
+    return 0;
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
     const char *p = (const char *)p_item;
@@ -1876,3 +1884,83 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     *error = 0;
     return number;
 }
+
+uint64_t str_to_uint64(const char *p_item, uint64_t uint_max,
+                       int *error, char tsep) {
+    const char *p = (const char *)p_item;
+    uint64_t number = 0;
+    int d;
+
+    // Skip leading spaces.
+    while (isspace(*p)) {
+        ++p;
+    }
+
+    // Handle sign.
+    if (*p == '-') {
+        *error = ERROR_MINUS_SIGN;
+        return 0;
+    } else if (*p == '+') {
+        p++;
+    }
+
+    // Check that there is a first digit.
+    if (!isdigit(*p)) {
+        // Error...
+        *error = ERROR_NO_DIGITS;
+        return 0;
+    }
+
+    // If number is less than pre_max, at least one more digit
+    // can be processed without overflowing.
+    int64_t pre_max = uint_max / 10;
+    int dig_pre_max = uint_max % 10;
+
+    // Process the digits.
+    d = *p;
+    if (tsep != '\0') {
+        while (1) {
+            if (d == tsep) {
+                d = *++p;
+                continue;
+            } else if (!isdigit(d)) {
+                break;
+            }
+            if ((number < pre_max) ||
+                ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+                number = number * 10 + (d - '0');
+                d = *++p;
+
+            } else {
+                *error = ERROR_OVERFLOW;
+                return 0;
+            }
+        }
+    } else {
+        while (isdigit(d)) {
+            if ((number < pre_max) ||
+                ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+                number = number * 10 + (d - '0');
+                d = *++p;
+
+            } else {
+                *error = ERROR_OVERFLOW;
+                return 0;
+            }
+        }
+    }
+
+    // Skip trailing spaces.
+    while (isspace(*p)) {
+        ++p;
+    }
+
+    // Did we use up all the characters?
+    if (*p) {
+        *error = ERROR_INVALID_CHARS;
+        return 0;
+    }
+
+    *error = 0;
+    return number;
+}
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -26,6 +26,7 @@ See LICENSE for the license
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
 #define ERROR_MINUS_SIGN 4
+#define ERROR_UINT64_NAN 5
 
 #include "../headers/stdint.h"
 
@@ -250,6 +251,16 @@ int tokenize_all_rows(parser_t *self);
 // Have parsed / type-converted a chunk of data
 // and want to free memory from the token stream
 
+typedef struct uint_state {
+    int seen_sint;
+    int seen_uint;
+    int seen_null;
+} uint_state;
+
+int uint_state_init(uint_state *self);
+
+uint64_t str_to_uint64(const char *p_item, uint64_t uint_max,
+                       int *error, char tsep);
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,