BUG: Parse uint64 in read_csv (pandas-dev#15020)

gfyoung · jreback · commit 74e20a0c4146 · 2017-01-02T14:50:01.000-05:00
Adds behavior to allow for parsing of uint64 data in read_csv. Also ensures that they are properly handled along with NaN and negative values. Closes pandas-devgh-14983.
diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py
@@ -128,6 +128,29 @@ def time_read_parse_dates_iso8601(self):
         read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'])
 
 
+class read_uint64_integers(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.na_values = [2**63 + 500]
+
+        self.arr1 = np.arange(10000).astype('uint64') + 2**63
+        self.data1 = '\n'.join(map(lambda x: str(x), self.arr1))
+
+        self.arr2 = self.arr1.copy().astype(object)
+        self.arr2[500] = -1
+        self.data2 = '\n'.join(map(lambda x: str(x), self.arr2))
+
+    def time_read_uint64(self):
+        read_csv(StringIO(self.data1), header=None)
+
+    def time_read_uint64_neg_values(self):
+        read_csv(StringIO(self.data2), header=None)
+
+    def time_read_uint64_na_values(self):
+        read_csv(StringIO(self.data1), header=None, na_values=self.na_values)
+
+
 class write_csv_standard(object):
     goal_time = 0.2
 
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -288,6 +288,7 @@ Bug Fixes
 - Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
 - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
 - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
+- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
 - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
 - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
 - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -921,29 +921,39 @@ def test_int64_overflow(self):
             self.assertRaises(OverflowError, self.read_csv,
                               StringIO(data), converters={'ID': conv})
 
-        # These numbers fall right inside the int64 range,
+        # These numbers fall right inside the int64-uint64 range,
         # so they should be parsed as string.
+        ui_max = np.iinfo(np.uint64).max
         i_max = np.iinfo(np.int64).max
         i_min = np.iinfo(np.int64).min
 
-        for x in [i_max, i_min]:
+        for x in [i_max, i_min, ui_max]:
             result = self.read_csv(StringIO(str(x)), header=None)
             expected = DataFrame([x])
             tm.assert_frame_equal(result, expected)
 
-        # These numbers fall just outside the int64 range,
+        # These numbers fall just outside the int64-uint64 range,
         # so they should be parsed as string.
-        too_big = i_max + 1
+        too_big = ui_max + 1
         too_small = i_min - 1
 
         for x in [too_big, too_small]:
             result = self.read_csv(StringIO(str(x)), header=None)
-            if self.engine == 'python' and x == too_big:
-                expected = DataFrame([x])
-            else:
-                expected = DataFrame([str(x)])
+            expected = DataFrame([str(x)])
             tm.assert_frame_equal(result, expected)
 
+        # No numerical dtype can hold both negative and uint64 values,
+        # so they should be cast as string.
+        data = '-1\n' + str(2**63)
+        expected = DataFrame([str(-1), str(2**63)])
+        result = self.read_csv(StringIO(data), header=None)
+        tm.assert_frame_equal(result, expected)
+
+        data = str(2**63) + '\n-1'
+        expected = DataFrame([str(2**63), str(-1)])
+        result = self.read_csv(StringIO(data), header=None)
+        tm.assert_frame_equal(result, expected)
+
     def test_empty_with_nrows_chunksize(self):
         # see gh-9535
         expected = DataFrame([], columns=['foo', 'bar'])
diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py
@@ -275,3 +275,11 @@ def test_empty_dtype(self):
         result = self.read_csv(StringIO(data), header=0,
                                dtype={'a': np.int32, 1: np.float64})
         tm.assert_frame_equal(result, expected)
+
+    def test_numeric_dtype(self):
+        data = '0\n1'
+
+        for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
+            expected = pd.DataFrame([0, 1], dtype=dt)
+            result = self.read_csv(StringIO(data), header=None, dtype=dt)
+            tm.assert_frame_equal(expected, result)
diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py
@@ -289,3 +289,17 @@ def test_na_values_dict_col_index(self):
         out = self.read_csv(StringIO(data), na_values=na_values)
         expected = DataFrame({'a': [np.nan, 1]})
         tm.assert_frame_equal(out, expected)
+
+    def test_na_values_uint64(self):
+        # see gh-14983
+
+        na_values = [2**63]
+        data = str(2**63) + '\n' + str(2**63 + 1)
+        expected = DataFrame([str(2**63), str(2**63 + 1)])
+        out = self.read_csv(StringIO(data), header=None, na_values=na_values)
+        tm.assert_frame_equal(out, expected)
+
+        data = str(2**63) + ',1' + '\n,2'
+        expected = DataFrame([[str(2**63), 1], ['', 2]])
+        out = self.read_csv(StringIO(data), header=None)
+        tm.assert_frame_equal(out, expected)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -193,6 +193,14 @@ cdef extern from "parser/tokenizer.h":
         int *line_start
         int col
 
+    ctypedef struct uint_state:
+        int seen_sint
+        int seen_uint
+        int seen_null
+
+    void uint_state_init(uint_state *self)
+    int uint64_conflict(uint_state *self)
+
     void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
     void COLITER_NEXT(coliter_t, const char *) nogil
 
@@ -217,7 +225,8 @@ cdef extern from "parser/tokenizer.h":
 
     int64_t str_to_int64(char *p_item, int64_t int_min,
                          int64_t int_max, int *error, char tsep) nogil
-#    uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
+    uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
+                           uint64_t uint_max, int *error, char tsep) nogil
 
     double xstrtod(const char *p, char **q, char decimal, char sci,
                    char tsep, int skip_trailing) nogil
@@ -1127,6 +1136,14 @@ cdef class TextReader:
                 try:
                     col_res, na_count = self._convert_with_dtype(
                         dt, i, start, end, na_filter, 0, na_hashset, na_flist)
+                except ValueError:
+                    # This error is raised from trying to convert to uint64,
+                    # and we discover that we cannot convert to any numerical
+                    # dtype successfully. As a result, we leave the data
+                    # column AS IS with object dtype.
+                    col_res, na_count = self._convert_with_dtype(
+                        np.dtype('object'), i, start, end, 0,
+                        0, na_hashset, na_flist)
                 except OverflowError:
                     col_res, na_count = self._convert_with_dtype(
                         np.dtype('object'), i, start, end, na_filter,
@@ -1164,12 +1181,17 @@ cdef class TextReader:
                              kh_str_t *na_hashset,
                              object na_flist):
         if is_integer_dtype(dtype):
-            result, na_count = _try_int64(self.parser, i, start,
-                                          end, na_filter, na_hashset)
-            if user_dtype and na_count is not None:
-                if na_count > 0:
-                    raise ValueError("Integer column has NA values in "
-                                     "column {column}".format(column=i))
+            try:
+                result, na_count = _try_int64(self.parser, i, start,
+                                              end, na_filter, na_hashset)
+                if user_dtype and na_count is not None:
+                    if na_count > 0:
+                        raise ValueError("Integer column has NA values in "
+                                         "column {column}".format(column=i))
+            except OverflowError:
+                result = _try_uint64(self.parser, i, start, end,
+                                     na_filter, na_hashset)
+                na_count = 0
 
             if result is not None and dtype != 'int64':
                 result = result.astype(dtype)
@@ -1750,6 +1772,78 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,
 
     return 0
 
+cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
+                 bint na_filter, kh_str_t *na_hashset):
+    cdef:
+        int error
+        size_t i, lines
+        coliter_t it
+        uint64_t *data
+        ndarray result
+        khiter_t k
+        uint_state state
+
+    lines = line_end - line_start
+    result = np.empty(lines, dtype=np.uint64)
+    data = <uint64_t *> result.data
+
+    uint_state_init(&state)
+    coliter_setup(&it, parser, col, line_start)
+    with nogil:
+        error = _try_uint64_nogil(parser, col, line_start, line_end,
+                                  na_filter, na_hashset, data, &state)
+    if error != 0:
+        if error == ERROR_OVERFLOW:
+            # Can't get the word variable
+            raise OverflowError('Overflow')
+        return None
+
+    if uint64_conflict(&state):
+        raise ValueError('Cannot convert to numerical dtype')
+
+    if state.seen_sint:
+        raise OverflowError('Overflow')
+
+    return result
+
+cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
+                                  int line_end, bint na_filter,
+                                  const kh_str_t *na_hashset,
+                                  uint64_t *data, uint_state *state) nogil:
+    cdef:
+        int error
+        size_t i
+        size_t lines = line_end - line_start
+        coliter_t it
+        const char *word = NULL
+        khiter_t k
+
+    coliter_setup(&it, parser, col, line_start)
+
+    if na_filter:
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+            k = kh_get_str(na_hashset, word)
+            # in the hash table
+            if k != na_hashset.n_buckets:
+                state.seen_null = 1
+                data[i] = 0
+                continue
+
+            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
+                                    &error, parser.thousands)
+            if error != 0:
+                return error
+    else:
+        for i in range(lines):
+            COLITER_NEXT(it, word)
+            data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
+                                    &error, parser.thousands)
+            if error != 0:
+                return error
+
+    return 0
+
 cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
                 bint na_filter, kh_str_t *na_hashset):
     cdef:
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -1757,6 +1757,16 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
 // End of xstrtod code
 // ---------------------------------------------------------------------------
 
+void uint_state_init(uint_state *self) {
+    self->seen_sint = 0;
+    self->seen_uint = 0;
+    self->seen_null = 0;
+}
+
+int uint64_conflict(uint_state *self) {
+    return self->seen_uint && (self->seen_sint || self->seen_null);
+}
+
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep) {
     const char *p = (const char *)p_item;
@@ -1876,3 +1886,88 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
     *error = 0;
     return number;
 }
+
+uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
+                       uint64_t uint_max, int *error, char tsep) {
+    const char *p = (const char *)p_item;
+    uint64_t pre_max = uint_max / 10;
+    int dig_pre_max = uint_max % 10;
+    uint64_t number = 0;
+    int d;
+
+    // Skip leading spaces.
+    while (isspace(*p)) {
+        ++p;
+    }
+
+    // Handle sign.
+    if (*p == '-') {
+        state->seen_sint = 1;
+        *error = 0;
+        return 0;
+    } else if (*p == '+') {
+        p++;
+    }
+
+    // Check that there is a first digit.
+    if (!isdigit(*p)) {
+        // Error...
+        *error = ERROR_NO_DIGITS;
+        return 0;
+    }
+
+    // If number is less than pre_max, at least one more digit
+    // can be processed without overflowing.
+    //
+    // Process the digits.
+    d = *p;
+    if (tsep != '\0') {
+        while (1) {
+            if (d == tsep) {
+                d = *++p;
+                continue;
+            } else if (!isdigit(d)) {
+                break;
+            }
+            if ((number < pre_max) ||
+                ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+                number = number * 10 + (d - '0');
+                d = *++p;
+
+            } else {
+                *error = ERROR_OVERFLOW;
+                return 0;
+            }
+        }
+    } else {
+        while (isdigit(d)) {
+            if ((number < pre_max) ||
+                ((number == pre_max) && (d - '0' <= dig_pre_max))) {
+                number = number * 10 + (d - '0');
+                d = *++p;
+
+            } else {
+                *error = ERROR_OVERFLOW;
+                return 0;
+            }
+        }
+    }
+
+    // Skip trailing spaces.
+    while (isspace(*p)) {
+        ++p;
+    }
+
+    // Did we use up all the characters?
+    if (*p) {
+        *error = ERROR_INVALID_CHARS;
+        return 0;
+    }
+
+    if (number > int_max) {
+        state->seen_uint = 1;
+    }
+
+    *error = 0;
+    return number;
+}
diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h
@@ -25,7 +25,6 @@ See LICENSE for the license
 #define ERROR_NO_DIGITS 1
 #define ERROR_OVERFLOW 2
 #define ERROR_INVALID_CHARS 3
-#define ERROR_MINUS_SIGN 4
 
 #include "../headers/stdint.h"
 
@@ -250,6 +249,18 @@ int tokenize_all_rows(parser_t *self);
 // Have parsed / type-converted a chunk of data
 // and want to free memory from the token stream
 
+typedef struct uint_state {
+    int seen_sint;
+    int seen_uint;
+    int seen_null;
+} uint_state;
+
+void uint_state_init(uint_state *self);
+
+int uint64_conflict(uint_state *self);
+
+uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
+                       uint64_t uint_max, int *error, char tsep);
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
                      int *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,