diff --git a/asv_bench/benchmarks/io_bench.py b/asv_bench/benchmarks/io_bench.py index 2ce3c4726b783..52064d2cdb8a2 100644 --- a/asv_bench/benchmarks/io_bench.py +++ b/asv_bench/benchmarks/io_bench.py @@ -128,6 +128,29 @@ def time_read_parse_dates_iso8601(self): read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo']) +class read_uint64_integers(object): + goal_time = 0.2 + + def setup(self): + self.na_values = [2**63 + 500] + + self.arr1 = np.arange(10000).astype('uint64') + 2**63 + self.data1 = '\n'.join(map(lambda x: str(x), self.arr1)) + + self.arr2 = self.arr1.copy().astype(object) + self.arr2[500] = -1 + self.data2 = '\n'.join(map(lambda x: str(x), self.arr2)) + + def time_read_uint64(self): + read_csv(StringIO(self.data1), header=None) + + def time_read_uint64_neg_values(self): + read_csv(StringIO(self.data2), header=None) + + def time_read_uint64_na_values(self): + read_csv(StringIO(self.data1), header=None, na_values=self.na_values) + + class write_csv_standard(object): goal_time = 0.2 diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index a947b4f3ca0ac..364a85f4c5947 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -288,6 +288,7 @@ Bug Fixes - Bug in ``Index`` power operations with reversed operands (:issue:`14973`) - Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`) - Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`) +- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`) - Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`) - Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`) - Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`) diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index e694e529212aa..6a05dada05e0d 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -921,29 +921,39 @@ def test_int64_overflow(self): self.assertRaises(OverflowError, self.read_csv, StringIO(data), converters={'ID': conv}) - # These numbers fall right inside the int64 range, + # These numbers fall right inside the int64-uint64 range, # so they should be parsed as string. + ui_max = np.iinfo(np.uint64).max i_max = np.iinfo(np.int64).max i_min = np.iinfo(np.int64).min - for x in [i_max, i_min]: + for x in [i_max, i_min, ui_max]: result = self.read_csv(StringIO(str(x)), header=None) expected = DataFrame([x]) tm.assert_frame_equal(result, expected) - # These numbers fall just outside the int64 range, + # These numbers fall just outside the int64-uint64 range, # so they should be parsed as string. - too_big = i_max + 1 + too_big = ui_max + 1 too_small = i_min - 1 for x in [too_big, too_small]: result = self.read_csv(StringIO(str(x)), header=None) - if self.engine == 'python' and x == too_big: - expected = DataFrame([x]) - else: - expected = DataFrame([str(x)]) + expected = DataFrame([str(x)]) tm.assert_frame_equal(result, expected) + # No numerical dtype can hold both negative and uint64 values, + # so they should be cast as string. + data = '-1\n' + str(2**63) + expected = DataFrame([str(-1), str(2**63)]) + result = self.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(result, expected) + + data = str(2**63) + '\n-1' + expected = DataFrame([str(2**63), str(-1)]) + result = self.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(result, expected) + def test_empty_with_nrows_chunksize(self): # see gh-9535 expected = DataFrame([], columns=['foo', 'bar']) diff --git a/pandas/io/tests/parser/dtypes.py b/pandas/io/tests/parser/dtypes.py index b9ab79c3b9d54..abcd14e9499cb 100644 --- a/pandas/io/tests/parser/dtypes.py +++ b/pandas/io/tests/parser/dtypes.py @@ -275,3 +275,11 @@ def test_empty_dtype(self): result = self.read_csv(StringIO(data), header=0, dtype={'a': np.int32, 1: np.float64}) tm.assert_frame_equal(result, expected) + + def test_numeric_dtype(self): + data = '0\n1' + + for dt in np.typecodes['AllInteger'] + np.typecodes['Float']: + expected = pd.DataFrame([0, 1], dtype=dt) + result = self.read_csv(StringIO(data), header=None, dtype=dt) + tm.assert_frame_equal(expected, result) diff --git a/pandas/io/tests/parser/na_values.py b/pandas/io/tests/parser/na_values.py index e245bc5589145..2cbd7cdedf2ab 100644 --- a/pandas/io/tests/parser/na_values.py +++ b/pandas/io/tests/parser/na_values.py @@ -289,3 +289,17 @@ def test_na_values_dict_col_index(self): out = self.read_csv(StringIO(data), na_values=na_values) expected = DataFrame({'a': [np.nan, 1]}) tm.assert_frame_equal(out, expected) + + def test_na_values_uint64(self): + # see gh-14983 + + na_values = [2**63] + data = str(2**63) + '\n' + str(2**63 + 1) + expected = DataFrame([str(2**63), str(2**63 + 1)]) + out = self.read_csv(StringIO(data), header=None, na_values=na_values) + tm.assert_frame_equal(out, expected) + + data = str(2**63) + ',1' + '\n,2' + expected = DataFrame([[str(2**63), 1], ['', 2]]) + out = self.read_csv(StringIO(data), header=None) + tm.assert_frame_equal(out, expected) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 2464ee15b36b7..c5082e999d19c 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -193,6 +193,14 @@ cdef extern from "parser/tokenizer.h": int *line_start int col + ctypedef struct uint_state: + int seen_sint + int seen_uint + int seen_null + + void uint_state_init(uint_state *self) + int uint64_conflict(uint_state *self) + void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil void COLITER_NEXT(coliter_t, const char *) nogil @@ -217,7 +225,8 @@ cdef extern from "parser/tokenizer.h": int64_t str_to_int64(char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) nogil -# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error) + uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, + uint64_t uint_max, int *error, char tsep) nogil double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) nogil @@ -1127,6 +1136,14 @@ cdef class TextReader: try: col_res, na_count = self._convert_with_dtype( dt, i, start, end, na_filter, 0, na_hashset, na_flist) + except ValueError: + # This error is raised from trying to convert to uint64, + # and we discover that we cannot convert to any numerical + # dtype successfully. As a result, we leave the data + # column AS IS with object dtype. + col_res, na_count = self._convert_with_dtype( + np.dtype('object'), i, start, end, 0, + 0, na_hashset, na_flist) except OverflowError: col_res, na_count = self._convert_with_dtype( np.dtype('object'), i, start, end, na_filter, @@ -1164,12 +1181,17 @@ cdef class TextReader: kh_str_t *na_hashset, object na_flist): if is_integer_dtype(dtype): - result, na_count = _try_int64(self.parser, i, start, - end, na_filter, na_hashset) - if user_dtype and na_count is not None: - if na_count > 0: - raise ValueError("Integer column has NA values in " - "column {column}".format(column=i)) + try: + result, na_count = _try_int64(self.parser, i, start, + end, na_filter, na_hashset) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError("Integer column has NA values in " + "column {column}".format(column=i)) + except OverflowError: + result = _try_uint64(self.parser, i, start, end, + na_filter, na_hashset) + na_count = 0 if result is not None and dtype != 'int64': result = result.astype(dtype) @@ -1750,6 +1772,78 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, return 0 +cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, + bint na_filter, kh_str_t *na_hashset): + cdef: + int error + size_t i, lines + coliter_t it + uint64_t *data + ndarray result + khiter_t k + uint_state state + + lines = line_end - line_start + result = np.empty(lines, dtype=np.uint64) + data = result.data + + uint_state_init(&state) + coliter_setup(&it, parser, col, line_start) + with nogil: + error = _try_uint64_nogil(parser, col, line_start, line_end, + na_filter, na_hashset, data, &state) + if error != 0: + if error == ERROR_OVERFLOW: + # Can't get the word variable + raise OverflowError('Overflow') + return None + + if uint64_conflict(&state): + raise ValueError('Cannot convert to numerical dtype') + + if state.seen_sint: + raise OverflowError('Overflow') + + return result + +cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, + int line_end, bint na_filter, + const kh_str_t *na_hashset, + uint64_t *data, uint_state *state) nogil: + cdef: + int error + size_t i + size_t lines = line_end - line_start + coliter_t it + const char *word = NULL + khiter_t k + + coliter_setup(&it, parser, col, line_start) + + if na_filter: + for i in range(lines): + COLITER_NEXT(it, word) + k = kh_get_str(na_hashset, word) + # in the hash table + if k != na_hashset.n_buckets: + state.seen_null = 1 + data[i] = 0 + continue + + data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, + &error, parser.thousands) + if error != 0: + return error + else: + for i in range(lines): + COLITER_NEXT(it, word) + data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, + &error, parser.thousands) + if error != 0: + return error + + return 0 + cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c index 1ea62d66345bd..bc729cd3e7453 100644 --- a/pandas/src/parser/tokenizer.c +++ b/pandas/src/parser/tokenizer.c @@ -1757,6 +1757,16 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, // End of xstrtod code // --------------------------------------------------------------------------- +void uint_state_init(uint_state *self) { + self->seen_sint = 0; + self->seen_uint = 0; + self->seen_null = 0; +} + +int uint64_conflict(uint_state *self) { + return self->seen_uint && (self->seen_sint || self->seen_null); +} + int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep) { const char *p = (const char *)p_item; @@ -1876,3 +1886,88 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, *error = 0; return number; } + +uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, + uint64_t uint_max, int *error, char tsep) { + const char *p = (const char *)p_item; + uint64_t pre_max = uint_max / 10; + int dig_pre_max = uint_max % 10; + uint64_t number = 0; + int d; + + // Skip leading spaces. + while (isspace(*p)) { + ++p; + } + + // Handle sign. + if (*p == '-') { + state->seen_sint = 1; + *error = 0; + return 0; + } else if (*p == '+') { + p++; + } + + // Check that there is a first digit. + if (!isdigit(*p)) { + // Error... + *error = ERROR_NO_DIGITS; + return 0; + } + + // If number is less than pre_max, at least one more digit + // can be processed without overflowing. + // + // Process the digits. + d = *p; + if (tsep != '\0') { + while (1) { + if (d == tsep) { + d = *++p; + continue; + } else if (!isdigit(d)) { + break; + } + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + + } else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } else { + while (isdigit(d)) { + if ((number < pre_max) || + ((number == pre_max) && (d - '0' <= dig_pre_max))) { + number = number * 10 + (d - '0'); + d = *++p; + + } else { + *error = ERROR_OVERFLOW; + return 0; + } + } + } + + // Skip trailing spaces. + while (isspace(*p)) { + ++p; + } + + // Did we use up all the characters? + if (*p) { + *error = ERROR_INVALID_CHARS; + return 0; + } + + if (number > int_max) { + state->seen_uint = 1; + } + + *error = 0; + return number; +} diff --git a/pandas/src/parser/tokenizer.h b/pandas/src/parser/tokenizer.h index e01812f1c5520..d31bf4b688c58 100644 --- a/pandas/src/parser/tokenizer.h +++ b/pandas/src/parser/tokenizer.h @@ -25,7 +25,6 @@ See LICENSE for the license #define ERROR_NO_DIGITS 1 #define ERROR_OVERFLOW 2 #define ERROR_INVALID_CHARS 3 -#define ERROR_MINUS_SIGN 4 #include "../headers/stdint.h" @@ -250,6 +249,18 @@ int tokenize_all_rows(parser_t *self); // Have parsed / type-converted a chunk of data // and want to free memory from the token stream +typedef struct uint_state { + int seen_sint; + int seen_uint; + int seen_null; +} uint_state; + +void uint_state_init(uint_state *self); + +int uint64_conflict(uint_state *self); + +uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, + uint64_t uint_max, int *error, char tsep); int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,