Skip to content

Commit 2ef3225

Browse files
committed
BUG: Parse uint64 in read_csv
Closes pandas-devgh-14983.
1 parent 0252385 commit 2ef3225

File tree

6 files changed

+204
-15
lines changed

6 files changed

+204
-15
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ Bug Fixes
286286
- Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
287287
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
288288
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
289+
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
289290
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
290291
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
291292
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)

pandas/io/tests/parser/common.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -956,29 +956,39 @@ def test_int64_overflow(self):
956956
self.assertRaises(OverflowError, self.read_csv,
957957
StringIO(data), converters={'ID': conv})
958958

959-
# These numbers fall right inside the int64 range,
959+
# These numbers fall right inside the int64-uint64 range,
960960
# so they should be parsed as string.
961+
ui_max = np.iinfo(np.uint64).max
961962
i_max = np.iinfo(np.int64).max
962963
i_min = np.iinfo(np.int64).min
963964

964-
for x in [i_max, i_min]:
965+
for x in [i_max, i_min, ui_max]:
965966
result = self.read_csv(StringIO(str(x)), header=None)
966967
expected = DataFrame([x])
967968
tm.assert_frame_equal(result, expected)
968969

969-
# These numbers fall just outside the int64 range,
970+
# These numbers fall just outside the int64-uint64 range,
970971
# so they should be parsed as string.
971-
too_big = i_max + 1
972+
too_big = ui_max + 1
972973
too_small = i_min - 1
973974

974975
for x in [too_big, too_small]:
975976
result = self.read_csv(StringIO(str(x)), header=None)
976-
if self.engine == 'python' and x == too_big:
977-
expected = DataFrame([x])
978-
else:
979-
expected = DataFrame([str(x)])
977+
expected = DataFrame([str(x)])
980978
tm.assert_frame_equal(result, expected)
981979

980+
# No numerical dtype can hold both negative and uint64 values,
981+
# so they should be cast as string.
982+
data = '-1\n' + str(2**63)
983+
expected = DataFrame([str(-1), str(2**63)])
984+
result = self.read_csv(StringIO(data), header=None)
985+
tm.assert_frame_equal(result, expected)
986+
987+
data = str(2**63) + '\n-1'
988+
expected = DataFrame([str(2**63), str(-1)])
989+
result = self.read_csv(StringIO(data), header=None)
990+
tm.assert_frame_equal(result, expected)
991+
982992
def test_empty_with_nrows_chunksize(self):
983993
# see gh-9535
984994
expected = DataFrame([], columns=['foo', 'bar'])

pandas/io/tests/parser/dtypes.py

+8
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,11 @@ def test_empty_dtype(self):
275275
result = self.read_csv(StringIO(data), header=0,
276276
dtype={'a': np.int32, 1: np.float64})
277277
tm.assert_frame_equal(result, expected)
278+
279+
def test_numeric_dtype(self):
280+
data = '0\n1'
281+
282+
for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
283+
expected = pd.DataFrame([0, 1], dtype=dt)
284+
result = self.read_csv(StringIO(data), header=None, dtype=dt)
285+
tm.assert_frame_equal(expected, result)

pandas/parser.pyx

+78-7
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,8 @@ cdef extern from "parser/tokenizer.h":
107107
FINISHED
108108

109109
enum: ERROR_OVERFLOW
110+
enum: ERROR_MINUS_SIGN
111+
enum: ERROR_UINT64_NAN
110112

111113
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
112114
int *status)
@@ -217,7 +219,8 @@ cdef extern from "parser/tokenizer.h":
217219

218220
int64_t str_to_int64(char *p_item, int64_t int_min,
219221
int64_t int_max, int *error, char tsep) nogil
220-
# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
222+
uint64_t str_to_uint64(char *p_item, uint64_t uint_max,
223+
int *error, char tsep) nogil
221224

222225
double xstrtod(const char *p, char **q, char decimal, char sci,
223226
char tsep, int skip_trailing) nogil
@@ -1164,12 +1167,17 @@ cdef class TextReader:
11641167
kh_str_t *na_hashset,
11651168
object na_flist):
11661169
if is_integer_dtype(dtype):
1167-
result, na_count = _try_int64(self.parser, i, start,
1168-
end, na_filter, na_hashset)
1169-
if user_dtype and na_count is not None:
1170-
if na_count > 0:
1171-
raise ValueError("Integer column has NA values in "
1172-
"column {column}".format(column=i))
1170+
try:
1171+
result, na_count = _try_int64(self.parser, i, start,
1172+
end, na_filter, na_hashset)
1173+
if user_dtype and na_count is not None:
1174+
if na_count > 0:
1175+
raise ValueError("Integer column has NA values in "
1176+
"column {column}".format(column=i))
1177+
except OverflowError:
1178+
result = _try_uint64(self.parser, i, start, end,
1179+
na_filter, na_hashset)
1180+
na_count = 0
11731181

11741182
if result is not None and dtype != 'int64':
11751183
result = result.astype(dtype)
@@ -1750,6 +1758,69 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,
17501758

17511759
return 0
17521760

1761+
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
1762+
bint na_filter, kh_str_t *na_hashset):
1763+
cdef:
1764+
int error
1765+
size_t i, lines
1766+
coliter_t it
1767+
uint64_t *data
1768+
ndarray result
1769+
khiter_t k
1770+
1771+
lines = line_end - line_start
1772+
result = np.empty(lines, dtype=np.uint64)
1773+
data = <uint64_t *> result.data
1774+
coliter_setup(&it, parser, col, line_start)
1775+
with nogil:
1776+
error = _try_uint64_nogil(parser, col, line_start, line_end,
1777+
na_filter, na_hashset, data)
1778+
if error != 0:
1779+
if (error == ERROR_OVERFLOW or error == ERROR_UINT64_NAN or
1780+
error == ERROR_MINUS_SIGN):
1781+
# Can't get the word variable
1782+
raise OverflowError('Overflow')
1783+
return None
1784+
1785+
return result
1786+
1787+
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
1788+
int line_end, bint na_filter,
1789+
const kh_str_t *na_hashset,
1790+
uint64_t *data) nogil:
1791+
cdef:
1792+
int error
1793+
size_t i
1794+
size_t lines = line_end - line_start
1795+
coliter_t it
1796+
const char *word = NULL
1797+
khiter_t k
1798+
1799+
coliter_setup(&it, parser, col, line_start)
1800+
1801+
if na_filter:
1802+
for i in range(lines):
1803+
COLITER_NEXT(it, word)
1804+
k = kh_get_str(na_hashset, word)
1805+
# in the hash table
1806+
if k != na_hashset.n_buckets:
1807+
error = ERROR_UINT64_NAN
1808+
return error
1809+
1810+
data[i] = str_to_uint64(word, UINT64_MAX,
1811+
&error, parser.thousands)
1812+
if error != 0:
1813+
return error
1814+
else:
1815+
for i in range(lines):
1816+
COLITER_NEXT(it, word)
1817+
data[i] = str_to_uint64(word, UINT64_MAX,
1818+
&error, parser.thousands)
1819+
if error != 0:
1820+
return error
1821+
1822+
return 0
1823+
17531824
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
17541825
bint na_filter, kh_str_t *na_hashset):
17551826
cdef:

pandas/src/parser/tokenizer.c

+88
Original file line numberDiff line numberDiff line change
@@ -1757,6 +1757,14 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17571757
// End of xstrtod code
17581758
// ---------------------------------------------------------------------------
17591759

1760+
int uint_state_init(uint_state *self) {
1761+
self->seen_sint = 0;
1762+
self->seen_uint = 0;
1763+
self->seen_null = 0;
1764+
1765+
return 0;
1766+
}
1767+
17601768
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
17611769
int *error, char tsep) {
17621770
const char *p = (const char *)p_item;
@@ -1876,3 +1884,83 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18761884
*error = 0;
18771885
return number;
18781886
}
1887+
1888+
uint64_t str_to_uint64(const char *p_item, uint64_t uint_max,
1889+
int *error, char tsep) {
1890+
const char *p = (const char *)p_item;
1891+
uint64_t number = 0;
1892+
int d;
1893+
1894+
// Skip leading spaces.
1895+
while (isspace(*p)) {
1896+
++p;
1897+
}
1898+
1899+
// Handle sign.
1900+
if (*p == '-') {
1901+
*error = ERROR_MINUS_SIGN;
1902+
return 0;
1903+
} else if (*p == '+') {
1904+
p++;
1905+
}
1906+
1907+
// Check that there is a first digit.
1908+
if (!isdigit(*p)) {
1909+
// Error...
1910+
*error = ERROR_NO_DIGITS;
1911+
return 0;
1912+
}
1913+
1914+
// If number is less than pre_max, at least one more digit
1915+
// can be processed without overflowing.
1916+
int64_t pre_max = uint_max / 10;
1917+
int dig_pre_max = uint_max % 10;
1918+
1919+
// Process the digits.
1920+
d = *p;
1921+
if (tsep != '\0') {
1922+
while (1) {
1923+
if (d == tsep) {
1924+
d = *++p;
1925+
continue;
1926+
} else if (!isdigit(d)) {
1927+
break;
1928+
}
1929+
if ((number < pre_max) ||
1930+
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1931+
number = number * 10 + (d - '0');
1932+
d = *++p;
1933+
1934+
} else {
1935+
*error = ERROR_OVERFLOW;
1936+
return 0;
1937+
}
1938+
}
1939+
} else {
1940+
while (isdigit(d)) {
1941+
if ((number < pre_max) ||
1942+
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1943+
number = number * 10 + (d - '0');
1944+
d = *++p;
1945+
1946+
} else {
1947+
*error = ERROR_OVERFLOW;
1948+
return 0;
1949+
}
1950+
}
1951+
}
1952+
1953+
// Skip trailing spaces.
1954+
while (isspace(*p)) {
1955+
++p;
1956+
}
1957+
1958+
// Did we use up all the characters?
1959+
if (*p) {
1960+
*error = ERROR_INVALID_CHARS;
1961+
return 0;
1962+
}
1963+
1964+
*error = 0;
1965+
return number;
1966+
}

pandas/src/parser/tokenizer.h

+11
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ See LICENSE for the license
2626
#define ERROR_OVERFLOW 2
2727
#define ERROR_INVALID_CHARS 3
2828
#define ERROR_MINUS_SIGN 4
29+
#define ERROR_UINT64_NAN 5
2930

3031
#include "../headers/stdint.h"
3132

@@ -250,6 +251,16 @@ int tokenize_all_rows(parser_t *self);
250251
// Have parsed / type-converted a chunk of data
251252
// and want to free memory from the token stream
252253

254+
typedef struct uint_state {
255+
int seen_sint;
256+
int seen_uint;
257+
int seen_null;
258+
} uint_state;
259+
260+
int uint_state_init(uint_state *self);
261+
262+
uint64_t str_to_uint64(const char *p_item, uint64_t uint_max,
263+
int *error, char tsep);
253264
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
254265
int *error, char tsep);
255266
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,

0 commit comments

Comments
 (0)