Skip to content

Commit 74e20a0

Browse files
gfyoungjreback
authored andcommitted
BUG: Parse uint64 in read_csv (pandas-dev#15020)
Adds behavior to allow for parsing of uint64 data in read_csv. Also ensures that they are properly handled along with NaN and negative values. Closes pandas-devgh-14983.
1 parent ab54944 commit 74e20a0

File tree

8 files changed

+272
-16
lines changed

8 files changed

+272
-16
lines changed

asv_bench/benchmarks/io_bench.py

+23
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,29 @@ def time_read_parse_dates_iso8601(self):
128128
read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'])
129129

130130

131+
class read_uint64_integers(object):
132+
goal_time = 0.2
133+
134+
def setup(self):
135+
self.na_values = [2**63 + 500]
136+
137+
self.arr1 = np.arange(10000).astype('uint64') + 2**63
138+
self.data1 = '\n'.join(map(lambda x: str(x), self.arr1))
139+
140+
self.arr2 = self.arr1.copy().astype(object)
141+
self.arr2[500] = -1
142+
self.data2 = '\n'.join(map(lambda x: str(x), self.arr2))
143+
144+
def time_read_uint64(self):
145+
read_csv(StringIO(self.data1), header=None)
146+
147+
def time_read_uint64_neg_values(self):
148+
read_csv(StringIO(self.data2), header=None)
149+
150+
def time_read_uint64_na_values(self):
151+
read_csv(StringIO(self.data1), header=None, na_values=self.na_values)
152+
153+
131154
class write_csv_standard(object):
132155
goal_time = 0.2
133156

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ Bug Fixes
288288
- Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
289289
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
290290
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
291+
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
291292
- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
292293
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
293294
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)

pandas/io/tests/parser/common.py

+18-8
Original file line numberDiff line numberDiff line change
@@ -921,29 +921,39 @@ def test_int64_overflow(self):
921921
self.assertRaises(OverflowError, self.read_csv,
922922
StringIO(data), converters={'ID': conv})
923923

924-
# These numbers fall right inside the int64 range,
924+
# These numbers fall right inside the int64-uint64 range,
925925
# so they should be parsed as string.
926+
ui_max = np.iinfo(np.uint64).max
926927
i_max = np.iinfo(np.int64).max
927928
i_min = np.iinfo(np.int64).min
928929

929-
for x in [i_max, i_min]:
930+
for x in [i_max, i_min, ui_max]:
930931
result = self.read_csv(StringIO(str(x)), header=None)
931932
expected = DataFrame([x])
932933
tm.assert_frame_equal(result, expected)
933934

934-
# These numbers fall just outside the int64 range,
935+
# These numbers fall just outside the int64-uint64 range,
935936
# so they should be parsed as string.
936-
too_big = i_max + 1
937+
too_big = ui_max + 1
937938
too_small = i_min - 1
938939

939940
for x in [too_big, too_small]:
940941
result = self.read_csv(StringIO(str(x)), header=None)
941-
if self.engine == 'python' and x == too_big:
942-
expected = DataFrame([x])
943-
else:
944-
expected = DataFrame([str(x)])
942+
expected = DataFrame([str(x)])
945943
tm.assert_frame_equal(result, expected)
946944

945+
# No numerical dtype can hold both negative and uint64 values,
946+
# so they should be cast as string.
947+
data = '-1\n' + str(2**63)
948+
expected = DataFrame([str(-1), str(2**63)])
949+
result = self.read_csv(StringIO(data), header=None)
950+
tm.assert_frame_equal(result, expected)
951+
952+
data = str(2**63) + '\n-1'
953+
expected = DataFrame([str(2**63), str(-1)])
954+
result = self.read_csv(StringIO(data), header=None)
955+
tm.assert_frame_equal(result, expected)
956+
947957
def test_empty_with_nrows_chunksize(self):
948958
# see gh-9535
949959
expected = DataFrame([], columns=['foo', 'bar'])

pandas/io/tests/parser/dtypes.py

+8
Original file line numberDiff line numberDiff line change
@@ -275,3 +275,11 @@ def test_empty_dtype(self):
275275
result = self.read_csv(StringIO(data), header=0,
276276
dtype={'a': np.int32, 1: np.float64})
277277
tm.assert_frame_equal(result, expected)
278+
279+
def test_numeric_dtype(self):
280+
data = '0\n1'
281+
282+
for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
283+
expected = pd.DataFrame([0, 1], dtype=dt)
284+
result = self.read_csv(StringIO(data), header=None, dtype=dt)
285+
tm.assert_frame_equal(expected, result)

pandas/io/tests/parser/na_values.py

+14
Original file line numberDiff line numberDiff line change
@@ -289,3 +289,17 @@ def test_na_values_dict_col_index(self):
289289
out = self.read_csv(StringIO(data), na_values=na_values)
290290
expected = DataFrame({'a': [np.nan, 1]})
291291
tm.assert_frame_equal(out, expected)
292+
293+
def test_na_values_uint64(self):
294+
# see gh-14983
295+
296+
na_values = [2**63]
297+
data = str(2**63) + '\n' + str(2**63 + 1)
298+
expected = DataFrame([str(2**63), str(2**63 + 1)])
299+
out = self.read_csv(StringIO(data), header=None, na_values=na_values)
300+
tm.assert_frame_equal(out, expected)
301+
302+
data = str(2**63) + ',1' + '\n,2'
303+
expected = DataFrame([[str(2**63), 1], ['', 2]])
304+
out = self.read_csv(StringIO(data), header=None)
305+
tm.assert_frame_equal(out, expected)

pandas/parser.pyx

+101-7
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,14 @@ cdef extern from "parser/tokenizer.h":
193193
int *line_start
194194
int col
195195

196+
ctypedef struct uint_state:
197+
int seen_sint
198+
int seen_uint
199+
int seen_null
200+
201+
void uint_state_init(uint_state *self)
202+
int uint64_conflict(uint_state *self)
203+
196204
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
197205
void COLITER_NEXT(coliter_t, const char *) nogil
198206

@@ -217,7 +225,8 @@ cdef extern from "parser/tokenizer.h":
217225

218226
int64_t str_to_int64(char *p_item, int64_t int_min,
219227
int64_t int_max, int *error, char tsep) nogil
220-
# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
228+
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
229+
uint64_t uint_max, int *error, char tsep) nogil
221230

222231
double xstrtod(const char *p, char **q, char decimal, char sci,
223232
char tsep, int skip_trailing) nogil
@@ -1127,6 +1136,14 @@ cdef class TextReader:
11271136
try:
11281137
col_res, na_count = self._convert_with_dtype(
11291138
dt, i, start, end, na_filter, 0, na_hashset, na_flist)
1139+
except ValueError:
1140+
# This error is raised from trying to convert to uint64,
1141+
# and we discover that we cannot convert to any numerical
1142+
# dtype successfully. As a result, we leave the data
1143+
# column AS IS with object dtype.
1144+
col_res, na_count = self._convert_with_dtype(
1145+
np.dtype('object'), i, start, end, 0,
1146+
0, na_hashset, na_flist)
11301147
except OverflowError:
11311148
col_res, na_count = self._convert_with_dtype(
11321149
np.dtype('object'), i, start, end, na_filter,
@@ -1164,12 +1181,17 @@ cdef class TextReader:
11641181
kh_str_t *na_hashset,
11651182
object na_flist):
11661183
if is_integer_dtype(dtype):
1167-
result, na_count = _try_int64(self.parser, i, start,
1168-
end, na_filter, na_hashset)
1169-
if user_dtype and na_count is not None:
1170-
if na_count > 0:
1171-
raise ValueError("Integer column has NA values in "
1172-
"column {column}".format(column=i))
1184+
try:
1185+
result, na_count = _try_int64(self.parser, i, start,
1186+
end, na_filter, na_hashset)
1187+
if user_dtype and na_count is not None:
1188+
if na_count > 0:
1189+
raise ValueError("Integer column has NA values in "
1190+
"column {column}".format(column=i))
1191+
except OverflowError:
1192+
result = _try_uint64(self.parser, i, start, end,
1193+
na_filter, na_hashset)
1194+
na_count = 0
11731195

11741196
if result is not None and dtype != 'int64':
11751197
result = result.astype(dtype)
@@ -1750,6 +1772,78 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,
17501772

17511773
return 0
17521774

1775+
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
1776+
bint na_filter, kh_str_t *na_hashset):
1777+
cdef:
1778+
int error
1779+
size_t i, lines
1780+
coliter_t it
1781+
uint64_t *data
1782+
ndarray result
1783+
khiter_t k
1784+
uint_state state
1785+
1786+
lines = line_end - line_start
1787+
result = np.empty(lines, dtype=np.uint64)
1788+
data = <uint64_t *> result.data
1789+
1790+
uint_state_init(&state)
1791+
coliter_setup(&it, parser, col, line_start)
1792+
with nogil:
1793+
error = _try_uint64_nogil(parser, col, line_start, line_end,
1794+
na_filter, na_hashset, data, &state)
1795+
if error != 0:
1796+
if error == ERROR_OVERFLOW:
1797+
# Can't get the word variable
1798+
raise OverflowError('Overflow')
1799+
return None
1800+
1801+
if uint64_conflict(&state):
1802+
raise ValueError('Cannot convert to numerical dtype')
1803+
1804+
if state.seen_sint:
1805+
raise OverflowError('Overflow')
1806+
1807+
return result
1808+
1809+
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
1810+
int line_end, bint na_filter,
1811+
const kh_str_t *na_hashset,
1812+
uint64_t *data, uint_state *state) nogil:
1813+
cdef:
1814+
int error
1815+
size_t i
1816+
size_t lines = line_end - line_start
1817+
coliter_t it
1818+
const char *word = NULL
1819+
khiter_t k
1820+
1821+
coliter_setup(&it, parser, col, line_start)
1822+
1823+
if na_filter:
1824+
for i in range(lines):
1825+
COLITER_NEXT(it, word)
1826+
k = kh_get_str(na_hashset, word)
1827+
# in the hash table
1828+
if k != na_hashset.n_buckets:
1829+
state.seen_null = 1
1830+
data[i] = 0
1831+
continue
1832+
1833+
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1834+
&error, parser.thousands)
1835+
if error != 0:
1836+
return error
1837+
else:
1838+
for i in range(lines):
1839+
COLITER_NEXT(it, word)
1840+
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
1841+
&error, parser.thousands)
1842+
if error != 0:
1843+
return error
1844+
1845+
return 0
1846+
17531847
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
17541848
bint na_filter, kh_str_t *na_hashset):
17551849
cdef:

pandas/src/parser/tokenizer.c

+95
Original file line numberDiff line numberDiff line change
@@ -1757,6 +1757,16 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
17571757
// End of xstrtod code
17581758
// ---------------------------------------------------------------------------
17591759

1760+
void uint_state_init(uint_state *self) {
1761+
self->seen_sint = 0;
1762+
self->seen_uint = 0;
1763+
self->seen_null = 0;
1764+
}
1765+
1766+
int uint64_conflict(uint_state *self) {
1767+
return self->seen_uint && (self->seen_sint || self->seen_null);
1768+
}
1769+
17601770
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
17611771
int *error, char tsep) {
17621772
const char *p = (const char *)p_item;
@@ -1876,3 +1886,88 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
18761886
*error = 0;
18771887
return number;
18781888
}
1889+
1890+
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
1891+
uint64_t uint_max, int *error, char tsep) {
1892+
const char *p = (const char *)p_item;
1893+
uint64_t pre_max = uint_max / 10;
1894+
int dig_pre_max = uint_max % 10;
1895+
uint64_t number = 0;
1896+
int d;
1897+
1898+
// Skip leading spaces.
1899+
while (isspace(*p)) {
1900+
++p;
1901+
}
1902+
1903+
// Handle sign.
1904+
if (*p == '-') {
1905+
state->seen_sint = 1;
1906+
*error = 0;
1907+
return 0;
1908+
} else if (*p == '+') {
1909+
p++;
1910+
}
1911+
1912+
// Check that there is a first digit.
1913+
if (!isdigit(*p)) {
1914+
// Error...
1915+
*error = ERROR_NO_DIGITS;
1916+
return 0;
1917+
}
1918+
1919+
// If number is less than pre_max, at least one more digit
1920+
// can be processed without overflowing.
1921+
//
1922+
// Process the digits.
1923+
d = *p;
1924+
if (tsep != '\0') {
1925+
while (1) {
1926+
if (d == tsep) {
1927+
d = *++p;
1928+
continue;
1929+
} else if (!isdigit(d)) {
1930+
break;
1931+
}
1932+
if ((number < pre_max) ||
1933+
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1934+
number = number * 10 + (d - '0');
1935+
d = *++p;
1936+
1937+
} else {
1938+
*error = ERROR_OVERFLOW;
1939+
return 0;
1940+
}
1941+
}
1942+
} else {
1943+
while (isdigit(d)) {
1944+
if ((number < pre_max) ||
1945+
((number == pre_max) && (d - '0' <= dig_pre_max))) {
1946+
number = number * 10 + (d - '0');
1947+
d = *++p;
1948+
1949+
} else {
1950+
*error = ERROR_OVERFLOW;
1951+
return 0;
1952+
}
1953+
}
1954+
}
1955+
1956+
// Skip trailing spaces.
1957+
while (isspace(*p)) {
1958+
++p;
1959+
}
1960+
1961+
// Did we use up all the characters?
1962+
if (*p) {
1963+
*error = ERROR_INVALID_CHARS;
1964+
return 0;
1965+
}
1966+
1967+
if (number > int_max) {
1968+
state->seen_uint = 1;
1969+
}
1970+
1971+
*error = 0;
1972+
return number;
1973+
}

pandas/src/parser/tokenizer.h

+12-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ See LICENSE for the license
2525
#define ERROR_NO_DIGITS 1
2626
#define ERROR_OVERFLOW 2
2727
#define ERROR_INVALID_CHARS 3
28-
#define ERROR_MINUS_SIGN 4
2928

3029
#include "../headers/stdint.h"
3130

@@ -250,6 +249,18 @@ int tokenize_all_rows(parser_t *self);
250249
// Have parsed / type-converted a chunk of data
251250
// and want to free memory from the token stream
252251

252+
typedef struct uint_state {
253+
int seen_sint;
254+
int seen_uint;
255+
int seen_null;
256+
} uint_state;
257+
258+
void uint_state_init(uint_state *self);
259+
260+
int uint64_conflict(uint_state *self);
261+
262+
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
263+
uint64_t uint_max, int *error, char tsep);
253264
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
254265
int *error, char tsep);
255266
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,

0 commit comments

Comments
 (0)