-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
BUG: Parse uint64 in read_csv #15020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -193,6 +193,14 @@ cdef extern from "parser/tokenizer.h": | |
int *line_start | ||
int col | ||
|
||
ctypedef struct uint_state: | ||
int seen_sint | ||
int seen_uint | ||
int seen_null | ||
|
||
void uint_state_init(uint_state *self) | ||
int uint64_conflict(uint_state *self) | ||
|
||
void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil | ||
void COLITER_NEXT(coliter_t, const char *) nogil | ||
|
||
|
@@ -217,7 +225,8 @@ cdef extern from "parser/tokenizer.h": | |
|
||
int64_t str_to_int64(char *p_item, int64_t int_min, | ||
int64_t int_max, int *error, char tsep) nogil | ||
# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error) | ||
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max, | ||
uint64_t uint_max, int *error, char tsep) nogil | ||
|
||
double xstrtod(const char *p, char **q, char decimal, char sci, | ||
char tsep, int skip_trailing) nogil | ||
|
@@ -1127,6 +1136,14 @@ cdef class TextReader: | |
try: | ||
col_res, na_count = self._convert_with_dtype( | ||
dt, i, start, end, na_filter, 0, na_hashset, na_flist) | ||
except ValueError: | ||
# This error is raised from trying to convert to uint64, | ||
# and we discover that we cannot convert to any numerical | ||
# dtype successfully. As a result, we leave the data | ||
# column AS IS with object dtype. | ||
col_res, na_count = self._convert_with_dtype( | ||
np.dtype('object'), i, start, end, 0, | ||
0, na_hashset, na_flist) | ||
except OverflowError: | ||
col_res, na_count = self._convert_with_dtype( | ||
np.dtype('object'), i, start, end, na_filter, | ||
|
@@ -1164,12 +1181,17 @@ cdef class TextReader: | |
kh_str_t *na_hashset, | ||
object na_flist): | ||
if is_integer_dtype(dtype): | ||
result, na_count = _try_int64(self.parser, i, start, | ||
end, na_filter, na_hashset) | ||
if user_dtype and na_count is not None: | ||
if na_count > 0: | ||
raise ValueError("Integer column has NA values in " | ||
"column {column}".format(column=i)) | ||
try: | ||
result, na_count = _try_int64(self.parser, i, start, | ||
end, na_filter, na_hashset) | ||
if user_dtype and na_count is not None: | ||
if na_count > 0: | ||
raise ValueError("Integer column has NA values in " | ||
"column {column}".format(column=i)) | ||
except OverflowError: | ||
result = _try_uint64(self.parser, i, start, end, | ||
na_filter, na_hashset) | ||
na_count = 0 | ||
|
||
if result is not None and dtype != 'int64': | ||
result = result.astype(dtype) | ||
|
@@ -1750,6 +1772,78 @@ cdef inline int _try_double_nogil(parser_t *parser, int col, | |
|
||
return 0 | ||
|
||
cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prob for another PR, but we should generate these try_* routines with tempita (if possible) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's possible I imagine but not as nice since the implementations are definitely customized to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, if it seems worth it |
||
cdef: | ||
int error | ||
size_t i, lines | ||
coliter_t it | ||
uint64_t *data | ||
ndarray result | ||
khiter_t k | ||
uint_state state | ||
|
||
lines = line_end - line_start | ||
result = np.empty(lines, dtype=np.uint64) | ||
data = <uint64_t *> result.data | ||
|
||
uint_state_init(&state) | ||
coliter_setup(&it, parser, col, line_start) | ||
with nogil: | ||
error = _try_uint64_nogil(parser, col, line_start, line_end, | ||
na_filter, na_hashset, data, &state) | ||
if error != 0: | ||
if error == ERROR_OVERFLOW: | ||
# Can't get the word variable | ||
raise OverflowError('Overflow') | ||
return None | ||
|
||
if uint64_conflict(&state): | ||
raise ValueError('Cannot convert to numerical dtype') | ||
|
||
if state.seen_sint: | ||
raise OverflowError('Overflow') | ||
|
||
return result | ||
|
||
cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start, | ||
int line_end, bint na_filter, | ||
const kh_str_t *na_hashset, | ||
uint64_t *data, uint_state *state) nogil: | ||
cdef: | ||
int error | ||
size_t i | ||
size_t lines = line_end - line_start | ||
coliter_t it | ||
const char *word = NULL | ||
khiter_t k | ||
|
||
coliter_setup(&it, parser, col, line_start) | ||
|
||
if na_filter: | ||
for i in range(lines): | ||
COLITER_NEXT(it, word) | ||
k = kh_get_str(na_hashset, word) | ||
# in the hash table | ||
if k != na_hashset.n_buckets: | ||
state.seen_null = 1 | ||
data[i] = 0 | ||
continue | ||
|
||
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, | ||
&error, parser.thousands) | ||
if error != 0: | ||
return error | ||
else: | ||
for i in range(lines): | ||
COLITER_NEXT(it, word) | ||
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX, | ||
&error, parser.thousands) | ||
if error != 0: | ||
return error | ||
|
||
return 0 | ||
|
||
cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, | ||
bint na_filter, kh_str_t *na_hashset): | ||
cdef: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1757,6 +1757,16 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep, | |
// End of xstrtod code | ||
// --------------------------------------------------------------------------- | ||
|
||
void uint_state_init(uint_state *self) { | ||
self->seen_sint = 0; | ||
self->seen_uint = 0; | ||
self->seen_null = 0; | ||
} | ||
|
||
int uint64_conflict(uint_state *self) { | ||
return self->seen_uint && (self->seen_sint || self->seen_null); | ||
} | ||
|
||
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, | ||
int *error, char tsep) { | ||
const char *p = (const char *)p_item; | ||
|
@@ -1876,3 +1886,88 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, | |
*error = 0; | ||
return number; | ||
} | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe generate some code here too (for another PR) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, possible but not as nice because the implementations are definitely customized to the |
||
uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, | ||
uint64_t uint_max, int *error, char tsep) { | ||
const char *p = (const char *)p_item; | ||
uint64_t pre_max = uint_max / 10; | ||
int dig_pre_max = uint_max % 10; | ||
uint64_t number = 0; | ||
int d; | ||
|
||
// Skip leading spaces. | ||
while (isspace(*p)) { | ||
++p; | ||
} | ||
|
||
// Handle sign. | ||
if (*p == '-') { | ||
state->seen_sint = 1; | ||
*error = 0; | ||
return 0; | ||
} else if (*p == '+') { | ||
p++; | ||
} | ||
|
||
// Check that there is a first digit. | ||
if (!isdigit(*p)) { | ||
// Error... | ||
*error = ERROR_NO_DIGITS; | ||
return 0; | ||
} | ||
|
||
// If number is less than pre_max, at least one more digit | ||
// can be processed without overflowing. | ||
// | ||
// Process the digits. | ||
d = *p; | ||
if (tsep != '\0') { | ||
while (1) { | ||
if (d == tsep) { | ||
d = *++p; | ||
continue; | ||
} else if (!isdigit(d)) { | ||
break; | ||
} | ||
if ((number < pre_max) || | ||
((number == pre_max) && (d - '0' <= dig_pre_max))) { | ||
number = number * 10 + (d - '0'); | ||
d = *++p; | ||
|
||
} else { | ||
*error = ERROR_OVERFLOW; | ||
return 0; | ||
} | ||
} | ||
} else { | ||
while (isdigit(d)) { | ||
if ((number < pre_max) || | ||
((number == pre_max) && (d - '0' <= dig_pre_max))) { | ||
number = number * 10 + (d - '0'); | ||
d = *++p; | ||
|
||
} else { | ||
*error = ERROR_OVERFLOW; | ||
return 0; | ||
} | ||
} | ||
} | ||
|
||
// Skip trailing spaces. | ||
while (isspace(*p)) { | ||
++p; | ||
} | ||
|
||
// Did we use up all the characters? | ||
if (*p) { | ||
*error = ERROR_INVALID_CHARS; | ||
return 0; | ||
} | ||
|
||
if (number > int_max) { | ||
state->seen_uint = 1; | ||
} | ||
|
||
*error = 0; | ||
return number; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could also call this an enhancement that read_csv now supports reading uint64 values?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO this is a bug because
read_csv
should be able to handle all data types equally. In addition, both the issue and this PR have already been tagged as a bug.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It doesn't really matter whether it is a bug fix or not (or how we tag the PR), what I just wanted to say is that we can highlight it more by putting it in the enhancement section, if we think it is worth it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, fair enough. I just realized though: #14937 (see my changes to the
whatsnew
) might actually resolve this discussion?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yep, indeed, we can group all uint enhancement/bug fixes together
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Okay, so we can leave that for #14937 then. 😄