Skip to content

BUG: Parse uint64 in read_csv #15020

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 2, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions asv_bench/benchmarks/io_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,29 @@ def time_read_parse_dates_iso8601(self):
read_csv(StringIO(self.data), header=None, names=['foo'], parse_dates=['foo'])


class read_uint64_integers(object):
goal_time = 0.2

def setup(self):
self.na_values = [2**63 + 500]

self.arr1 = np.arange(10000).astype('uint64') + 2**63
self.data1 = '\n'.join(map(lambda x: str(x), self.arr1))

self.arr2 = self.arr1.copy().astype(object)
self.arr2[500] = -1
self.data2 = '\n'.join(map(lambda x: str(x), self.arr2))

def time_read_uint64(self):
read_csv(StringIO(self.data1), header=None)

def time_read_uint64_neg_values(self):
read_csv(StringIO(self.data2), header=None)

def time_read_uint64_na_values(self):
read_csv(StringIO(self.data1), header=None, na_values=self.na_values)


class write_csv_standard(object):
goal_time = 0.2

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,7 @@ Bug Fixes
- Bug in ``Index`` power operations with reversed operands (:issue:`14973`)
- Bug in ``TimedeltaIndex`` addition where overflow was being allowed without error (:issue:`14816`)
- Bug in ``DataFrame`` construction in which unsigned 64-bit integer elements were being converted to objects (:issue:`14881`)
- Bug in ``pd.read_csv()`` in which unsigned 64-bit integer elements were being improperly converted to the wrong data types (:issue:`14983`)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could also call this an enhancement that read_csv now supports reading uint64 values?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMO this is a bug because read_csv should be able to handle all data types equally. In addition, both the issue and this PR have already been tagged as a bug.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It doesn't really matter whether it is a bug fix or not (or how we tag the PR), what I just wanted to say is that we can highlight it more by putting it in the enhancement section, if we think it is worth it.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, fair enough. I just realized though: #14937 (see my changes to thewhatsnew) might actually resolve this discussion?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep, indeed, we can group all uint enhancement/bug fixes together

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, so we can leave that for #14937 then. 😄

- Bug in ``astype()`` where ``inf`` values were incorrectly converted to integers. Now raises error now with ``astype()`` for Series and DataFrames (:issue:`14265`)
- Bug in ``DataFrame(..).apply(to_numeric)`` when values are of type decimal.Decimal. (:issue:`14827`)
- Bug in ``describe()`` when passing a numpy array which does not contain the median to the ``percentiles`` keyword argument (:issue:`14908`)
Expand Down
26 changes: 18 additions & 8 deletions pandas/io/tests/parser/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -921,29 +921,39 @@ def test_int64_overflow(self):
self.assertRaises(OverflowError, self.read_csv,
StringIO(data), converters={'ID': conv})

# These numbers fall right inside the int64 range,
# These numbers fall right inside the int64-uint64 range,
# so they should be parsed as string.
ui_max = np.iinfo(np.uint64).max
i_max = np.iinfo(np.int64).max
i_min = np.iinfo(np.int64).min

for x in [i_max, i_min]:
for x in [i_max, i_min, ui_max]:
result = self.read_csv(StringIO(str(x)), header=None)
expected = DataFrame([x])
tm.assert_frame_equal(result, expected)

# These numbers fall just outside the int64 range,
# These numbers fall just outside the int64-uint64 range,
# so they should be parsed as string.
too_big = i_max + 1
too_big = ui_max + 1
too_small = i_min - 1

for x in [too_big, too_small]:
result = self.read_csv(StringIO(str(x)), header=None)
if self.engine == 'python' and x == too_big:
expected = DataFrame([x])
else:
expected = DataFrame([str(x)])
expected = DataFrame([str(x)])
tm.assert_frame_equal(result, expected)

# No numerical dtype can hold both negative and uint64 values,
# so they should be cast as string.
data = '-1\n' + str(2**63)
expected = DataFrame([str(-1), str(2**63)])
result = self.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(result, expected)

data = str(2**63) + '\n-1'
expected = DataFrame([str(2**63), str(-1)])
result = self.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(result, expected)

def test_empty_with_nrows_chunksize(self):
# see gh-9535
expected = DataFrame([], columns=['foo', 'bar'])
Expand Down
8 changes: 8 additions & 0 deletions pandas/io/tests/parser/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,3 +275,11 @@ def test_empty_dtype(self):
result = self.read_csv(StringIO(data), header=0,
dtype={'a': np.int32, 1: np.float64})
tm.assert_frame_equal(result, expected)

def test_numeric_dtype(self):
data = '0\n1'

for dt in np.typecodes['AllInteger'] + np.typecodes['Float']:
expected = pd.DataFrame([0, 1], dtype=dt)
result = self.read_csv(StringIO(data), header=None, dtype=dt)
tm.assert_frame_equal(expected, result)
14 changes: 14 additions & 0 deletions pandas/io/tests/parser/na_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,3 +289,17 @@ def test_na_values_dict_col_index(self):
out = self.read_csv(StringIO(data), na_values=na_values)
expected = DataFrame({'a': [np.nan, 1]})
tm.assert_frame_equal(out, expected)

def test_na_values_uint64(self):
# see gh-14983

na_values = [2**63]
data = str(2**63) + '\n' + str(2**63 + 1)
expected = DataFrame([str(2**63), str(2**63 + 1)])
out = self.read_csv(StringIO(data), header=None, na_values=na_values)
tm.assert_frame_equal(out, expected)

data = str(2**63) + ',1' + '\n,2'
expected = DataFrame([[str(2**63), 1], ['', 2]])
out = self.read_csv(StringIO(data), header=None)
tm.assert_frame_equal(out, expected)
108 changes: 101 additions & 7 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,14 @@ cdef extern from "parser/tokenizer.h":
int *line_start
int col

ctypedef struct uint_state:
int seen_sint
int seen_uint
int seen_null

void uint_state_init(uint_state *self)
int uint64_conflict(uint_state *self)

void coliter_setup(coliter_t *it, parser_t *parser, int i, int start) nogil
void COLITER_NEXT(coliter_t, const char *) nogil

Expand All @@ -217,7 +225,8 @@ cdef extern from "parser/tokenizer.h":

int64_t str_to_int64(char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep) nogil
# uint64_t str_to_uint64(char *p_item, uint64_t uint_max, int *error)
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) nogil

double xstrtod(const char *p, char **q, char decimal, char sci,
char tsep, int skip_trailing) nogil
Expand Down Expand Up @@ -1127,6 +1136,14 @@ cdef class TextReader:
try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_flist)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype('object'), i, start, end, 0,
0, na_hashset, na_flist)
except OverflowError:
col_res, na_count = self._convert_with_dtype(
np.dtype('object'), i, start, end, na_filter,
Expand Down Expand Up @@ -1164,12 +1181,17 @@ cdef class TextReader:
kh_str_t *na_hashset,
object na_flist):
if is_integer_dtype(dtype):
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError("Integer column has NA values in "
"column {column}".format(column=i))
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError("Integer column has NA values in "
"column {column}".format(column=i))
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_count = 0

if result is not None and dtype != 'int64':
result = result.astype(dtype)
Expand Down Expand Up @@ -1750,6 +1772,78 @@ cdef inline int _try_double_nogil(parser_t *parser, int col,

return 0

cdef _try_uint64(parser_t *parser, int col, int line_start, int line_end,
bint na_filter, kh_str_t *na_hashset):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

prob for another PR, but we should generate these try_* routines with tempita (if possible)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's possible I imagine but not as nice since the implementations are definitely customized to the dtype (e.g. note how I did not pass in any NA value for uint64, but the int64 implementation does)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, if it seems worth it

cdef:
int error
size_t i, lines
coliter_t it
uint64_t *data
ndarray result
khiter_t k
uint_state state

lines = line_end - line_start
result = np.empty(lines, dtype=np.uint64)
data = <uint64_t *> result.data

uint_state_init(&state)
coliter_setup(&it, parser, col, line_start)
with nogil:
error = _try_uint64_nogil(parser, col, line_start, line_end,
na_filter, na_hashset, data, &state)
if error != 0:
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError('Overflow')
return None

if uint64_conflict(&state):
raise ValueError('Cannot convert to numerical dtype')

if state.seen_sint:
raise OverflowError('Overflow')

return result

cdef inline int _try_uint64_nogil(parser_t *parser, int col, int line_start,
int line_end, bint na_filter,
const kh_str_t *na_hashset,
uint64_t *data, uint_state *state) nogil:
cdef:
int error
size_t i
size_t lines = line_end - line_start
coliter_t it
const char *word = NULL
khiter_t k

coliter_setup(&it, parser, col, line_start)

if na_filter:
for i in range(lines):
COLITER_NEXT(it, word)
k = kh_get_str(na_hashset, word)
# in the hash table
if k != na_hashset.n_buckets:
state.seen_null = 1
data[i] = 0
continue

data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
&error, parser.thousands)
if error != 0:
return error
else:
for i in range(lines):
COLITER_NEXT(it, word)
data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
&error, parser.thousands)
if error != 0:
return error

return 0

cdef _try_int64(parser_t *parser, int col, int line_start, int line_end,
bint na_filter, kh_str_t *na_hashset):
cdef:
Expand Down
95 changes: 95 additions & 0 deletions pandas/src/parser/tokenizer.c
Original file line number Diff line number Diff line change
Expand Up @@ -1757,6 +1757,16 @@ double round_trip(const char *p, char **q, char decimal, char sci, char tsep,
// End of xstrtod code
// ---------------------------------------------------------------------------

void uint_state_init(uint_state *self) {
self->seen_sint = 0;
self->seen_uint = 0;
self->seen_null = 0;
}

int uint64_conflict(uint_state *self) {
return self->seen_uint && (self->seen_sint || self->seen_null);
}

int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep) {
const char *p = (const char *)p_item;
Expand Down Expand Up @@ -1876,3 +1886,88 @@ int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
*error = 0;
return number;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe generate some code here too (for another PR)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, possible but not as nice because the implementations are definitely customized to the dtype (e.g. compare the difference in handling for "negative numbers" for int64 and uint64).

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) {
const char *p = (const char *)p_item;
uint64_t pre_max = uint_max / 10;
int dig_pre_max = uint_max % 10;
uint64_t number = 0;
int d;

// Skip leading spaces.
while (isspace(*p)) {
++p;
}

// Handle sign.
if (*p == '-') {
state->seen_sint = 1;
*error = 0;
return 0;
} else if (*p == '+') {
p++;
}

// Check that there is a first digit.
if (!isdigit(*p)) {
// Error...
*error = ERROR_NO_DIGITS;
return 0;
}

// If number is less than pre_max, at least one more digit
// can be processed without overflowing.
//
// Process the digits.
d = *p;
if (tsep != '\0') {
while (1) {
if (d == tsep) {
d = *++p;
continue;
} else if (!isdigit(d)) {
break;
}
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
} else {
while (isdigit(d)) {
if ((number < pre_max) ||
((number == pre_max) && (d - '0' <= dig_pre_max))) {
number = number * 10 + (d - '0');
d = *++p;

} else {
*error = ERROR_OVERFLOW;
return 0;
}
}
}

// Skip trailing spaces.
while (isspace(*p)) {
++p;
}

// Did we use up all the characters?
if (*p) {
*error = ERROR_INVALID_CHARS;
return 0;
}

if (number > int_max) {
state->seen_uint = 1;
}

*error = 0;
return number;
}
13 changes: 12 additions & 1 deletion pandas/src/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ See LICENSE for the license
#define ERROR_NO_DIGITS 1
#define ERROR_OVERFLOW 2
#define ERROR_INVALID_CHARS 3
#define ERROR_MINUS_SIGN 4

#include "../headers/stdint.h"

Expand Down Expand Up @@ -250,6 +249,18 @@ int tokenize_all_rows(parser_t *self);
// Have parsed / type-converted a chunk of data
// and want to free memory from the token stream

typedef struct uint_state {
int seen_sint;
int seen_uint;
int seen_null;
} uint_state;

void uint_state_init(uint_state *self);

int uint64_conflict(uint_state *self);

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep);
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
Expand Down