From 995da78f9cbc0afd7f4271d822d93e0d31b3bd68 Mon Sep 17 00:00:00 2001 From: Vasilij N Litvinov Date: Thu, 7 Feb 2019 15:58:23 +0300 Subject: [PATCH 01/11] Increase hash table size to reduce hash collisions on NA_VALUES lookup --- pandas/_libs/parsers.pyx | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 18959b2d37b7f..23cb4b7c00c67 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -39,9 +39,9 @@ import pandas._libs.lib as lib from pandas._libs.khash cimport ( khiter_t, kh_str_t, kh_init_str, kh_put_str, kh_exist_str, - kh_get_str, kh_destroy_str, + kh_get_str, kh_destroy_str, kh_resize_str, kh_float64_t, kh_get_float64, kh_destroy_float64, - kh_put_float64, kh_init_float64, + kh_put_float64, kh_init_float64, kh_resize_float64, kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, kh_destroy_strbox) @@ -2106,6 +2106,13 @@ cdef kh_str_t* kset_from_list(list values) except NULL: k = kh_put_str(table, PyBytes_AsString(val), &ret) + if table.n_buckets <= 128: + # Resize the hash table to make it almost empty, this + # reduces amount of hash collisions on lookup thus + # "key not in table" case is faster. + # Note that this trades table memory footprint for lookup speed. + kh_resize_str(table, table.n_buckets * 8) + return table @@ -2126,6 +2133,9 @@ cdef kh_float64_t* kset_float64_from_list(values) except NULL: k = kh_put_float64(table, val, &ret) + if table.n_buckets <= 128: + # See reasoning in kset_from_list + kh_resize_float64(table, table.n_buckets * 8) return table From 18350511161c27bd4ee30fd47bc464183cd0fb05 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 25 Feb 2019 16:30:45 +0300 Subject: [PATCH 02/11] changed macro errno to _error variable --- pandas/_libs/parsers.pyx | 29 ++++++++++------------------- pandas/_libs/src/parser/tokenizer.c | 20 ++++++++++---------- pandas/_libs/src/parser/tokenizer.h | 4 ++-- 3 files changed, 22 insertions(+), 31 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 23cb4b7c00c67..ef2c030d9cc00 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -71,9 +71,6 @@ cdef: float64_t NEGINF = -INF -cdef extern from "errno.h": - int errno - cdef extern from "headers/portable.h": # I *think* this is here so that strcasecmp is defined on Windows # so we don't get @@ -186,7 +183,7 @@ cdef extern from "parser/tokenizer.h": int64_t skipfooter # pick one, depending on whether the converter requires GIL float64_t (*double_converter_nogil)(const char *, char **, - char, char, char, int) nogil + char, char, char, int, int *) nogil float64_t (*double_converter_withgil)(const char *, char **, char, char, char, int) @@ -237,9 +234,9 @@ cdef extern from "parser/tokenizer.h": uint64_t uint_max, int *error, char tsep) nogil float64_t xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil + char tsep, int skip_trailing, int *_error) nogil float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing) nogil + char tsep, int skip_trailing, int *_error) nogil float64_t round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) nogil @@ -1761,7 +1758,7 @@ cdef _try_double(parser_t *parser, int64_t col, assert parser.double_converter_withgil != NULL error = _try_double_nogil(parser, parser.double_converter_withgil, col, line_start, line_end, na_filter, na_hashset, use_na_flist, @@ -1775,7 +1772,7 @@ cdef _try_double(parser_t *parser, int64_t col, cdef inline int _try_double_nogil(parser_t *parser, float64_t (*double_converter)( const char *, char **, char, - char, char, int) nogil, + char, char, int, int *) nogil, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset, bint use_na_flist, @@ -1783,15 +1780,13 @@ cdef inline int _try_double_nogil(parser_t *parser, float64_t NA, float64_t *data, int *na_count) nogil: cdef: - int error, + int _error, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL char *p_end khiter_t k, k64 - global errno - na_count[0] = 0 coliter_setup(&it, parser, col, line_start) @@ -1806,16 +1801,14 @@ cdef inline int _try_double_nogil(parser_t *parser, data[0] = NA else: data[0] = double_converter(word, &p_end, parser.decimal, - parser.sci, parser.thousands, 1) - if errno != 0 or p_end[0] or p_end == word: + parser.sci, parser.thousands, 1, &_error) + if _error != 0 or p_end == word or p_end[0]: if (strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since - # the errno is never consumed. return 1 if use_na_flist: k64 = kh_get_float64(na_flist, data[0]) @@ -1827,16 +1820,14 @@ cdef inline int _try_double_nogil(parser_t *parser, for i in range(lines): COLITER_NEXT(it, word) data[0] = double_converter(word, &p_end, parser.decimal, - parser.sci, parser.thousands, 1) - if errno != 0 or p_end[0] or p_end == word: + parser.sci, parser.thousands, 1, &_error) + if _error != 0 or p_end == word or p_end[0]: if (strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0): data[0] = INF elif strcasecmp(word, cneginf) == 0: data[0] = NEGINF else: - # Just return a non-zero value since - # the errno is never consumed. return 1 data += 1 diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 1117e75aa2583..8f44cfaede5ff 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1544,7 +1544,7 @@ int main(int argc, char *argv[]) { const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; double xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing) { + char tsep, int skip_trailing, int *_error) { double number; unsigned int i_number = 0; int exponent; @@ -1555,7 +1555,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, int num_digits; int num_decimals; - errno = 0; + *_error = 0; // Skip leading whitespace. while (isspace_ascii(*p)) p++; @@ -1609,7 +1609,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits == 0) { - errno = ERANGE; + *_error = ERANGE; return 0.0; } @@ -1646,7 +1646,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { - errno = ERANGE; + *_error = ERANGE; return HUGE_VAL; } @@ -1666,7 +1666,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (number == HUGE_VAL) { - errno = ERANGE; + *_error = ERANGE; } if (skip_trailing) { @@ -1680,7 +1680,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing) { + char tsep, int skip_trailing, int *_error) { double number; int exponent; int negative; @@ -1722,7 +1722,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; - errno = 0; + *_error = 0; // Skip leading whitespace. while (isspace_ascii(*p)) p++; @@ -1772,7 +1772,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits == 0) { - errno = ERANGE; + *_error = ERANGE; return 0.0; } @@ -1809,7 +1809,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (exponent > 308) { - errno = ERANGE; + *_error = ERANGE; return HUGE_VAL; } else if (exponent > 0) { number *= e[exponent]; @@ -1822,7 +1822,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, number /= e[-exponent]; } - if (number == HUGE_VAL || number == -HUGE_VAL) errno = ERANGE; + if (number == HUGE_VAL || number == -HUGE_VAL) *_error = ERANGE; if (skip_trailing) { // Skip trailing whitespace. diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 7a0c8b536d122..640b4f7bb8179 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -260,9 +260,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing); + int skip_trailing, int *_error); double precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing); + char tsep, int skip_trailing, int *_error); double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); int to_boolean(const char *item, uint8_t *val); From 7e144b94fe6958133f64fe54bbf6add2585fd5b6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Wed, 27 Feb 2019 19:54:58 +0300 Subject: [PATCH 03/11] speed up kh_get_str by lookup table --- pandas/_libs/khash.pxd | 10 +++ pandas/_libs/parsers.pyx | 120 ++++++++++++--------------- pandas/_libs/src/klib/khash_python.h | 35 ++++++++ 3 files changed, 100 insertions(+), 65 deletions(-) diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 971a45e365586..c02e57f502721 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -56,6 +56,16 @@ cdef extern from "khash_python.h": bint kh_exist_str(kh_str_t*, khiter_t) nogil + ctypedef struct kh_str_starts_t: + kh_str_t *table + char starts[256] + + kh_str_starts_t* kh_init_str_starts() nogil + khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) nogil + khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil + void kh_destroy_str_starts(kh_str_starts_t*) nogil + void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil + ctypedef struct kh_int64_t: khint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index ef2c030d9cc00..6f491ceb66365 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -43,7 +43,8 @@ from pandas._libs.khash cimport ( kh_float64_t, kh_get_float64, kh_destroy_float64, kh_put_float64, kh_init_float64, kh_resize_float64, kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, - kh_destroy_strbox) + kh_destroy_strbox, + kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) import pandas.compat as compat from pandas.core.dtypes.common import ( @@ -282,8 +283,8 @@ cdef class TextReader: int64_t parser_start list clocks char *c_encoding - kh_str_t *false_set - kh_str_t *true_set + kh_str_starts_t *false_set + kh_str_starts_t *true_set cdef public: int64_t leading_cols, table_width, skipfooter, buffer_lines @@ -554,10 +555,10 @@ cdef class TextReader: def __dealloc__(self): parser_free(self.parser) if self.true_set: - kh_destroy_str(self.true_set) + kh_destroy_str_starts(self.true_set) self.true_set = NULL if self.false_set: - kh_destroy_str(self.false_set) + kh_destroy_str_starts(self.false_set) self.false_set = NULL parser_del(self.parser) @@ -572,10 +573,10 @@ cdef class TextReader: # also preemptively free all allocated memory parser_free(self.parser) if self.true_set: - kh_destroy_str(self.true_set) + kh_destroy_str_starts(self.true_set) self.true_set = NULL if self.false_set: - kh_destroy_str(self.false_set) + kh_destroy_str_starts(self.false_set) self.false_set = NULL def set_error_bad_lines(self, int status): @@ -1017,7 +1018,7 @@ cdef class TextReader: cdef: int64_t i int nused - kh_str_t *na_hashset = NULL + kh_str_starts_t *na_hashset = NULL int64_t start, end object name, na_flist, col_dtype = None bint na_filter = 0 @@ -1141,7 +1142,7 @@ cdef class TextReader: cdef inline _convert_tokens(self, Py_ssize_t i, int start, int end, object name, bint na_filter, - kh_str_t *na_hashset, + kh_str_starts_t *na_hashset, object na_flist, object col_dtype): if col_dtype is not None: @@ -1204,7 +1205,7 @@ cdef class TextReader: int64_t start, int64_t end, bint na_filter, bint user_dtype, - kh_str_t *na_hashset, + kh_str_starts_t *na_hashset, object na_flist): if is_categorical_dtype(dtype): # TODO: I suspect that _categorical_convert could be @@ -1301,7 +1302,7 @@ cdef class TextReader: "supported for parsing".format(dtype=dtype)) cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset): cdef StringPath path = _string_path(self.c_encoding) @@ -1360,8 +1361,8 @@ cdef class TextReader: return _ensure_encoded(self.na_values), self.na_fvalues - cdef _free_na_set(self, kh_str_t *table): - kh_destroy_str(table) + cdef _free_na_set(self, kh_str_starts_t *table): + kh_destroy_str_starts(table) cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused): cdef int64_t j @@ -1451,7 +1452,7 @@ cdef inline StringPath _string_path(char *encoding): cdef _string_box_factorize(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset): cdef: int error, na_count = 0 Py_ssize_t i, lines @@ -1476,9 +1477,8 @@ cdef _string_box_factorize(parser_t *parser, int64_t col, COLITER_NEXT(it, word) if na_filter: - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: + if kh_get_str_starts_item(na_hashset, word): + # in the hash table na_count += 1 result[i] = NA continue @@ -1505,7 +1505,7 @@ cdef _string_box_factorize(parser_t *parser, int64_t col, cdef _string_box_utf8(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset): cdef: int error, na_count = 0 Py_ssize_t i, lines @@ -1530,9 +1530,8 @@ cdef _string_box_utf8(parser_t *parser, int64_t col, COLITER_NEXT(it, word) if na_filter: - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: + if kh_get_str_starts_item(na_hashset, word): + # in the hash table na_count += 1 result[i] = NA continue @@ -1559,7 +1558,7 @@ cdef _string_box_utf8(parser_t *parser, int64_t col, cdef _string_box_decode(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset, + bint na_filter, kh_str_starts_t *na_hashset, char *encoding): cdef: int error, na_count = 0 @@ -1587,9 +1586,8 @@ cdef _string_box_decode(parser_t *parser, int64_t col, COLITER_NEXT(it, word) if na_filter: - k = kh_get_str(na_hashset, word) + if kh_get_str_starts_item(na_hashset, word): # in the hash table - if k != na_hashset.n_buckets: na_count += 1 result[i] = NA continue @@ -1618,7 +1616,7 @@ cdef _string_box_decode(parser_t *parser, int64_t col, @cython.boundscheck(False) cdef _categorical_convert(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset, + bint na_filter, kh_str_starts_t *na_hashset, char *encoding): "Convert column data into codes, categories" cdef: @@ -1651,9 +1649,8 @@ cdef _categorical_convert(parser_t *parser, int64_t col, COLITER_NEXT(it, word) if na_filter: - k = kh_get_str(na_hashset, word) + if kh_get_str_starts_item(na_hashset, word): # is in NA values - if k != na_hashset.n_buckets: na_count += 1 codes[i] = NA continue @@ -1730,7 +1727,7 @@ cdef: cdef _try_double(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset, object na_flist): + bint na_filter, kh_str_starts_t *na_hashset, object na_flist): cdef: int error, na_count = 0 Py_ssize_t i, lines @@ -1774,7 +1771,7 @@ cdef inline int _try_double_nogil(parser_t *parser, const char *, char **, char, char, char, int, int *) nogil, int col, int line_start, int line_end, - bint na_filter, kh_str_t *na_hashset, + bint na_filter, kh_str_starts_t *na_hashset, bint use_na_flist, const kh_float64_t *na_flist, float64_t NA, float64_t *data, @@ -1794,9 +1791,8 @@ cdef inline int _try_double_nogil(parser_t *parser, for i in range(lines): COLITER_NEXT(it, word) - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: + if kh_get_str_starts_item(na_hashset, word): + # in the hash table na_count[0] += 1 data[0] = NA else: @@ -1836,7 +1832,7 @@ cdef inline int _try_double_nogil(parser_t *parser, cdef _try_uint64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset): cdef: int error Py_ssize_t i, lines @@ -1873,7 +1869,7 @@ cdef _try_uint64(parser_t *parser, int64_t col, cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, - const kh_str_t *na_hashset, + const kh_str_starts_t *na_hashset, uint64_t *data, uint_state *state) nogil: cdef: int error @@ -1887,9 +1883,8 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, if na_filter: for i in range(lines): COLITER_NEXT(it, word) - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: + if kh_get_str_starts_item(na_hashset, word): + # in the hash table state.seen_null = 1 data[i] = 0 continue @@ -1911,7 +1906,7 @@ cdef inline int _try_uint64_nogil(parser_t *parser, int64_t col, cdef _try_int64(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, kh_str_t *na_hashset): + bint na_filter, kh_str_starts_t *na_hashset): cdef: int error, na_count = 0 Py_ssize_t i, lines @@ -1941,7 +1936,7 @@ cdef _try_int64(parser_t *parser, int64_t col, cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, - const kh_str_t *na_hashset, int64_t NA, + const kh_str_starts_t *na_hashset, int64_t NA, int64_t *data, int *na_count) nogil: cdef: int error @@ -1956,9 +1951,8 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, if na_filter: for i in range(lines): COLITER_NEXT(it, word) - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: + if kh_get_str_starts_item(na_hashset, word): + # in the hash table na_count[0] += 1 data[i] = NA continue @@ -1980,9 +1974,9 @@ cdef inline int _try_int64_nogil(parser_t *parser, int64_t col, cdef _try_bool_flex(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, - bint na_filter, const kh_str_t *na_hashset, - const kh_str_t *true_hashset, - const kh_str_t *false_hashset): + bint na_filter, const kh_str_starts_t *na_hashset, + const kh_str_starts_t *true_hashset, + const kh_str_starts_t *false_hashset): cdef: int error, na_count = 0 Py_ssize_t i, lines @@ -2009,9 +2003,9 @@ cdef _try_bool_flex(parser_t *parser, int64_t col, cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, int64_t line_start, int64_t line_end, bint na_filter, - const kh_str_t *na_hashset, - const kh_str_t *true_hashset, - const kh_str_t *false_hashset, + const kh_str_starts_t *na_hashset, + const kh_str_starts_t *true_hashset, + const kh_str_starts_t *false_hashset, uint8_t NA, uint8_t *data, int *na_count) nogil: cdef: @@ -2028,21 +2022,18 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, for i in range(lines): COLITER_NEXT(it, word) - k = kh_get_str(na_hashset, word) - # in the hash table - if k != na_hashset.n_buckets: + if kh_get_str_starts_item(na_hashset, word): + # in the hash table na_count[0] += 1 data[0] = NA data += 1 continue - k = kh_get_str(true_hashset, word) - if k != true_hashset.n_buckets: + if kh_get_str_starts_item(true_hashset, word): data[0] = 1 data += 1 continue - k = kh_get_str(false_hashset, word) - if k != false_hashset.n_buckets: + if kh_get_str_starts_item(false_hashset, word): data[0] = 0 data += 1 continue @@ -2055,14 +2046,12 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, for i in range(lines): COLITER_NEXT(it, word) - k = kh_get_str(true_hashset, word) - if k != true_hashset.n_buckets: + if kh_get_str_starts_item(true_hashset, word): data[0] = 1 data += 1 continue - k = kh_get_str(false_hashset, word) - if k != false_hashset.n_buckets: + if kh_get_str_starts_item(false_hashset, word): data[0] = 0 data += 1 continue @@ -2075,17 +2064,18 @@ cdef inline int _try_bool_flex_nogil(parser_t *parser, int64_t col, return 0 -cdef kh_str_t* kset_from_list(list values) except NULL: +cdef kh_str_starts_t* kset_from_list(list values) except NULL: # caller takes responsibility for freeing the hash table cdef: Py_ssize_t i khiter_t k - kh_str_t *table + kh_str_starts_t *table int ret = 0 object val - table = kh_init_str() + table = kh_init_str_starts() + for i in range(len(values)): val = values[i] @@ -2095,14 +2085,14 @@ cdef kh_str_t* kset_from_list(list values) except NULL: kh_destroy_str(table) raise ValueError('Must be all encoded bytes') - k = kh_put_str(table, PyBytes_AsString(val), &ret) + kh_put_str_starts_item(table, PyBytes_AsString(val), &ret) - if table.n_buckets <= 128: + if table.table.n_buckets <= 128: # Resize the hash table to make it almost empty, this # reduces amount of hash collisions on lookup thus # "key not in table" case is faster. # Note that this trades table memory footprint for lookup speed. - kh_resize_str(table, table.n_buckets * 8) + kh_resize_str_starts(table, table.table.n_buckets * 8) return table diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 45a93051f78d3..71e457f3192ce 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -84,3 +84,38 @@ KHASH_SET_INIT_PYOBJECT(pyset) #define kh_exist_pyset(h, k) (kh_exist(h, k)) KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) + +typedef struct { + kh_str_t *table; + char starts[256]; +} kh_str_starts_t; + +inline static kh_str_starts_t* kh_init_str_starts(void) { + kh_str_starts_t *result = (kh_str_starts_t*)calloc(1, sizeof(kh_str_starts_t)); + result->table = kh_init_str(); + return result; +} + +inline static khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) { + khint_t result = kh_put_str(table->table, key, ret); + if (*ret != 0) { + table->starts[key[0]] = 1; + } + return result; +} + +inline static khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) { + char ch = *key; + if (table->starts[ch]) { + if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; + } + return 0; +} + +inline static void kh_destroy_str_starts(kh_str_starts_t* table) {//FIXME + kh_destroy_str(table->table); +} + +inline static void kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { + kh_resize_str(table->table, val); +} \ No newline at end of file From 86031b73d5196e428802129eb1841408500e4a27 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Thu, 28 Feb 2019 14:35:10 +0300 Subject: [PATCH 04/11] fix memory leak, try to speed up xstrtod --- pandas/_libs/khash.pxd | 2 +- pandas/_libs/parsers.pyx | 16 +++++++++------- pandas/_libs/src/klib/khash_python.h | 7 ++++--- pandas/_libs/src/parser/tokenizer.c | 18 ++++++++---------- pandas/_libs/src/parser/tokenizer.h | 4 ++-- 5 files changed, 24 insertions(+), 23 deletions(-) diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index c02e57f502721..47cda6957421c 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -58,7 +58,7 @@ cdef extern from "khash_python.h": ctypedef struct kh_str_starts_t: kh_str_t *table - char starts[256] + int starts[256] kh_str_starts_t* kh_init_str_starts() nogil khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) nogil diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 6f491ceb66365..fb42921674fcb 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -235,9 +235,9 @@ cdef extern from "parser/tokenizer.h": uint64_t uint_max, int *error, char tsep) nogil float64_t xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *_error) nogil + char tsep, int skip_trailing, int *error) nogil float64_t precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *_error) nogil + char tsep, int skip_trailing, int *error) nogil float64_t round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing) nogil @@ -1777,7 +1777,7 @@ cdef inline int _try_double_nogil(parser_t *parser, float64_t NA, float64_t *data, int *na_count) nogil: cdef: - int _error, + int error = 0, Py_ssize_t i, lines = line_end - line_start coliter_t it const char *word = NULL @@ -1797,8 +1797,9 @@ cdef inline int _try_double_nogil(parser_t *parser, data[0] = NA else: data[0] = double_converter(word, &p_end, parser.decimal, - parser.sci, parser.thousands, 1, &_error) - if _error != 0 or p_end == word or p_end[0]: + parser.sci, parser.thousands, 1, &error) + if error != 0 or p_end == word or p_end[0]: + error = 0 if (strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0): data[0] = INF @@ -1816,8 +1817,9 @@ cdef inline int _try_double_nogil(parser_t *parser, for i in range(lines): COLITER_NEXT(it, word) data[0] = double_converter(word, &p_end, parser.decimal, - parser.sci, parser.thousands, 1, &_error) - if _error != 0 or p_end == word or p_end[0]: + parser.sci, parser.thousands, 1, &error) + if error != 0 or p_end == word or p_end[0]: + error = 0 if (strcasecmp(word, cinf) == 0 or strcasecmp(word, cposinf) == 0): data[0] = INF diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 71e457f3192ce..37e8534e23a9d 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -87,7 +87,7 @@ KHASH_MAP_INIT_STR(strbox, kh_pyobject_t) typedef struct { kh_str_t *table; - char starts[256]; + int starts[256]; } kh_str_starts_t; inline static kh_str_starts_t* kh_init_str_starts(void) { @@ -105,15 +105,16 @@ inline static khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, } inline static khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) { - char ch = *key; + int ch = *key; if (table->starts[ch]) { if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; } return 0; } -inline static void kh_destroy_str_starts(kh_str_starts_t* table) {//FIXME +inline static void kh_destroy_str_starts(kh_str_starts_t* table) { kh_destroy_str(table->table); + free(table); } inline static void kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 8f44cfaede5ff..712e12829a937 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -1544,7 +1544,7 @@ int main(int argc, char *argv[]) { const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4; double xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *_error) { + char tsep, int skip_trailing, int *error) { double number; unsigned int i_number = 0; int exponent; @@ -1555,7 +1555,6 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, int num_digits; int num_decimals; - *_error = 0; // Skip leading whitespace. while (isspace_ascii(*p)) p++; @@ -1609,7 +1608,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits == 0) { - *_error = ERANGE; + *error = ERANGE; return 0.0; } @@ -1646,7 +1645,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (exponent < DBL_MIN_EXP || exponent > DBL_MAX_EXP) { - *_error = ERANGE; + *error = ERANGE; return HUGE_VAL; } @@ -1666,7 +1665,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } if (number == HUGE_VAL) { - *_error = ERANGE; + *error = ERANGE; } if (skip_trailing) { @@ -1680,7 +1679,7 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci, } double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, - char tsep, int skip_trailing, int *_error) { + char tsep, int skip_trailing, int *error) { double number; int exponent; int negative; @@ -1722,7 +1721,6 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; - *_error = 0; // Skip leading whitespace. while (isspace_ascii(*p)) p++; @@ -1772,7 +1770,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (num_digits == 0) { - *_error = ERANGE; + *error = ERANGE; return 0.0; } @@ -1809,7 +1807,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, } if (exponent > 308) { - *_error = ERANGE; + *error = ERANGE; return HUGE_VAL; } else if (exponent > 0) { number *= e[exponent]; @@ -1822,7 +1820,7 @@ double precise_xstrtod(const char *str, char **endptr, char decimal, char sci, number /= e[-exponent]; } - if (number == HUGE_VAL || number == -HUGE_VAL) *_error = ERANGE; + if (number == HUGE_VAL || number == -HUGE_VAL) *error = ERANGE; if (skip_trailing) { // Skip trailing whitespace. diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index 640b4f7bb8179..70d7e4a3aff40 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -260,9 +260,9 @@ uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max, int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max, int *error, char tsep); double xstrtod(const char *p, char **q, char decimal, char sci, char tsep, - int skip_trailing, int *_error); + int skip_trailing, int *error); double precise_xstrtod(const char *p, char **q, char decimal, char sci, - char tsep, int skip_trailing, int *_error); + char tsep, int skip_trailing, int *error); double round_trip(const char *p, char **q, char decimal, char sci, char tsep, int skip_trailing); int to_boolean(const char *item, uint8_t *val); From 391d0a7f2d01ae834586f51a02cca67b474bf5b6 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 4 Mar 2019 13:07:42 +0300 Subject: [PATCH 05/11] more clean code in khash.pxd, parsers.pyx --- pandas/_libs/khash.pxd | 3 ++- pandas/_libs/parsers.pyx | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 47cda6957421c..7300a99141578 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -61,7 +61,8 @@ cdef extern from "khash_python.h": int starts[256] kh_str_starts_t* kh_init_str_starts() nogil - khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) nogil + khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, + int* ret) nogil khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index fb42921674fcb..91252c549a923 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -44,7 +44,8 @@ from pandas._libs.khash cimport ( kh_put_float64, kh_init_float64, kh_resize_float64, kh_strbox_t, kh_put_strbox, kh_get_strbox, kh_init_strbox, kh_destroy_strbox, - kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) + kh_str_starts_t, kh_put_str_starts_item, kh_init_str_starts, + kh_get_str_starts_item, kh_destroy_str_starts, kh_resize_str_starts) import pandas.compat as compat from pandas.core.dtypes.common import ( From 143deb1344f6a933dbaa47360a2078c4f5adc042 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 5 Mar 2019 13:26:16 +0300 Subject: [PATCH 06/11] fix indents --- pandas/_libs/khash.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index 7300a99141578..c52dacd37f955 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -62,7 +62,7 @@ cdef extern from "khash_python.h": kh_str_starts_t* kh_init_str_starts() nogil khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, - int* ret) nogil + int* ret) nogil khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil void kh_destroy_str_starts(kh_str_starts_t*) nogil void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil From 1e25c6b3f3662ca362cb819fecc3f874e3f6c685 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Tue, 5 Mar 2019 15:22:58 +0300 Subject: [PATCH 07/11] Correct destroying of NA set table --- pandas/_libs/parsers.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 91252c549a923..8361958974232 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -2085,7 +2085,7 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: # None creeps in sometimes, which isn't possible here if not isinstance(val, bytes): - kh_destroy_str(table) + kh_destroy_str_starts(table) raise ValueError('Must be all encoded bytes') kh_put_str_starts_item(table, PyBytes_AsString(val), &ret) From 16b5922572f5fb1b31adb7bafeebff8847cb7dc4 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Mon, 11 Mar 2019 17:05:55 +0300 Subject: [PATCH 08/11] cast first char symbol to unsigned char for functions for kh_str_starts_t khash struct --- pandas/_libs/src/klib/khash_python.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index 37e8534e23a9d..e322d0734afc6 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -99,13 +99,13 @@ inline static kh_str_starts_t* kh_init_str_starts(void) { inline static khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) { khint_t result = kh_put_str(table->table, key, ret); if (*ret != 0) { - table->starts[key[0]] = 1; + table->starts[(unsigned char)key[0]] = 1; } return result; } inline static khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) { - int ch = *key; + unsigned char ch = *key; if (table->starts[ch]) { if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; } From 0755c64f9539f108944bb8ccc56449dc6f024343 Mon Sep 17 00:00:00 2001 From: Anatoly Myachev Date: Fri, 15 Mar 2019 13:59:09 +0300 Subject: [PATCH 09/11] Use PANDAS_INLINE instead of plain "static inline" --- pandas/_libs/src/klib/khash_python.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index e322d0734afc6..a81f9785ebe64 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -90,13 +90,15 @@ typedef struct { int starts[256]; } kh_str_starts_t; -inline static kh_str_starts_t* kh_init_str_starts(void) { +typedef kh_str_starts_t* p_kh_str_starts_t; + +p_kh_str_starts_t PANDAS_INLINE kh_init_str_starts(void) { kh_str_starts_t *result = (kh_str_starts_t*)calloc(1, sizeof(kh_str_starts_t)); result->table = kh_init_str(); return result; } -inline static khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) { +khint_t PANDAS_INLINE kh_put_str_starts_item(kh_str_starts_t* table, char* key, int* ret) { khint_t result = kh_put_str(table->table, key, ret); if (*ret != 0) { table->starts[(unsigned char)key[0]] = 1; @@ -104,7 +106,7 @@ inline static khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key, return result; } -inline static khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) { +khint_t PANDAS_INLINE kh_get_str_starts_item(kh_str_starts_t* table, char* key) { unsigned char ch = *key; if (table->starts[ch]) { if (ch == '\0' || kh_get_str(table->table, key) != table->table->n_buckets) return 1; @@ -112,11 +114,11 @@ inline static khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) return 0; } -inline static void kh_destroy_str_starts(kh_str_starts_t* table) { +void PANDAS_INLINE kh_destroy_str_starts(kh_str_starts_t* table) { kh_destroy_str(table->table); free(table); } -inline static void kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { +void PANDAS_INLINE kh_resize_str_starts(kh_str_starts_t* table, khint_t val) { kh_resize_str(table->table, val); } \ No newline at end of file From da9d4e64bffa234171979eddc82be5564902cdfa Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 20 Mar 2019 19:14:46 +0300 Subject: [PATCH 10/11] Fix linting errors --- pandas/_libs/parsers.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 8361958974232..88bb00cb54319 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1798,7 +1798,8 @@ cdef inline int _try_double_nogil(parser_t *parser, data[0] = NA else: data[0] = double_converter(word, &p_end, parser.decimal, - parser.sci, parser.thousands, 1, &error) + parser.sci, parser.thousands, + 1, &error) if error != 0 or p_end == word or p_end[0]: error = 0 if (strcasecmp(word, cinf) == 0 or @@ -2079,7 +2080,6 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL: table = kh_init_str_starts() - for i in range(len(values)): val = values[i] From 2ceb8501723bd0e96e2bb2fac543cae8c85553df Mon Sep 17 00:00:00 2001 From: Vasily Litvinov Date: Wed, 20 Mar 2019 23:14:50 +0300 Subject: [PATCH 11/11] Added whatsnew entry --- doc/source/whatsnew/v0.25.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 2ed2c21ba5584..0d68826034afa 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -176,6 +176,7 @@ Performance Improvements int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) - Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`) - Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`) +- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`) .. _whatsnew_0250.bug_fixes: