Skip to content

Commit cd057c7

Browse files
vnlitvinovjreback
authored andcommitted
PERF: Improve performance of hash sets in read_csv (#25804)
1 parent 1f7ef05 commit cd057c7

File tree

6 files changed

+139
-97
lines changed

6 files changed

+139
-97
lines changed

doc/source/whatsnew/v0.25.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ Performance Improvements
176176
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
177177
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
178178
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
179+
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
179180

180181
.. _whatsnew_0250.bug_fixes:
181182

pandas/_libs/khash.pxd

+11
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,17 @@ cdef extern from "khash_python.h":
5656

5757
bint kh_exist_str(kh_str_t*, khiter_t) nogil
5858

59+
ctypedef struct kh_str_starts_t:
60+
kh_str_t *table
61+
int starts[256]
62+
63+
kh_str_starts_t* kh_init_str_starts() nogil
64+
khint_t kh_put_str_starts_item(kh_str_starts_t* table, char* key,
65+
int* ret) nogil
66+
khint_t kh_get_str_starts_item(kh_str_starts_t* table, char* key) nogil
67+
void kh_destroy_str_starts(kh_str_starts_t*) nogil
68+
void kh_resize_str_starts(kh_str_starts_t*, khint_t) nogil
69+
5970
ctypedef struct kh_int64_t:
6071
khint_t n_buckets, size, n_occupied, upper_bound
6172
uint32_t *flags

0 commit comments

Comments
 (0)