Skip to content

Commit 4c21e5c

Browse files
vnlitvinovjreback
authored andcommitted
Speed up tokenizing of a row in csv and xstrtod parsing (#25784)
1 parent 37d04a3 commit 4c21e5c

File tree

3 files changed

+75
-49
lines changed

3 files changed

+75
-49
lines changed

doc/source/whatsnew/v0.25.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ Performance Improvements
175175
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is
176176
int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`)
177177
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
178-
178+
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
179179

180180
.. _whatsnew_0250.bug_fixes:
181181

pandas/_libs/src/parser/tokenizer.c

+59-33
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) {
4545
self->line_start = parser->line_start + start;
4646
}
4747

48-
coliter_t *coliter_new(parser_t *self, int i) {
48+
coliter_t *coliter_new(register parser_t *self, int i) {
4949
// column i, starting at 0
5050
coliter_t *iter = (coliter_t *)malloc(sizeof(coliter_t));
5151

@@ -97,7 +97,7 @@ static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity,
9797
return newbuffer;
9898
}
9999

100-
void parser_set_default_options(parser_t *self) {
100+
void parser_set_default_options(register parser_t *self) {
101101
self->decimal = '.';
102102
self->sci = 'E';
103103

@@ -131,11 +131,11 @@ void parser_set_default_options(parser_t *self) {
131131
self->skip_footer = 0;
132132
}
133133

134-
int get_parser_memory_footprint(parser_t *self) { return 0; }
134+
int get_parser_memory_footprint(register parser_t *self) { return 0; }
135135

136136
parser_t *parser_new() { return (parser_t *)calloc(1, sizeof(parser_t)); }
137137

138-
int parser_clear_data_buffers(parser_t *self) {
138+
int parser_clear_data_buffers(register parser_t *self) {
139139
free_if_not_null((void *)&self->stream);
140140
free_if_not_null((void *)&self->words);
141141
free_if_not_null((void *)&self->word_starts);
@@ -144,7 +144,7 @@ int parser_clear_data_buffers(parser_t *self) {
144144
return 0;
145145
}
146146

147-
int parser_cleanup(parser_t *self) {
147+
int parser_cleanup(register parser_t *self) {
148148
int status = 0;
149149

150150
// XXX where to put this
@@ -170,7 +170,7 @@ int parser_cleanup(parser_t *self) {
170170
return status;
171171
}
172172

173-
int parser_init(parser_t *self) {
173+
int parser_init(register parser_t *self) {
174174
int64_t sz;
175175

176176
/*
@@ -240,16 +240,16 @@ int parser_init(parser_t *self) {
240240
return 0;
241241
}
242242

243-
void parser_free(parser_t *self) {
243+
void parser_free(register parser_t *self) {
244244
// opposite of parser_init
245245
parser_cleanup(self);
246246
}
247247

248-
void parser_del(parser_t *self) {
248+
void parser_del(register parser_t *self) {
249249
free(self);
250250
}
251251

252-
static int make_stream_space(parser_t *self, size_t nbytes) {
252+
static int make_stream_space(register parser_t *self, size_t nbytes) {
253253
int64_t i, cap, length;
254254
int status;
255255
void *orig_ptr, *newptr;
@@ -363,7 +363,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
363363
return 0;
364364
}
365365

366-
static int push_char(parser_t *self, char c) {
366+
static int push_char(register parser_t *self, char c) {
367367
TRACE(("push_char: self->stream[%zu] = %x, stream_cap=%zu\n",
368368
self->stream_len + 1, c, self->stream_cap))
369369
if (self->stream_len >= self->stream_cap) {
@@ -381,7 +381,7 @@ static int push_char(parser_t *self, char c) {
381381
return 0;
382382
}
383383

384-
int PANDAS_INLINE end_field(parser_t *self) {
384+
int PANDAS_INLINE end_field(register parser_t *self) {
385385
// XXX cruft
386386
if (self->words_len >= self->words_cap) {
387387
TRACE(
@@ -419,7 +419,7 @@ int PANDAS_INLINE end_field(parser_t *self) {
419419
return 0;
420420
}
421421

422-
static void append_warning(parser_t *self, const char *msg) {
422+
static void append_warning(register parser_t *self, const char *msg) {
423423
int64_t ex_length;
424424
int64_t length = strlen(msg);
425425
void *newptr;
@@ -437,7 +437,7 @@ static void append_warning(parser_t *self, const char *msg) {
437437
}
438438
}
439439

440-
static int end_line(parser_t *self) {
440+
static int end_line(register parser_t *self) {
441441
char *msg;
442442
int64_t fields;
443443
int ex_fields = self->expected_fields;
@@ -556,7 +556,7 @@ static int end_line(parser_t *self) {
556556
return 0;
557557
}
558558

559-
int parser_add_skiprow(parser_t *self, int64_t row) {
559+
int parser_add_skiprow(register parser_t *self, int64_t row) {
560560
khiter_t k;
561561
kh_int64_t *set;
562562
int ret = 0;
@@ -573,7 +573,7 @@ int parser_add_skiprow(parser_t *self, int64_t row) {
573573
return 0;
574574
}
575575

576-
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
576+
int parser_set_skipfirstnrows(register parser_t *self, int64_t nrows) {
577577
// self->file_lines is zero based so subtract 1 from nrows
578578
if (nrows > 0) {
579579
self->skip_first_N_rows = nrows - 1;
@@ -582,7 +582,7 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
582582
return 0;
583583
}
584584

585-
static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
585+
static int parser_buffer_bytes(register parser_t *self, size_t nbytes) {
586586
int status;
587587
size_t bytes_read;
588588

@@ -677,18 +677,16 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
677677
#define IS_WHITESPACE(c) ((c == ' ' || c == '\t'))
678678

679679
#define IS_TERMINATOR(c) \
680-
((self->lineterminator == '\0' && c == '\n') || \
681-
(self->lineterminator != '\0' && c == self->lineterminator))
680+
(c == line_terminator)
682681

683682
#define IS_QUOTE(c) ((c == self->quotechar && self->quoting != QUOTE_NONE))
684683

685684
// don't parse '\r' with a custom line terminator
686-
#define IS_CARRIAGE(c) ((self->lineterminator == '\0' && c == '\r'))
685+
#define IS_CARRIAGE(c) (c == carriage_symbol)
687686

688-
#define IS_COMMENT_CHAR(c) \
689-
((self->commentchar != '\0' && c == self->commentchar))
687+
#define IS_COMMENT_CHAR(c) (c == comment_symbol)
690688

691-
#define IS_ESCAPE_CHAR(c) ((self->escapechar != '\0' && c == self->escapechar))
689+
#define IS_ESCAPE_CHAR(c) (c == escape_symbol)
692690

693691
#define IS_SKIPPABLE_SPACE(c) \
694692
((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
@@ -710,7 +708,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
710708
self->datapos += 3; \
711709
}
712710

713-
int skip_this_line(parser_t *self, int64_t rownum) {
711+
int skip_this_line(register parser_t *self, int64_t rownum) {
714712
int should_skip;
715713
PyObject *result;
716714
PyGILState_STATE state;
@@ -739,13 +737,25 @@ int skip_this_line(parser_t *self, int64_t rownum) {
739737
}
740738
}
741739

742-
int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
740+
int tokenize_bytes(register parser_t *self,
741+
size_t line_limit, int64_t start_lines) {
743742
int64_t i, slen;
744743
int should_skip;
745744
char c;
746745
char *stream;
747746
char *buf = self->data + self->datapos;
748747

748+
const char line_terminator = (self->lineterminator == '\0') ?
749+
'\n' : self->lineterminator;
750+
751+
// 1000 is something that couldn't fit in "char"
752+
// thus comparing a char to it would always be "false"
753+
const int carriage_symbol = (self->lineterminator == '\0') ? '\r' : 1000;
754+
const int comment_symbol = (self->commentchar != '\0') ?
755+
self->commentchar : 1000;
756+
const int escape_symbol = (self->escapechar != '\0') ?
757+
self->escapechar : 1000;
758+
749759
if (make_stream_space(self, self->datalen - self->datapos) < 0) {
750760
int64_t bufsize = 100;
751761
self->error_msg = (char *)malloc(bufsize);
@@ -1149,7 +1159,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
11491159
return 0;
11501160
}
11511161

1152-
static int parser_handle_eof(parser_t *self) {
1162+
static int parser_handle_eof(register parser_t *self) {
11531163
int64_t bufsize = 100;
11541164

11551165
TRACE(
@@ -1194,7 +1204,7 @@ static int parser_handle_eof(parser_t *self) {
11941204
return 0;
11951205
}
11961206

1197-
int parser_consume_rows(parser_t *self, size_t nrows) {
1207+
int parser_consume_rows(register parser_t *self, size_t nrows) {
11981208
int64_t i, offset, word_deletions, char_count;
11991209

12001210
if (nrows > self->lines) {
@@ -1250,7 +1260,7 @@ static size_t _next_pow2(size_t sz) {
12501260
return result;
12511261
}
12521262

1253-
int parser_trim_buffers(parser_t *self) {
1263+
int parser_trim_buffers(register parser_t *self) {
12541264
/*
12551265
Free memory
12561266
*/
@@ -1353,7 +1363,7 @@ int parser_trim_buffers(parser_t *self) {
13531363
all : tokenize all the data vs. certain number of rows
13541364
*/
13551365

1356-
int _tokenize_helper(parser_t *self, size_t nrows, int all) {
1366+
int _tokenize_helper(register parser_t *self, size_t nrows, int all) {
13571367
int status = 0;
13581368
int64_t start_lines = self->lines;
13591369

@@ -1402,12 +1412,12 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
14021412
return status;
14031413
}
14041414

1405-
int tokenize_nrows(parser_t *self, size_t nrows) {
1415+
int tokenize_nrows(register parser_t *self, size_t nrows) {
14061416
int status = _tokenize_helper(self, nrows, 0);
14071417
return status;
14081418
}
14091419

1410-
int tokenize_all_rows(parser_t *self) {
1420+
int tokenize_all_rows(register parser_t *self) {
14111421
int status = _tokenize_helper(self, -1, 1);
14121422
return status;
14131423
}
@@ -1529,9 +1539,14 @@ int main(int argc, char *argv[]) {
15291539
// * Add tsep argument for thousands separator
15301540
//
15311541

1542+
// pessimistic but quick assessment,
1543+
// assuming that each decimal digit requires 4 bits to store
1544+
const int max_int_decimal_digits = (sizeof(unsigned int) * 8) / 4;
1545+
15321546
double xstrtod(const char *str, char **endptr, char decimal, char sci,
15331547
char tsep, int skip_trailing) {
15341548
double number;
1549+
unsigned int i_number = 0;
15351550
int exponent;
15361551
int negative;
15371552
char *p = (char *)str;
@@ -1554,19 +1569,30 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
15541569
p++;
15551570
}
15561571

1557-
number = 0.;
15581572
exponent = 0;
15591573
num_digits = 0;
15601574
num_decimals = 0;
15611575

15621576
// Process string of digits.
1563-
while (isdigit_ascii(*p)) {
1564-
number = number * 10. + (*p - '0');
1577+
while (isdigit_ascii(*p) && num_digits <= max_int_decimal_digits) {
1578+
i_number = i_number * 10 + (*p - '0');
15651579
p++;
15661580
num_digits++;
15671581

15681582
p += (tsep != '\0' && *p == tsep);
15691583
}
1584+
number = i_number;
1585+
1586+
if (num_digits > max_int_decimal_digits) {
1587+
// process what's left as double
1588+
while (isdigit_ascii(*p)) {
1589+
number = number * 10. + (*p - '0');
1590+
p++;
1591+
num_digits++;
1592+
1593+
p += (tsep != '\0' && *p == tsep);
1594+
}
1595+
}
15701596

15711597
// Process decimal part.
15721598
if (*p == decimal) {

pandas/_libs/src/parser/tokenizer.h

+15-15
Original file line numberDiff line numberDiff line change
@@ -212,35 +212,35 @@ typedef struct coliter_t {
212212
} coliter_t;
213213

214214
void coliter_setup(coliter_t *self, parser_t *parser, int i, int start);
215-
coliter_t *coliter_new(parser_t *self, int i);
215+
coliter_t *coliter_new(register parser_t *self, int i);
216216

217-
#define COLITER_NEXT(iter, word) \
218-
do { \
219-
const int64_t i = *iter.line_start++ + iter.col; \
220-
word = i < *iter.line_start ? iter.words[i] : ""; \
217+
#define COLITER_NEXT(iter, word) \
218+
do { \
219+
const int64_t i = *iter.line_start++ + iter.col; \
220+
word = i >= *iter.line_start ? "" : iter.words[i]; \
221221
} while (0)
222222

223223
parser_t *parser_new(void);
224224

225-
int parser_init(parser_t *self);
225+
int parser_init(register parser_t *self);
226226

227-
int parser_consume_rows(parser_t *self, size_t nrows);
227+
int parser_consume_rows(register parser_t *self, size_t nrows);
228228

229-
int parser_trim_buffers(parser_t *self);
229+
int parser_trim_buffers(register parser_t *self);
230230

231-
int parser_add_skiprow(parser_t *self, int64_t row);
231+
int parser_add_skiprow(register parser_t *self, int64_t row);
232232

233-
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
233+
int parser_set_skipfirstnrows(register parser_t *self, int64_t nrows);
234234

235-
void parser_free(parser_t *self);
235+
void parser_free(register parser_t *self);
236236

237-
void parser_del(parser_t *self);
237+
void parser_del(register parser_t *self);
238238

239-
void parser_set_default_options(parser_t *self);
239+
void parser_set_default_options(register parser_t *self);
240240

241-
int tokenize_nrows(parser_t *self, size_t nrows);
241+
int tokenize_nrows(register parser_t *self, size_t nrows);
242242

243-
int tokenize_all_rows(parser_t *self);
243+
int tokenize_all_rows(register parser_t *self);
244244

245245
// Have parsed / type-converted a chunk of data
246246
// and want to free memory from the token stream

0 commit comments

Comments
 (0)