@@ -45,7 +45,7 @@ void coliter_setup(coliter_t *self, parser_t *parser, int i, int start) {
45
45
self -> line_start = parser -> line_start + start ;
46
46
}
47
47
48
- coliter_t * coliter_new (parser_t * self , int i ) {
48
+ coliter_t * coliter_new (register parser_t * self , int i ) {
49
49
// column i, starting at 0
50
50
coliter_t * iter = (coliter_t * )malloc (sizeof (coliter_t ));
51
51
@@ -97,7 +97,7 @@ static void *grow_buffer(void *buffer, int64_t length, int64_t *capacity,
97
97
return newbuffer ;
98
98
}
99
99
100
- void parser_set_default_options (parser_t * self ) {
100
+ void parser_set_default_options (register parser_t * self ) {
101
101
self -> decimal = '.' ;
102
102
self -> sci = 'E' ;
103
103
@@ -131,11 +131,11 @@ void parser_set_default_options(parser_t *self) {
131
131
self -> skip_footer = 0 ;
132
132
}
133
133
134
- int get_parser_memory_footprint (parser_t * self ) { return 0 ; }
134
+ int get_parser_memory_footprint (register parser_t * self ) { return 0 ; }
135
135
136
136
parser_t * parser_new () { return (parser_t * )calloc (1 , sizeof (parser_t )); }
137
137
138
- int parser_clear_data_buffers (parser_t * self ) {
138
+ int parser_clear_data_buffers (register parser_t * self ) {
139
139
free_if_not_null ((void * )& self -> stream );
140
140
free_if_not_null ((void * )& self -> words );
141
141
free_if_not_null ((void * )& self -> word_starts );
@@ -144,7 +144,7 @@ int parser_clear_data_buffers(parser_t *self) {
144
144
return 0 ;
145
145
}
146
146
147
- int parser_cleanup (parser_t * self ) {
147
+ int parser_cleanup (register parser_t * self ) {
148
148
int status = 0 ;
149
149
150
150
// XXX where to put this
@@ -170,7 +170,7 @@ int parser_cleanup(parser_t *self) {
170
170
return status ;
171
171
}
172
172
173
- int parser_init (parser_t * self ) {
173
+ int parser_init (register parser_t * self ) {
174
174
int64_t sz ;
175
175
176
176
/*
@@ -240,16 +240,16 @@ int parser_init(parser_t *self) {
240
240
return 0 ;
241
241
}
242
242
243
- void parser_free (parser_t * self ) {
243
+ void parser_free (register parser_t * self ) {
244
244
// opposite of parser_init
245
245
parser_cleanup (self );
246
246
}
247
247
248
- void parser_del (parser_t * self ) {
248
+ void parser_del (register parser_t * self ) {
249
249
free (self );
250
250
}
251
251
252
- static int make_stream_space (parser_t * self , size_t nbytes ) {
252
+ static int make_stream_space (register parser_t * self , size_t nbytes ) {
253
253
int64_t i , cap , length ;
254
254
int status ;
255
255
void * orig_ptr , * newptr ;
@@ -363,7 +363,7 @@ static int make_stream_space(parser_t *self, size_t nbytes) {
363
363
return 0 ;
364
364
}
365
365
366
- static int push_char (parser_t * self , char c ) {
366
+ static int push_char (register parser_t * self , char c ) {
367
367
TRACE (("push_char: self->stream[%zu] = %x, stream_cap=%zu\n" ,
368
368
self -> stream_len + 1 , c , self -> stream_cap ))
369
369
if (self -> stream_len >= self -> stream_cap ) {
@@ -381,7 +381,7 @@ static int push_char(parser_t *self, char c) {
381
381
return 0 ;
382
382
}
383
383
384
- int PANDAS_INLINE end_field (parser_t * self ) {
384
+ int PANDAS_INLINE end_field (register parser_t * self ) {
385
385
// XXX cruft
386
386
if (self -> words_len >= self -> words_cap ) {
387
387
TRACE (
@@ -419,7 +419,7 @@ int PANDAS_INLINE end_field(parser_t *self) {
419
419
return 0 ;
420
420
}
421
421
422
- static void append_warning (parser_t * self , const char * msg ) {
422
+ static void append_warning (register parser_t * self , const char * msg ) {
423
423
int64_t ex_length ;
424
424
int64_t length = strlen (msg );
425
425
void * newptr ;
@@ -437,7 +437,7 @@ static void append_warning(parser_t *self, const char *msg) {
437
437
}
438
438
}
439
439
440
- static int end_line (parser_t * self ) {
440
+ static int end_line (register parser_t * self ) {
441
441
char * msg ;
442
442
int64_t fields ;
443
443
int ex_fields = self -> expected_fields ;
@@ -556,7 +556,7 @@ static int end_line(parser_t *self) {
556
556
return 0 ;
557
557
}
558
558
559
- int parser_add_skiprow (parser_t * self , int64_t row ) {
559
+ int parser_add_skiprow (register parser_t * self , int64_t row ) {
560
560
khiter_t k ;
561
561
kh_int64_t * set ;
562
562
int ret = 0 ;
@@ -573,7 +573,7 @@ int parser_add_skiprow(parser_t *self, int64_t row) {
573
573
return 0 ;
574
574
}
575
575
576
- int parser_set_skipfirstnrows (parser_t * self , int64_t nrows ) {
576
+ int parser_set_skipfirstnrows (register parser_t * self , int64_t nrows ) {
577
577
// self->file_lines is zero based so subtract 1 from nrows
578
578
if (nrows > 0 ) {
579
579
self -> skip_first_N_rows = nrows - 1 ;
@@ -582,7 +582,7 @@ int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
582
582
return 0 ;
583
583
}
584
584
585
- static int parser_buffer_bytes (parser_t * self , size_t nbytes ) {
585
+ static int parser_buffer_bytes (register parser_t * self , size_t nbytes ) {
586
586
int status ;
587
587
size_t bytes_read ;
588
588
@@ -677,18 +677,16 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
677
677
#define IS_WHITESPACE (c ) ((c == ' ' || c == '\t'))
678
678
679
679
#define IS_TERMINATOR (c ) \
680
- ((self->lineterminator == '\0' && c == '\n') || \
681
- (self->lineterminator != '\0' && c == self->lineterminator))
680
+ (c == line_terminator)
682
681
683
682
#define IS_QUOTE (c ) ((c == self->quotechar && self->quoting != QUOTE_NONE))
684
683
685
684
// don't parse '\r' with a custom line terminator
686
- #define IS_CARRIAGE (c ) ((self->lineterminator == '\0' && c == '\r') )
685
+ #define IS_CARRIAGE (c ) (c == carriage_symbol )
687
686
688
- #define IS_COMMENT_CHAR (c ) \
689
- ((self->commentchar != '\0' && c == self->commentchar))
687
+ #define IS_COMMENT_CHAR (c ) (c == comment_symbol)
690
688
691
- #define IS_ESCAPE_CHAR (c ) ((self->escapechar != '\0' && c == self->escapechar) )
689
+ #define IS_ESCAPE_CHAR (c ) (c == escape_symbol )
692
690
693
691
#define IS_SKIPPABLE_SPACE (c ) \
694
692
((!self->delim_whitespace && c == ' ' && self->skipinitialspace))
@@ -710,7 +708,7 @@ static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
710
708
self->datapos += 3; \
711
709
}
712
710
713
- int skip_this_line (parser_t * self , int64_t rownum ) {
711
+ int skip_this_line (register parser_t * self , int64_t rownum ) {
714
712
int should_skip ;
715
713
PyObject * result ;
716
714
PyGILState_STATE state ;
@@ -739,13 +737,25 @@ int skip_this_line(parser_t *self, int64_t rownum) {
739
737
}
740
738
}
741
739
742
- int tokenize_bytes (parser_t * self , size_t line_limit , int64_t start_lines ) {
740
+ int tokenize_bytes (register parser_t * self ,
741
+ size_t line_limit , int64_t start_lines ) {
743
742
int64_t i , slen ;
744
743
int should_skip ;
745
744
char c ;
746
745
char * stream ;
747
746
char * buf = self -> data + self -> datapos ;
748
747
748
+ const char line_terminator = (self -> lineterminator == '\0' ) ?
749
+ '\n' : self -> lineterminator ;
750
+
751
+ // 1000 is something that couldn't fit in "char"
752
+ // thus comparing a char to it would always be "false"
753
+ const int carriage_symbol = (self -> lineterminator == '\0' ) ? '\r' : 1000 ;
754
+ const int comment_symbol = (self -> commentchar != '\0' ) ?
755
+ self -> commentchar : 1000 ;
756
+ const int escape_symbol = (self -> escapechar != '\0' ) ?
757
+ self -> escapechar : 1000 ;
758
+
749
759
if (make_stream_space (self , self -> datalen - self -> datapos ) < 0 ) {
750
760
int64_t bufsize = 100 ;
751
761
self -> error_msg = (char * )malloc (bufsize );
@@ -1149,7 +1159,7 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int64_t start_lines) {
1149
1159
return 0 ;
1150
1160
}
1151
1161
1152
- static int parser_handle_eof (parser_t * self ) {
1162
+ static int parser_handle_eof (register parser_t * self ) {
1153
1163
int64_t bufsize = 100 ;
1154
1164
1155
1165
TRACE (
@@ -1194,7 +1204,7 @@ static int parser_handle_eof(parser_t *self) {
1194
1204
return 0 ;
1195
1205
}
1196
1206
1197
- int parser_consume_rows (parser_t * self , size_t nrows ) {
1207
+ int parser_consume_rows (register parser_t * self , size_t nrows ) {
1198
1208
int64_t i , offset , word_deletions , char_count ;
1199
1209
1200
1210
if (nrows > self -> lines ) {
@@ -1250,7 +1260,7 @@ static size_t _next_pow2(size_t sz) {
1250
1260
return result ;
1251
1261
}
1252
1262
1253
- int parser_trim_buffers (parser_t * self ) {
1263
+ int parser_trim_buffers (register parser_t * self ) {
1254
1264
/*
1255
1265
Free memory
1256
1266
*/
@@ -1353,7 +1363,7 @@ int parser_trim_buffers(parser_t *self) {
1353
1363
all : tokenize all the data vs. certain number of rows
1354
1364
*/
1355
1365
1356
- int _tokenize_helper (parser_t * self , size_t nrows , int all ) {
1366
+ int _tokenize_helper (register parser_t * self , size_t nrows , int all ) {
1357
1367
int status = 0 ;
1358
1368
int64_t start_lines = self -> lines ;
1359
1369
@@ -1402,12 +1412,12 @@ int _tokenize_helper(parser_t *self, size_t nrows, int all) {
1402
1412
return status ;
1403
1413
}
1404
1414
1405
- int tokenize_nrows (parser_t * self , size_t nrows ) {
1415
+ int tokenize_nrows (register parser_t * self , size_t nrows ) {
1406
1416
int status = _tokenize_helper (self , nrows , 0 );
1407
1417
return status ;
1408
1418
}
1409
1419
1410
- int tokenize_all_rows (parser_t * self ) {
1420
+ int tokenize_all_rows (register parser_t * self ) {
1411
1421
int status = _tokenize_helper (self , -1 , 1 );
1412
1422
return status ;
1413
1423
}
@@ -1529,9 +1539,14 @@ int main(int argc, char *argv[]) {
1529
1539
// * Add tsep argument for thousands separator
1530
1540
//
1531
1541
1542
+ // pessimistic but quick assessment,
1543
+ // assuming that each decimal digit requires 4 bits to store
1544
+ const int max_int_decimal_digits = (sizeof (unsigned int ) * 8 ) / 4 ;
1545
+
1532
1546
double xstrtod (const char * str , char * * endptr , char decimal , char sci ,
1533
1547
char tsep , int skip_trailing ) {
1534
1548
double number ;
1549
+ unsigned int i_number = 0 ;
1535
1550
int exponent ;
1536
1551
int negative ;
1537
1552
char * p = (char * )str ;
@@ -1554,19 +1569,30 @@ double xstrtod(const char *str, char **endptr, char decimal, char sci,
1554
1569
p ++ ;
1555
1570
}
1556
1571
1557
- number = 0. ;
1558
1572
exponent = 0 ;
1559
1573
num_digits = 0 ;
1560
1574
num_decimals = 0 ;
1561
1575
1562
1576
// Process string of digits.
1563
- while (isdigit_ascii (* p )) {
1564
- number = number * 10. + (* p - '0' );
1577
+ while (isdigit_ascii (* p ) && num_digits <= max_int_decimal_digits ) {
1578
+ i_number = i_number * 10 + (* p - '0' );
1565
1579
p ++ ;
1566
1580
num_digits ++ ;
1567
1581
1568
1582
p += (tsep != '\0' && * p == tsep );
1569
1583
}
1584
+ number = i_number ;
1585
+
1586
+ if (num_digits > max_int_decimal_digits ) {
1587
+ // process what's left as double
1588
+ while (isdigit_ascii (* p )) {
1589
+ number = number * 10. + (* p - '0' );
1590
+ p ++ ;
1591
+ num_digits ++ ;
1592
+
1593
+ p += (tsep != '\0' && * p == tsep );
1594
+ }
1595
+ }
1570
1596
1571
1597
// Process decimal part.
1572
1598
if (* p == decimal ) {
0 commit comments