Skip to content

Commit 238f522

Browse files
committed
ENH: handle comments in C tokenizer. add synthetic vbenchmark. close #1204
1 parent 808c30c commit 238f522

File tree

5 files changed

+58
-11
lines changed

5 files changed

+58
-11
lines changed

pandas/io/parsers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,7 @@ def _read(filepath_or_buffer, kwds):
244244
'widths': None
245245
}
246246

247-
_c_unsupported = set(['comment', 'skip_footer'])
247+
_c_unsupported = set(['skip_footer'])
248248
_python_unsupported = set(_c_parser_defaults.keys())
249249

250250

@@ -501,7 +501,7 @@ def _clean_options(self, options, engine):
501501

502502
# C engine not supported yet
503503
if engine == 'c':
504-
if (options['comment'] or options['skip_footer'] > 0):
504+
if options['skip_footer'] > 0:
505505
engine = 'python'
506506

507507
if engine == 'c':

pandas/src/parser.pyx

+6
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ cdef extern from "parser/tokenizer.h":
7373
QUOTE_IN_QUOTED_FIELD
7474
EAT_CRNL
7575
EAT_WHITESPACE
76+
EAT_COMMENT
7677
FINISHED
7778

7879
ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
@@ -332,6 +333,11 @@ cdef class TextReader:
332333
self.parser.quotechar = ord(quotechar)
333334
self.parser.quoting = quoting
334335

336+
if comment is not None:
337+
if len(comment) > 1:
338+
raise ValueError('Only length-1 comment characters supported')
339+
self.parser.commentchar = ord(comment)
340+
335341
# error handling of bad lines
336342
self.parser.error_bad_lines = int(error_bad_lines)
337343
self.parser.warn_bad_lines = int(warn_bad_lines)

pandas/src/parser/tokenizer.c

+34
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,8 @@ int parser_init(parser_t *self) {
255255
self->error_msg = NULL;
256256
self->warn_msg = NULL;
257257

258+
self->commentchar = '\0';
259+
258260
return 0;
259261
}
260262

@@ -688,6 +690,10 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
688690
/* save empty field */
689691
END_FIELD();
690692
}
693+
else if (c == self->commentchar) {
694+
END_FIELD();
695+
self->state = EAT_COMMENT;
696+
}
691697
else {
692698
/* begin new unquoted field */
693699
if (self->quoting == QUOTE_NONNUMERIC)
@@ -726,6 +732,10 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
726732
END_FIELD();
727733
self->state = START_FIELD;
728734
}
735+
else if (c == self->commentchar) {
736+
END_FIELD();
737+
self->state = EAT_COMMENT;
738+
}
729739
else {
730740
/* normal character - save in field */
731741
PUSH_CHAR(c);
@@ -811,6 +821,14 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
811821
}
812822
break;
813823

824+
case EAT_COMMENT:
825+
if (c == '\n') {
826+
END_LINE();
827+
} else if (c == '\r') {
828+
self->state = EAT_CRNL;
829+
}
830+
break;
831+
814832
default:
815833
break;
816834

@@ -919,6 +937,10 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
919937
else if (IS_WHITESPACE(c)) {
920938
self->state = EAT_WHITESPACE;
921939
}
940+
else if (c == self->commentchar) {
941+
END_FIELD();
942+
self->state = EAT_COMMENT;
943+
}
922944
else {
923945
/* begin new unquoted field */
924946
if (self->quoting == QUOTE_NONNUMERIC)
@@ -957,6 +979,10 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
957979
END_FIELD();
958980
self->state = EAT_WHITESPACE;
959981
}
982+
else if (c == self->commentchar) {
983+
END_FIELD();
984+
self->state = EAT_COMMENT;
985+
}
960986
else {
961987
/* normal character - save in field */
962988
PUSH_CHAR(c);
@@ -1042,6 +1068,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
10421068
}
10431069
break;
10441070

1071+
case EAT_COMMENT:
1072+
if (c == '\n') {
1073+
END_LINE();
1074+
} else if (c == '\r') {
1075+
self->state = EAT_CRNL;
1076+
}
1077+
break;
1078+
10451079
default:
10461080
break;
10471081

pandas/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ typedef enum {
119119
QUOTE_IN_QUOTED_FIELD,
120120
EAT_CRNL,
121121
EAT_WHITESPACE,
122+
EAT_COMMENT,
122123
FINISHED
123124
} ParserState;
124125

vb_suite/parser.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
from datetime import datetime
33

44
common_setup = """from pandas_vb_common import *
5+
from pandas import read_csv, read_table
56
"""
67

78
setup = common_setup + """
8-
from pandas import read_csv
99
import os
1010
N = 10000
1111
K = 8
@@ -19,7 +19,6 @@
1919

2020

2121
setup = common_setup + """
22-
from pandas import read_csv
2322
import os
2423
N = 10000
2524
K = 8
@@ -35,7 +34,6 @@
3534
start_date=datetime(2012, 5, 7))
3635

3736
setup = common_setup + """
38-
from pandas import read_csv
3937
import os
4038
N = 10000
4139
K = 8
@@ -46,13 +44,22 @@
4644
df.to_csv('test.csv', sep='|')
4745
"""
4846

49-
read_csv_comment_vb = Benchmark("read_csv('test.csv', sep='|', comment='#')",
50-
setup,
51-
cleanup="os.remove('test.csv')",
52-
start_date=datetime(2012, 5, 7))
47+
read_csv_comment = Benchmark("read_csv('test.csv', sep='|', comment='#')",
48+
setup,
49+
cleanup="os.remove('test.csv')",
50+
start_date=datetime(2012, 5, 7))
51+
52+
setup = common_setup + """
53+
data = ['A,B,C']
54+
data = data + ['1,2,3 # comment'] * 100000
55+
data = '\\n'.join(data)
56+
"""
57+
58+
stmt = "read_csv(StringIO(data), comment='#')"
59+
read_csv_comment2 = Benchmark(stmt, setup,
60+
start_date=datetime(2011, 11, 1))
5361

5462
setup = common_setup + """
55-
from pandas import read_table
5663
from cStringIO import StringIO
5764
import os
5865
N = 10000
@@ -72,7 +79,6 @@
7279
read_table_multiple_date = Benchmark(cmd, setup, start_date=sdate)
7380

7481
setup = common_setup + """
75-
from pandas import read_table
7682
from cStringIO import StringIO
7783
import os
7884
N = 10000

0 commit comments

Comments
 (0)