Skip to content

Commit e5fe75e

Browse files
committed
Merge pull request pandas-dev#8752 from selasley/trailing_spaces_fix
Update tokenizer to fix pandas-dev#8679 pandas-dev#8661
2 parents ff0756f + 6bf83c5 commit e5fe75e

File tree

6 files changed

+135
-23
lines changed

6 files changed

+135
-23
lines changed

doc/source/whatsnew/v0.15.2.txt

+2
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ Enhancements
7474

7575
Performance
7676
~~~~~~~~~~~
77+
- Reduce memory usage when skiprows is an integer in read_csv (:issue:`8681`)
7778

7879
.. _whatsnew_0152.experimental:
7980

@@ -155,3 +156,4 @@ Bug Fixes
155156
of the level names are numbers (:issue:`8584`).
156157
- Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is
157158
not lexically sorted or unique (:issue:`7724`)
159+
- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`)

pandas/io/tests/test_parsers.py

+23
Original file line numberDiff line numberDiff line change
@@ -3048,6 +3048,29 @@ def test_comment_skiprows(self):
30483048
df = self.read_csv(StringIO(data), comment='#', skiprows=4)
30493049
tm.assert_almost_equal(df.values, expected)
30503050

3051+
def test_trailing_spaces(self):
3052+
data = """skip
3053+
random line with trailing spaces
3054+
skip
3055+
1,2,3
3056+
1,2.,4.
3057+
random line with trailing tabs\t\t\t
3058+
3059+
5.,NaN,10.0
3060+
"""
3061+
expected = pd.DataFrame([[1., 2., 4.],
3062+
[5., np.nan, 10.]])
3063+
# this should ignore six lines including lines with trailing
3064+
# whitespace and blank lines. issues 8661, 8679
3065+
df = self.read_csv(StringIO(data.replace(',', ' ')),
3066+
header=None, delim_whitespace=True,
3067+
skiprows=[0,1,2,3,5,6], skip_blank_lines=True)
3068+
tm.assert_frame_equal(df, expected)
3069+
df = self.read_table(StringIO(data.replace(',', ' ')),
3070+
header=None, delim_whitespace=True,
3071+
skiprows=[0,1,2,3,5,6], skip_blank_lines=True)
3072+
tm.assert_frame_equal(df, expected)
3073+
30513074
def test_comment_header(self):
30523075
data = """# empty
30533076
# second empty line

pandas/parser.pyx

+8-4
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ cdef extern from "parser/tokenizer.h":
8686
EAT_COMMENT
8787
EAT_LINE_COMMENT
8888
WHITESPACE_LINE
89+
SKIP_LINE
8990
FINISHED
9091

9192
enum: ERROR_OVERFLOW
@@ -158,6 +159,7 @@ cdef extern from "parser/tokenizer.h":
158159
int header_end # header row end
159160

160161
void *skipset
162+
int64_t skip_first_N_rows
161163
int skip_footer
162164
double (*converter)(const char *, char **, char, char, char, int)
163165

@@ -181,6 +183,8 @@ cdef extern from "parser/tokenizer.h":
181183
void parser_free(parser_t *self) nogil
182184
int parser_add_skiprow(parser_t *self, int64_t row)
183185

186+
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
187+
184188
void parser_set_default_options(parser_t *self)
185189

186190
int parser_consume_rows(parser_t *self, size_t nrows)
@@ -524,10 +528,10 @@ cdef class TextReader:
524528

525529
cdef _make_skiprow_set(self):
526530
if isinstance(self.skiprows, (int, np.integer)):
527-
self.skiprows = range(self.skiprows)
528-
529-
for i in self.skiprows:
530-
parser_add_skiprow(self.parser, i)
531+
parser_set_skipfirstnrows(self.parser, self.skiprows)
532+
else:
533+
for i in self.skiprows:
534+
parser_add_skiprow(self.parser, i)
531535

532536
cdef _setup_parser_source(self, source):
533537
cdef:

pandas/src/parser/tokenizer.c

+82-19
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ void parser_set_default_options(parser_t *self) {
156156
self->thousands = '\0';
157157

158158
self->skipset = NULL;
159+
self-> skip_first_N_rows = -1;
159160
self->skip_footer = 0;
160161
}
161162

@@ -444,21 +445,17 @@ static int end_line(parser_t *self) {
444445
}
445446
}
446447

447-
if (self->skipset != NULL) {
448-
k = kh_get_int64((kh_int64_t*) self->skipset, self->file_lines);
449-
450-
if (k != ((kh_int64_t*)self->skipset)->n_buckets) {
451-
TRACE(("Skipping row %d\n", self->file_lines));
452-
// increment file line count
453-
self->file_lines++;
454-
455-
// skip the tokens from this bad line
456-
self->line_start[self->lines] += fields;
448+
if (self->state == SKIP_LINE) {
449+
TRACE(("Skipping row %d\n", self->file_lines));
450+
// increment file line count
451+
self->file_lines++;
452+
453+
// skip the tokens from this bad line
454+
self->line_start[self->lines] += fields;
457455

458-
// reset field count
459-
self->line_fields[self->lines] = 0;
460-
return 0;
461-
}
456+
// reset field count
457+
self->line_fields[self->lines] = 0;
458+
return 0;
462459
}
463460

464461
/* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
@@ -556,6 +553,15 @@ int parser_add_skiprow(parser_t *self, int64_t row) {
556553
return 0;
557554
}
558555

556+
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows) {
557+
// self->file_lines is zero based so subtract 1 from nrows
558+
if (nrows > 0) {
559+
self->skip_first_N_rows = nrows - 1;
560+
}
561+
562+
return 0;
563+
}
564+
559565
static int parser_buffer_bytes(parser_t *self, size_t nbytes) {
560566
int status;
561567
size_t bytes_read;
@@ -656,6 +662,15 @@ typedef int (*parser_op)(parser_t *self, size_t line_limit);
656662
TRACE(("datapos: %d, datalen: %d\n", self->datapos, self->datalen));
657663

658664

665+
int skip_this_line(parser_t *self, int64_t rownum) {
666+
if (self->skipset != NULL) {
667+
return ( kh_get_int64((kh_int64_t*) self->skipset, self->file_lines) !=
668+
((kh_int64_t*)self->skipset)->n_buckets );
669+
}
670+
else {
671+
return ( rownum <= self->skip_first_N_rows );
672+
}
673+
}
659674

660675
int tokenize_delimited(parser_t *self, size_t line_limit)
661676
{
@@ -688,10 +703,25 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
688703

689704
switch(self->state) {
690705

706+
case SKIP_LINE:
707+
// TRACE(("tokenize_delimited SKIP_LINE %c, state %d\n", c, self->state));
708+
if (c == '\n') {
709+
END_LINE();
710+
}
711+
break;
712+
691713
case START_RECORD:
692714
// start of record
693-
694-
if (c == '\n') {
715+
if (skip_this_line(self, self->file_lines)) {
716+
if (c == '\n') {
717+
END_LINE()
718+
}
719+
else {
720+
self->state = SKIP_LINE;
721+
}
722+
break;
723+
}
724+
else if (c == '\n') {
695725
// \n\r possible?
696726
if (self->skip_empty_lines)
697727
{
@@ -1006,9 +1036,26 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
10061036
self->state));
10071037

10081038
switch(self->state) {
1039+
1040+
case SKIP_LINE:
1041+
// TRACE(("tokenize_delim_customterm SKIP_LINE %c, state %d\n", c, self->state));
1042+
if (c == self->lineterminator) {
1043+
END_LINE();
1044+
}
1045+
break;
1046+
10091047
case START_RECORD:
10101048
// start of record
1011-
if (c == self->lineterminator) {
1049+
if (skip_this_line(self, self->file_lines)) {
1050+
if (c == self->lineterminator) {
1051+
END_LINE()
1052+
}
1053+
else {
1054+
self->state = SKIP_LINE;
1055+
}
1056+
break;
1057+
}
1058+
else if (c == self->lineterminator) {
10121059
// \n\r possible?
10131060
if (self->skip_empty_lines)
10141061
{
@@ -1252,6 +1299,14 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
12521299
self->state));
12531300

12541301
switch(self->state) {
1302+
1303+
case SKIP_LINE:
1304+
// TRACE(("tokenize_whitespace SKIP_LINE %c, state %d\n", c, self->state));
1305+
if (c == '\n') {
1306+
END_LINE();
1307+
}
1308+
break;
1309+
12551310
case WHITESPACE_LINE:
12561311
if (c == '\n') {
12571312
self->file_lines++;
@@ -1283,9 +1338,17 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
12831338

12841339
case START_RECORD:
12851340
// start of record
1286-
if (c == '\n') {
1287-
// \n\r possible?
1341+
if (skip_this_line(self, self->file_lines)) {
1342+
if (c == '\n') {
1343+
END_LINE()
1344+
}
1345+
else {
1346+
self->state = SKIP_LINE;
1347+
}
1348+
break;
1349+
} else if (c == '\n') {
12881350
if (self->skip_empty_lines)
1351+
// \n\r possible?
12891352
{
12901353
self->file_lines++;
12911354
}

pandas/src/parser/tokenizer.h

+4
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ typedef enum {
127127
EAT_COMMENT,
128128
EAT_LINE_COMMENT,
129129
WHITESPACE_LINE,
130+
SKIP_LINE,
130131
FINISHED
131132
} ParserState;
132133

@@ -203,6 +204,7 @@ typedef struct parser_t {
203204
int header_end; // header row end
204205

205206
void *skipset;
207+
int64_t skip_first_N_rows;
206208
int skip_footer;
207209
double (*converter)(const char *, char **, char, char, char, int);
208210

@@ -240,6 +242,8 @@ int parser_trim_buffers(parser_t *self);
240242

241243
int parser_add_skiprow(parser_t *self, int64_t row);
242244

245+
int parser_set_skipfirstnrows(parser_t *self, int64_t nrows);
246+
243247
void parser_free(parser_t *self);
244248

245249
void parser_set_default_options(parser_t *self);

vb_suite/io_bench.py

+16
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,22 @@
2121
read_csv_standard = Benchmark("read_csv('__test__.csv')", setup1,
2222
start_date=datetime(2011, 9, 15))
2323

24+
#----------------------------------
25+
# skiprows
26+
27+
setup1 = common_setup + """
28+
index = tm.makeStringIndex(20000)
29+
df = DataFrame({'float1' : randn(20000),
30+
'float2' : randn(20000),
31+
'string1' : ['foo'] * 20000,
32+
'bool1' : [True] * 20000,
33+
'int1' : np.random.randint(0, 200000, size=20000)},
34+
index=index)
35+
df.to_csv('__test__.csv')
36+
"""
37+
38+
read_csv_skiprows = Benchmark("read_csv('__test__.csv', skiprows=10000)", setup1,
39+
start_date=datetime(2011, 9, 15))
2440

2541
#----------------------------------------------------------------------
2642
# write_csv

0 commit comments

Comments
 (0)