Skip to content

Commit 900a552

Browse files
committed
ENH: handle ragged CSV files nicely when specifying explicit list of column names. close #2981
1 parent 2cab493 commit 900a552

File tree

5 files changed

+53
-11
lines changed

5 files changed

+53
-11
lines changed

RELEASE.rst

+4
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ pandas 0.11.0
119119
- Improved performance across several core functions by taking memory
120120
ordering of arrays into account. Courtesy of @stephenwlin (GH3130_)
121121
- Improved performance of groupby transform method (GH2121_)
122+
- Handle "ragged" CSV files missing trailing delimiters in rows with missing
123+
fields when also providing explicit list of column names (so the parser
124+
knows how many columns to expect in the result) (GH2981_)
122125

123126
**API Changes**
124127

@@ -304,6 +307,7 @@ pandas 0.11.0
304307
.. _GH2932: https://github.com/pydata/pandas/issues/2932
305308
.. _GH2973: https://github.com/pydata/pandas/issues/2973
306309
.. _GH2967: https://github.com/pydata/pandas/issues/2967
310+
.. _GH2981: https://github.com/pydata/pandas/issues/2981
307311
.. _GH2982: https://github.com/pydata/pandas/issues/2982
308312
.. _GH2989: https://github.com/pydata/pandas/issues/2989
309313
.. _GH2993: https://github.com/pydata/pandas/issues/2993

pandas/io/tests/test_parsers.py

+20
Original file line numberDiff line numberDiff line change
@@ -1548,6 +1548,26 @@ def test_int64_min_issues(self):
15481548

15491549
tm.assert_frame_equal(result, expected)
15501550

1551+
def test_parse_ragged_csv(self):
1552+
data = """1,2,3
1553+
1,2,3,4
1554+
1,2,3,4,5
1555+
1,2
1556+
1,2,3,4"""
1557+
1558+
nice_data = """1,2,3,,
1559+
1,2,3,4,
1560+
1,2,3,4,5
1561+
1,2,,,
1562+
1,2,3,4,"""
1563+
result = self.read_csv(StringIO(data), header=None,
1564+
names=['a', 'b', 'c', 'd', 'e'])
1565+
1566+
expected = self.read_csv(StringIO(nice_data), header=None,
1567+
names=['a', 'b', 'c', 'd', 'e'])
1568+
1569+
tm.assert_frame_equal(result, expected)
1570+
15511571

15521572
class TestPythonParser(ParserTests, unittest.TestCase):
15531573

pandas/src/parser.pyx

+16-6
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ cdef extern from "parser/tokenizer.h":
131131
int allow_embedded_newline
132132
int strict # raise exception on bad CSV */
133133

134+
int expected_fields
134135
int error_bad_lines
135136
int warn_bad_lines
136137

@@ -583,6 +584,10 @@ cdef class TextReader:
583584
header = self.names
584585

585586
elif self.names is not None:
587+
# Enforce this unless usecols
588+
if not self.has_usecols:
589+
self.parser.expected_fields = len(self.names)
590+
586591
# Names passed
587592
if self.parser.lines < 1:
588593
self._tokenize_rows(1)
@@ -605,14 +610,19 @@ cdef class TextReader:
605610
if self.parser.lines < data_line + 1:
606611
field_count = len(header)
607612
else: # not self.has_usecols:
613+
608614
field_count = self.parser.line_fields[data_line]
609615

616+
# #2981
617+
if self.names is not None:
618+
field_count = max(field_count, len(self.names))
619+
610620
passed_count = len(header)
611621

612-
if passed_count > field_count:
613-
raise CParserError('Column names have %d fields, '
614-
'data has %d fields'
615-
% (passed_count, field_count))
622+
# if passed_count > field_count:
623+
# raise CParserError('Column names have %d fields, '
624+
# 'data has %d fields'
625+
# % (passed_count, field_count))
616626

617627
if self.has_usecols:
618628
nuse = len(self.usecols)
@@ -623,8 +633,8 @@ cdef class TextReader:
623633
elif passed_count != field_count:
624634
raise ValueError('Passed header names '
625635
'mismatches usecols')
626-
# oh boy, #2442
627-
elif self.allow_leading_cols:
636+
# oh boy, #2442, #2981
637+
elif self.allow_leading_cols and passed_count < field_count:
628638
self.leading_cols = field_count - passed_count
629639

630640
return header, field_count

pandas/src/parser/tokenizer.c

+12-5
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ void parser_set_default_options(parser_t *self) {
148148
self->allow_embedded_newline = 1;
149149
self->strict = 0;
150150

151+
self->expected_fields = -1;
151152
self->error_bad_lines = 0;
152153
self->warn_bad_lines = 0;
153154

@@ -428,16 +429,19 @@ static void append_warning(parser_t *self, const char *msg) {
428429
static int end_line(parser_t *self) {
429430
int fields;
430431
khiter_t k; /* for hash set detection */
431-
int ex_fields = -1;
432+
int ex_fields = self->expected_fields;
432433
char *msg;
433434

434435
fields = self->line_fields[self->lines];
435436

436437
TRACE(("Line end, nfields: %d\n", fields));
437438

438-
439439
if (self->lines > 0) {
440-
ex_fields = self->line_fields[self->lines - 1];
440+
if (self->expected_fields >= 0) {
441+
ex_fields = self->expected_fields;
442+
} else {
443+
ex_fields = self->line_fields[self->lines - 1];
444+
}
441445
}
442446

443447
if (self->skipset != NULL) {
@@ -457,7 +461,10 @@ static int end_line(parser_t *self) {
457461
}
458462
}
459463

460-
if (!(self->lines <= self->header + 1) && fields > ex_fields) {
464+
/* printf("Line: %d, Fields: %d, Ex-fields: %d\n", self->lines, fields, ex_fields); */
465+
466+
if (!(self->lines <= self->header + 1)
467+
&& (self->expected_fields < 0 && fields > ex_fields)) {
461468
// increment file line count
462469
self->file_lines++;
463470

@@ -491,7 +498,7 @@ static int end_line(parser_t *self) {
491498
}
492499
else {
493500
/* missing trailing delimiters */
494-
if (self->lines >= self->header + 1 && self->lines > 0) {
501+
if (self->lines >= self->header + 1) {
495502
while (fields < ex_fields){
496503
end_field(self);
497504
/* printf("Prior word: %s\n", self->words[self->words_len - 2]); */

pandas/src/parser/tokenizer.h

+1
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,7 @@ typedef struct parser_t {
183183
int allow_embedded_newline;
184184
int strict; /* raise exception on bad CSV */
185185

186+
int expected_fields;
186187
int error_bad_lines;
187188
int warn_bad_lines;
188189

0 commit comments

Comments
 (0)