Skip to content

Commit 7859063

Browse files
committed
ENH: missing trailing fields become NA, and tested to verify that's the case. close #2430
1 parent b0a19c8 commit 7859063

File tree

3 files changed

+28
-7
lines changed

3 files changed

+28
-7
lines changed

pandas/io/tests/test_cparser.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -137,12 +137,13 @@ def test_integer_thousands(self):
137137
tm.assert_almost_equal(result[0], expected)
138138

139139
def test_skip_bad_lines(self):
140+
# too many lines, see #2430 for why
140141
data = ('a:b:c\n'
141142
'd:e:f\n'
142143
'g:h:i\n'
143-
'j:k\n'
144+
'j:k:l:m\n'
144145
'l:m:n\n'
145-
'o:p')
146+
'o:p:q:r')
146147

147148
reader = TextReader(StringIO(data), delimiter=':',
148149
header=None)

pandas/io/tests/test_parsers.py

+14-3
Original file line numberDiff line numberDiff line change
@@ -720,10 +720,11 @@ def test_read_table_unicode(self):
720720
self.assert_(isinstance(df1['X0'].values[0], unicode))
721721

722722
def test_read_table_wrong_num_columns(self):
723+
# too few!
723724
data = """A,B,C,D,E,F
724-
1,2,3,4,5
725-
6,7,8,9,10
726-
11,12,13,14,15
725+
1,2,3,4,5,6
726+
6,7,8,9,10,11,12
727+
11,12,13,14,15,16
727728
"""
728729
self.assertRaises(Exception, self.read_csv, StringIO(data))
729730

@@ -1302,6 +1303,15 @@ def test_nonexistent_path(self):
13021303
path = '%s.csv' % tm.rands(10)
13031304
self.assertRaises(Exception, self.read_csv, path)
13041305

1306+
def test_missing_trailing_delimiters(self):
1307+
data = """A,B,C,D
1308+
1,2,3,4
1309+
1,3,3,
1310+
1,4,5"""
1311+
result = self.read_csv(StringIO(data))
1312+
self.assertTrue(result['D'].isnull()[1:].all())
1313+
1314+
13051315
class TestPythonParser(ParserTests, unittest.TestCase):
13061316

13071317
def read_csv(self, *args, **kwds):
@@ -1645,6 +1655,7 @@ def test_utf16_example(self):
16451655
result = self.read_table(buf, encoding='utf-16')
16461656
self.assertEquals(len(result), 50)
16471657

1658+
16481659
class TestCParserHighMemory(ParserTests, unittest.TestCase):
16491660

16501661
def read_csv(self, *args, **kwds):

pandas/src/parser/tokenizer.c

+11-2
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ static int end_line(parser_t *self) {
447447
}
448448
}
449449

450-
if (!(self->lines <= self->header + 1) && fields != ex_fields) {
450+
if (!(self->lines <= self->header + 1) && fields > ex_fields) {
451451
// increment file line count
452452
self->file_lines++;
453453

@@ -478,7 +478,16 @@ static int end_line(parser_t *self) {
478478
free(msg);
479479
}
480480
}
481-
} else {
481+
}
482+
else {
483+
/* missing trailing delimiters */
484+
if (self->lines >= self->header + 1 && self->lines > 0) {
485+
while (fields < ex_fields){
486+
end_field(self);
487+
fields++;
488+
}
489+
}
490+
482491
// increment both line counts
483492
self->file_lines++;
484493

0 commit comments

Comments
 (0)