Skip to content

Commit 639ce04

Browse files
committed
BUG: read_csv skips lines with initial whitespace + one non-space character
1 parent 9e4e447 commit 639ce04

File tree

3 files changed

+63
-6
lines changed

3 files changed

+63
-6
lines changed

doc/source/whatsnew/v0.16.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -95,4 +95,7 @@ Bug Fixes
9595
- Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`)
9696

9797
- Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`)
98+
9899
- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`)
100+
101+
- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)

pandas/io/tests/test_parsers.py

+52
Original file line numberDiff line numberDiff line change
@@ -2253,6 +2253,20 @@ def test_nrows_and_chunksize_raises_notimplemented(self):
22532253
self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
22542254
nrows=10, chunksize=5)
22552255

2256+
def test_single_char_leading_whitespace(self):
2257+
# GH 9710
2258+
data = """\
2259+
MyColumn
2260+
a
2261+
b
2262+
a
2263+
b\n"""
2264+
2265+
expected = DataFrame({'MyColumn' : list('abab')})
2266+
2267+
result = self.read_csv(StringIO(data), skipinitialspace=True)
2268+
tm.assert_frame_equal(result, expected)
2269+
22562270

22572271
class TestPythonParser(ParserTests, tm.TestCase):
22582272
def test_negative_skipfooter_raises(self):
@@ -3293,6 +3307,25 @@ def test_buffer_overflow(self):
32933307
except Exception as cperr:
32943308
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
32953309

3310+
def test_single_char_leading_whitespace(self):
3311+
# GH 9710
3312+
data = """\
3313+
MyColumn
3314+
a
3315+
b
3316+
a
3317+
b\n"""
3318+
3319+
expected = DataFrame({'MyColumn' : list('abab')})
3320+
3321+
result = self.read_csv(StringIO(data), delim_whitespace=True,
3322+
skipinitialspace=True)
3323+
tm.assert_frame_equal(result, expected)
3324+
3325+
result = self.read_csv(StringIO(data), lineterminator='\n',
3326+
skipinitialspace=True)
3327+
tm.assert_frame_equal(result, expected)
3328+
32963329
class TestCParserLowMemory(ParserTests, tm.TestCase):
32973330

32983331
def read_csv(self, *args, **kwds):
@@ -3714,6 +3747,25 @@ def test_buffer_overflow(self):
37143747
except Exception as cperr:
37153748
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
37163749

3750+
def test_single_char_leading_whitespace(self):
3751+
# GH 9710
3752+
data = """\
3753+
MyColumn
3754+
a
3755+
b
3756+
a
3757+
b\n"""
3758+
3759+
expected = DataFrame({'MyColumn' : list('abab')})
3760+
3761+
result = self.read_csv(StringIO(data), delim_whitespace=True,
3762+
skipinitialspace=True)
3763+
tm.assert_frame_equal(result, expected)
3764+
3765+
result = self.read_csv(StringIO(data), lineterminator='\n',
3766+
skipinitialspace=True)
3767+
tm.assert_frame_equal(result, expected)
3768+
37173769
class TestMiscellaneous(tm.TestCase):
37183770

37193771
# for tests that don't fit into any of the other classes, e.g. those that

pandas/src/parser/tokenizer.c

+8-6
Original file line numberDiff line numberDiff line change
@@ -851,10 +851,11 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
851851
;
852852
else { // backtrack
853853
/* We have to use i + 1 because buf has been incremented but not i */
854-
while (i + 1 > self->datapos && *buf != '\n') {
854+
do {
855855
--buf;
856856
--i;
857-
}
857+
} while (i + 1 > self->datapos && *buf != '\n');
858+
858859
if (i + 1 > self->datapos) // reached a newline rather than the beginning
859860
{
860861
++buf; // move pointer to first char after newline
@@ -1075,7 +1076,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
10751076
// Next character in file
10761077
c = *buf++;
10771078

1078-
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
1079+
TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n",
10791080
i, c, self->file_lines + 1, self->line_fields[self->lines],
10801081
self->state));
10811082

@@ -1168,10 +1169,11 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
11681169
;
11691170
else { // backtrack
11701171
/* We have to use i + 1 because buf has been incremented but not i */
1171-
while (i + 1 > self->datapos && *buf != self->lineterminator) {
1172+
do {
11721173
--buf;
11731174
--i;
1174-
}
1175+
} while (i + 1 > self->datapos && *buf != self->lineterminator);
1176+
11751177
if (i + 1 > self->datapos) // reached a newline rather than the beginning
11761178
{
11771179
++buf; // move pointer to first char after newline
@@ -1338,7 +1340,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
13381340
// Next character in file
13391341
c = *buf++;
13401342

1341-
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
1343+
TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n",
13421344
i, c, self->file_lines + 1, self->line_fields[self->lines],
13431345
self->state));
13441346

0 commit comments

Comments
 (0)