Skip to content

Commit 2997e70

Browse files
evanpwjreback
authored andcommitted
BUG: read_csv skips lines with initial whitespace + one non-space character (GH9710)
1 parent cf62037 commit 2997e70

File tree

3 files changed

+62
-6
lines changed

3 files changed

+62
-6
lines changed

doc/source/whatsnew/v0.16.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,7 @@ Bug Fixes
198198
- Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`)
199199
- Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`)
200200
- Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`)
201+
201202
- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`)
202203
- Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`)
203204
- Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`)
@@ -206,6 +207,7 @@ Bug Fixes
206207

207208
- Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`)
208209
- Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`)
210+
- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)
209211

210212

211213

pandas/io/tests/test_parsers.py

+52
Original file line numberDiff line numberDiff line change
@@ -2273,6 +2273,20 @@ def test_nrows_and_chunksize_raises_notimplemented(self):
22732273
self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
22742274
nrows=10, chunksize=5)
22752275

2276+
def test_single_char_leading_whitespace(self):
2277+
# GH 9710
2278+
data = """\
2279+
MyColumn
2280+
a
2281+
b
2282+
a
2283+
b\n"""
2284+
2285+
expected = DataFrame({'MyColumn' : list('abab')})
2286+
2287+
result = self.read_csv(StringIO(data), skipinitialspace=True)
2288+
tm.assert_frame_equal(result, expected)
2289+
22762290

22772291
class TestPythonParser(ParserTests, tm.TestCase):
22782292
def test_negative_skipfooter_raises(self):
@@ -3313,6 +3327,25 @@ def test_buffer_overflow(self):
33133327
except Exception as cperr:
33143328
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
33153329

3330+
def test_single_char_leading_whitespace(self):
3331+
# GH 9710
3332+
data = """\
3333+
MyColumn
3334+
a
3335+
b
3336+
a
3337+
b\n"""
3338+
3339+
expected = DataFrame({'MyColumn' : list('abab')})
3340+
3341+
result = self.read_csv(StringIO(data), delim_whitespace=True,
3342+
skipinitialspace=True)
3343+
tm.assert_frame_equal(result, expected)
3344+
3345+
result = self.read_csv(StringIO(data), lineterminator='\n',
3346+
skipinitialspace=True)
3347+
tm.assert_frame_equal(result, expected)
3348+
33163349
class TestCParserLowMemory(ParserTests, tm.TestCase):
33173350

33183351
def read_csv(self, *args, **kwds):
@@ -3734,6 +3767,25 @@ def test_buffer_overflow(self):
37343767
except Exception as cperr:
37353768
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
37363769

3770+
def test_single_char_leading_whitespace(self):
3771+
# GH 9710
3772+
data = """\
3773+
MyColumn
3774+
a
3775+
b
3776+
a
3777+
b\n"""
3778+
3779+
expected = DataFrame({'MyColumn' : list('abab')})
3780+
3781+
result = self.read_csv(StringIO(data), delim_whitespace=True,
3782+
skipinitialspace=True)
3783+
tm.assert_frame_equal(result, expected)
3784+
3785+
result = self.read_csv(StringIO(data), lineterminator='\n',
3786+
skipinitialspace=True)
3787+
tm.assert_frame_equal(result, expected)
3788+
37373789
class TestMiscellaneous(tm.TestCase):
37383790

37393791
# for tests that don't fit into any of the other classes, e.g. those that

pandas/src/parser/tokenizer.c

+8-6
Original file line numberDiff line numberDiff line change
@@ -849,10 +849,11 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
849849
;
850850
else { // backtrack
851851
/* We have to use i + 1 because buf has been incremented but not i */
852-
while (i + 1 > self->datapos && *buf != '\n') {
852+
do {
853853
--buf;
854854
--i;
855-
}
855+
} while (i + 1 > self->datapos && *buf != '\n');
856+
856857
if (i + 1 > self->datapos) // reached a newline rather than the beginning
857858
{
858859
++buf; // move pointer to first char after newline
@@ -1073,7 +1074,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
10731074
// Next character in file
10741075
c = *buf++;
10751076

1076-
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
1077+
TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n",
10771078
i, c, self->file_lines + 1, self->line_fields[self->lines],
10781079
self->state));
10791080

@@ -1166,10 +1167,11 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
11661167
;
11671168
else { // backtrack
11681169
/* We have to use i + 1 because buf has been incremented but not i */
1169-
while (i + 1 > self->datapos && *buf != self->lineterminator) {
1170+
do {
11701171
--buf;
11711172
--i;
1172-
}
1173+
} while (i + 1 > self->datapos && *buf != self->lineterminator);
1174+
11731175
if (i + 1 > self->datapos) // reached a newline rather than the beginning
11741176
{
11751177
++buf; // move pointer to first char after newline
@@ -1336,7 +1338,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
13361338
// Next character in file
13371339
c = *buf++;
13381340

1339-
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
1341+
TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n",
13401342
i, c, self->file_lines + 1, self->line_fields[self->lines],
13411343
self->state));
13421344

0 commit comments

Comments
 (0)