Skip to content

Commit fdecff7

Browse files
author
Evan Wright
committed
Merge commit '639ce04' into issue_9757
2 parents fd3912d + 639ce04 commit fdecff7

File tree

3 files changed

+92
-18
lines changed

3 files changed

+92
-18
lines changed

doc/source/whatsnew/v0.16.1.txt

+4
Original file line numberDiff line numberDiff line change
@@ -96,3 +96,7 @@ Bug Fixes
9696
- Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`)
9797

9898
- Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`)
99+
100+
- Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`)
101+
102+
- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)

pandas/io/tests/test_parsers.py

+74
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,28 @@ def test_deep_skiprows(self):
839839
condensed_data = self.read_csv(StringIO(condensed_text))
840840
tm.assert_frame_equal(data, condensed_data)
841841

842+
def test_skiprows_blank(self):
843+
# GH 9832
844+
text = """#foo,a,b,c
845+
#foo,a,b,c
846+
847+
#foo,a,b,c
848+
#foo,a,b,c
849+
850+
1/1/2000,1.,2.,3.
851+
1/2/2000,4,5,6
852+
1/3/2000,7,8,9
853+
"""
854+
data = self.read_csv(StringIO(text), skiprows=6, header=None,
855+
index_col=0, parse_dates=True)
856+
857+
expected = DataFrame(np.arange(1., 10.).reshape((3, 3)),
858+
columns=[1, 2, 3],
859+
index=[datetime(2000, 1, 1), datetime(2000, 1, 2),
860+
datetime(2000, 1, 3)])
861+
expected.index.name = 0
862+
tm.assert_frame_equal(data, expected)
863+
842864
def test_detect_string_na(self):
843865
data = """A,B
844866
foo,bar
@@ -2231,6 +2253,20 @@ def test_nrows_and_chunksize_raises_notimplemented(self):
22312253
self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
22322254
nrows=10, chunksize=5)
22332255

2256+
def test_single_char_leading_whitespace(self):
2257+
# GH 9710
2258+
data = """\
2259+
MyColumn
2260+
a
2261+
b
2262+
a
2263+
b\n"""
2264+
2265+
expected = DataFrame({'MyColumn' : list('abab')})
2266+
2267+
result = self.read_csv(StringIO(data), skipinitialspace=True)
2268+
tm.assert_frame_equal(result, expected)
2269+
22342270

22352271
class TestPythonParser(ParserTests, tm.TestCase):
22362272
def test_negative_skipfooter_raises(self):
@@ -3271,6 +3307,25 @@ def test_buffer_overflow(self):
32713307
except Exception as cperr:
32723308
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
32733309

3310+
def test_single_char_leading_whitespace(self):
3311+
# GH 9710
3312+
data = """\
3313+
MyColumn
3314+
a
3315+
b
3316+
a
3317+
b\n"""
3318+
3319+
expected = DataFrame({'MyColumn' : list('abab')})
3320+
3321+
result = self.read_csv(StringIO(data), delim_whitespace=True,
3322+
skipinitialspace=True)
3323+
tm.assert_frame_equal(result, expected)
3324+
3325+
result = self.read_csv(StringIO(data), lineterminator='\n',
3326+
skipinitialspace=True)
3327+
tm.assert_frame_equal(result, expected)
3328+
32743329
class TestCParserLowMemory(ParserTests, tm.TestCase):
32753330

32763331
def read_csv(self, *args, **kwds):
@@ -3692,6 +3747,25 @@ def test_buffer_overflow(self):
36923747
except Exception as cperr:
36933748
self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
36943749

3750+
def test_single_char_leading_whitespace(self):
3751+
# GH 9710
3752+
data = """\
3753+
MyColumn
3754+
a
3755+
b
3756+
a
3757+
b\n"""
3758+
3759+
expected = DataFrame({'MyColumn' : list('abab')})
3760+
3761+
result = self.read_csv(StringIO(data), delim_whitespace=True,
3762+
skipinitialspace=True)
3763+
tm.assert_frame_equal(result, expected)
3764+
3765+
result = self.read_csv(StringIO(data), lineterminator='\n',
3766+
skipinitialspace=True)
3767+
tm.assert_frame_equal(result, expected)
3768+
36953769
class TestMiscellaneous(tm.TestCase):
36963770

36973771
# for tests that don't fit into any of the other classes, e.g. those that

pandas/src/parser/tokenizer.c

+14-18
Original file line numberDiff line numberDiff line change
@@ -757,11 +757,9 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
757757
case START_RECORD:
758758
// start of record
759759
if (skip_this_line(self, self->file_lines)) {
760+
self->state = SKIP_LINE;
760761
if (c == '\n') {
761-
END_LINE()
762-
}
763-
else {
764-
self->state = SKIP_LINE;
762+
END_LINE();
765763
}
766764
break;
767765
}
@@ -853,10 +851,11 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
853851
;
854852
else { // backtrack
855853
/* We have to use i + 1 because buf has been incremented but not i */
856-
while (i + 1 > self->datapos && *buf != '\n') {
854+
do {
857855
--buf;
858856
--i;
859-
}
857+
} while (i + 1 > self->datapos && *buf != '\n');
858+
860859
if (i + 1 > self->datapos) // reached a newline rather than the beginning
861860
{
862861
++buf; // move pointer to first char after newline
@@ -1077,7 +1076,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
10771076
// Next character in file
10781077
c = *buf++;
10791078

1080-
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
1079+
TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n",
10811080
i, c, self->file_lines + 1, self->line_fields[self->lines],
10821081
self->state));
10831082

@@ -1093,11 +1092,9 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
10931092
case START_RECORD:
10941093
// start of record
10951094
if (skip_this_line(self, self->file_lines)) {
1095+
self->state = SKIP_LINE;
10961096
if (c == self->lineterminator) {
1097-
END_LINE()
1098-
}
1099-
else {
1100-
self->state = SKIP_LINE;
1097+
END_LINE();
11011098
}
11021099
break;
11031100
}
@@ -1172,10 +1169,11 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
11721169
;
11731170
else { // backtrack
11741171
/* We have to use i + 1 because buf has been incremented but not i */
1175-
while (i + 1 > self->datapos && *buf != self->lineterminator) {
1172+
do {
11761173
--buf;
11771174
--i;
1178-
}
1175+
} while (i + 1 > self->datapos && *buf != self->lineterminator);
1176+
11791177
if (i + 1 > self->datapos) // reached a newline rather than the beginning
11801178
{
11811179
++buf; // move pointer to first char after newline
@@ -1342,7 +1340,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
13421340
// Next character in file
13431341
c = *buf++;
13441342

1345-
TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
1343+
TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n",
13461344
i, c, self->file_lines + 1, self->line_fields[self->lines],
13471345
self->state));
13481346

@@ -1391,11 +1389,9 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
13911389
case START_RECORD:
13921390
// start of record
13931391
if (skip_this_line(self, self->file_lines)) {
1392+
self->state = SKIP_LINE;
13941393
if (c == '\n') {
1395-
END_LINE()
1396-
}
1397-
else {
1398-
self->state = SKIP_LINE;
1394+
END_LINE();
13991395
}
14001396
break;
14011397
} else if (c == '\n') {

0 commit comments

Comments
 (0)