Skip to content

Commit f5a4dfa

Browse files
committed
Merge pull request #8984 from selasley/trailing_spaces_fix
BUG in read_csv skipping rows after a row with trailing spaces, #8983
2 parents 2ef795e + 0756858 commit f5a4dfa

File tree

3 files changed

+12
-4
lines changed

3 files changed

+12
-4
lines changed

doc/source/whatsnew/v0.15.2.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ Bug Fixes
165165
of the level names are numbers (:issue:`8584`).
166166
- Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is
167167
not lexically sorted or unique (:issue:`7724`)
168-
- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`)
168+
- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`), (:issue:`8983`)
169169
- Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`)
170170

171171

pandas/io/tests/test_parsers.py

+10-3
Original file line numberDiff line numberDiff line change
@@ -3049,17 +3049,17 @@ def test_comment_skiprows(self):
30493049
tm.assert_almost_equal(df.values, expected)
30503050

30513051
def test_trailing_spaces(self):
3052-
data = """skip
3052+
data = """A B C
30533053
random line with trailing spaces
30543054
skip
30553055
1,2,3
30563056
1,2.,4.
30573057
random line with trailing tabs\t\t\t
30583058
3059-
5.,NaN,10.0
3059+
5.1,NaN,10.0
30603060
"""
30613061
expected = pd.DataFrame([[1., 2., 4.],
3062-
[5., np.nan, 10.]])
3062+
[5.1, np.nan, 10.]])
30633063
# this should ignore six lines including lines with trailing
30643064
# whitespace and blank lines. issues 8661, 8679
30653065
df = self.read_csv(StringIO(data.replace(',', ' ')),
@@ -3070,6 +3070,13 @@ def test_trailing_spaces(self):
30703070
header=None, delim_whitespace=True,
30713071
skiprows=[0,1,2,3,5,6], skip_blank_lines=True)
30723072
tm.assert_frame_equal(df, expected)
3073+
# test skipping set of rows after a row with trailing spaces, issue #8983
3074+
expected = pd.DataFrame({"A":[1., 5.1], "B":[2., np.nan],
3075+
"C":[4., 10]})
3076+
df = self.read_table(StringIO(data.replace(',', ' ')),
3077+
delim_whitespace=True,
3078+
skiprows=[1,2,3,5,6], skip_blank_lines=True)
3079+
tm.assert_frame_equal(df, expected)
30733080

30743081
def test_comment_header(self):
30753082
data = """# empty

pandas/src/parser/tokenizer.c

+1
Original file line numberDiff line numberDiff line change
@@ -1324,6 +1324,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
13241324
if (c == '\n') {
13251325
END_LINE();
13261326
self->state = START_RECORD;
1327+
break;
13271328
} else if (c == '\r') {
13281329
self->state = EAT_CRNL;
13291330
break;

0 commit comments

Comments
 (0)