Skip to content

Commit 44f15ba

Browse files
pankajpTomAugspurger
authored andcommitted
BUG: Fix warning with c engine when skipping lines with comment (#16455)
* Fix correct warning with c engine when skipping lines Fixed bug where c engine would not print warnings for lines it skipped in case the skipped line had an inline comment. Also, its accounting of number of fields in such lines would be off by one. * Use `tm.capture_stderr` to capture stderr * Add bug fix note in `whatsnew/v0.20.3.txt` * Move test to CParserTests The behavior is only applicable on the `c` engine. * Update whatsnew bug entry as per review (cherry picked from commit 97ad3fb)
1 parent 810cb2d commit 44f15ba

File tree

3 files changed

+33
-0
lines changed

3 files changed

+33
-0
lines changed

doc/source/whatsnew/v0.20.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ Indexing
5656
I/O
5757
^^^
5858

59+
- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
5960
- Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
6061
- Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
6162

pandas/_libs/src/parser/tokenizer.c

+3
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
832832
} else if (IS_CARRIAGE(c)) {
833833
self->state = EAT_CRNL;
834834
break;
835+
} else if (IS_COMMENT_CHAR(c)) {
836+
self->state = EAT_COMMENT;
837+
break;
835838
} else if (!IS_WHITESPACE(c)) {
836839
self->state = START_FIELD;
837840
// fall through to subsequent state

pandas/tests/io/parser/c_parser_only.py

+29
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
further arguments when parsing.
88
"""
99

10+
import sys
11+
1012
import pytest
1113
import numpy as np
1214

@@ -417,3 +419,30 @@ def test_data_after_quote(self):
417419
expected = DataFrame({'a': ['1', 'ba']})
418420

419421
tm.assert_frame_equal(result, expected)
422+
423+
@tm.capture_stderr
424+
def test_comment_whitespace_delimited(self):
425+
test_input = """\
426+
1 2
427+
2 2 3
428+
3 2 3 # 3 fields
429+
4 2 3# 3 fields
430+
5 2 # 2 fields
431+
6 2# 2 fields
432+
7 # 1 field, NaN
433+
8# 1 field, NaN
434+
9 2 3 # skipped line
435+
# comment"""
436+
df = self.read_csv(StringIO(test_input), comment='#', header=None,
437+
delimiter='\\s+', skiprows=0,
438+
error_bad_lines=False)
439+
error = sys.stderr.getvalue()
440+
# skipped lines 2, 3, 4, 9
441+
for line_num in (2, 3, 4, 9):
442+
assert 'Skipping line {}'.format(line_num) in error, error
443+
expected = DataFrame([[1, 2],
444+
[5, 2],
445+
[6, 2],
446+
[7, np.nan],
447+
[8, np.nan]])
448+
tm.assert_frame_equal(df, expected)

0 commit comments

Comments
 (0)