BUG: Fix warning with c engine when skipping lines with comment (#16455)

pankajp · TomAugspurger · commit 44f15ba99b6e · 2017-05-30T07:23:45.000-05:00
* Fix correct warning with c engine when skipping lines Fixed bug where c engine would not print warnings for lines it skipped in case the skipped line had an inline comment. Also, its accounting of number of fields in such lines would be off by one. * Use `tm.capture_stderr` to capture stderr * Add bug fix note in `whatsnew/v0.20.3.txt` * Move test to CParserTests The behavior is only applicable on the `c` engine. * Update whatsnew bug entry as per review (cherry picked from commit 97ad3fb)
diff --git a/doc/source/whatsnew/v0.20.2.txt b/doc/source/whatsnew/v0.20.2.txt
@@ -56,6 +56,7 @@ Indexing
 I/O
 ^^^
 
+- Bug in pd.read_csv() when comment is passed in space deliminted text files (:issue:`16472`)
 - Bug that would force importing of the clipboard routines unnecessarily, potentially causing an import error on startup (:issue:`16288`)
 - Bug that raised IndexError HTML-rendering an empty DataFrame (:issue:`15953`)
 
diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c
@@ -832,6 +832,9 @@ int tokenize_bytes(parser_t *self, size_t line_limit, int start_lines) {
                 } else if (IS_CARRIAGE(c)) {
                     self->state = EAT_CRNL;
                     break;
+                } else if (IS_COMMENT_CHAR(c)) {
+                    self->state = EAT_COMMENT;
+                    break;
                 } else if (!IS_WHITESPACE(c)) {
                     self->state = START_FIELD;
                     // fall through to subsequent state
diff --git a/pandas/tests/io/parser/c_parser_only.py b/pandas/tests/io/parser/c_parser_only.py
@@ -7,6 +7,8 @@
 further arguments when parsing.
 """
 
+import sys
+
 import pytest
 import numpy as np
 
@@ -417,3 +419,30 @@ def test_data_after_quote(self):
         expected = DataFrame({'a': ['1', 'ba']})
 
         tm.assert_frame_equal(result, expected)
+
+    @tm.capture_stderr
+    def test_comment_whitespace_delimited(self):
+        test_input = """\
+1 2
+2 2 3
+3 2 3 # 3 fields
+4 2 3# 3 fields
+5 2 # 2 fields
+6 2# 2 fields
+7 # 1 field, NaN
+8# 1 field, NaN
+9 2 3 # skipped line
+# comment"""
+        df = self.read_csv(StringIO(test_input), comment='#', header=None,
+                           delimiter='\\s+', skiprows=0,
+                           error_bad_lines=False)
+        error = sys.stderr.getvalue()
+        # skipped lines 2, 3, 4, 9
+        for line_num in (2, 3, 4, 9):
+            assert 'Skipping line {}'.format(line_num) in error, error
+        expected = DataFrame([[1, 2],
+                              [5, 2],
+                              [6, 2],
+                              [7, np.nan],
+                              [8, np.nan]])
+        tm.assert_frame_equal(df, expected)