BUG: read_csv skips lines with initial whitespace + one non-space character (GH9710)

evanpw · jreback · commit 2997e70202ba · 2015-04-27T21:00:48.000-04:00
diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt
@@ -198,6 +198,7 @@ Bug Fixes
 - Bug in ``where`` causing incorrect results when upcasting was required (:issue:`9731`)
 - Bug in ``FloatArrayFormatter`` where decision boundary for displaying "small" floats in decimal format is off by one order of magnitude for a given display.precision (:issue:`9764`)
 - Fixed bug where ``DataFrame.plot()`` raised an error when both ``color`` and ``style`` keywords were passed and there was no color symbol in the style strings (:issue:`9671`)
+
 - Bug in ``read_csv`` and ``read_table`` when using ``skip_rows`` parameter if blank lines are present. (:issue:`9832`)
 - Bug in ``read_csv()`` interprets ``index_col=True`` as ``1`` (:issue:`9798`)
 - Bug in index equality comparisons using ``==`` failing on Index/MultiIndex type incompatibility (:issue:`9785`)
@@ -206,6 +207,7 @@ Bug Fixes
 
 - Bug ``GroupBy.size`` doesn't attach index name properly if grouped by ``TimeGrouper`` (:issue:`9925`)
 - Bug causing an exception in slice assignments because ``length_of_indexer`` returns wrong results (:issue:`9995`)
+- Bug in csv parser causing lines with initial whitespace plus one non-space character to be skipped. (:issue:`9710`)
 
 
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2273,6 +2273,20 @@ def test_nrows_and_chunksize_raises_notimplemented(self):
         self.assertRaises(NotImplementedError, self.read_csv, StringIO(data),
                      nrows=10, chunksize=5)
 
+    def test_single_char_leading_whitespace(self):
+        # GH 9710
+        data = """\
+MyColumn
+   a
+   b
+   a
+   b\n"""
+
+        expected = DataFrame({'MyColumn' : list('abab')})
+
+        result = self.read_csv(StringIO(data), skipinitialspace=True)
+        tm.assert_frame_equal(result, expected)
+
 
 class TestPythonParser(ParserTests, tm.TestCase):
     def test_negative_skipfooter_raises(self):
@@ -3313,6 +3327,25 @@ def test_buffer_overflow(self):
             except Exception as cperr:
                 self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
 
+    def test_single_char_leading_whitespace(self):
+        # GH 9710
+        data = """\
+MyColumn
+   a
+   b
+   a
+   b\n"""
+
+        expected = DataFrame({'MyColumn' : list('abab')})
+
+        result = self.read_csv(StringIO(data), delim_whitespace=True,
+                               skipinitialspace=True)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), lineterminator='\n',
+                               skipinitialspace=True)
+        tm.assert_frame_equal(result, expected)
+
 class TestCParserLowMemory(ParserTests, tm.TestCase):
 
     def read_csv(self, *args, **kwds):
@@ -3734,6 +3767,25 @@ def test_buffer_overflow(self):
             except Exception as cperr:
                 self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr))
 
+    def test_single_char_leading_whitespace(self):
+        # GH 9710
+        data = """\
+MyColumn
+   a
+   b
+   a
+   b\n"""
+
+        expected = DataFrame({'MyColumn' : list('abab')})
+
+        result = self.read_csv(StringIO(data), delim_whitespace=True,
+                               skipinitialspace=True)
+        tm.assert_frame_equal(result, expected)
+
+        result = self.read_csv(StringIO(data), lineterminator='\n',
+                               skipinitialspace=True)
+        tm.assert_frame_equal(result, expected)
+
 class TestMiscellaneous(tm.TestCase):
 
     # for tests that don't fit into any of the other classes, e.g. those that
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -849,10 +849,11 @@ int tokenize_delimited(parser_t *self, size_t line_limit)
                 ;
             else { // backtrack
                 /* We have to use i + 1 because buf has been incremented but not i */
-                while (i + 1 > self->datapos && *buf != '\n') {
+                do {
                     --buf;
                     --i;
-                }
+                } while (i + 1 > self->datapos && *buf != '\n');
+
                 if (i + 1 > self->datapos) // reached a newline rather than the beginning
                 {
                     ++buf; // move pointer to first char after newline
@@ -1073,7 +1074,7 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
         // Next character in file
         c = *buf++;
 
-        TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
+        TRACE(("tokenize_delim_customterm - Iter: %d Char: %c Line %d field_count %d, state %d\n",
                i, c, self->file_lines + 1, self->line_fields[self->lines],
                self->state));
 
@@ -1166,10 +1167,11 @@ int tokenize_delim_customterm(parser_t *self, size_t line_limit)
                 ;
             else { // backtrack
                 /* We have to use i + 1 because buf has been incremented but not i */
-                while (i + 1 > self->datapos && *buf != self->lineterminator) {
+                do {
                     --buf;
                     --i;
-                }
+                } while (i + 1 > self->datapos && *buf != self->lineterminator);
+
                 if (i + 1 > self->datapos) // reached a newline rather than the beginning
                 {
                     ++buf; // move pointer to first char after newline
@@ -1336,7 +1338,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
         // Next character in file
         c = *buf++;
 
-        TRACE(("Iter: %d Char: %c Line %d field_count %d, state %d\n",
+        TRACE(("tokenize_whitespace - Iter: %d Char: %c Line %d field_count %d, state %d\n",
                i, c, self->file_lines + 1, self->line_fields[self->lines],
                self->state));