Merge pull request #8984 from selasley/trailing_spaces_fix

jreback · jreback · commit f5a4dfa05e63 · 2014-12-03T16:56:45.000-05:00
BUG in read_csv skipping rows after a row with trailing spaces, #8983
diff --git a/doc/source/whatsnew/v0.15.2.txt b/doc/source/whatsnew/v0.15.2.txt
@@ -165,7 +165,7 @@ Bug Fixes
   of the level names are numbers (:issue:`8584`).
 - Bug in ``MultiIndex`` where ``__contains__`` returns wrong result if index is
   not lexically sorted or unique (:issue:`7724`)
-- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`)
+- BUG CSV: fix problem with trailing whitespace in skipped rows, (:issue:`8679`), (:issue:`8661`), (:issue:`8983`)
 - Regression in ``Timestamp`` does not parse 'Z' zone designator for UTC (:issue:`8771`)
 
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -3049,17 +3049,17 @@ def test_comment_skiprows(self):
         tm.assert_almost_equal(df.values, expected)
 
     def test_trailing_spaces(self):
-        data = """skip
+        data = """A B C  
 random line with trailing spaces    
 skip
 1,2,3
 1,2.,4.
 random line with trailing tabs\t\t\t
      
-5.,NaN,10.0
+5.1,NaN,10.0
 """
         expected = pd.DataFrame([[1., 2., 4.],
-                    [5., np.nan, 10.]])
+                    [5.1, np.nan, 10.]])
         # this should ignore six lines including lines with trailing 
         # whitespace and blank lines.  issues 8661, 8679
         df = self.read_csv(StringIO(data.replace(',', '  ')), 
@@ -3070,6 +3070,13 @@ def test_trailing_spaces(self):
                              header=None, delim_whitespace=True,
                              skiprows=[0,1,2,3,5,6], skip_blank_lines=True)
         tm.assert_frame_equal(df, expected)
+        # test skipping set of rows after a row with trailing spaces, issue #8983
+        expected = pd.DataFrame({"A":[1., 5.1], "B":[2., np.nan], 
+                                "C":[4., 10]})
+        df = self.read_table(StringIO(data.replace(',', '  ')), 
+                             delim_whitespace=True,
+                             skiprows=[1,2,3,5,6], skip_blank_lines=True)
+        tm.assert_frame_equal(df, expected)
 
     def test_comment_header(self):
         data = """# empty
diff --git a/pandas/src/parser/tokenizer.c b/pandas/src/parser/tokenizer.c
@@ -1324,6 +1324,7 @@ int tokenize_whitespace(parser_t *self, size_t line_limit)
             if (c == '\n') {
                 END_LINE();
                 self->state = START_RECORD;
+                break;
             } else if (c == '\r') {
                 self->state = EAT_CRNL;
                 break;