BUG: read_csv with empty df

Ben Kandel · Ben Kandel · commit b8200e4c35a9 · 2016-11-21T20:48:17.000-05:00
read_csv would fail on files if the number of header lines passed in includes
all the lines in the files. This commit fixes that bug.
diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt
@@ -57,5 +57,6 @@ Bug Fixes
 - Bug in ``DataFrame.to_json`` where ``lines=True`` and a value contained a ``}`` character (:issue:`14391`)
 - Bug in ``df.groupby`` causing an ``AttributeError`` when grouping a single index frame by a column and the index level (:issue`14327`)
 - Bug in ``df.groupby`` where ``TypeError`` raised when ``pd.Grouper(key=...)`` is passed in a list (:issue:`14334`)
+- Bug in ``pd.read_csv`` where reading files fails if the number of headers is equal to the number of lines in the file (:issue:`14515`)
 - Bug in ``pd.pivot_table`` may raise ``TypeError`` or ``ValueError`` when ``index`` or ``columns``
   is not scalar and ``values`` is not specified (:issue:`14380`)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -606,6 +606,24 @@ def test_multi_index_no_level_names(self):
         expected = self.read_csv(StringIO(data), index_col=[1, 0])
         tm.assert_frame_equal(df, expected, check_names=False)
 
+    def test_multi_index_blank_df(self):
+        # GH 14545
+        data = """a,b
+"""
+        df = self.read_csv(StringIO(data), header=[0])
+        expected = DataFrame(columns=[('a'),('b')])
+        tm.assert_frame_equal(df, expected)
+        expected_csv = expected.to_csv()
+        round_trip = self.read_csv(StringIO(expected_csv))
+        tm.assert_frame_equal(expected, round_trip)
+
+        data_multiline = """a,b
+c,d
+"""
+        df2 = self.read_csv(StringIO(data_multiline), header=[0,1])
+        expected2 = DataFrame(columns=[('a', 'c'), ('b', 'd')])
+        tm.assert_frame_equal(df2, expected2)
+
     def test_no_unnamed_index(self):
         data = """ id c0 c1 c2
 0 1 0 a b
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -717,7 +717,9 @@ cdef class TextReader:
                     start = self.parser.line_start[0]
 
                 # e.g., if header=3 and file only has 2 lines
-                elif self.parser.lines < hr + 1:
+                if (self.parser.lines < hr + 1
+                    and not isinstance(self.orig_header, list)) or (
+                            self.parser.lines < hr):
                     msg = self.orig_header
                     if isinstance(msg, list):
                         msg = "[%s], len of %d," % (
@@ -940,7 +942,7 @@ cdef class TextReader:
                 raise_parser_error('Error tokenizing data', self.parser)
             footer = self.skipfooter
 
-        if self.parser_start == self.parser.lines:
+        if self.parser_start >= self.parser.lines:
             raise StopIteration
         self._end_clock('Tokenization')