Merge pull request #10827 from evanpw/empty_nrows_chunksize

jreback · jreback · commit 09c4f73d7f33 · 2015-08-15T18:22:36.000-04:00
Bug in read_csv when using nrows or chunksize on a file containing only a header
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -654,6 +654,8 @@ Bug Fixes
 - Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`)
 - Bug in ``Index.take`` may add unnecessary ``freq`` attribute (:issue:`10791`)
 
+- Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`)
+
 - Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
 - Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue: `9431`)
 - Bug in ``DatetimeIndex.take`` and ``TimedeltaIndex.take`` may not raise ``IndexError`` against invalid index (:issue:`10295`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -802,6 +802,8 @@ def __init__(self, kwds):
 
         self._name_processed = False
 
+        self._first_chunk = True
+
     @property
     def _has_complex_date_col(self):
         return (isinstance(self.parse_dates, dict) or
@@ -1164,21 +1166,25 @@ def set_error_bad_lines(self, status):
         self._reader.set_error_bad_lines(int(status))
 
     def read(self, nrows=None):
-        if self.as_recarray:
-            # what to do if there are leading columns?
-            return self._reader.read(nrows)
-
         try:
             data = self._reader.read(nrows)
         except StopIteration:
-            if nrows is None:
+            if self._first_chunk:
+                self._first_chunk = False
                 return _get_empty_meta(self.orig_names,
                                        self.index_col,
                                        self.index_names,
                                        dtype=self.kwds.get('dtype'))
             else:
                 raise
 
+        # Done with first read, next time raise StopIteration
+        self._first_chunk = False
+
+        if self.as_recarray:
+            # what to do if there are leading columns?
+            return data
+
         names = self.names
 
         if self._reader.leading_cols:
@@ -1454,7 +1460,6 @@ def __init__(self, f, **kwds):
             self._name_processed = True
             if self.index_names is None:
                 self.index_names = index_names
-        self._first_chunk = True
 
         if self.parse_dates:
             self._no_thousands_columns = self._set_no_thousands_columns()
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -2415,6 +2415,25 @@ def test_int64_overflow(self):
             expected = pd.DataFrame([str(x)])
             tm.assert_frame_equal(result, expected)
 
+    def test_empty_with_nrows_chunksize(self):
+        # GH 9535
+        expected = pd.DataFrame([], columns=['foo', 'bar'])
+
+        result = self.read_csv(StringIO('foo,bar\n'), nrows=10)
+        tm.assert_frame_equal(result, expected)
+
+        result = next(iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10)))
+        tm.assert_frame_equal(result, expected)
+
+        result = pd.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True)
+        result = pd.DataFrame(result[2], columns=result[1], index=result[0])
+        tm.assert_frame_equal(pd.DataFrame.from_records(result), expected)
+
+        result = next(iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10, as_recarray=True)))
+        result = pd.DataFrame(result[2], columns=result[1], index=result[0])
+        tm.assert_frame_equal(pd.DataFrame.from_records(result), expected)
+
+
 
 class TestPythonParser(ParserTests, tm.TestCase):
     def test_negative_skipfooter_raises(self):