Skip to content

Commit 09c4f73

Browse files
committed
Merge pull request #10827 from evanpw/empty_nrows_chunksize
Bug in read_csv when using nrows or chunksize on a file containing only a header
2 parents 58db03e + 661b7d7 commit 09c4f73

File tree

3 files changed

+32
-6
lines changed

3 files changed

+32
-6
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -654,6 +654,8 @@ Bug Fixes
654654
- Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`)
655655
- Bug in ``Index.take`` may add unnecessary ``freq`` attribute (:issue:`10791`)
656656

657+
- Bug in ``read_csv`` when using the ``nrows`` or ``chunksize`` parameters if file contains only a header line (:issue:`9535`)
658+
657659
- Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
658660
- Bug in ``pd.unique`` for arrays with the ``datetime64`` or ``timedelta64`` dtype that meant an array with object dtype was returned instead the original dtype (:issue: `9431`)
659661
- Bug in ``DatetimeIndex.take`` and ``TimedeltaIndex.take`` may not raise ``IndexError`` against invalid index (:issue:`10295`)

pandas/io/parsers.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -802,6 +802,8 @@ def __init__(self, kwds):
802802

803803
self._name_processed = False
804804

805+
self._first_chunk = True
806+
805807
@property
806808
def _has_complex_date_col(self):
807809
return (isinstance(self.parse_dates, dict) or
@@ -1164,21 +1166,25 @@ def set_error_bad_lines(self, status):
11641166
self._reader.set_error_bad_lines(int(status))
11651167

11661168
def read(self, nrows=None):
1167-
if self.as_recarray:
1168-
# what to do if there are leading columns?
1169-
return self._reader.read(nrows)
1170-
11711169
try:
11721170
data = self._reader.read(nrows)
11731171
except StopIteration:
1174-
if nrows is None:
1172+
if self._first_chunk:
1173+
self._first_chunk = False
11751174
return _get_empty_meta(self.orig_names,
11761175
self.index_col,
11771176
self.index_names,
11781177
dtype=self.kwds.get('dtype'))
11791178
else:
11801179
raise
11811180

1181+
# Done with first read, next time raise StopIteration
1182+
self._first_chunk = False
1183+
1184+
if self.as_recarray:
1185+
# what to do if there are leading columns?
1186+
return data
1187+
11821188
names = self.names
11831189

11841190
if self._reader.leading_cols:
@@ -1454,7 +1460,6 @@ def __init__(self, f, **kwds):
14541460
self._name_processed = True
14551461
if self.index_names is None:
14561462
self.index_names = index_names
1457-
self._first_chunk = True
14581463

14591464
if self.parse_dates:
14601465
self._no_thousands_columns = self._set_no_thousands_columns()

pandas/io/tests/test_parsers.py

+19
Original file line numberDiff line numberDiff line change
@@ -2415,6 +2415,25 @@ def test_int64_overflow(self):
24152415
expected = pd.DataFrame([str(x)])
24162416
tm.assert_frame_equal(result, expected)
24172417

2418+
def test_empty_with_nrows_chunksize(self):
2419+
# GH 9535
2420+
expected = pd.DataFrame([], columns=['foo', 'bar'])
2421+
2422+
result = self.read_csv(StringIO('foo,bar\n'), nrows=10)
2423+
tm.assert_frame_equal(result, expected)
2424+
2425+
result = next(iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10)))
2426+
tm.assert_frame_equal(result, expected)
2427+
2428+
result = pd.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True)
2429+
result = pd.DataFrame(result[2], columns=result[1], index=result[0])
2430+
tm.assert_frame_equal(pd.DataFrame.from_records(result), expected)
2431+
2432+
result = next(iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10, as_recarray=True)))
2433+
result = pd.DataFrame(result[2], columns=result[1], index=result[0])
2434+
tm.assert_frame_equal(pd.DataFrame.from_records(result), expected)
2435+
2436+
24182437

24192438
class TestPythonParser(ParserTests, tm.TestCase):
24202439
def test_negative_skipfooter_raises(self):

0 commit comments

Comments
 (0)