Skip to content

Commit 75287ae

Browse files
committed
ENH: support "nrows" and "chunksize" together
1 parent 92239f5 commit 75287ae

File tree

4 files changed

+17
-24
lines changed

4 files changed

+17
-24
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ Other enhancements
291291
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
292292
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
293293
- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`)
294+
- The ``nrows`` and ``chunksize`` arguments in ``pd.read_csv`` are no more incompatible (:issue:`15755`)
294295
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
295296
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
296297
- ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)

pandas/io/parsers.py

+9-15
Original file line numberDiff line numberDiff line change
@@ -384,29 +384,18 @@ def _read(filepath_or_buffer, kwds):
384384
# Extract some of the arguments (pass chunksize on).
385385
iterator = kwds.get('iterator', False)
386386
chunksize = kwds.get('chunksize', None)
387-
nrows = _validate_nrows(kwds.pop('nrows', None))
387+
nrows = _validate_nrows(kwds.get('nrows', None))
388388

389389
# Create the parser.
390390
parser = TextFileReader(filepath_or_buffer, **kwds)
391391

392-
if (nrows is not None) and (chunksize is not None):
393-
raise NotImplementedError("'nrows' and 'chunksize' cannot be used"
394-
" together yet.")
395-
elif nrows is not None:
396-
try:
397-
data = parser.read(nrows)
398-
finally:
399-
parser.close()
400-
return data
401-
402-
elif chunksize or iterator:
392+
if chunksize or iterator:
403393
return parser
404394

405395
try:
406-
data = parser.read()
396+
data = parser.read(nrows)
407397
finally:
408398
parser.close()
409-
410399
return data
411400

412401

@@ -445,7 +434,7 @@ def _read(filepath_or_buffer, kwds):
445434

446435
'usecols': None,
447436

448-
# 'nrows': None,
437+
'nrows': None,
449438
# 'iterator': False,
450439
'chunksize': None,
451440
'verbose': False,
@@ -749,6 +738,7 @@ def __init__(self, f, engine=None, **kwds):
749738
options = self._get_options_with_defaults(engine)
750739

751740
self.chunksize = options.pop('chunksize', None)
741+
self.nrows = options.pop('nrows', None)
752742
self.squeeze = options.pop('squeeze', False)
753743

754744
# might mutate self.engine
@@ -1009,6 +999,10 @@ def _create_index(self, ret):
1009999
def get_chunk(self, size=None):
10101000
if size is None:
10111001
size = self.chunksize
1002+
if self.nrows is not None:
1003+
if self._currow >= self.nrows:
1004+
raise StopIteration
1005+
size = min(size, self.nrows - self._currow)
10121006
return self.read(nrows=size)
10131007

10141008

pandas/tests/io/parser/common.py

+7
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,13 @@ def test_read_chunksize(self):
402402
tm.assert_frame_equal(chunks[1], df[2:4])
403403
tm.assert_frame_equal(chunks[2], df[4:])
404404

405+
# With nrows
406+
reader = self.read_csv(StringIO(self.data1), index_col=0,
407+
chunksize=2, nrows=5)
408+
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
409+
410+
tm.assert_frame_equal(pd.concat(reader), df)
411+
405412
def test_read_chunksize_named(self):
406413
reader = self.read_csv(
407414
StringIO(self.data1), index_col='index', chunksize=2)

pandas/tests/io/parser/test_unsupported.py

-9
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,6 @@ def test_mangle_dupe_cols_false(self):
2929
read_csv(StringIO(data), engine=engine,
3030
mangle_dupe_cols=False)
3131

32-
def test_nrows_and_chunksize(self):
33-
data = 'a b c'
34-
msg = "cannot be used together yet"
35-
36-
for engine in ('c', 'python'):
37-
with tm.assertRaisesRegexp(NotImplementedError, msg):
38-
read_csv(StringIO(data), engine=engine,
39-
nrows=10, chunksize=5)
40-
4132
def test_c_engine(self):
4233
# see gh-6607
4334
data = 'a b c\n1 2 3'

0 commit comments

Comments
 (0)