Skip to content

Commit 69c1533

Browse files
toobazmattip
authored andcommitted
ENH: support "nrows" and "chunksize" together
closes pandas-dev#15755 Author: Pietro Battiston <[email protected]> Closes pandas-dev#15756 from toobaz/nrows_chunksize and squashes the following commits: d0288e3 [Pietro Battiston] ENH: support "nrows" and "chunksize" together
1 parent a417981 commit 69c1533

File tree

4 files changed

+37
-24
lines changed

4 files changed

+37
-24
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -291,6 +291,7 @@ Other enhancements
291291
- ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
292292
- The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`)
293293
- The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`)
294+
- The ``nrows`` and ``chunksize`` arguments in ``pd.read_csv()`` are supported if both are passed (:issue:`6774`, :issue:`15755`)
294295
- ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
295296
- ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
296297
- ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)

pandas/io/parsers.py

+9-15
Original file line numberDiff line numberDiff line change
@@ -384,29 +384,18 @@ def _read(filepath_or_buffer, kwds):
384384
# Extract some of the arguments (pass chunksize on).
385385
iterator = kwds.get('iterator', False)
386386
chunksize = kwds.get('chunksize', None)
387-
nrows = _validate_nrows(kwds.pop('nrows', None))
387+
nrows = _validate_nrows(kwds.get('nrows', None))
388388

389389
# Create the parser.
390390
parser = TextFileReader(filepath_or_buffer, **kwds)
391391

392-
if (nrows is not None) and (chunksize is not None):
393-
raise NotImplementedError("'nrows' and 'chunksize' cannot be used"
394-
" together yet.")
395-
elif nrows is not None:
396-
try:
397-
data = parser.read(nrows)
398-
finally:
399-
parser.close()
400-
return data
401-
402-
elif chunksize or iterator:
392+
if chunksize or iterator:
403393
return parser
404394

405395
try:
406-
data = parser.read()
396+
data = parser.read(nrows)
407397
finally:
408398
parser.close()
409-
410399
return data
411400

412401

@@ -445,7 +434,7 @@ def _read(filepath_or_buffer, kwds):
445434

446435
'usecols': None,
447436

448-
# 'nrows': None,
437+
'nrows': None,
449438
# 'iterator': False,
450439
'chunksize': None,
451440
'verbose': False,
@@ -749,6 +738,7 @@ def __init__(self, f, engine=None, **kwds):
749738
options = self._get_options_with_defaults(engine)
750739

751740
self.chunksize = options.pop('chunksize', None)
741+
self.nrows = options.pop('nrows', None)
752742
self.squeeze = options.pop('squeeze', False)
753743

754744
# might mutate self.engine
@@ -1009,6 +999,10 @@ def _create_index(self, ret):
1009999
def get_chunk(self, size=None):
10101000
if size is None:
10111001
size = self.chunksize
1002+
if self.nrows is not None:
1003+
if self._currow >= self.nrows:
1004+
raise StopIteration
1005+
size = min(size, self.nrows - self._currow)
10121006
return self.read(nrows=size)
10131007

10141008

pandas/tests/io/parser/common.py

+27
Original file line numberDiff line numberDiff line change
@@ -402,6 +402,33 @@ def test_read_chunksize(self):
402402
tm.assert_frame_equal(chunks[1], df[2:4])
403403
tm.assert_frame_equal(chunks[2], df[4:])
404404

405+
def test_read_chunksize_and_nrows(self):
406+
407+
# gh-15755
408+
# With nrows
409+
reader = self.read_csv(StringIO(self.data1), index_col=0,
410+
chunksize=2, nrows=5)
411+
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
412+
413+
tm.assert_frame_equal(pd.concat(reader), df)
414+
415+
# chunksize > nrows
416+
reader = self.read_csv(StringIO(self.data1), index_col=0,
417+
chunksize=8, nrows=5)
418+
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
419+
420+
tm.assert_frame_equal(pd.concat(reader), df)
421+
422+
# with changing "size":
423+
reader = self.read_csv(StringIO(self.data1), index_col=0,
424+
chunksize=8, nrows=5)
425+
df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
426+
427+
tm.assert_frame_equal(reader.get_chunk(size=2), df.iloc[:2])
428+
tm.assert_frame_equal(reader.get_chunk(size=4), df.iloc[2:5])
429+
with tm.assertRaises(StopIteration):
430+
reader.get_chunk(size=3)
431+
405432
def test_read_chunksize_named(self):
406433
reader = self.read_csv(
407434
StringIO(self.data1), index_col='index', chunksize=2)

pandas/tests/io/parser/test_unsupported.py

-9
Original file line numberDiff line numberDiff line change
@@ -29,15 +29,6 @@ def test_mangle_dupe_cols_false(self):
2929
read_csv(StringIO(data), engine=engine,
3030
mangle_dupe_cols=False)
3131

32-
def test_nrows_and_chunksize(self):
33-
data = 'a b c'
34-
msg = "cannot be used together yet"
35-
36-
for engine in ('c', 'python'):
37-
with tm.assertRaisesRegexp(NotImplementedError, msg):
38-
read_csv(StringIO(data), engine=engine,
39-
nrows=10, chunksize=5)
40-
4132
def test_c_engine(self):
4233
# see gh-6607
4334
data = 'a b c\n1 2 3'

0 commit comments

Comments
 (0)