From d0288e39ace949d5456d6d16b7e6dbca98067773 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 8 Mar 2017 18:13:30 +0100 Subject: [PATCH] ENH: support "nrows" and "chunksize" together --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/io/parsers.py | 24 ++++++++-------------- pandas/tests/io/parser/common.py | 24 ++++++++++++++++++++++ pandas/tests/io/parser/test_unsupported.py | 9 -------- 4 files changed, 34 insertions(+), 24 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index d036049e3ffdb..2be809fbf548d 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -291,6 +291,7 @@ Other enhancements - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`) - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`14154`) - The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value (:issue:`10882`) +- The ``nrows`` and ``chunksize`` arguments in ``pd.read_csv`` are supported if both are passed (:issue:`15755`) - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`) - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`) - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs ` (:issue:`15136`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9aedddc811830..18343670fb39e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -384,29 +384,18 @@ def _read(filepath_or_buffer, kwds): # Extract some of the arguments (pass chunksize on). iterator = kwds.get('iterator', False) chunksize = kwds.get('chunksize', None) - nrows = _validate_nrows(kwds.pop('nrows', None)) + nrows = _validate_nrows(kwds.get('nrows', None)) # Create the parser. parser = TextFileReader(filepath_or_buffer, **kwds) - if (nrows is not None) and (chunksize is not None): - raise NotImplementedError("'nrows' and 'chunksize' cannot be used" - " together yet.") - elif nrows is not None: - try: - data = parser.read(nrows) - finally: - parser.close() - return data - - elif chunksize or iterator: + if chunksize or iterator: return parser try: - data = parser.read() + data = parser.read(nrows) finally: parser.close() - return data @@ -445,7 +434,7 @@ def _read(filepath_or_buffer, kwds): 'usecols': None, - # 'nrows': None, + 'nrows': None, # 'iterator': False, 'chunksize': None, 'verbose': False, @@ -749,6 +738,7 @@ def __init__(self, f, engine=None, **kwds): options = self._get_options_with_defaults(engine) self.chunksize = options.pop('chunksize', None) + self.nrows = options.pop('nrows', None) self.squeeze = options.pop('squeeze', False) # might mutate self.engine @@ -1009,6 +999,10 @@ def _create_index(self, ret): def get_chunk(self, size=None): if size is None: size = self.chunksize + if self.nrows is not None: + if self._currow >= self.nrows: + raise StopIteration + size = min(size, self.nrows - self._currow) return self.read(nrows=size) diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py index df75d14e9702d..120d2bb8f6759 100644 --- a/pandas/tests/io/parser/common.py +++ b/pandas/tests/io/parser/common.py @@ -402,6 +402,30 @@ def test_read_chunksize(self): tm.assert_frame_equal(chunks[1], df[2:4]) tm.assert_frame_equal(chunks[2], df[4:]) + # With nrows + reader = self.read_csv(StringIO(self.data1), index_col=0, + chunksize=2, nrows=5) + df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) + + tm.assert_frame_equal(pd.concat(reader), df) + + # chunksize > nrows + reader = self.read_csv(StringIO(self.data1), index_col=0, + chunksize=8, nrows=5) + df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) + + tm.assert_frame_equal(pd.concat(reader), df) + + # with changing "size": + reader = self.read_csv(StringIO(self.data1), index_col=0, + chunksize=8, nrows=5) + df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5) + + tm.assert_frame_equal(reader.get_chunk(size=2), df.iloc[:2]) + tm.assert_frame_equal(reader.get_chunk(size=4), df.iloc[2:5]) + with tm.assertRaises(StopIteration): + reader.get_chunk(size=3) + def test_read_chunksize_named(self): reader = self.read_csv( StringIO(self.data1), index_col='index', chunksize=2) diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py index 999db47cf2eaf..48dd5d4ba506b 100644 --- a/pandas/tests/io/parser/test_unsupported.py +++ b/pandas/tests/io/parser/test_unsupported.py @@ -29,15 +29,6 @@ def test_mangle_dupe_cols_false(self): read_csv(StringIO(data), engine=engine, mangle_dupe_cols=False) - def test_nrows_and_chunksize(self): - data = 'a b c' - msg = "cannot be used together yet" - - for engine in ('c', 'python'): - with tm.assertRaisesRegexp(NotImplementedError, msg): - read_csv(StringIO(data), engine=engine, - nrows=10, chunksize=5) - def test_c_engine(self): # see gh-6607 data = 'a b c\n1 2 3'