ENH: support "nrows" and "chunksize" together

toobaz · mattip · commit 69c1533ae415 · 2017-03-30T23:12:36.000+03:00
closes pandas-dev#15755 Author: Pietro Battiston <me@pietrobattiston.it> Closes pandas-dev#15756 from toobaz/nrows_chunksize and squashes the following commits: d0288e3 [Pietro Battiston] ENH: support "nrows" and "chunksize" together
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -291,6 +291,7 @@ Other enhancements
 - ``Series`` provides a ``to_excel`` method to output Excel files (:issue:`8825`)
 - The ``usecols`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`14154`)
 - The ``skiprows`` argument in ``pd.read_csv`` now accepts a callable function as a value  (:issue:`10882`)
+- The ``nrows`` and ``chunksize`` arguments in ``pd.read_csv()`` are supported if both are passed (:issue:`6774`, :issue:`15755`)
 - ``pd.DataFrame.plot`` now prints a title above each subplot if ``suplots=True`` and ``title`` is a list of strings (:issue:`14753`)
 - ``pd.Series.interpolate`` now supports timedelta as an index type with ``method='time'`` (:issue:`6424`)
 - ``Timedelta.isoformat`` method added for formatting Timedeltas as an `ISO 8601 duration`_. See the :ref:`Timedelta docs <timedeltas.isoformat>` (:issue:`15136`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -384,29 +384,18 @@ def _read(filepath_or_buffer, kwds):
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get('iterator', False)
     chunksize = kwds.get('chunksize', None)
-    nrows = _validate_nrows(kwds.pop('nrows', None))
+    nrows = _validate_nrows(kwds.get('nrows', None))
 
     # Create the parser.
     parser = TextFileReader(filepath_or_buffer, **kwds)
 
-    if (nrows is not None) and (chunksize is not None):
-        raise NotImplementedError("'nrows' and 'chunksize' cannot be used"
-                                  " together yet.")
-    elif nrows is not None:
-        try:
-            data = parser.read(nrows)
-        finally:
-            parser.close()
-        return data
-
-    elif chunksize or iterator:
+    if chunksize or iterator:
         return parser
 
     try:
-        data = parser.read()
+        data = parser.read(nrows)
     finally:
         parser.close()
-
     return data
 
 
@@ -445,7 +434,7 @@ def _read(filepath_or_buffer, kwds):
 
     'usecols': None,
 
-    # 'nrows': None,
+    'nrows': None,
     # 'iterator': False,
     'chunksize': None,
     'verbose': False,
@@ -749,6 +738,7 @@ def __init__(self, f, engine=None, **kwds):
         options = self._get_options_with_defaults(engine)
 
         self.chunksize = options.pop('chunksize', None)
+        self.nrows = options.pop('nrows', None)
         self.squeeze = options.pop('squeeze', False)
 
         # might mutate self.engine
@@ -1009,6 +999,10 @@ def _create_index(self, ret):
     def get_chunk(self, size=None):
         if size is None:
             size = self.chunksize
+        if self.nrows is not None:
+            if self._currow >= self.nrows:
+                raise StopIteration
+            size = min(size, self.nrows - self._currow)
         return self.read(nrows=size)
 
 
diff --git a/pandas/tests/io/parser/common.py b/pandas/tests/io/parser/common.py
@@ -402,6 +402,33 @@ def test_read_chunksize(self):
         tm.assert_frame_equal(chunks[1], df[2:4])
         tm.assert_frame_equal(chunks[2], df[4:])
 
+    def test_read_chunksize_and_nrows(self):
+
+        # gh-15755
+        # With nrows
+        reader = self.read_csv(StringIO(self.data1), index_col=0,
+                               chunksize=2, nrows=5)
+        df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
+
+        tm.assert_frame_equal(pd.concat(reader), df)
+
+        # chunksize > nrows
+        reader = self.read_csv(StringIO(self.data1), index_col=0,
+                               chunksize=8, nrows=5)
+        df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
+
+        tm.assert_frame_equal(pd.concat(reader), df)
+
+        # with changing "size":
+        reader = self.read_csv(StringIO(self.data1), index_col=0,
+                               chunksize=8, nrows=5)
+        df = self.read_csv(StringIO(self.data1), index_col=0, nrows=5)
+
+        tm.assert_frame_equal(reader.get_chunk(size=2), df.iloc[:2])
+        tm.assert_frame_equal(reader.get_chunk(size=4), df.iloc[2:5])
+        with tm.assertRaises(StopIteration):
+            reader.get_chunk(size=3)
+
     def test_read_chunksize_named(self):
         reader = self.read_csv(
             StringIO(self.data1), index_col='index', chunksize=2)
diff --git a/pandas/tests/io/parser/test_unsupported.py b/pandas/tests/io/parser/test_unsupported.py
@@ -29,15 +29,6 @@ def test_mangle_dupe_cols_false(self):
                 read_csv(StringIO(data), engine=engine,
                          mangle_dupe_cols=False)
 
-    def test_nrows_and_chunksize(self):
-        data = 'a b c'
-        msg = "cannot be used together yet"
-
-        for engine in ('c', 'python'):
-            with tm.assertRaisesRegexp(NotImplementedError, msg):
-                read_csv(StringIO(data), engine=engine,
-                         nrows=10, chunksize=5)
-
     def test_c_engine(self):
         # see gh-6607
         data = 'a b c\n1 2 3'