Add nrows parameter to pandas.read_excel() (#18507)

alysivji · jreback · commit 1a46dba2449c · 2017-12-09T10:12:34.000-05:00
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -135,7 +135,7 @@ Other Enhancements
 - Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
 - :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`).
 - :class:`IntervalIndex` now supports time zone aware ``Interval`` objects (:issue:`18537`, :issue:`18538`)
-
+- :func:`read_excel()` has gained the ``nrows`` parameter (:issue:`16645`)
 
 .. _whatsnew_0220.api_breaking:
 
@@ -188,6 +188,7 @@ Other API Changes
 - :func:`pandas.DataFrame.merge` no longer casts a ``float`` column to ``object`` when merging on ``int`` and ``float`` columns (:issue:`16572`)
 - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`)
 - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`)
+- Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`)
 
 .. _whatsnew_0220.deprecations:
 
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -70,31 +70,37 @@
     * None -> All sheets as a dictionary of DataFrames
 
 sheetname : string, int, mixed list of strings/ints, or None, default 0
+
     .. deprecated:: 0.21.0
        Use `sheet_name` instead
 
 header : int, list of ints, default 0
     Row (0-indexed) to use for the column labels of the parsed
     DataFrame. If a list of integers is passed those row positions will
     be combined into a ``MultiIndex``. Use None if there is no header.
-skiprows : list-like
-    Rows to skip at the beginning (0-indexed)
-skip_footer : int, default 0
-    Rows at the end to skip (0-indexed)
+names : array-like, default None
+    List of column names to use. If file contains no header row,
+    then you should explicitly pass header=None
 index_col : int, list of ints, default None
     Column (0-indexed) to use as the row labels of the DataFrame.
     Pass None if there is no such column.  If a list is passed,
     those columns will be combined into a ``MultiIndex``.  If a
     subset of data is selected with ``usecols``, index_col
     is based on the subset.
-names : array-like, default None
-    List of column names to use. If file contains no header row,
-    then you should explicitly pass header=None
-converters : dict, default None
-    Dict of functions for converting values in certain columns. Keys can
-    either be integers or column labels, values are functions that take one
-    input argument, the Excel cell content, and return the transformed
-    content.
+parse_cols : int or list, default None
+
+    .. deprecated:: 0.21.0
+       Pass in `usecols` instead.
+
+usecols : int or list, default None
+    * If None then parse all columns,
+    * If int then indicates last column to be parsed
+    * If list of ints then indicates list of column numbers to be parsed
+    * If string then indicates comma separated list of Excel column letters and
+      column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
+      both sides.
+squeeze : boolean, default False
+    If the parsed data only contains one column then return a Series
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     Use `object` to preserve data as stored in Excel and not interpret dtype.
@@ -103,6 +109,14 @@
 
     .. versionadded:: 0.20.0
 
+engine: string, default None
+    If io is not a buffer or path, this must be set to identify io.
+    Acceptable values are None or xlrd
+converters : dict, default None
+    Dict of functions for converting values in certain columns. Keys can
+    either be integers or column labels, values are functions that take one
+    input argument, the Excel cell content, and return the transformed
+    content.
 true_values : list, default None
     Values to consider as True
 
@@ -113,36 +127,29 @@
 
     .. versionadded:: 0.19.0
 
-parse_cols : int or list, default None
-    .. deprecated:: 0.21.0
-       Pass in `usecols` instead.
+skiprows : list-like
+    Rows to skip at the beginning (0-indexed)
+nrows : int, default None
+    Number of rows to parse
+
+    .. versionadded:: 0.22.0
 
-usecols : int or list, default None
-    * If None then parse all columns,
-    * If int then indicates last column to be parsed
-    * If list of ints then indicates list of column numbers to be parsed
-    * If string then indicates comma separated list of Excel column letters and
-      column ranges (e.g. "A:E" or "A,C,E:F").  Ranges are inclusive of
-      both sides.
-squeeze : boolean, default False
-    If the parsed data only contains one column then return a Series
 na_values : scalar, str, list-like, or dict, default None
     Additional strings to recognize as NA/NaN. If dict passed, specific
     per-column NA values. By default the following values are interpreted
     as NaN: '""" + fill("', '".join(sorted(_NA_VALUES)), 70) + """'.
-thousands : str, default None
-    Thousands separator for parsing string columns to numeric.  Note that
-    this parameter is only necessary for columns stored as TEXT in Excel,
-    any numeric columns will automatically be parsed, regardless of display
-    format.
 keep_default_na : bool, default True
     If na_values are specified and keep_default_na is False the default NaN
     values are overridden, otherwise they're appended to.
 verbose : boolean, default False
     Indicate number of NA values placed in non-numeric columns
-engine: string, default None
-    If io is not a buffer or path, this must be set to identify io.
-    Acceptable values are None or xlrd
+thousands : str, default None
+    Thousands separator for parsing string columns to numeric.  Note that
+    this parameter is only necessary for columns stored as TEXT in Excel,
+    any numeric columns will automatically be parsed, regardless of display
+    format.
+skip_footer : int, default 0
+    Rows at the end to skip (0-indexed)
 convert_float : boolean, default True
     convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
     data will be read in as floats: Excel stores all numbers as floats
@@ -193,12 +200,27 @@ def get_writer(engine_name):
 
 @Appender(_read_excel_doc)
 @deprecate_kwarg("parse_cols", "usecols")
-def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
-               index_col=None, names=None, usecols=None, parse_dates=False,
-               date_parser=None, na_values=None, thousands=None,
-               convert_float=True, converters=None, dtype=None,
-               true_values=None, false_values=None, engine=None,
-               squeeze=False, **kwds):
+def read_excel(io,
+               sheet_name=0,
+               header=0,
+               names=None,
+               index_col=None,
+               usecols=None,
+               squeeze=False,
+               dtype=None,
+               engine=None,
+               converters=None,
+               true_values=None,
+               false_values=None,
+               skiprows=None,
+               nrows=None,
+               na_values=None,
+               parse_dates=False,
+               date_parser=None,
+               thousands=None,
+               skip_footer=0,
+               convert_float=True,
+               **kwds):
 
     # Can't use _deprecate_kwarg since sheetname=None has a special meaning
     if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
@@ -213,12 +235,25 @@ def read_excel(io, sheet_name=0, header=0, skiprows=None, skip_footer=0,
         io = ExcelFile(io, engine=engine)
 
     return io._parse_excel(
-        sheetname=sheet_name, header=header, skiprows=skiprows, names=names,
-        index_col=index_col, usecols=usecols, parse_dates=parse_dates,
-        date_parser=date_parser, na_values=na_values, thousands=thousands,
-        convert_float=convert_float, skip_footer=skip_footer,
-        converters=converters, dtype=dtype, true_values=true_values,
-        false_values=false_values, squeeze=squeeze, **kwds)
+        sheetname=sheet_name,
+        header=header,
+        names=names,
+        index_col=index_col,
+        usecols=usecols,
+        squeeze=squeeze,
+        dtype=dtype,
+        converters=converters,
+        true_values=true_values,
+        false_values=false_values,
+        skiprows=skiprows,
+        nrows=nrows,
+        na_values=na_values,
+        parse_dates=parse_dates,
+        date_parser=date_parser,
+        thousands=thousands,
+        skip_footer=skip_footer,
+        convert_float=convert_float,
+        **kwds)
 
 
 class ExcelFile(object):
@@ -282,31 +317,49 @@ def __init__(self, io, **kwds):
     def __fspath__(self):
         return self._io
 
-    def parse(self, sheet_name=0, header=0, skiprows=None, skip_footer=0,
-              names=None, index_col=None, usecols=None, parse_dates=False,
-              date_parser=None, na_values=None, thousands=None,
-              convert_float=True, converters=None, true_values=None,
-              false_values=None, squeeze=False, **kwds):
+    def parse(self,
+              sheet_name=0,
+              header=0,
+              names=None,
+              index_col=None,
+              usecols=None,
+              squeeze=False,
+              converters=None,
+              true_values=None,
+              false_values=None,
+              skiprows=None,
+              nrows=None,
+              na_values=None,
+              parse_dates=False,
+              date_parser=None,
+              thousands=None,
+              skip_footer=0,
+              convert_float=True,
+              **kwds):
         """
         Parse specified sheet(s) into a DataFrame
 
         Equivalent to read_excel(ExcelFile, ...)  See the read_excel
         docstring for more info on accepted parameters
         """
 
-        return self._parse_excel(sheetname=sheet_name, header=header,
-                                 skiprows=skiprows, names=names,
+        return self._parse_excel(sheetname=sheet_name,
+                                 header=header,
+                                 names=names,
                                  index_col=index_col,
                                  usecols=usecols,
+                                 squeeze=squeeze,
+                                 converters=converters,
+                                 true_values=true_values,
+                                 false_values=false_values,
+                                 skiprows=skiprows,
+                                 nrows=nrows,
+                                 na_values=na_values,
                                  parse_dates=parse_dates,
-                                 date_parser=date_parser, na_values=na_values,
+                                 date_parser=date_parser,
                                  thousands=thousands,
                                  skip_footer=skip_footer,
                                  convert_float=convert_float,
-                                 converters=converters,
-                                 true_values=true_values,
-                                 false_values=false_values,
-                                 squeeze=squeeze,
                                  **kwds)
 
     def _should_parse(self, i, usecols):
@@ -342,12 +395,26 @@ def _excel2num(x):
         else:
             return i in usecols
 
-    def _parse_excel(self, sheetname=0, header=0, skiprows=None, names=None,
-                     skip_footer=0, index_col=None, usecols=None,
-                     parse_dates=False, date_parser=None, na_values=None,
-                     thousands=None, convert_float=True, true_values=None,
-                     false_values=None, verbose=False, dtype=None,
-                     squeeze=False, **kwds):
+    def _parse_excel(self,
+                     sheetname=0,
+                     header=0,
+                     names=None,
+                     index_col=None,
+                     usecols=None,
+                     squeeze=False,
+                     dtype=None,
+                     true_values=None,
+                     false_values=None,
+                     skiprows=None,
+                     nrows=None,
+                     na_values=None,
+                     verbose=False,
+                     parse_dates=False,
+                     date_parser=None,
+                     thousands=None,
+                     skip_footer=0,
+                     convert_float=True,
+                     **kwds):
 
         skipfooter = kwds.pop('skipfooter', None)
         if skipfooter is not None:
@@ -509,21 +576,24 @@ def _parse_cell(cell_contents, cell_typ):
 
             # GH 12292 : error when read one empty column from excel file
             try:
-                parser = TextParser(data, header=header, index_col=index_col,
+                parser = TextParser(data,
+                                    header=header,
+                                    index_col=index_col,
                                     has_index_names=has_index_names,
-                                    na_values=na_values,
-                                    thousands=thousands,
-                                    parse_dates=parse_dates,
-                                    date_parser=date_parser,
+                                    squeeze=squeeze,
+                                    dtype=dtype,
                                     true_values=true_values,
                                     false_values=false_values,
                                     skiprows=skiprows,
+                                    nrows=nrows,
+                                    na_values=na_values,
+                                    parse_dates=parse_dates,
+                                    date_parser=date_parser,
+                                    thousands=thousands,
                                     skipfooter=skip_footer,
-                                    squeeze=squeeze,
-                                    dtype=dtype,
                                     **kwds)
 
-                output[asheetname] = parser.read()
+                output[asheetname] = parser.read(nrows=nrows)
                 if names is not None:
                     output[asheetname].columns = names
                 if not squeeze or isinstance(output[asheetname], DataFrame):
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -440,7 +440,7 @@ def _read(filepath_or_buffer, kwds):
     # Extract some of the arguments (pass chunksize on).
     iterator = kwds.get('iterator', False)
     chunksize = _validate_integer('chunksize', kwds.get('chunksize', None), 1)
-    nrows = _validate_integer('nrows', kwds.get('nrows', None))
+    nrows = kwds.get('nrows', None)
 
     # Check for duplicates in names.
     _validate_names(kwds.get("names", None))
@@ -1062,6 +1062,8 @@ def _failover_to_python(self):
         raise AbstractMethodError(self)
 
     def read(self, nrows=None):
+        nrows = _validate_integer('nrows', nrows)
+
         if nrows is not None:
             if self.options.get('skipfooter'):
                 raise ValueError('skipfooter not supported for iteration')
diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py
@@ -1017,6 +1017,33 @@ def test_read_excel_skiprows_list(self):
                                'skiprows_list', skiprows=np.array([0, 2]))
         tm.assert_frame_equal(actual, expected)
 
+    def test_read_excel_nrows(self):
+        # GH 16645
+        num_rows_to_pull = 5
+        actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
+                               nrows=num_rows_to_pull)
+        expected = pd.read_excel(os.path.join(self.dirpath,
+                                              'test1' + self.ext))
+        expected = expected[:num_rows_to_pull]
+        tm.assert_frame_equal(actual, expected)
+
+    def test_read_excel_nrows_greater_than_nrows_in_file(self):
+        # GH 16645
+        expected = pd.read_excel(os.path.join(self.dirpath,
+                                              'test1' + self.ext))
+        num_records_in_file = len(expected)
+        num_rows_to_pull = num_records_in_file + 10
+        actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
+                               nrows=num_rows_to_pull)
+        tm.assert_frame_equal(actual, expected)
+
+    def test_read_excel_nrows_non_integer_parameter(self):
+        # GH 16645
+        msg = "'nrows' must be an integer >=0"
+        with tm.assert_raises_regex(ValueError, msg):
+            pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext),
+                          nrows='5')
+
     def test_read_excel_squeeze(self):
         # GH 12157
         f = os.path.join(self.dirpath, 'test_squeeze' + self.ext)