API: read_excel signature

chris-b1 · jreback · commit 0d39ca1109fd · 2015-09-30T17:10:31.000-04:00
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -1980,100 +1980,85 @@ Excel files
 -----------
 
 The :func:`~pandas.read_excel` method can read Excel 2003 (``.xls``) and
-Excel 2007 (``.xlsx``) files using the ``xlrd`` Python
-module and use the same parsing code as the above to convert tabular data into
-a DataFrame. See the :ref:`cookbook<cookbook.excel>` for some
+Excel 2007+ (``.xlsx``) files using the ``xlrd`` Python
+module.  The :meth:`~DataFrame.to_excel` instance method is used for
+saving a ``DataFrame`` to Excel.  Generally the semantics are
+similar to working with :ref:`csv<io.read_csv_table>` data.  See the :ref:`cookbook<cookbook.excel>` for some
 advanced strategies
 
 .. _io.excel_reader:
 
 Reading Excel Files
 '''''''''''''''''''
 
-.. versionadded:: 0.17
+In the most basic use-case, ``read_excel`` takes a path to an Excel
+file, and the ``sheetname`` indicating which sheet to parse.
 
-``read_excel`` can read a ``MultiIndex`` index, by passing a list of columns to ``index_col``
-and a ``MultiIndex`` column by passing a list of rows to ``header``.  If either the ``index``
-or ``columns`` have serialized level names those will be read in as well by specifying
-the rows/columns that make up the levels.
-
-.. ipython:: python
+.. code-block:: python
 
-   # MultiIndex index - no names
-   df = pd.DataFrame({'a':[1,2,3,4], 'b':[5,6,7,8]},
-                     index=pd.MultiIndex.from_product([['a','b'],['c','d']]))
-   df.to_excel('path_to_file.xlsx')
-   df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
-   df
+   # Returns a DataFrame
+   read_excel('path_to_file.xls', sheetname='Sheet1')
 
-   # MultiIndex index - with names
-   df.index = df.index.set_names(['lvl1', 'lvl2'])
-   df.to_excel('path_to_file.xlsx')
-   df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
-   df
 
-   # MultiIndex index and column - with names
-   df.columns = pd.MultiIndex.from_product([['a'],['b', 'd']], names=['c1', 'c2'])
-   df.to_excel('path_to_file.xlsx')
-   df = pd.read_excel('path_to_file.xlsx',
-                       index_col=[0,1], header=[0,1])
-   df
+.. _io.excel.excelfile_class:
 
-.. ipython:: python
-   :suppress:
+``ExcelFile`` class
++++++++++++++++++++
 
-   import os
-   os.remove('path_to_file.xlsx')
+To faciliate working with multiple sheets from the same file, the ``ExcelFile``
+class can be used to wrap the file and can be be passed into ``read_excel``
+There will be a performance benefit for reading multiple sheets as the file is
+read into memory only once.
 
-.. warning::
+.. code-block:: python
 
-   Excel files saved in version 0.16.2 or prior that had index names will still able to be read in,
-   but the ``has_index_names`` argument must specified to ``True``.
+   xlsx = pd.ExcelFile('path_to_file.xls)
+   df = pd.read_excel(xlsx, 'Sheet1')
 
-.. versionadded:: 0.16
+The ``ExcelFile`` class can also be used as a context manager.
 
-``read_excel`` can read more than one sheet, by setting ``sheetname`` to either
-a list of sheet names, a list of sheet positions, or ``None`` to read all sheets.
+.. code-block:: python
 
-.. versionadded:: 0.13
+   with pd.ExcelFile('path_to_file.xls') as xls:
+       df1 = pd.read_excel(xls, 'Sheet1')
+       df2 = pd.read_excel(xls, 'Sheet2')
 
-Sheets can be specified by sheet index or sheet name, using an integer or string,
-respectively.
+The ``sheet_names`` property will generate
+a list of the sheet names in the file.
 
-.. versionadded:: 0.12
+The primary use-case for an ``ExcelFile`` is parsing multiple sheets with
+different parameters
 
-``ExcelFile`` has been moved to the top level namespace.
+.. code-block:: python
 
-There are two approaches to reading an excel file.  The ``read_excel`` function
-and the ``ExcelFile`` class.  ``read_excel`` is for reading one file
-with file-specific arguments (ie. identical data formats across sheets).
-``ExcelFile`` is for reading one file with sheet-specific arguments (ie. various data
-formats across sheets).  Choosing the approach is largely a question of
-code readability and execution speed.
+    data = {}
+    # For when Sheet1's format differs from Sheet2
+    with pd.ExcelFile('path_to_file.xls') as xls:
+        data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, na_values=['NA'])
+        data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=1)
 
-Equivalent class and function approaches to read a single sheet:
+Note that if the same parsing parameters are used for all sheets, a list
+of sheet names can simply be passed to ``read_excel`` with no loss in performance.
 
 .. code-block:: python
 
     # using the ExcelFile class
-    xls = pd.ExcelFile('path_to_file.xls')
-    data = xls.parse('Sheet1', index_col=None, na_values=['NA'])
+    data = {}
+    with pd.ExcelFile('path_to_file.xls') as xls:
+        data['Sheet1'] = read_excel(xls, 'Sheet1', index_col=None, na_values=['NA'])
+        data['Sheet2'] = read_excel(xls, 'Sheet2', index_col=None, na_values=['NA'])
 
-    # using the read_excel function
-    data = read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])
+    # equivalent using the read_excel function
+    data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], index_col=None, na_values=['NA'])
 
-Equivalent class and function approaches to read multiple sheets:
+.. versionadded:: 0.12
 
-.. code-block:: python
+``ExcelFile`` has been moved to the top level namespace.
 
-    data = {}
-    # For when Sheet1's format differs from Sheet2
-    xls = pd.ExcelFile('path_to_file.xls')
-    data['Sheet1'] = xls.parse('Sheet1', index_col=None, na_values=['NA'])
-    data['Sheet2'] = xls.parse('Sheet2', index_col=1)
+.. versionadded:: 0.17
+
+``read_excel`` can take an ``ExcelFile`` object as input
 
-    # For when Sheet1's format is identical to Sheet2
-    data = read_excel('path_to_file.xls', ['Sheet1','Sheet2'], index_col=None, na_values=['NA'])
 
 .. _io.excel.specifying_sheets:
 
@@ -2125,6 +2110,72 @@ Using a list to get multiple sheets:
    # Returns the 1st and 4th sheet, as a dictionary of DataFrames.
    read_excel('path_to_file.xls',sheetname=['Sheet1',3])
 
+.. versionadded:: 0.16
+
+``read_excel`` can read more than one sheet, by setting ``sheetname`` to either
+a list of sheet names, a list of sheet positions, or ``None`` to read all sheets.
+
+.. versionadded:: 0.13
+
+Sheets can be specified by sheet index or sheet name, using an integer or string,
+respectively.
+
+.. _io.excel.reading_multiindex:
+
+Reading a ``MultiIndex``
+++++++++++++++++++++++++
+
+.. versionadded:: 0.17
+
+``read_excel`` can read a ``MultiIndex`` index, by passing a list of columns to ``index_col``
+and a ``MultiIndex`` column by passing a list of rows to ``header``.  If either the ``index``
+or ``columns`` have serialized level names those will be read in as well by specifying
+the rows/columns that make up the levels.
+
+For example, to read in a ``MultiIndex`` index without names:
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a':[1,2,3,4], 'b':[5,6,7,8]},
+                     index=pd.MultiIndex.from_product([['a','b'],['c','d']]))
+   df.to_excel('path_to_file.xlsx')
+   df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
+   df
+
+If the index has level names, they will parsed as well, using the same
+parameters.
+
+.. ipython:: python
+
+   df.index = df.index.set_names(['lvl1', 'lvl2'])
+   df.to_excel('path_to_file.xlsx')
+   df = pd.read_excel('path_to_file.xlsx', index_col=[0,1])
+   df
+
+
+If the source file has both ``MultiIndex`` index and columns, lists specifying each
+should be passed to ``index_col`` and ``header``
+
+.. ipython:: python
+
+   df.columns = pd.MultiIndex.from_product([['a'],['b', 'd']], names=['c1', 'c2'])
+   df.to_excel('path_to_file.xlsx')
+   df = pd.read_excel('path_to_file.xlsx',
+                       index_col=[0,1], header=[0,1])
+   df
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('path_to_file.xlsx')
+
+.. warning::
+
+   Excel files saved in version 0.16.2 or prior that had index names will still able to be read in,
+   but the ``has_index_names`` argument must specified to ``True``.
+
+
 Parsing Specific Columns
 ++++++++++++++++++++++++
 
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -938,6 +938,8 @@ Other API Changes
 - When constructing ``DataFrame`` with an array of ``complex64`` dtype previously meant the corresponding column
   was automatically promoted to the ``complex128`` dtype. Pandas will now preserve the itemsize of the input for complex data (:issue:`10952`)
 - some numeric reduction operators would return ``ValueError``, rather than ``TypeError`` on object types that includes strings and numbers (:issue:`11131`)
+- Passing currently unsupported ``chunksize`` argument to ``read_excel`` or ``ExcelFile.parse`` will now raise ``NotImplementedError`` (:issue:`8011`)
+- Allow an ``ExcelFile`` object to be passed into ``read_excel`` (:issue:`11198`)
 - ``DatetimeIndex.union`` does not infer ``freq`` if ``self`` and the input have ``None`` as ``freq`` (:issue:`11086`)
 - ``NaT``'s methods now either raise ``ValueError``, or return ``np.nan`` or ``NaT`` (:issue:`9513`)
 
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -70,12 +70,20 @@ def get_writer(engine_name):
     except KeyError:
         raise ValueError("No Excel writer '%s'" % engine_name)
 
-
-excel_doc_common = """
+def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0,
+               index_col=None, parse_cols=None, parse_dates=False,
+               date_parser=None, na_values=None, thousands=None,
+               convert_float=True, has_index_names=None, converters=None,
+               engine=None, **kwds):
+    """
     Read an Excel table into a pandas DataFrame
 
     Parameters
-    ----------%(io)s
+    ----------
+    io : string, file-like object, pandas ExcelFile, or xlrd workbook.
+        The string could be a URL. Valid URL schemes include http, ftp, s3,
+        and file. For file URLs, a host is expected. For instance, a local
+        file could be file://localhost/path/to/workbook.xlsx
     sheetname : string, int, mixed list of strings/ints, or None, default 0
 
         Strings are used for sheet names, Integers are used in zero-indexed sheet
@@ -122,18 +130,24 @@ def get_writer(engine_name):
     na_values : list-like, default None
         List of additional strings to recognize as NA/NaN
     thousands : str, default None
-        Thousands separator
+        Thousands separator for parsing string columns to numeric.  Note that
+        this parameter is only necessary for columns stored as TEXT in Excel,
+        any numeric columns will automatically be parsed, regardless of display
+        format.
     keep_default_na : bool, default True
         If na_values are specified and keep_default_na is False the default NaN
         values are overridden, otherwise they're appended to
     verbose : boolean, default False
-        Indicate number of NA values placed in non-numeric columns%(eng)s
+        Indicate number of NA values placed in non-numeric columns
+    engine: string, default None
+        If io is not a buffer or path, this must be set to identify io.
+        Acceptable values are None or xlrd
     convert_float : boolean, default True
         convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
         data will be read in as floats: Excel stores all numbers as floats
         internally
     has_index_names : boolean, default None
-        DEPCRECATED: for version 0.17+ index names will be automatically inferred
+        DEPRECATED: for version 0.17+ index names will be automatically inferred
         based on index_col.  To read Excel output from 0.16.2 and prior that
         had saved index names, use True.
 
@@ -144,28 +158,21 @@ def get_writer(engine_name):
         for more information on when a Dict of Dataframes is returned.
 
     """
-read_excel_kwargs = dict()
-read_excel_kwargs['io'] = """
-    io : string, file-like object, or xlrd workbook.
-        The string could be a URL. Valid URL schemes include http, ftp, s3,
-        and file. For file URLs, a host is expected. For instance, a local
-        file could be file://localhost/path/to/workbook.xlsx"""
-read_excel_kwargs['eng'] = """
-    engine: string, default None
-        If io is not a buffer or path, this must be set to identify io.
-        Acceptable values are None or xlrd"""
-
-@Appender(excel_doc_common % read_excel_kwargs)
-def read_excel(io, sheetname=0, **kwds):
-    engine = kwds.pop('engine', None)
 
-    return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
+    if not isinstance(io, ExcelFile):
+        io = ExcelFile(io, engine=engine)
 
+    return io._parse_excel(
+        sheetname=sheetname, header=header, skiprows=skiprows,
+        index_col=index_col, parse_cols=parse_cols, parse_dates=parse_dates,
+        date_parser=date_parser, na_values=na_values, thousands=thousands,
+        convert_float=convert_float, has_index_names=has_index_names,
+        skip_footer=skip_footer, converters=converters, **kwds)
 
 class ExcelFile(object):
     """
     Class for parsing tabular excel sheets into DataFrame objects.
-    Uses xlrd. See ExcelFile.parse for more documentation
+    Uses xlrd. See read_excel for more documentation
 
     Parameters
     ----------
@@ -207,23 +214,16 @@ def __init__(self, io, **kwds):
             raise ValueError('Must explicitly set engine if not passing in'
                              ' buffer or path for io.')
 
-    @Appender(excel_doc_common % dict(io='', eng=''))
     def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
               index_col=None, parse_cols=None, parse_dates=False,
-              date_parser=None, na_values=None, thousands=None, chunksize=None,
+              date_parser=None, na_values=None, thousands=None,
               convert_float=True, has_index_names=None, converters=None, **kwds):
+        """
+        Parse specified sheet(s) into a DataFrame
 
-        skipfooter = kwds.pop('skipfooter', None)
-        if skipfooter is not None:
-            skip_footer = skipfooter
-
-        _validate_header_arg(header)
-        if has_index_names is not None:
-            warn("\nThe has_index_names argument is deprecated; index names "
-                 "will be automatically inferred based on index_col.\n"
-                 "This argmument is still necessary if reading Excel output "
-                 "from 0.16.2 or prior with index names.", FutureWarning,
-                 stacklevel=3)
+        Equivalent to read_excel(ExcelFile, ...)  See the read_excel
+        docstring for more info on accepted parameters
+        """
 
         return self._parse_excel(sheetname=sheetname, header=header,
                                  skiprows=skiprows,
@@ -232,7 +232,7 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                                  parse_cols=parse_cols,
                                  parse_dates=parse_dates,
                                  date_parser=date_parser, na_values=na_values,
-                                 thousands=thousands, chunksize=chunksize,
+                                 thousands=thousands,
                                  skip_footer=skip_footer,
                                  convert_float=convert_float,
                                  converters=converters,
@@ -274,8 +274,25 @@ def _excel2num(x):
     def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                      index_col=None, has_index_names=None, parse_cols=None,
                      parse_dates=False, date_parser=None, na_values=None,
-                     thousands=None, chunksize=None, convert_float=True,
+                     thousands=None, convert_float=True,
                      verbose=False, **kwds):
+
+        skipfooter = kwds.pop('skipfooter', None)
+        if skipfooter is not None:
+            skip_footer = skipfooter
+
+        _validate_header_arg(header)
+        if has_index_names is not None:
+            warn("\nThe has_index_names argument is deprecated; index names "
+                 "will be automatically inferred based on index_col.\n"
+                 "This argmument is still necessary if reading Excel output "
+                 "from 0.16.2 or prior with index names.", FutureWarning,
+                 stacklevel=3)
+
+        if 'chunksize' in kwds:
+            raise NotImplementedError("Reading an Excel file in chunks "
+                                      "is not implemented")
+
         import xlrd
         from xlrd import (xldate, XL_CELL_DATE,
                           XL_CELL_ERROR, XL_CELL_BOOLEAN,
@@ -416,7 +433,6 @@ def _parse_cell(cell_contents,cell_typ):
                                 date_parser=date_parser,
                                 skiprows=skiprows,
                                 skip_footer=skip_footer,
-                                chunksize=chunksize,
                                 **kwds)
 
             output[asheetname] = parser.read()
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py