diff --git a/doc/source/io.rst b/doc/source/io.rst index c9a42f373ee6e..7dddc43b136cf 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1532,6 +1532,26 @@ advanced strategies read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) +.. versionadded:: 0.13 + +There are now two ways to read in sheets from an Excel file. You can provide +either the index of a sheet or its name. If the value provided is an integer +then it is assumed that the integer refers to the index of a sheet, otherwise +if a string is passed then it is assumed that the string refers to the name of +a particular sheet in the file. + +Using the sheet name: + +.. code-block:: python + + read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + +Using the sheet index: + +.. code-block:: python + + read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + It is often the case that users will insert columns to do temporary computations in Excel and you may not want to read in those columns. `read_excel` takes a `parse_cols` keyword to allow you to specify a subset of columns to parse. diff --git a/doc/source/release.rst b/doc/source/release.rst index 54fa4d30bac0a..41a39d4592b8f 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -37,6 +37,8 @@ pandas 0.13 - ``read_html`` now raises a ``URLError`` instead of catching and raising a ``ValueError`` (:issue:`4303`, :issue:`4305`) + - ``read_excel`` now supports an integer in its ``sheetname`` argument giving + the index of the sheet to read in (:issue:`4301`). **API Changes** diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 52bd674cb7830..6ee3adeac1a6e 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -9,6 +9,9 @@ enhancements along with a large number of bug fixes. API changes ~~~~~~~~~~~ + - ``read_excel`` now supports an integer in its ``sheetname`` argument giving + the index of the sheet to read in (:issue:`4301`). + Enhancements ~~~~~~~~~~~~ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index a691075844f8f..b3b48382faae0 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -48,8 +48,9 @@ def read_excel(path_or_buf, sheetname, kind=None, **kwds): parsed : DataFrame DataFrame from the passed in Excel file """ - return ExcelFile(path_or_buf,kind=kind).parse(sheetname=sheetname, - kind=kind, **kwds) + return ExcelFile(path_or_buf, kind=kind).parse(sheetname=sheetname, + kind=kind, **kwds) + class ExcelFile(object): """ @@ -86,8 +87,8 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, Parameters ---------- - sheetname : string - Name of Excel sheet + sheetname : string or integer + Name of Excel sheet or the page number of the sheet header : int, default 0 Row to use for the column labels of the parsed DataFrame skiprows : list-like @@ -117,27 +118,20 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, parsed : DataFrame DataFrame parsed from the Excel file """ - - # has_index_names: boolean, default False - # True if the cols defined in index_col have an index name and are - # not in the header has_index_names = False # removed as new argument of API function skipfooter = kwds.pop('skipfooter', None) if skipfooter is not None: skip_footer = skipfooter - return self._parse_excel(sheetname, header=header, - skiprows=skiprows, index_col=index_col, - has_index_names=has_index_names, - parse_cols=parse_cols, - parse_dates=parse_dates, - date_parser=date_parser, - na_values=na_values, - thousands=thousands, - chunksize=chunksize, - skip_footer=skip_footer, - **kwds) + return self._parse_excel(sheetname, header=header, skiprows=skiprows, + index_col=index_col, + has_index_names=has_index_names, + parse_cols=parse_cols, + parse_dates=parse_dates, + date_parser=date_parser, na_values=na_values, + thousands=thousands, chunksize=chunksize, + skip_footer=skip_footer, **kwds) def _should_parse(self, i, parse_cols): @@ -171,20 +165,22 @@ def _excel2num(x): else: return i in parse_cols - def _parse_excel(self, sheetname, header=0, skiprows=None, - skip_footer=0, index_col=None, has_index_names=None, - parse_cols=None, parse_dates=False, date_parser=None, - na_values=None, thousands=None, chunksize=None, - **kwds): + def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, + index_col=None, has_index_names=None, parse_cols=None, + parse_dates=False, date_parser=None, na_values=None, + thousands=None, chunksize=None, **kwds): from xlrd import (xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode - sheet = self.book.sheet_by_name(sheetname) + if isinstance(sheetname, basestring): + sheet = self.book.sheet_by_name(sheetname) + else: # assume an integer if not a string + sheet = self.book.sheet_by_index(sheetname) data = [] should_parse = {} - for i in range(sheet.nrows): + for i in xrange(sheet.nrows): row = [] for j, (value, typ) in enumerate(izip(sheet.row_values(i), sheet.row_types(i))): @@ -225,7 +221,7 @@ def _parse_excel(self, sheetname, header=0, skiprows=None, @property def sheet_names(self): - return self.book.sheet_names() + return self.book.sheet_names() def _trim_excel_header(row): diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index baf6966530772..ebbb7292cb3d7 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -82,6 +82,7 @@ def setUp(self): self.csv1 = os.path.join(self.dirpath, 'test1.csv') self.csv2 = os.path.join(self.dirpath, 'test2.csv') self.xls1 = os.path.join(self.dirpath, 'test.xls') + self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx') self.frame = _frame.copy() self.frame2 = _frame2.copy() self.tsframe = _tsframe.copy() @@ -198,6 +199,49 @@ def test_excel_passes_na(self): columns=['Test']) tm.assert_frame_equal(parsed, expected) + def check_excel_table_sheet_by_index(self, filename, csvfile): + import xlrd + + pth = os.path.join(self.dirpath, filename) + xls = ExcelFile(pth) + df = xls.parse(0, index_col=0, parse_dates=True) + df2 = self.read_csv(csvfile, index_col=0, parse_dates=True) + df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True) + tm.assert_frame_equal(df, df2, check_names=False) + tm.assert_frame_equal(df3, df2, check_names=False) + + df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1) + df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1) + tm.assert_frame_equal(df4, df.ix[:-1]) + tm.assert_frame_equal(df4, df5) + + self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf') + + def test_excel_table_sheet_by_index(self): + _skip_if_no_xlrd() + for filename, csvfile in [(self.xls1, self.csv1), + (self.xlsx1, self.csv1)]: + self.check_excel_table_sheet_by_index(filename, csvfile) + + def check_excel_sheet_by_name_raise(self, ext): + import xlrd + pth = os.path.join(self.dirpath, 'testit.{0}'.format(ext)) + + with ensure_clean(pth) as pth: + gt = DataFrame(np.random.randn(10, 2)) + gt.to_excel(pth) + xl = ExcelFile(pth) + df = xl.parse(0) + tm.assert_frame_equal(gt, df) + + self.assertRaises(xlrd.XLRDError, xl.parse, '0') + + def test_excel_sheet_by_name_raise(self): + _skip_if_no_xlrd() + _skip_if_no_xlwt() + for ext in ('xls', 'xlsx'): + self.check_excel_sheet_by_name_raise(ext) + def test_excel_table(self): _skip_if_no_xlrd() @@ -438,7 +482,6 @@ def _check_extension_sheets(self, ext): np.testing.assert_equal('test1', reader.sheet_names[0]) np.testing.assert_equal('test2', reader.sheet_names[1]) - def test_excel_roundtrip_xls_colaliases(self): _skip_if_no_excelsuite() self._check_extension_colaliases('xls') @@ -892,6 +935,7 @@ def test_deprecated_from_parsers(self): from pandas.io.parsers import ExcelWriter as xw xw(path) + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False)