Skip to content

ENH: add integer sheetname support in read_excel #4308

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 25, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions doc/source/io.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,26 @@ advanced strategies

read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])

.. versionadded:: 0.13

There are now two ways to read in sheets from an Excel file. You can provide
either the index of a sheet or its name. If the value provided is an integer
then it is assumed that the integer refers to the index of a sheet, otherwise
if a string is passed then it is assumed that the string refers to the name of
a particular sheet in the file.

Using the sheet name:

.. code-block:: python

read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA'])

Using the sheet index:

.. code-block:: python

read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA'])

It is often the case that users will insert columns to do temporary computations
in Excel and you may not want to read in those columns. `read_excel` takes
a `parse_cols` keyword to allow you to specify a subset of columns to parse.
Expand Down
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ pandas 0.13

- ``read_html`` now raises a ``URLError`` instead of catching and raising a
``ValueError`` (:issue:`4303`, :issue:`4305`)
- ``read_excel`` now supports an integer in its ``sheetname`` argument giving
the index of the sheet to read in (:issue:`4301`).

**API Changes**

Expand Down
3 changes: 3 additions & 0 deletions doc/source/v0.13.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ enhancements along with a large number of bug fixes.
API changes
~~~~~~~~~~~

- ``read_excel`` now supports an integer in its ``sheetname`` argument giving
the index of the sheet to read in (:issue:`4301`).

Enhancements
~~~~~~~~~~~~

Expand Down
50 changes: 23 additions & 27 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ def read_excel(path_or_buf, sheetname, kind=None, **kwds):
parsed : DataFrame
DataFrame from the passed in Excel file
"""
return ExcelFile(path_or_buf,kind=kind).parse(sheetname=sheetname,
kind=kind, **kwds)
return ExcelFile(path_or_buf, kind=kind).parse(sheetname=sheetname,
kind=kind, **kwds)


class ExcelFile(object):
"""
Expand Down Expand Up @@ -86,8 +87,8 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,

Parameters
----------
sheetname : string
Name of Excel sheet
sheetname : string or integer
Name of Excel sheet or the page number of the sheet
header : int, default 0
Row to use for the column labels of the parsed DataFrame
skiprows : list-like
Expand Down Expand Up @@ -117,27 +118,20 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
parsed : DataFrame
DataFrame parsed from the Excel file
"""

# has_index_names: boolean, default False
# True if the cols defined in index_col have an index name and are
# not in the header
has_index_names = False # removed as new argument of API function

skipfooter = kwds.pop('skipfooter', None)
if skipfooter is not None:
skip_footer = skipfooter

return self._parse_excel(sheetname, header=header,
skiprows=skiprows, index_col=index_col,
has_index_names=has_index_names,
parse_cols=parse_cols,
parse_dates=parse_dates,
date_parser=date_parser,
na_values=na_values,
thousands=thousands,
chunksize=chunksize,
skip_footer=skip_footer,
**kwds)
return self._parse_excel(sheetname, header=header, skiprows=skiprows,
index_col=index_col,
has_index_names=has_index_names,
parse_cols=parse_cols,
parse_dates=parse_dates,
date_parser=date_parser, na_values=na_values,
thousands=thousands, chunksize=chunksize,
skip_footer=skip_footer, **kwds)

def _should_parse(self, i, parse_cols):

Expand Down Expand Up @@ -171,20 +165,22 @@ def _excel2num(x):
else:
return i in parse_cols

def _parse_excel(self, sheetname, header=0, skiprows=None,
skip_footer=0, index_col=None, has_index_names=None,
parse_cols=None, parse_dates=False, date_parser=None,
na_values=None, thousands=None, chunksize=None,
**kwds):
def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0,
index_col=None, has_index_names=None, parse_cols=None,
parse_dates=False, date_parser=None, na_values=None,
thousands=None, chunksize=None, **kwds):
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN)

datemode = self.book.datemode
sheet = self.book.sheet_by_name(sheetname)
if isinstance(sheetname, basestring):
sheet = self.book.sheet_by_name(sheetname)
else: # assume an integer if not a string
sheet = self.book.sheet_by_index(sheetname)

data = []
should_parse = {}
for i in range(sheet.nrows):
for i in xrange(sheet.nrows):
row = []
for j, (value, typ) in enumerate(izip(sheet.row_values(i),
sheet.row_types(i))):
Expand Down Expand Up @@ -225,7 +221,7 @@ def _parse_excel(self, sheetname, header=0, skiprows=None,

@property
def sheet_names(self):
return self.book.sheet_names()
return self.book.sheet_names()


def _trim_excel_header(row):
Expand Down
46 changes: 45 additions & 1 deletion pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def setUp(self):
self.csv1 = os.path.join(self.dirpath, 'test1.csv')
self.csv2 = os.path.join(self.dirpath, 'test2.csv')
self.xls1 = os.path.join(self.dirpath, 'test.xls')
self.xlsx1 = os.path.join(self.dirpath, 'test.xlsx')
self.frame = _frame.copy()
self.frame2 = _frame2.copy()
self.tsframe = _tsframe.copy()
Expand Down Expand Up @@ -198,6 +199,49 @@ def test_excel_passes_na(self):
columns=['Test'])
tm.assert_frame_equal(parsed, expected)

def check_excel_table_sheet_by_index(self, filename, csvfile):
import xlrd

pth = os.path.join(self.dirpath, filename)
xls = ExcelFile(pth)
df = xls.parse(0, index_col=0, parse_dates=True)
df2 = self.read_csv(csvfile, index_col=0, parse_dates=True)
df3 = xls.parse(1, skiprows=[1], index_col=0, parse_dates=True)
tm.assert_frame_equal(df, df2, check_names=False)
tm.assert_frame_equal(df3, df2, check_names=False)

df4 = xls.parse(0, index_col=0, parse_dates=True, skipfooter=1)
df5 = xls.parse(0, index_col=0, parse_dates=True, skip_footer=1)
tm.assert_frame_equal(df4, df.ix[:-1])
tm.assert_frame_equal(df4, df5)

self.assertRaises(xlrd.XLRDError, xls.parse, 'asdf')

def test_excel_table_sheet_by_index(self):
_skip_if_no_xlrd()
for filename, csvfile in [(self.xls1, self.csv1),
(self.xlsx1, self.csv1)]:
self.check_excel_table_sheet_by_index(filename, csvfile)

def check_excel_sheet_by_name_raise(self, ext):
import xlrd
pth = os.path.join(self.dirpath, 'testit.{0}'.format(ext))

with ensure_clean(pth) as pth:
gt = DataFrame(np.random.randn(10, 2))
gt.to_excel(pth)
xl = ExcelFile(pth)
df = xl.parse(0)
tm.assert_frame_equal(gt, df)

self.assertRaises(xlrd.XLRDError, xl.parse, '0')

def test_excel_sheet_by_name_raise(self):
_skip_if_no_xlrd()
_skip_if_no_xlwt()
for ext in ('xls', 'xlsx'):
self.check_excel_sheet_by_name_raise(ext)

def test_excel_table(self):
_skip_if_no_xlrd()

Expand Down Expand Up @@ -438,7 +482,6 @@ def _check_extension_sheets(self, ext):
np.testing.assert_equal('test1', reader.sheet_names[0])
np.testing.assert_equal('test2', reader.sheet_names[1])


def test_excel_roundtrip_xls_colaliases(self):
_skip_if_no_excelsuite()
self._check_extension_colaliases('xls')
Expand Down Expand Up @@ -892,6 +935,7 @@ def test_deprecated_from_parsers(self):
from pandas.io.parsers import ExcelWriter as xw
xw(path)


if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)