Skip to content

ENH Change ExcelFile to accept a workbook for the path_or_buf argument. #4962

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 27, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,10 @@ Improvements to existing features
(0.4.3 and 0.5.0) (:issue:`4981`).
- Better string representations of ``MultiIndex`` (including ability to roundtrip
via ``repr``). (:issue:`3347`, :issue:`4935`)

- Both ExcelFile and read_excel to accept an xlrd.Book for the io
(formerly path_or_buf) argument; this requires engine to be set.
(:issue:`4961`).

API Changes
~~~~~~~~~~~

Expand Down
51 changes: 35 additions & 16 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,13 @@ def get_writer(engine_name):
except KeyError:
raise ValueError("No Excel writer '%s'" % engine_name)

def read_excel(path_or_buf, sheetname, **kwds):
def read_excel(io, sheetname, **kwds):
"""Read an Excel table into a pandas DataFrame

Parameters
----------
io : string, file-like object or xlrd workbook
If a string, expected to be a path to xls or xlsx file
sheetname : string
Name of Excel sheet
header : int, default 0
Expand All @@ -74,7 +76,10 @@ def read_excel(path_or_buf, sheetname, **kwds):
values are overridden, otherwise they're appended to
verbose : boolean, default False
Indicate number of NA values placed in non-numeric columns

engine: string, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or xlrd

Returns
-------
parsed : DataFrame
Expand All @@ -84,7 +89,10 @@ def read_excel(path_or_buf, sheetname, **kwds):
kwds.pop('kind')
warn("kind keyword is no longer supported in read_excel and may be "
"removed in a future version", FutureWarning)
return ExcelFile(path_or_buf).parse(sheetname=sheetname, **kwds)

engine = kwds.pop('engine', None)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an if clause here:

if engine is not None and engine != 'xlrd':
    raise ValueError("Unknown engine: %s" % engine)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does it make sense to add that check in the __init__ for ExcelFile instead?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, that's fine.

return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)


class ExcelFile(object):
Expand All @@ -94,10 +102,13 @@ class ExcelFile(object):

Parameters
----------
path : string or file-like object
Path to xls or xlsx file
io : string, file-like object or xlrd workbook
If a string, expected to be a path to xls or xlsx file
engine: string, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or xlrd
"""
def __init__(self, path_or_buf, **kwds):
def __init__(self, io, **kwds):

import xlrd # throw an ImportError if we need to

Expand All @@ -106,14 +117,22 @@ def __init__(self, path_or_buf, **kwds):
raise ImportError("pandas requires xlrd >= 0.9.0 for excel "
"support, current version " + xlrd.__VERSION__)

self.path_or_buf = path_or_buf
self.tmpfile = None

if isinstance(path_or_buf, compat.string_types):
self.book = xlrd.open_workbook(path_or_buf)
else:
data = path_or_buf.read()
self.io = io

engine = kwds.pop('engine', None)

if engine is not None and engine != 'xlrd':
raise ValueError("Unknown engine: %s" % engine)

if isinstance(io, compat.string_types):
self.book = xlrd.open_workbook(io)
elif engine == "xlrd" and isinstance(io, xlrd.Book):
self.book = io
elif hasattr(io, "read"):
data = io.read()
self.book = xlrd.open_workbook(file_contents=data)
else:
raise ValueError('Must explicitly set engine if not passing in buffer or path for io.')

def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
index_col=None, parse_cols=None, parse_dates=False,
Expand Down Expand Up @@ -261,9 +280,9 @@ def sheet_names(self):
return self.book.sheet_names()

def close(self):
"""close path_or_buf if necessary"""
if hasattr(self.path_or_buf, 'close'):
self.path_or_buf.close()
"""close io if necessary"""
if hasattr(self.io, 'close'):
self.io.close()

def __enter__(self):
return self
Expand Down
20 changes: 20 additions & 0 deletions pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,26 @@ def test_excel_read_buffer(self):
f = open(pth, 'rb')
xl = ExcelFile(f)
xl.parse('Sheet1', index_col=0, parse_dates=True)

def test_read_xlrd_Book(self):
_skip_if_no_xlrd()
_skip_if_no_xlwt()

import xlrd

pth = '__tmp_excel_read_worksheet__.xls'
df = self.frame

with ensure_clean(pth) as pth:
df.to_excel(pth, "SheetA")
book = xlrd.open_workbook(pth)

with ExcelFile(book, engine="xlrd") as xl:
result = xl.parse("SheetA")
tm.assert_frame_equal(df, result)

result = read_excel(book, sheetname="SheetA", engine="xlrd")
tm.assert_frame_equal(df, result)

def test_xlsx_table(self):
_skip_if_no_xlrd()
Expand Down