From 9103322afa82fd27b7d960529f6e1bfd2111424b Mon Sep 17 00:00:00 2001 From: Alex Rothberg Date: Wed, 31 Jul 2013 09:04:07 -0400 Subject: [PATCH] ENH Change ExcelFile and read_excel to accept a workbook for the io (formerly path_or_buf) argument; this requires engine to be set. (GH4961) --- doc/source/release.rst | 5 +++- pandas/io/excel.py | 51 ++++++++++++++++++++++++----------- pandas/io/tests/test_excel.py | 20 ++++++++++++++ 3 files changed, 59 insertions(+), 17 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 34720c49b163b..659947cae1ea7 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -133,7 +133,10 @@ Improvements to existing features (0.4.3 and 0.5.0) (:issue:`4981`). - Better string representations of ``MultiIndex`` (including ability to roundtrip via ``repr``). (:issue:`3347`, :issue:`4935`) - + - Both ExcelFile and read_excel to accept an xlrd.Book for the io + (formerly path_or_buf) argument; this requires engine to be set. + (:issue:`4961`). + API Changes ~~~~~~~~~~~ diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 02dbc381a10be..6b83fada19001 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -45,11 +45,13 @@ def get_writer(engine_name): except KeyError: raise ValueError("No Excel writer '%s'" % engine_name) -def read_excel(path_or_buf, sheetname, **kwds): +def read_excel(io, sheetname, **kwds): """Read an Excel table into a pandas DataFrame Parameters ---------- + io : string, file-like object or xlrd workbook + If a string, expected to be a path to xls or xlsx file sheetname : string Name of Excel sheet header : int, default 0 @@ -74,7 +76,10 @@ def read_excel(path_or_buf, sheetname, **kwds): values are overridden, otherwise they're appended to verbose : boolean, default False Indicate number of NA values placed in non-numeric columns - + engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd + Returns ------- parsed : DataFrame @@ -84,7 +89,10 @@ def read_excel(path_or_buf, sheetname, **kwds): kwds.pop('kind') warn("kind keyword is no longer supported in read_excel and may be " "removed in a future version", FutureWarning) - return ExcelFile(path_or_buf).parse(sheetname=sheetname, **kwds) + + engine = kwds.pop('engine', None) + + return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds) class ExcelFile(object): @@ -94,10 +102,13 @@ class ExcelFile(object): Parameters ---------- - path : string or file-like object - Path to xls or xlsx file + io : string, file-like object or xlrd workbook + If a string, expected to be a path to xls or xlsx file + engine: string, default None + If io is not a buffer or path, this must be set to identify io. + Acceptable values are None or xlrd """ - def __init__(self, path_or_buf, **kwds): + def __init__(self, io, **kwds): import xlrd # throw an ImportError if we need to @@ -106,14 +117,22 @@ def __init__(self, path_or_buf, **kwds): raise ImportError("pandas requires xlrd >= 0.9.0 for excel " "support, current version " + xlrd.__VERSION__) - self.path_or_buf = path_or_buf - self.tmpfile = None - - if isinstance(path_or_buf, compat.string_types): - self.book = xlrd.open_workbook(path_or_buf) - else: - data = path_or_buf.read() + self.io = io + + engine = kwds.pop('engine', None) + + if engine is not None and engine != 'xlrd': + raise ValueError("Unknown engine: %s" % engine) + + if isinstance(io, compat.string_types): + self.book = xlrd.open_workbook(io) + elif engine == "xlrd" and isinstance(io, xlrd.Book): + self.book = io + elif hasattr(io, "read"): + data = io.read() self.book = xlrd.open_workbook(file_contents=data) + else: + raise ValueError('Must explicitly set engine if not passing in buffer or path for io.') def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, @@ -261,9 +280,9 @@ def sheet_names(self): return self.book.sheet_names() def close(self): - """close path_or_buf if necessary""" - if hasattr(self.path_or_buf, 'close'): - self.path_or_buf.close() + """close io if necessary""" + if hasattr(self.io, 'close'): + self.io.close() def __enter__(self): return self diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 2bcf4789412f6..cd101d325f21d 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -254,6 +254,26 @@ def test_excel_read_buffer(self): f = open(pth, 'rb') xl = ExcelFile(f) xl.parse('Sheet1', index_col=0, parse_dates=True) + + def test_read_xlrd_Book(self): + _skip_if_no_xlrd() + _skip_if_no_xlwt() + + import xlrd + + pth = '__tmp_excel_read_worksheet__.xls' + df = self.frame + + with ensure_clean(pth) as pth: + df.to_excel(pth, "SheetA") + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine="xlrd") as xl: + result = xl.parse("SheetA") + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheetname="SheetA", engine="xlrd") + tm.assert_frame_equal(df, result) def test_xlsx_table(self): _skip_if_no_xlrd()