Merge pull request #4962 from cancan101/excel_take_workbook

jtratner · jtratner · commit 0fbd34798731 · 2013-09-26T21:39:57.000-07:00
ENH Change ExcelFile to accept a workbook for the path_or_buf argument.
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -133,7 +133,10 @@ Improvements to existing features
     (0.4.3 and 0.5.0) (:issue:`4981`).
   - Better string representations of ``MultiIndex`` (including ability to roundtrip
     via ``repr``). (:issue:`3347`, :issue:`4935`)
-
+  - Both ExcelFile and read_excel to accept an xlrd.Book for the io
+    (formerly path_or_buf) argument; this requires engine to be set.
+    (:issue:`4961`).
+    
 API Changes
 ~~~~~~~~~~~
 
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -45,11 +45,13 @@ def get_writer(engine_name):
     except KeyError:
         raise ValueError("No Excel writer '%s'" % engine_name)
 
-def read_excel(path_or_buf, sheetname, **kwds):
+def read_excel(io, sheetname, **kwds):
     """Read an Excel table into a pandas DataFrame
 
     Parameters
     ----------
+    io : string, file-like object or xlrd workbook
+        If a string, expected to be a path to xls or xlsx file
     sheetname : string
          Name of Excel sheet
     header : int, default 0
@@ -74,7 +76,10 @@ def read_excel(path_or_buf, sheetname, **kwds):
         values are overridden, otherwise they're appended to
     verbose : boolean, default False
         Indicate number of NA values placed in non-numeric columns
-
+    engine: string, default None
+        If io is not a buffer or path, this must be set to identify io.
+        Acceptable values are None or xlrd
+        
     Returns
     -------
     parsed : DataFrame
@@ -84,7 +89,10 @@ def read_excel(path_or_buf, sheetname, **kwds):
         kwds.pop('kind')
         warn("kind keyword is no longer supported in read_excel and may be "
              "removed in a future version", FutureWarning)
-    return ExcelFile(path_or_buf).parse(sheetname=sheetname, **kwds)
+    
+    engine = kwds.pop('engine', None)   
+        
+    return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
 
 
 class ExcelFile(object):
@@ -94,10 +102,13 @@ class ExcelFile(object):
 
     Parameters
     ----------
-    path : string or file-like object
-        Path to xls or xlsx file
+    io : string, file-like object or xlrd workbook
+        If a string, expected to be a path to xls or xlsx file
+    engine: string, default None
+        If io is not a buffer or path, this must be set to identify io.
+        Acceptable values are None or xlrd
     """
-    def __init__(self, path_or_buf, **kwds):
+    def __init__(self, io, **kwds):
 
         import xlrd  # throw an ImportError if we need to
 
@@ -106,14 +117,22 @@ def __init__(self, path_or_buf, **kwds):
             raise ImportError("pandas requires xlrd >= 0.9.0 for excel "
                               "support, current version " + xlrd.__VERSION__)
 
-        self.path_or_buf = path_or_buf
-        self.tmpfile = None
-
-        if isinstance(path_or_buf, compat.string_types):
-            self.book = xlrd.open_workbook(path_or_buf)
-        else:
-            data = path_or_buf.read()
+        self.io = io
+        
+        engine = kwds.pop('engine', None)
+        
+        if engine is not None and engine != 'xlrd':
+            raise ValueError("Unknown engine: %s" % engine)
+
+        if isinstance(io, compat.string_types):
+            self.book = xlrd.open_workbook(io)
+        elif engine == "xlrd" and isinstance(io, xlrd.Book):
+            self.book = io
+        elif hasattr(io, "read"):
+            data = io.read()
             self.book = xlrd.open_workbook(file_contents=data)
+        else:
+            raise ValueError('Must explicitly set engine if not passing in buffer or path for io.')            
 
     def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
               index_col=None, parse_cols=None, parse_dates=False,
@@ -261,9 +280,9 @@ def sheet_names(self):
         return self.book.sheet_names()
 
     def close(self):
-        """close path_or_buf if necessary"""
-        if hasattr(self.path_or_buf, 'close'):
-            self.path_or_buf.close()
+        """close io if necessary"""
+        if hasattr(self.io, 'close'):
+            self.io.close()
 
     def __enter__(self):
         return self
diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py
@@ -254,6 +254,26 @@ def test_excel_read_buffer(self):
         f = open(pth, 'rb')
         xl = ExcelFile(f)
         xl.parse('Sheet1', index_col=0, parse_dates=True)
+        
+    def test_read_xlrd_Book(self):
+        _skip_if_no_xlrd()
+        _skip_if_no_xlwt()
+        
+        import xlrd
+        
+        pth = '__tmp_excel_read_worksheet__.xls'
+        df = self.frame
+        
+        with ensure_clean(pth) as pth:
+            df.to_excel(pth, "SheetA")
+            book = xlrd.open_workbook(pth)
+            
+            with ExcelFile(book, engine="xlrd") as xl:
+                result = xl.parse("SheetA")
+                tm.assert_frame_equal(df, result)
+
+            result = read_excel(book, sheetname="SheetA", engine="xlrd")
+            tm.assert_frame_equal(df, result)
 
     def test_xlsx_table(self):
         _skip_if_no_xlrd()