read multiple sheets for ods, small PEP8 changes

davidovitch · davidovitch · commit 5148c1b3690b · 2015-03-15T15:33:24.000+01:00
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -73,9 +73,14 @@ class BaseFile(object):
     """ Class for identifying the type of reader
     """
 
-    def __init__(self, try_engine=False):
+    def __init__(self, engine, extensions, io_class, open_workbook,
+                 try_engine=False):
+        self.engine = engine
+        self.extensions = extensions
+        self.io_class = io_class
+        self.open_workbook = open_workbook
         if try_engine:
-            self.has_engine()
+            self.load_engine()
 
     def is_ext(self, path):
         """Verify if the path's extension is supported by the reader
@@ -94,26 +99,29 @@ def is_type(self, io):
         else:
             return False
 
-    def has_engine(self):
-        """Verify if the engine is installed
+    def load_engine(self):
+        """Load the engine if installed
         """
         try:
-            self.load_engine()
+            self._load_engine()
             _readers[self.engine] = True
         except ImportError:
             _readers[self.engine] = False
+        except AttributeError:
+            _readers[self.engine] = False
+            msg = 'Excel engine "%s" is not implemented' % self.engine
+            raise NotImplementedError(msg)
 
 
 class XLRDFile(BaseFile):
 
-    def __init__(self, **kwargs):
-        self.engine = 'xlrd'
-        self.extensions = ['xls', 'xlsx', 'xlsm']
-        self.io_class = type(None)
-        self.open_workbook = None
-        super(XLRDFile, self).__init__(**kwargs)
+    def __init__(self, try_engine=False):
+        # engine, extensions, are defined here, but io_class and open_workbook
+        # are only defined when importing the engine
+        args = ('xlrd', ['xls', 'xlsx', 'xlsm'], type(None), None)
+        super(XLRDFile, self).__init__(*args, try_engine=try_engine)
 
-    def load_engine(self):
+    def _load_engine(self):
         import xlrd  # throw an ImportError if we need to
         ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2]))
         if ver < (0, 9):  # pragma: no cover
@@ -126,14 +134,13 @@ def load_engine(self):
 
 class EZODFFile(BaseFile):
 
-    def __init__(self, **kwargs):
-        self.engine = 'ezodf'
-        self.extensions = ['ods']
-        self.io_class = type(None)
-        self.open_workbook = None
-        super(EZODFFile, self).__init__(**kwargs)
+    def __init__(self, try_engine=False):
+        # engine, extensions, are defined here, but io_class and open_workbook
+        # are only defined when importing the engine
+        args = ('ezodf', ['ods'], type(None), None)
+        super(EZODFFile, self).__init__(*args, try_engine=try_engine)
 
-    def load_engine(self):
+    def _load_engine(self):
         import ezodf
         self.open_workbook = ezodf.opendoc
         self.io_class = ezodf.document.PackagedDocument
@@ -150,17 +157,17 @@ def read_excel(io, sheetname=0, **kwds):
         and file. For file URLs, a host is expected. For instance, a local
         file could be file://localhost/path/to/workbook.xlsx
     sheetname : string, int, mixed list of strings/ints, or None, default 0
-        
-        Strings are used for sheet names, Integers are used in zero-indexed sheet 
-        positions. 
-        
+
+        Strings are used for sheet names, Integers are used in zero-indexed sheet
+        positions.
+
         Lists of strings/integers are used to request multiple sheets.
-        
+
         Specify None to get all sheets.
-        
+
         str|int -> DataFrame is returned.
         list|None -> Dict of DataFrames is returned, with keys representing sheets.
-               
+
         Available Cases
 
         * Defaults to 0 -> 1st sheet as a DataFrame
@@ -293,19 +300,19 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
         Parameters
         ----------
         sheetname : string, int, mixed list of strings/ints, or None, default 0
-            
-            Strings are used for sheet names, Integers are used in zero-indexed sheet 
-            positions. 
-            
+
+            Strings are used for sheet names, Integers are used in zero-indexed sheet
+            positions.
+
             Lists of strings/integers are used to request multiple sheets.
-            
+
             Specify None to get all sheets.
-            
+
             str|int -> DataFrame is returned.
             list|None -> Dict of DataFrames is returned, with keys representing sheets.
-                   
+
             Available Cases
-    
+
             * Defaults to 0 -> 1st sheet as a DataFrame
             * 1 -> 2nd sheet as a DataFrame
             * "Sheet1" -> 1st sheet as a DataFrame
@@ -426,10 +433,10 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
 
         epoch1904 = self.book.datemode
 
-        def _parse_cell(cell_contents,cell_typ):
+        def _parse_cell(cell_contents, cell_typ):
             """converts the contents of the cell into a pandas
                appropriate object"""
-               
+
             if cell_typ == XL_CELL_DATE:
                 if xlrd_0_9_3:
                     # Use the newer xlrd datetime handling.
@@ -472,10 +479,10 @@ def _parse_cell(cell_contents,cell_typ):
             xlrd_0_9_3 = True
         else:
             xlrd_0_9_3 = False
-        
+
         ret_dict = False
-        
-        #Keep sheetname to maintain backwards compatibility.
+
+        # Keep sheetname to maintain backwards compatibility.
         if isinstance(sheetname, list):
             sheets = sheetname
             ret_dict = True
@@ -484,38 +491,38 @@ def _parse_cell(cell_contents,cell_typ):
             ret_dict = True
         else:
             sheets = [sheetname]
-        
-        #handle same-type duplicates.
+
+        # handle same-type duplicates.
         sheets = list(set(sheets))
-        
+
         output = {}
-        
+
         for asheetname in sheets:
             if verbose:
                 print("Reading sheet %s" % asheetname)
-            
+
             if isinstance(asheetname, compat.string_types):
                 sheet = self.book.sheet_by_name(asheetname)
-            else:  # assume an integer if not a string    
-                sheet = self.book.sheet_by_index(asheetname)   
-            
+            else:  # assume an integer if not a string
+                sheet = self.book.sheet_by_index(asheetname)
+
             data = []
             should_parse = {}
-            
+
             for i in range(sheet.nrows):
                 row = []
                 for j, (value, typ) in enumerate(zip(sheet.row_values(i),
                                                      sheet.row_types(i))):
                     if parse_cols is not None and j not in should_parse:
                         should_parse[j] = self._should_parse(j, parse_cols)
-    
+
                     if parse_cols is None or should_parse[j]:
-                        row.append(_parse_cell(value,typ))
+                        row.append(_parse_cell(value, typ))
                 data.append(row)
-    
+
             if header is not None:
                 data[header] = _trim_excel_header(data[header])
-    
+
             parser = TextParser(data, header=header, index_col=index_col,
                                 has_index_names=has_index_names,
                                 na_values=na_values,
@@ -526,76 +533,103 @@ def _parse_cell(cell_contents,cell_typ):
                                 skip_footer=skip_footer,
                                 chunksize=chunksize,
                                 **kwds)
-            
+
             output[asheetname] = parser.read()
-            
+
         if ret_dict:
             return output
         else:
             return output[asheetname]
-        
 
     def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
                    index_col=None, has_index_names=None, parse_cols=None,
                    parse_dates=False, date_parser=None, na_values=None,
                    thousands=None, chunksize=None, convert_float=True,
-                   **kwds):
-
-        # sheetname can be index or string
-        sheet = self.book.sheets[sheetname]
-
-        data = []
-        should_parse = {}
-        for i in range(sheet.nrows()):
-            row = []
-            for j, cell in enumerate(sheet.row(i)):
-
-                if parse_cols is not None and j not in should_parse:
-                    should_parse[j] = self._should_parse(j, parse_cols)
-
-                if parse_cols is None or should_parse[j]:
-
-                    if isinstance(cell.value, float):
-                        value = cell.value
-                        if convert_float:
-                            # GH5394 - Excel and ODS 'numbers' are always floats
-                            # it's a minimal perf hit and less suprising
-                            # FIXME: this goes wrong when int(cell.value) returns
-                            # a long (>1e18)
-                            val = int(cell.value)
-                            if val == cell.value:
-                                value = val
-                    elif isinstance(cell.value, compat.string_types):
-                        typ = cell.value_type
-#                        if typ == 'string':
-#                            value = cell.value
-                        if typ == 'date' or typ == 'time':
-                            value = self._parse_datetime(cell)
-                        else:
-                            value = cell.value
-                    elif isinstance(cell.value, bool):
-                        value = cell.value
-#                    elif isinstance(cell.value, type(None)):
-#                        value = np.nan
-                    else:
-                        value = np.nan
+                   verbose=False, **kwds):
 
-                    row.append(value)
+        def _parse_cell(cell):
+            """converts the contents of the cell into a pandas
+               appropriate object"""
+            if isinstance(cell.value, float):
+                value = cell.value
+                if convert_float:
+                    # GH5394 - Excel and ODS 'numbers' are always floats
+                    # it's a minimal perf hit and less suprising
+                    # FIXME: this goes wrong when int(cell.value) returns
+                    # a long (>1e18)
+                    val = int(cell.value)
+                    if val == cell.value:
+                        value = val
+            elif isinstance(cell.value, compat.string_types):
+                typ = cell.value_type
+#                if typ == 'string':
+#                    value = cell.value
+                if typ == 'date' or typ == 'time':
+                    value = self._parse_datetime(cell)
+                else:
+                    value = cell.value
+            elif isinstance(cell.value, bool):
+                value = cell.value
+#            elif isinstance(cell.value, type(None)):
+#                value = np.nan
+            else:
+                value = np.nan
+            return value
 
-            data.append(row)
+        ret_dict = False
 
-        parser = TextParser(data, header=header, index_col=index_col,
-                            has_index_names=has_index_names,
-                            na_values=na_values,
-                            thousands=thousands,
-                            parse_dates=parse_dates,
-                            date_parser=date_parser,
-                            skiprows=skiprows,
-                            skip_footer=skip_footer,
-                            chunksize=chunksize,
-                            **kwds)
+        # Keep sheetname to maintain backwards compatibility.
+        if isinstance(sheetname, list):
+            sheets = sheetname
+            ret_dict = True
+        elif sheetname is None:
+            sheets = self.sheet_names
+            ret_dict = True
+        else:
+            sheets = [sheetname]
+
+        # handle same-type duplicates.
+        sheets = list(set(sheets))
+
+        output = {}
+
+        for asheetname in sheets:
+            if verbose:
+                print("Reading sheet %s" % asheetname)
+
+            # sheetname can be index or string
+            sheet = self.book.sheets[asheetname]
+
+            data = []
+            should_parse = {}
+            for i in range(sheet.nrows()):
+                row = []
+                for j, cell in enumerate(sheet.row(i)):
+
+                    if parse_cols is not None and j not in should_parse:
+                        should_parse[j] = self._should_parse(j, parse_cols)
+
+                    if parse_cols is None or should_parse[j]:
+                        row.append(_parse_cell(cell))
 
-        return parser.read()
+                data.append(row)
+
+            parser = TextParser(data, header=header, index_col=index_col,
+                                has_index_names=has_index_names,
+                                na_values=na_values,
+                                thousands=thousands,
+                                parse_dates=parse_dates,
+                                date_parser=date_parser,
+                                skiprows=skiprows,
+                                skip_footer=skip_footer,
+                                chunksize=chunksize,
+                                **kwds)
+            output[asheetname] = parser.read()
+
+        if ret_dict:
+            return output
+        else:
+            return output[asheetname]
 
     def _parse_datetime(self, cell):
         """Parse the date or time from on ods cell to a datetime object.
@@ -609,7 +643,7 @@ def _parse_datetime(self, cell):
         def _value2date(value):
             try:
                 return datetime.datetime.strptime(value, '%Y-%m-%d')
-            except ValueError:#, TypeError):
+            except ValueError:  # , TypeError):
                 return datetime.datetime.strptime(value, '%Y-%m-%dT%H:%M:%S')
 
         # Technically it is not necessary to try to derive the date/time
@@ -643,7 +677,7 @@ def _value2date(value):
             value = _value2date(cell.value)
         elif cell.value_type == 'time':
             try:
-                # FIXME: what if the decimal separator is a comma in the locale?
+                # FIXME: what if the decimal separator is a comma in locale?
                 value = datetime.datetime.strptime(cell.value, 'PT%HH%MM%S.%fS')
             except ValueError:
                 value = datetime.datetime.strptime(cell.value, 'PT%HH%MM%SS')
@@ -657,9 +691,9 @@ def _print_ods_cellinfo(self, cell):
         Cell attributes are documented here:
         https://pythonhosted.org/ezodf/tableobjects.html#id2
         """
-        print('   plaintext:', cell.plaintext()) # no formatting
+        print('   plaintext:', cell.plaintext())  # no formatting
         # formatted, but what is difference with value?
-        print('display_form:', cell.display_form) # format, ?=plaintext
+        print('display_form:', cell.display_form)  # format, ?=plaintext
         print('       value:', cell.value)       # data handled
         print('  value_type:', cell.value_type)  # data type
         print('     formula:', cell.formula)