Excel Reader Refactor - Base Class Introduction (#24829)

WillAyd · jreback · commit 95415406d191 · 2019-01-26T10:34:46.000-05:00
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -375,60 +375,25 @@ def read_excel(io,
         **kwds)
 
 
-class _XlrdReader(object):
-
-    def __init__(self, filepath_or_buffer):
-        """Reader using xlrd engine.
-
-        Parameters
-        ----------
-        filepath_or_buffer : string, path object or Workbook
-            Object to be parsed.
-        """
-        err_msg = "Install xlrd >= 1.0.0 for Excel support"
-
-        try:
-            import xlrd
-        except ImportError:
-            raise ImportError(err_msg)
-        else:
-            if xlrd.__VERSION__ < LooseVersion("1.0.0"):
-                raise ImportError(err_msg +
-                                  ". Current version " + xlrd.__VERSION__)
+@add_metaclass(abc.ABCMeta)
+class _BaseExcelReader(object):
 
-        # If filepath_or_buffer is a url, want to keep the data as bytes so
-        # can't pass to get_filepath_or_buffer()
-        if _is_url(filepath_or_buffer):
-            filepath_or_buffer = _urlopen(filepath_or_buffer)
-        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
-            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
-                filepath_or_buffer)
+    @property
+    @abc.abstractmethod
+    def sheet_names(self):
+        pass
 
-        if isinstance(filepath_or_buffer, xlrd.Book):
-            self.book = filepath_or_buffer
-        elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
-                filepath_or_buffer, "read"):
-            # N.B. xlrd.Book has a read attribute too
-            if hasattr(filepath_or_buffer, 'seek'):
-                try:
-                    # GH 19779
-                    filepath_or_buffer.seek(0)
-                except UnsupportedOperation:
-                    # HTTPResponse does not support seek()
-                    # GH 20434
-                    pass
+    @abc.abstractmethod
+    def get_sheet_by_name(self, name):
+        pass
 
-            data = filepath_or_buffer.read()
-            self.book = xlrd.open_workbook(file_contents=data)
-        elif isinstance(filepath_or_buffer, compat.string_types):
-            self.book = xlrd.open_workbook(filepath_or_buffer)
-        else:
-            raise ValueError('Must explicitly set engine if not passing in'
-                             ' buffer or path for io.')
+    @abc.abstractmethod
+    def get_sheet_by_index(self, index):
+        pass
 
-    @property
-    def sheet_names(self):
-        return self.book.sheet_names()
+    @abc.abstractmethod
+    def get_sheet_data(self, sheet, convert_float):
+        pass
 
     def parse(self,
               sheet_name=0,
@@ -455,56 +420,14 @@ def parse(self,
 
         _validate_header_arg(header)
 
-        from xlrd import (xldate, XL_CELL_DATE,
-                          XL_CELL_ERROR, XL_CELL_BOOLEAN,
-                          XL_CELL_NUMBER)
-
-        epoch1904 = self.book.datemode
-
-        def _parse_cell(cell_contents, cell_typ):
-            """converts the contents of the cell into a pandas
-               appropriate object"""
-
-            if cell_typ == XL_CELL_DATE:
-
-                # Use the newer xlrd datetime handling.
-                try:
-                    cell_contents = xldate.xldate_as_datetime(
-                        cell_contents, epoch1904)
-                except OverflowError:
-                    return cell_contents
-
-                # Excel doesn't distinguish between dates and time,
-                # so we treat dates on the epoch as times only.
-                # Also, Excel supports 1900 and 1904 epochs.
-                year = (cell_contents.timetuple())[0:3]
-                if ((not epoch1904 and year == (1899, 12, 31)) or
-                        (epoch1904 and year == (1904, 1, 1))):
-                    cell_contents = time(cell_contents.hour,
-                                         cell_contents.minute,
-                                         cell_contents.second,
-                                         cell_contents.microsecond)
-
-            elif cell_typ == XL_CELL_ERROR:
-                cell_contents = np.nan
-            elif cell_typ == XL_CELL_BOOLEAN:
-                cell_contents = bool(cell_contents)
-            elif convert_float and cell_typ == XL_CELL_NUMBER:
-                # GH5394 - Excel 'numbers' are always floats
-                # it's a minimal perf hit and less surprising
-                val = int(cell_contents)
-                if val == cell_contents:
-                    cell_contents = val
-            return cell_contents
-
         ret_dict = False
 
         # Keep sheetname to maintain backwards compatibility.
         if isinstance(sheet_name, list):
             sheets = sheet_name
             ret_dict = True
         elif sheet_name is None:
-            sheets = self.book.sheet_names()
+            sheets = self.sheet_names
             ret_dict = True
         else:
             sheets = [sheet_name]
@@ -519,19 +442,13 @@ def _parse_cell(cell_contents, cell_typ):
                 print("Reading sheet {sheet}".format(sheet=asheetname))
 
             if isinstance(asheetname, compat.string_types):
-                sheet = self.book.sheet_by_name(asheetname)
+                sheet = self.get_sheet_by_name(asheetname)
             else:  # assume an integer if not a string
-                sheet = self.book.sheet_by_index(asheetname)
+                sheet = self.get_sheet_by_index(asheetname)
 
-            data = []
+            data = self.get_sheet_data(sheet, convert_float)
             usecols = _maybe_convert_usecols(usecols)
 
-            for i in range(sheet.nrows):
-                row = [_parse_cell(value, typ)
-                       for value, typ in zip(sheet.row_values(i),
-                                             sheet.row_types(i))]
-                data.append(row)
-
             if sheet.nrows == 0:
                 output[asheetname] = DataFrame()
                 continue
@@ -620,6 +537,120 @@ def _parse_cell(cell_contents, cell_typ):
             return output[asheetname]
 
 
+class _XlrdReader(_BaseExcelReader):
+
+    def __init__(self, filepath_or_buffer):
+        """Reader using xlrd engine.
+
+        Parameters
+        ----------
+        filepath_or_buffer : string, path object or Workbook
+            Object to be parsed.
+        """
+        err_msg = "Install xlrd >= 1.0.0 for Excel support"
+
+        try:
+            import xlrd
+        except ImportError:
+            raise ImportError(err_msg)
+        else:
+            if xlrd.__VERSION__ < LooseVersion("1.0.0"):
+                raise ImportError(err_msg +
+                                  ". Current version " + xlrd.__VERSION__)
+
+        # If filepath_or_buffer is a url, want to keep the data as bytes so
+        # can't pass to get_filepath_or_buffer()
+        if _is_url(filepath_or_buffer):
+            filepath_or_buffer = _urlopen(filepath_or_buffer)
+        elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
+            filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
+                filepath_or_buffer)
+
+        if isinstance(filepath_or_buffer, xlrd.Book):
+            self.book = filepath_or_buffer
+        elif hasattr(filepath_or_buffer, "read"):
+            # N.B. xlrd.Book has a read attribute too
+            if hasattr(filepath_or_buffer, 'seek'):
+                try:
+                    # GH 19779
+                    filepath_or_buffer.seek(0)
+                except UnsupportedOperation:
+                    # HTTPResponse does not support seek()
+                    # GH 20434
+                    pass
+
+            data = filepath_or_buffer.read()
+            self.book = xlrd.open_workbook(file_contents=data)
+        elif isinstance(filepath_or_buffer, compat.string_types):
+            self.book = xlrd.open_workbook(filepath_or_buffer)
+        else:
+            raise ValueError('Must explicitly set engine if not passing in'
+                             ' buffer or path for io.')
+
+    @property
+    def sheet_names(self):
+        return self.book.sheet_names()
+
+    def get_sheet_by_name(self, name):
+        return self.book.sheet_by_name(name)
+
+    def get_sheet_by_index(self, index):
+        return self.book.sheet_by_index(index)
+
+    def get_sheet_data(self, sheet, convert_float):
+        from xlrd import (xldate, XL_CELL_DATE,
+                          XL_CELL_ERROR, XL_CELL_BOOLEAN,
+                          XL_CELL_NUMBER)
+
+        epoch1904 = self.book.datemode
+
+        def _parse_cell(cell_contents, cell_typ):
+            """converts the contents of the cell into a pandas
+               appropriate object"""
+
+            if cell_typ == XL_CELL_DATE:
+
+                # Use the newer xlrd datetime handling.
+                try:
+                    cell_contents = xldate.xldate_as_datetime(
+                        cell_contents, epoch1904)
+                except OverflowError:
+                    return cell_contents
+
+                # Excel doesn't distinguish between dates and time,
+                # so we treat dates on the epoch as times only.
+                # Also, Excel supports 1900 and 1904 epochs.
+                year = (cell_contents.timetuple())[0:3]
+                if ((not epoch1904 and year == (1899, 12, 31)) or
+                        (epoch1904 and year == (1904, 1, 1))):
+                    cell_contents = time(cell_contents.hour,
+                                         cell_contents.minute,
+                                         cell_contents.second,
+                                         cell_contents.microsecond)
+
+            elif cell_typ == XL_CELL_ERROR:
+                cell_contents = np.nan
+            elif cell_typ == XL_CELL_BOOLEAN:
+                cell_contents = bool(cell_contents)
+            elif convert_float and cell_typ == XL_CELL_NUMBER:
+                # GH5394 - Excel 'numbers' are always floats
+                # it's a minimal perf hit and less surprising
+                val = int(cell_contents)
+                if val == cell_contents:
+                    cell_contents = val
+            return cell_contents
+
+        data = []
+
+        for i in range(sheet.nrows):
+            row = [_parse_cell(value, typ)
+                   for value, typ in zip(sheet.row_values(i),
+                                         sheet.row_types(i))]
+            data.append(row)
+
+        return data
+
+
 class ExcelFile(object):
     """
     Class for parsing tabular excel sheets into DataFrame objects.