From 98f65b19f918b825ae6cb1001cd8e91f45377cc7 Mon Sep 17 00:00:00 2001 From: y-p Date: Mon, 25 Mar 2013 04:28:53 +0200 Subject: [PATCH] ENH: Use xlrd >=0.9.0 for both xls/xlsx, sidesteps GH1629 PTF --- README.rst | 3 +- pandas/io/parsers.py | 103 ++++++---------------------------- pandas/io/tests/test_excel.py | 25 --------- 3 files changed, 19 insertions(+), 112 deletions(-) diff --git a/README.rst b/README.rst index 5145f801fc6eb..c9b70f07b0862 100644 --- a/README.rst +++ b/README.rst @@ -87,7 +87,8 @@ Optional dependencies * `statsmodels `__ * Needed for parts of :mod:`pandas.stats` * `openpyxl `__, `xlrd/xlwt `__ - * openpyxl version 1.6.1 or higher + * openpyxl version 1.6.1 or higher, for writing .xlsx files + * xlrd >= 0.9.0 * Needed for Excel I/O diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 60798bacbc144..fda96248c2a75 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1803,64 +1803,32 @@ def _make_reader(self, f): #---------------------------------------------------------------------- # ExcelFile class -_openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n" - "You can install it via 'easy_install openpyxl' or " - "'pip install openpyxl'.\nAlternatively, you could save" - " the .xlsx file as a .xls file.\n") - - class ExcelFile(object): """ Class for parsing tabular excel sheets into DataFrame objects. - Uses xlrd for parsing .xls files or openpyxl for .xlsx files. - See ExcelFile.parse for more documentation + Uses xlrd. See ExcelFile.parse for more documentation Parameters ---------- path : string or file-like object Path to xls or xlsx file - kind : {'xls', 'xlsx', None}, default None """ - def __init__(self, path_or_buf, kind=None): + def __init__(self, path_or_buf, kind=None, **kwds): self.kind = kind - self.use_xlsx = kind == 'xls' + + import xlrd # throw an ImportError if we need to + ver = tuple(map(int,xlrd.__VERSION__.split(".")[:2])) + if ver < (0, 9): + raise ImportError("pandas requires xlrd >= 0.9.0 for excel support") self.path_or_buf = path_or_buf self.tmpfile = None if isinstance(path_or_buf, basestring): - if kind == 'xls' or (kind is None and - path_or_buf.endswith('.xls')): - self.use_xlsx = False - import xlrd - self.book = xlrd.open_workbook(path_or_buf) - else: - self.use_xlsx = True - try: - from openpyxl.reader.excel import load_workbook - self.book = load_workbook(path_or_buf, use_iterators=True) - except ImportError: # pragma: no cover - raise ImportError(_openpyxl_msg) + self.book = xlrd.open_workbook(path_or_buf) else: data = path_or_buf.read() - - if self.kind == 'xls': - import xlrd - self.book = xlrd.open_workbook(file_contents=data) - elif self.kind == 'xlsx': - from openpyxl.reader.excel import load_workbook - buf = py3compat.BytesIO(data) - self.book = load_workbook(buf, use_iterators=True) - else: - try: - import xlrd - self.book = xlrd.open_workbook(file_contents=data) - self.use_xlsx = False - except Exception: - self.use_xlsx = True - from openpyxl.reader.excel import load_workbook - buf = py3compat.BytesIO(data) - self.book = load_workbook(buf, use_iterators=True) + self.book = xlrd.open_workbook(file_contents=data) def __repr__(self): return object.__repr__(self) @@ -1908,9 +1876,7 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0, if skipfooter is not None: skip_footer = skipfooter - choose = {True: self._parse_xlsx, - False: self._parse_xls} - return choose[self.use_xlsx](sheetname, header=header, + return self._parse_excel(sheetname, header=header, skiprows=skiprows, index_col=index_col, has_index_names=has_index_names, parse_cols=parse_cols, @@ -1953,47 +1919,12 @@ def _excel2num(x): else: return i in parse_cols - def _parse_xlsx(self, sheetname, header=0, skiprows=None, - skip_footer=0, index_col=None, has_index_names=False, - parse_cols=None, parse_dates=False, date_parser=None, - na_values=None, thousands=None, chunksize=None): - sheet = self.book.get_sheet_by_name(name=sheetname) - data = [] - - # it brings a new method: iter_rows() - should_parse = {} - - for row in sheet.iter_rows(): - row_data = [] - for j, cell in enumerate(row): - - if parse_cols is not None and j not in should_parse: - should_parse[j] = self._should_parse(j, parse_cols) - - if parse_cols is None or should_parse[j]: - row_data.append(cell.internal_value) - data.append(row_data) - - if header is not None: - data[header] = _trim_excel_header(data[header]) - - parser = TextParser(data, header=header, index_col=index_col, - has_index_names=has_index_names, - na_values=na_values, - thousands=thousands, - parse_dates=parse_dates, - date_parser=date_parser, - skiprows=skiprows, - skip_footer=skip_footer, - chunksize=chunksize) - - return parser.read() - - def _parse_xls(self, sheetname, header=0, skiprows=None, + def _parse_excel(self, sheetname, header=0, skiprows=None, skip_footer=0, index_col=None, has_index_names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, chunksize=None): - from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR + from xlrd import (xldate_as_tuple, XL_CELL_DATE, + XL_CELL_ERROR, XL_CELL_BOOLEAN) datemode = self.book.datemode sheet = self.book.sheet_by_name(sheetname) @@ -2015,9 +1946,12 @@ def _parse_xls(self, sheetname, header=0, skiprows=None, value = datetime.time(*dt[3:]) else: value = datetime.datetime(*dt) - if typ == XL_CELL_ERROR: + elif typ == XL_CELL_ERROR: value = np.nan + elif typ == XL_CELL_BOOLEAN: + value = bool(value) row.append(value) + data.append(row) if header is not None: @@ -2037,9 +1971,6 @@ def _parse_xls(self, sheetname, header=0, skiprows=None, @property def sheet_names(self): - if self.use_xlsx: - return self.book.get_sheet_names() - else: return self.book.sheet_names() diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index ccd9cbc56b2a5..ee2d265690221 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -245,18 +245,6 @@ def test_specify_kind_xls(self): # self.assertRaises(Exception, ExcelFile, open(xlsx_file, 'rb'), # kind='xls') - def test_specify_kind_xlsx(self): - _skip_if_no_openpyxl() - xlsx_file = os.path.join(self.dirpath, 'test.xlsx') - xls_file = os.path.join(self.dirpath, 'test.xls') - - self.assertRaises(Exception, ExcelFile, xls_file, kind='xlsx') - - ExcelFile(open(xlsx_file, 'rb'), kind='xlsx') - - self.assertRaises(Exception, ExcelFile, open(xls_file, 'rb'), - kind='xlsx') - def read_csv(self, *args, **kwds): kwds = kwds.copy() kwds['engine'] = 'python' @@ -545,19 +533,6 @@ def test_excel_roundtrip_datetime(self): recons = reader.parse('test1') tm.assert_frame_equal(self.tsframe, recons) - def test_excel_roundtrip_bool(self): - _skip_if_no_openpyxl() - - # Test roundtrip np.bool8, does not seem to work for xls - path = '__tmp_excel_roundtrip_bool__.xlsx' - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - with ensure_clean(path) as path: - - frame.to_excel(path, 'test1') - reader = ExcelFile(path) - recons = reader.parse('test1') - tm.assert_frame_equal(frame, recons) - def test_to_excel_periodindex(self): _skip_if_no_excelsuite()