Skip to content

ENH: Use xlrd >=0.9.0 for both xls/xlsx, sidesteps GH1629 #3164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
1 commit merged into from Apr 23, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,8 @@ Optional dependencies
* `statsmodels <http://statsmodels.sourceforge.net/>`__
* Needed for parts of :mod:`pandas.stats`
* `openpyxl <http://packages.python.org/openpyxl/>`__, `xlrd/xlwt <http://www.python-excel.org/>`__
* openpyxl version 1.6.1 or higher
* openpyxl version 1.6.1 or higher, for writing .xlsx files
* xlrd >= 0.9.0
* Needed for Excel I/O


Expand Down
103 changes: 17 additions & 86 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1803,64 +1803,32 @@ def _make_reader(self, f):
#----------------------------------------------------------------------
# ExcelFile class

_openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n"
"You can install it via 'easy_install openpyxl' or "
"'pip install openpyxl'.\nAlternatively, you could save"
" the .xlsx file as a .xls file.\n")


class ExcelFile(object):
"""
Class for parsing tabular excel sheets into DataFrame objects.
Uses xlrd for parsing .xls files or openpyxl for .xlsx files.
See ExcelFile.parse for more documentation
Uses xlrd. See ExcelFile.parse for more documentation

Parameters
----------
path : string or file-like object
Path to xls or xlsx file
kind : {'xls', 'xlsx', None}, default None
"""
def __init__(self, path_or_buf, kind=None):
def __init__(self, path_or_buf, kind=None, **kwds):
self.kind = kind
self.use_xlsx = kind == 'xls'

import xlrd # throw an ImportError if we need to
ver = tuple(map(int,xlrd.__VERSION__.split(".")[:2]))
if ver < (0, 9):
raise ImportError("pandas requires xlrd >= 0.9.0 for excel support")

self.path_or_buf = path_or_buf
self.tmpfile = None

if isinstance(path_or_buf, basestring):
if kind == 'xls' or (kind is None and
path_or_buf.endswith('.xls')):
self.use_xlsx = False
import xlrd
self.book = xlrd.open_workbook(path_or_buf)
else:
self.use_xlsx = True
try:
from openpyxl.reader.excel import load_workbook
self.book = load_workbook(path_or_buf, use_iterators=True)
except ImportError: # pragma: no cover
raise ImportError(_openpyxl_msg)
self.book = xlrd.open_workbook(path_or_buf)
else:
data = path_or_buf.read()

if self.kind == 'xls':
import xlrd
self.book = xlrd.open_workbook(file_contents=data)
elif self.kind == 'xlsx':
from openpyxl.reader.excel import load_workbook
buf = py3compat.BytesIO(data)
self.book = load_workbook(buf, use_iterators=True)
else:
try:
import xlrd
self.book = xlrd.open_workbook(file_contents=data)
self.use_xlsx = False
except Exception:
self.use_xlsx = True
from openpyxl.reader.excel import load_workbook
buf = py3compat.BytesIO(data)
self.book = load_workbook(buf, use_iterators=True)
self.book = xlrd.open_workbook(file_contents=data)

def __repr__(self):
return object.__repr__(self)
Expand Down Expand Up @@ -1908,9 +1876,7 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
if skipfooter is not None:
skip_footer = skipfooter

choose = {True: self._parse_xlsx,
False: self._parse_xls}
return choose[self.use_xlsx](sheetname, header=header,
return self._parse_excel(sheetname, header=header,
skiprows=skiprows, index_col=index_col,
has_index_names=has_index_names,
parse_cols=parse_cols,
Expand Down Expand Up @@ -1953,47 +1919,12 @@ def _excel2num(x):
else:
return i in parse_cols

def _parse_xlsx(self, sheetname, header=0, skiprows=None,
skip_footer=0, index_col=None, has_index_names=False,
parse_cols=None, parse_dates=False, date_parser=None,
na_values=None, thousands=None, chunksize=None):
sheet = self.book.get_sheet_by_name(name=sheetname)
data = []

# it brings a new method: iter_rows()
should_parse = {}

for row in sheet.iter_rows():
row_data = []
for j, cell in enumerate(row):

if parse_cols is not None and j not in should_parse:
should_parse[j] = self._should_parse(j, parse_cols)

if parse_cols is None or should_parse[j]:
row_data.append(cell.internal_value)
data.append(row_data)

if header is not None:
data[header] = _trim_excel_header(data[header])

parser = TextParser(data, header=header, index_col=index_col,
has_index_names=has_index_names,
na_values=na_values,
thousands=thousands,
parse_dates=parse_dates,
date_parser=date_parser,
skiprows=skiprows,
skip_footer=skip_footer,
chunksize=chunksize)

return parser.read()

def _parse_xls(self, sheetname, header=0, skiprows=None,
def _parse_excel(self, sheetname, header=0, skiprows=None,
skip_footer=0, index_col=None, has_index_names=None,
parse_cols=None, parse_dates=False, date_parser=None,
na_values=None, thousands=None, chunksize=None):
from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN)

datemode = self.book.datemode
sheet = self.book.sheet_by_name(sheetname)
Expand All @@ -2015,9 +1946,12 @@ def _parse_xls(self, sheetname, header=0, skiprows=None,
value = datetime.time(*dt[3:])
else:
value = datetime.datetime(*dt)
if typ == XL_CELL_ERROR:
elif typ == XL_CELL_ERROR:
value = np.nan
elif typ == XL_CELL_BOOLEAN:
value = bool(value)
row.append(value)

data.append(row)

if header is not None:
Expand All @@ -2037,9 +1971,6 @@ def _parse_xls(self, sheetname, header=0, skiprows=None,

@property
def sheet_names(self):
if self.use_xlsx:
return self.book.get_sheet_names()
else:
return self.book.sheet_names()


Expand Down
25 changes: 0 additions & 25 deletions pandas/io/tests/test_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,18 +245,6 @@ def test_specify_kind_xls(self):
# self.assertRaises(Exception, ExcelFile, open(xlsx_file, 'rb'),
# kind='xls')

def test_specify_kind_xlsx(self):
_skip_if_no_openpyxl()
xlsx_file = os.path.join(self.dirpath, 'test.xlsx')
xls_file = os.path.join(self.dirpath, 'test.xls')

self.assertRaises(Exception, ExcelFile, xls_file, kind='xlsx')

ExcelFile(open(xlsx_file, 'rb'), kind='xlsx')

self.assertRaises(Exception, ExcelFile, open(xls_file, 'rb'),
kind='xlsx')

def read_csv(self, *args, **kwds):
kwds = kwds.copy()
kwds['engine'] = 'python'
Expand Down Expand Up @@ -545,19 +533,6 @@ def test_excel_roundtrip_datetime(self):
recons = reader.parse('test1')
tm.assert_frame_equal(self.tsframe, recons)

def test_excel_roundtrip_bool(self):
_skip_if_no_openpyxl()

# Test roundtrip np.bool8, does not seem to work for xls
path = '__tmp_excel_roundtrip_bool__.xlsx'
frame = (DataFrame(np.random.randn(10, 2)) >= 0)
with ensure_clean(path) as path:

frame.to_excel(path, 'test1')
reader = ExcelFile(path)
recons = reader.parse('test1')
tm.assert_frame_equal(frame, recons)

def test_to_excel_periodindex(self):
_skip_if_no_excelsuite()

Expand Down