Skip to content

Commit 98f65b1

Browse files
author
y-p
committed
ENH: Use xlrd >=0.9.0 for both xls/xlsx, sidesteps GH1629
PTF
1 parent 48f2587 commit 98f65b1

File tree

3 files changed

+19
-112
lines changed

3 files changed

+19
-112
lines changed

README.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,8 @@ Optional dependencies
8787
* `statsmodels <http://statsmodels.sourceforge.net/>`__
8888
* Needed for parts of :mod:`pandas.stats`
8989
* `openpyxl <http://packages.python.org/openpyxl/>`__, `xlrd/xlwt <http://www.python-excel.org/>`__
90-
* openpyxl version 1.6.1 or higher
90+
* openpyxl version 1.6.1 or higher, for writing .xlsx files
91+
* xlrd >= 0.9.0
9192
* Needed for Excel I/O
9293

9394

pandas/io/parsers.py

+17-86
Original file line numberDiff line numberDiff line change
@@ -1803,64 +1803,32 @@ def _make_reader(self, f):
18031803
#----------------------------------------------------------------------
18041804
# ExcelFile class
18051805

1806-
_openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n"
1807-
"You can install it via 'easy_install openpyxl' or "
1808-
"'pip install openpyxl'.\nAlternatively, you could save"
1809-
" the .xlsx file as a .xls file.\n")
1810-
1811-
18121806
class ExcelFile(object):
18131807
"""
18141808
Class for parsing tabular excel sheets into DataFrame objects.
1815-
Uses xlrd for parsing .xls files or openpyxl for .xlsx files.
1816-
See ExcelFile.parse for more documentation
1809+
Uses xlrd. See ExcelFile.parse for more documentation
18171810
18181811
Parameters
18191812
----------
18201813
path : string or file-like object
18211814
Path to xls or xlsx file
1822-
kind : {'xls', 'xlsx', None}, default None
18231815
"""
1824-
def __init__(self, path_or_buf, kind=None):
1816+
def __init__(self, path_or_buf, kind=None, **kwds):
18251817
self.kind = kind
1826-
self.use_xlsx = kind == 'xls'
1818+
1819+
import xlrd # throw an ImportError if we need to
1820+
ver = tuple(map(int,xlrd.__VERSION__.split(".")[:2]))
1821+
if ver < (0, 9):
1822+
raise ImportError("pandas requires xlrd >= 0.9.0 for excel support")
18271823

18281824
self.path_or_buf = path_or_buf
18291825
self.tmpfile = None
18301826

18311827
if isinstance(path_or_buf, basestring):
1832-
if kind == 'xls' or (kind is None and
1833-
path_or_buf.endswith('.xls')):
1834-
self.use_xlsx = False
1835-
import xlrd
1836-
self.book = xlrd.open_workbook(path_or_buf)
1837-
else:
1838-
self.use_xlsx = True
1839-
try:
1840-
from openpyxl.reader.excel import load_workbook
1841-
self.book = load_workbook(path_or_buf, use_iterators=True)
1842-
except ImportError: # pragma: no cover
1843-
raise ImportError(_openpyxl_msg)
1828+
self.book = xlrd.open_workbook(path_or_buf)
18441829
else:
18451830
data = path_or_buf.read()
1846-
1847-
if self.kind == 'xls':
1848-
import xlrd
1849-
self.book = xlrd.open_workbook(file_contents=data)
1850-
elif self.kind == 'xlsx':
1851-
from openpyxl.reader.excel import load_workbook
1852-
buf = py3compat.BytesIO(data)
1853-
self.book = load_workbook(buf, use_iterators=True)
1854-
else:
1855-
try:
1856-
import xlrd
1857-
self.book = xlrd.open_workbook(file_contents=data)
1858-
self.use_xlsx = False
1859-
except Exception:
1860-
self.use_xlsx = True
1861-
from openpyxl.reader.excel import load_workbook
1862-
buf = py3compat.BytesIO(data)
1863-
self.book = load_workbook(buf, use_iterators=True)
1831+
self.book = xlrd.open_workbook(file_contents=data)
18641832

18651833
def __repr__(self):
18661834
return object.__repr__(self)
@@ -1908,9 +1876,7 @@ def parse(self, sheetname, header=0, skiprows=None, skip_footer=0,
19081876
if skipfooter is not None:
19091877
skip_footer = skipfooter
19101878

1911-
choose = {True: self._parse_xlsx,
1912-
False: self._parse_xls}
1913-
return choose[self.use_xlsx](sheetname, header=header,
1879+
return self._parse_excel(sheetname, header=header,
19141880
skiprows=skiprows, index_col=index_col,
19151881
has_index_names=has_index_names,
19161882
parse_cols=parse_cols,
@@ -1953,47 +1919,12 @@ def _excel2num(x):
19531919
else:
19541920
return i in parse_cols
19551921

1956-
def _parse_xlsx(self, sheetname, header=0, skiprows=None,
1957-
skip_footer=0, index_col=None, has_index_names=False,
1958-
parse_cols=None, parse_dates=False, date_parser=None,
1959-
na_values=None, thousands=None, chunksize=None):
1960-
sheet = self.book.get_sheet_by_name(name=sheetname)
1961-
data = []
1962-
1963-
# it brings a new method: iter_rows()
1964-
should_parse = {}
1965-
1966-
for row in sheet.iter_rows():
1967-
row_data = []
1968-
for j, cell in enumerate(row):
1969-
1970-
if parse_cols is not None and j not in should_parse:
1971-
should_parse[j] = self._should_parse(j, parse_cols)
1972-
1973-
if parse_cols is None or should_parse[j]:
1974-
row_data.append(cell.internal_value)
1975-
data.append(row_data)
1976-
1977-
if header is not None:
1978-
data[header] = _trim_excel_header(data[header])
1979-
1980-
parser = TextParser(data, header=header, index_col=index_col,
1981-
has_index_names=has_index_names,
1982-
na_values=na_values,
1983-
thousands=thousands,
1984-
parse_dates=parse_dates,
1985-
date_parser=date_parser,
1986-
skiprows=skiprows,
1987-
skip_footer=skip_footer,
1988-
chunksize=chunksize)
1989-
1990-
return parser.read()
1991-
1992-
def _parse_xls(self, sheetname, header=0, skiprows=None,
1922+
def _parse_excel(self, sheetname, header=0, skiprows=None,
19931923
skip_footer=0, index_col=None, has_index_names=None,
19941924
parse_cols=None, parse_dates=False, date_parser=None,
19951925
na_values=None, thousands=None, chunksize=None):
1996-
from xlrd import xldate_as_tuple, XL_CELL_DATE, XL_CELL_ERROR
1926+
from xlrd import (xldate_as_tuple, XL_CELL_DATE,
1927+
XL_CELL_ERROR, XL_CELL_BOOLEAN)
19971928

19981929
datemode = self.book.datemode
19991930
sheet = self.book.sheet_by_name(sheetname)
@@ -2015,9 +1946,12 @@ def _parse_xls(self, sheetname, header=0, skiprows=None,
20151946
value = datetime.time(*dt[3:])
20161947
else:
20171948
value = datetime.datetime(*dt)
2018-
if typ == XL_CELL_ERROR:
1949+
elif typ == XL_CELL_ERROR:
20191950
value = np.nan
1951+
elif typ == XL_CELL_BOOLEAN:
1952+
value = bool(value)
20201953
row.append(value)
1954+
20211955
data.append(row)
20221956

20231957
if header is not None:
@@ -2037,9 +1971,6 @@ def _parse_xls(self, sheetname, header=0, skiprows=None,
20371971

20381972
@property
20391973
def sheet_names(self):
2040-
if self.use_xlsx:
2041-
return self.book.get_sheet_names()
2042-
else:
20431974
return self.book.sheet_names()
20441975

20451976

pandas/io/tests/test_excel.py

-25
Original file line numberDiff line numberDiff line change
@@ -245,18 +245,6 @@ def test_specify_kind_xls(self):
245245
# self.assertRaises(Exception, ExcelFile, open(xlsx_file, 'rb'),
246246
# kind='xls')
247247

248-
def test_specify_kind_xlsx(self):
249-
_skip_if_no_openpyxl()
250-
xlsx_file = os.path.join(self.dirpath, 'test.xlsx')
251-
xls_file = os.path.join(self.dirpath, 'test.xls')
252-
253-
self.assertRaises(Exception, ExcelFile, xls_file, kind='xlsx')
254-
255-
ExcelFile(open(xlsx_file, 'rb'), kind='xlsx')
256-
257-
self.assertRaises(Exception, ExcelFile, open(xls_file, 'rb'),
258-
kind='xlsx')
259-
260248
def read_csv(self, *args, **kwds):
261249
kwds = kwds.copy()
262250
kwds['engine'] = 'python'
@@ -545,19 +533,6 @@ def test_excel_roundtrip_datetime(self):
545533
recons = reader.parse('test1')
546534
tm.assert_frame_equal(self.tsframe, recons)
547535

548-
def test_excel_roundtrip_bool(self):
549-
_skip_if_no_openpyxl()
550-
551-
# Test roundtrip np.bool8, does not seem to work for xls
552-
path = '__tmp_excel_roundtrip_bool__.xlsx'
553-
frame = (DataFrame(np.random.randn(10, 2)) >= 0)
554-
with ensure_clean(path) as path:
555-
556-
frame.to_excel(path, 'test1')
557-
reader = ExcelFile(path)
558-
recons = reader.parse('test1')
559-
tm.assert_frame_equal(frame, recons)
560-
561536
def test_to_excel_periodindex(self):
562537
_skip_if_no_excelsuite()
563538

0 commit comments

Comments
 (0)