Skip to content

Decouple xlrd reading from ExcelFile class #24423

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Dec 28, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 134 additions & 108 deletions pandas/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,23 +358,16 @@ def read_excel(io,
**kwds)


class ExcelFile(object):
"""
Class for parsing tabular excel sheets into DataFrame objects.
Uses xlrd. See read_excel for more documentation

Parameters
----------
io : string, path object (pathlib.Path or py._path.local.LocalPath),
file-like object or xlrd workbook
If a string or path object, expected to be a path to xls or xlsx file
engine : string, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or xlrd
"""
class _XlrdReader(object):

def __init__(self, io, **kwds):
def __init__(self, filepath_or_buffer):
"""Reader using xlrd engine.

Parameters
----------
filepath_or_buffer : string, path object or Workbook
Object to be parsed.
"""
err_msg = "Install xlrd >= 1.0.0 for Excel support"

try:
Expand All @@ -386,46 +379,39 @@ def __init__(self, io, **kwds):
raise ImportError(err_msg +
". Current version " + xlrd.__VERSION__)

# could be a str, ExcelFile, Book, etc.
self.io = io
# Always a string
self._io = _stringify_path(io)

engine = kwds.pop('engine', None)

if engine is not None and engine != 'xlrd':
raise ValueError("Unknown engine: {engine}".format(engine=engine))

# If io is a url, want to keep the data as bytes so can't pass
# to get_filepath_or_buffer()
if _is_url(self._io):
io = _urlopen(self._io)
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
io, _, _, _ = get_filepath_or_buffer(self._io)

if engine == 'xlrd' and isinstance(io, xlrd.Book):
self.book = io
elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
# If filepath_or_buffer is a url, want to keep the data as bytes so
# can't pass to get_filepath_or_buffer()
if _is_url(filepath_or_buffer):
filepath_or_buffer = _urlopen(filepath_or_buffer)
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
filepath_or_buffer)

if isinstance(filepath_or_buffer, xlrd.Book):
self.book = filepath_or_buffer
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
filepath_or_buffer, "read"):
# N.B. xlrd.Book has a read attribute too
if hasattr(io, 'seek'):
if hasattr(filepath_or_buffer, 'seek'):
try:
# GH 19779
io.seek(0)
filepath_or_buffer.seek(0)
except UnsupportedOperation:
# HTTPResponse does not support seek()
# GH 20434
pass

data = io.read()
data = filepath_or_buffer.read()
self.book = xlrd.open_workbook(file_contents=data)
elif isinstance(self._io, compat.string_types):
self.book = xlrd.open_workbook(self._io)
elif isinstance(filepath_or_buffer, compat.string_types):
self.book = xlrd.open_workbook(filepath_or_buffer)
else:
raise ValueError('Must explicitly set engine if not passing in'
' buffer or path for io.')

def __fspath__(self):
return self._io
@property
def sheet_names(self):
return self.book.sheet_names()

def parse(self,
sheet_name=0,
Expand All @@ -434,12 +420,13 @@ def parse(self,
index_col=None,
usecols=None,
squeeze=False,
converters=None,
dtype=None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

were these just missing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also somewhat of a misleading diff but the existing code base has different signatures for parse and _parse_excel. When I moved the latter to be part of the reader class and renamed to simply parse git picked it up the different signatures in the diff.

I didn't bother to align the signatures here but certainly can as well

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry if this is a duplicate (can't see my previous response?) but the diff here is somewhat misleading. Previously there was a parse and _parse_excel function. With the refactor, I moved _parse_excel to the private reader class but simply named it parse.

Git is mixing up the two parse functions, basically assuming that the existing one for the ExcelFile class is brand new (which it wasn't) and is comparing the reader's implementation to the existing ExcelFile class function. The signatures weren't aligned hence this small diff.

I just moved the code without any change but can look at aligning signatures if you'd like

true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
verbose=False,
parse_dates=False,
date_parser=None,
thousands=None,
Expand All @@ -448,72 +435,9 @@ def parse(self,
convert_float=True,
mangle_dupe_cols=True,
**kwds):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we now renove the kwds?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will double check. There is one branch where these would get used and dispatched to the TextParser, though maybe that is dead code

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

**kwds is passed through TextParser to the Python parser in parsers.py.

This is definitely not dead code, so I am very wary of removing this. I think some more work can be done to better align the signature read_excel with that of read_csv (in the interest of creating a more unified data IO API)

IMO we should refrain from removing it (that would be an API IMO), especially as there is enough happening with the refactor.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea after taking another look agreed with @gfyoung here. I think it's worth aligning the signatures of the different parse calls within the module and potentially removing keyword args if possible (I'm not actually sure what keywords would be applicable here) but would prefer to do in a separate PR since it would be potentially API breaking

"""
Parse specified sheet(s) into a DataFrame

Equivalent to read_excel(ExcelFile, ...) See the read_excel
docstring for more info on accepted parameters
"""

# Can't use _deprecate_kwarg since sheetname=None has a special meaning
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
warnings.warn("The `sheetname` keyword is deprecated, use "
"`sheet_name` instead", FutureWarning, stacklevel=2)
sheet_name = kwds.pop("sheetname")
elif 'sheetname' in kwds:
raise TypeError("Cannot specify both `sheet_name` "
"and `sheetname`. Use just `sheet_name`")

return self._parse_excel(sheet_name=sheet_name,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
squeeze=squeeze,
converters=converters,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
**kwds)

def _parse_excel(self,
sheet_name=0,
header=0,
names=None,
index_col=None,
usecols=None,
squeeze=False,
dtype=None,
true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
verbose=False,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
**kwds):

_validate_header_arg(header)

if 'chunksize' in kwds:
raise NotImplementedError("chunksize keyword of read_excel "
"is not implemented")

from xlrd import (xldate, XL_CELL_DATE,
XL_CELL_ERROR, XL_CELL_BOOLEAN,
XL_CELL_NUMBER)
Expand Down Expand Up @@ -563,7 +487,7 @@ def _parse_cell(cell_contents, cell_typ):
sheets = sheet_name
ret_dict = True
elif sheet_name is None:
sheets = self.sheet_names
sheets = self.book.sheet_names()
ret_dict = True
else:
sheets = [sheet_name]
Expand Down Expand Up @@ -678,9 +602,111 @@ def _parse_cell(cell_contents, cell_typ):
else:
return output[asheetname]


class ExcelFile(object):
"""
Class for parsing tabular excel sheets into DataFrame objects.
Uses xlrd. See read_excel for more documentation

Parameters
----------
io : string, path object (pathlib.Path or py._path.local.LocalPath),
file-like object or xlrd workbook
If a string or path object, expected to be a path to xls or xlsx file.
engine : string, default None
If io is not a buffer or path, this must be set to identify io.
Acceptable values are None or ``xlrd``.
"""

_engines = {
'xlrd': _XlrdReader,
}

def __init__(self, io, engine=None):
if engine is None:
engine = 'xlrd'
if engine not in self._engines:
raise ValueError("Unknown engine: {engine}".format(engine=engine))

# could be a str, ExcelFile, Book, etc.
self.io = io
# Always a string
self._io = _stringify_path(io)

self._reader = self._engines[engine](self._io)

def __fspath__(self):
return self._io

def parse(self,
sheet_name=0,
header=0,
names=None,
index_col=None,
usecols=None,
squeeze=False,
converters=None,
true_values=None,
false_values=None,
skiprows=None,
nrows=None,
na_values=None,
parse_dates=False,
date_parser=None,
thousands=None,
comment=None,
skipfooter=0,
convert_float=True,
mangle_dupe_cols=True,
**kwds):
"""
Parse specified sheet(s) into a DataFrame

Equivalent to read_excel(ExcelFile, ...) See the read_excel
docstring for more info on accepted parameters
"""

# Can't use _deprecate_kwarg since sheetname=None has a special meaning
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
warnings.warn("The `sheetname` keyword is deprecated, use "
"`sheet_name` instead", FutureWarning, stacklevel=2)
sheet_name = kwds.pop("sheetname")
elif 'sheetname' in kwds:
raise TypeError("Cannot specify both `sheet_name` "
"and `sheetname`. Use just `sheet_name`")

if 'chunksize' in kwds:
raise NotImplementedError("chunksize keyword of read_excel "
"is not implemented")

return self._reader.parse(sheet_name=sheet_name,
header=header,
names=names,
index_col=index_col,
usecols=usecols,
squeeze=squeeze,
converters=converters,
true_values=true_values,
false_values=false_values,
skiprows=skiprows,
nrows=nrows,
na_values=na_values,
parse_dates=parse_dates,
date_parser=date_parser,
thousands=thousands,
comment=comment,
skipfooter=skipfooter,
convert_float=convert_float,
mangle_dupe_cols=mangle_dupe_cols,
**kwds)

@property
def book(self):
return self._reader.book

@property
def sheet_names(self):
return self.book.sheet_names()
return self._reader.sheet_names

def close(self):
"""close io if necessary"""
Expand Down
Loading