-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
Decouple xlrd reading from ExcelFile class #24423
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d015b12
351abae
2dddf18
d2e4174
89a1ebe
607dbe5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -358,23 +358,16 @@ def read_excel(io, | |
**kwds) | ||
|
||
|
||
class ExcelFile(object): | ||
""" | ||
Class for parsing tabular excel sheets into DataFrame objects. | ||
Uses xlrd. See read_excel for more documentation | ||
|
||
Parameters | ||
---------- | ||
io : string, path object (pathlib.Path or py._path.local.LocalPath), | ||
file-like object or xlrd workbook | ||
If a string or path object, expected to be a path to xls or xlsx file | ||
engine : string, default None | ||
If io is not a buffer or path, this must be set to identify io. | ||
Acceptable values are None or xlrd | ||
""" | ||
class _XlrdReader(object): | ||
|
||
def __init__(self, io, **kwds): | ||
def __init__(self, filepath_or_buffer): | ||
"""Reader using xlrd engine. | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : string, path object or Workbook | ||
Object to be parsed. | ||
""" | ||
err_msg = "Install xlrd >= 1.0.0 for Excel support" | ||
|
||
try: | ||
|
@@ -386,46 +379,39 @@ def __init__(self, io, **kwds): | |
raise ImportError(err_msg + | ||
". Current version " + xlrd.__VERSION__) | ||
|
||
# could be a str, ExcelFile, Book, etc. | ||
self.io = io | ||
# Always a string | ||
self._io = _stringify_path(io) | ||
|
||
engine = kwds.pop('engine', None) | ||
|
||
if engine is not None and engine != 'xlrd': | ||
raise ValueError("Unknown engine: {engine}".format(engine=engine)) | ||
|
||
# If io is a url, want to keep the data as bytes so can't pass | ||
# to get_filepath_or_buffer() | ||
if _is_url(self._io): | ||
io = _urlopen(self._io) | ||
elif not isinstance(self.io, (ExcelFile, xlrd.Book)): | ||
io, _, _, _ = get_filepath_or_buffer(self._io) | ||
|
||
if engine == 'xlrd' and isinstance(io, xlrd.Book): | ||
self.book = io | ||
elif not isinstance(io, xlrd.Book) and hasattr(io, "read"): | ||
# If filepath_or_buffer is a url, want to keep the data as bytes so | ||
# can't pass to get_filepath_or_buffer() | ||
if _is_url(filepath_or_buffer): | ||
filepath_or_buffer = _urlopen(filepath_or_buffer) | ||
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)): | ||
filepath_or_buffer, _, _, _ = get_filepath_or_buffer( | ||
filepath_or_buffer) | ||
|
||
if isinstance(filepath_or_buffer, xlrd.Book): | ||
self.book = filepath_or_buffer | ||
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr( | ||
filepath_or_buffer, "read"): | ||
# N.B. xlrd.Book has a read attribute too | ||
if hasattr(io, 'seek'): | ||
if hasattr(filepath_or_buffer, 'seek'): | ||
try: | ||
# GH 19779 | ||
io.seek(0) | ||
filepath_or_buffer.seek(0) | ||
except UnsupportedOperation: | ||
# HTTPResponse does not support seek() | ||
# GH 20434 | ||
pass | ||
|
||
data = io.read() | ||
data = filepath_or_buffer.read() | ||
self.book = xlrd.open_workbook(file_contents=data) | ||
elif isinstance(self._io, compat.string_types): | ||
self.book = xlrd.open_workbook(self._io) | ||
elif isinstance(filepath_or_buffer, compat.string_types): | ||
self.book = xlrd.open_workbook(filepath_or_buffer) | ||
else: | ||
raise ValueError('Must explicitly set engine if not passing in' | ||
' buffer or path for io.') | ||
|
||
def __fspath__(self): | ||
return self._io | ||
@property | ||
def sheet_names(self): | ||
return self.book.sheet_names() | ||
|
||
def parse(self, | ||
sheet_name=0, | ||
|
@@ -434,12 +420,13 @@ def parse(self, | |
index_col=None, | ||
usecols=None, | ||
squeeze=False, | ||
converters=None, | ||
dtype=None, | ||
true_values=None, | ||
false_values=None, | ||
skiprows=None, | ||
nrows=None, | ||
na_values=None, | ||
verbose=False, | ||
parse_dates=False, | ||
date_parser=None, | ||
thousands=None, | ||
|
@@ -448,72 +435,9 @@ def parse(self, | |
convert_float=True, | ||
mangle_dupe_cols=True, | ||
**kwds): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we now renove the kwds? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will double check. There is one branch where these would get used and dispatched to the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is definitely not dead code, so I am very wary of removing this. I think some more work can be done to better align the signature IMO we should refrain from removing it (that would be an API IMO), especially as there is enough happening with the refactor. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea after taking another look agreed with @gfyoung here. I think it's worth aligning the signatures of the different parse calls within the module and potentially removing keyword args if possible (I'm not actually sure what keywords would be applicable here) but would prefer to do in a separate PR since it would be potentially API breaking |
||
""" | ||
Parse specified sheet(s) into a DataFrame | ||
|
||
Equivalent to read_excel(ExcelFile, ...) See the read_excel | ||
docstring for more info on accepted parameters | ||
""" | ||
|
||
# Can't use _deprecate_kwarg since sheetname=None has a special meaning | ||
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: | ||
warnings.warn("The `sheetname` keyword is deprecated, use " | ||
"`sheet_name` instead", FutureWarning, stacklevel=2) | ||
sheet_name = kwds.pop("sheetname") | ||
elif 'sheetname' in kwds: | ||
raise TypeError("Cannot specify both `sheet_name` " | ||
"and `sheetname`. Use just `sheet_name`") | ||
|
||
return self._parse_excel(sheet_name=sheet_name, | ||
header=header, | ||
names=names, | ||
index_col=index_col, | ||
usecols=usecols, | ||
squeeze=squeeze, | ||
converters=converters, | ||
true_values=true_values, | ||
false_values=false_values, | ||
skiprows=skiprows, | ||
nrows=nrows, | ||
na_values=na_values, | ||
parse_dates=parse_dates, | ||
date_parser=date_parser, | ||
thousands=thousands, | ||
comment=comment, | ||
skipfooter=skipfooter, | ||
convert_float=convert_float, | ||
mangle_dupe_cols=mangle_dupe_cols, | ||
**kwds) | ||
|
||
def _parse_excel(self, | ||
sheet_name=0, | ||
header=0, | ||
names=None, | ||
index_col=None, | ||
usecols=None, | ||
squeeze=False, | ||
dtype=None, | ||
true_values=None, | ||
false_values=None, | ||
skiprows=None, | ||
nrows=None, | ||
na_values=None, | ||
verbose=False, | ||
parse_dates=False, | ||
date_parser=None, | ||
thousands=None, | ||
comment=None, | ||
skipfooter=0, | ||
convert_float=True, | ||
mangle_dupe_cols=True, | ||
**kwds): | ||
|
||
_validate_header_arg(header) | ||
|
||
if 'chunksize' in kwds: | ||
raise NotImplementedError("chunksize keyword of read_excel " | ||
"is not implemented") | ||
|
||
from xlrd import (xldate, XL_CELL_DATE, | ||
XL_CELL_ERROR, XL_CELL_BOOLEAN, | ||
XL_CELL_NUMBER) | ||
|
@@ -563,7 +487,7 @@ def _parse_cell(cell_contents, cell_typ): | |
sheets = sheet_name | ||
ret_dict = True | ||
elif sheet_name is None: | ||
sheets = self.sheet_names | ||
sheets = self.book.sheet_names() | ||
ret_dict = True | ||
else: | ||
sheets = [sheet_name] | ||
|
@@ -678,9 +602,111 @@ def _parse_cell(cell_contents, cell_typ): | |
else: | ||
return output[asheetname] | ||
|
||
|
||
class ExcelFile(object): | ||
""" | ||
Class for parsing tabular excel sheets into DataFrame objects. | ||
Uses xlrd. See read_excel for more documentation | ||
|
||
Parameters | ||
---------- | ||
io : string, path object (pathlib.Path or py._path.local.LocalPath), | ||
file-like object or xlrd workbook | ||
If a string or path object, expected to be a path to xls or xlsx file. | ||
engine : string, default None | ||
If io is not a buffer or path, this must be set to identify io. | ||
Acceptable values are None or ``xlrd``. | ||
""" | ||
|
||
_engines = { | ||
'xlrd': _XlrdReader, | ||
} | ||
|
||
def __init__(self, io, engine=None): | ||
if engine is None: | ||
engine = 'xlrd' | ||
if engine not in self._engines: | ||
raise ValueError("Unknown engine: {engine}".format(engine=engine)) | ||
|
||
# could be a str, ExcelFile, Book, etc. | ||
self.io = io | ||
# Always a string | ||
self._io = _stringify_path(io) | ||
|
||
self._reader = self._engines[engine](self._io) | ||
|
||
def __fspath__(self): | ||
return self._io | ||
|
||
def parse(self, | ||
sheet_name=0, | ||
header=0, | ||
names=None, | ||
index_col=None, | ||
usecols=None, | ||
squeeze=False, | ||
converters=None, | ||
true_values=None, | ||
false_values=None, | ||
skiprows=None, | ||
nrows=None, | ||
na_values=None, | ||
parse_dates=False, | ||
date_parser=None, | ||
thousands=None, | ||
comment=None, | ||
skipfooter=0, | ||
convert_float=True, | ||
mangle_dupe_cols=True, | ||
**kwds): | ||
""" | ||
Parse specified sheet(s) into a DataFrame | ||
|
||
Equivalent to read_excel(ExcelFile, ...) See the read_excel | ||
docstring for more info on accepted parameters | ||
""" | ||
|
||
# Can't use _deprecate_kwarg since sheetname=None has a special meaning | ||
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds: | ||
warnings.warn("The `sheetname` keyword is deprecated, use " | ||
"`sheet_name` instead", FutureWarning, stacklevel=2) | ||
sheet_name = kwds.pop("sheetname") | ||
elif 'sheetname' in kwds: | ||
raise TypeError("Cannot specify both `sheet_name` " | ||
"and `sheetname`. Use just `sheet_name`") | ||
|
||
if 'chunksize' in kwds: | ||
raise NotImplementedError("chunksize keyword of read_excel " | ||
"is not implemented") | ||
|
||
return self._reader.parse(sheet_name=sheet_name, | ||
header=header, | ||
names=names, | ||
index_col=index_col, | ||
usecols=usecols, | ||
squeeze=squeeze, | ||
converters=converters, | ||
true_values=true_values, | ||
false_values=false_values, | ||
skiprows=skiprows, | ||
nrows=nrows, | ||
na_values=na_values, | ||
parse_dates=parse_dates, | ||
date_parser=date_parser, | ||
thousands=thousands, | ||
comment=comment, | ||
skipfooter=skipfooter, | ||
convert_float=convert_float, | ||
mangle_dupe_cols=mangle_dupe_cols, | ||
**kwds) | ||
|
||
@property | ||
def book(self): | ||
return self._reader.book | ||
|
||
@property | ||
def sheet_names(self): | ||
return self.book.sheet_names() | ||
return self._reader.sheet_names | ||
|
||
def close(self): | ||
"""close io if necessary""" | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
were these just missing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also somewhat of a misleading diff but the existing code base has different signatures for
parse
and_parse_excel
. When I moved the latter to be part of the reader class and renamed to simplyparse
git picked it up the different signatures in the diff.I didn't bother to align the signatures here but certainly can as well
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry if this is a duplicate (can't see my previous response?) but the diff here is somewhat misleading. Previously there was a
parse
and_parse_excel
function. With the refactor, I moved_parse_excel
to the private reader class but simply named itparse
.Git is mixing up the two
parse
functions, basically assuming that the existing one for theExcelFile
class is brand new (which it wasn't) and is comparing the reader's implementation to the existingExcelFile
class function. The signatures weren't aligned hence this small diff.I just moved the code without any change but can look at aligning signatures if you'd like