Skip to content

Commit fa794e7

Browse files
WillAydPingviinituutti
authored andcommitted
Decouple xlrd reading from ExcelFile class (pandas-dev#24423)
1 parent 5a9816f commit fa794e7

File tree

2 files changed

+175
-135
lines changed

2 files changed

+175
-135
lines changed

pandas/io/excel.py

+134-108
Original file line numberDiff line numberDiff line change
@@ -358,23 +358,16 @@ def read_excel(io,
358358
**kwds)
359359

360360

361-
class ExcelFile(object):
362-
"""
363-
Class for parsing tabular excel sheets into DataFrame objects.
364-
Uses xlrd. See read_excel for more documentation
365-
366-
Parameters
367-
----------
368-
io : string, path object (pathlib.Path or py._path.local.LocalPath),
369-
file-like object or xlrd workbook
370-
If a string or path object, expected to be a path to xls or xlsx file
371-
engine : string, default None
372-
If io is not a buffer or path, this must be set to identify io.
373-
Acceptable values are None or xlrd
374-
"""
361+
class _XlrdReader(object):
375362

376-
def __init__(self, io, **kwds):
363+
def __init__(self, filepath_or_buffer):
364+
"""Reader using xlrd engine.
377365
366+
Parameters
367+
----------
368+
filepath_or_buffer : string, path object or Workbook
369+
Object to be parsed.
370+
"""
378371
err_msg = "Install xlrd >= 1.0.0 for Excel support"
379372

380373
try:
@@ -386,46 +379,39 @@ def __init__(self, io, **kwds):
386379
raise ImportError(err_msg +
387380
". Current version " + xlrd.__VERSION__)
388381

389-
# could be a str, ExcelFile, Book, etc.
390-
self.io = io
391-
# Always a string
392-
self._io = _stringify_path(io)
393-
394-
engine = kwds.pop('engine', None)
395-
396-
if engine is not None and engine != 'xlrd':
397-
raise ValueError("Unknown engine: {engine}".format(engine=engine))
398-
399-
# If io is a url, want to keep the data as bytes so can't pass
400-
# to get_filepath_or_buffer()
401-
if _is_url(self._io):
402-
io = _urlopen(self._io)
403-
elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
404-
io, _, _, _ = get_filepath_or_buffer(self._io)
405-
406-
if engine == 'xlrd' and isinstance(io, xlrd.Book):
407-
self.book = io
408-
elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
382+
# If filepath_or_buffer is a url, want to keep the data as bytes so
383+
# can't pass to get_filepath_or_buffer()
384+
if _is_url(filepath_or_buffer):
385+
filepath_or_buffer = _urlopen(filepath_or_buffer)
386+
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
387+
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
388+
filepath_or_buffer)
389+
390+
if isinstance(filepath_or_buffer, xlrd.Book):
391+
self.book = filepath_or_buffer
392+
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
393+
filepath_or_buffer, "read"):
409394
# N.B. xlrd.Book has a read attribute too
410-
if hasattr(io, 'seek'):
395+
if hasattr(filepath_or_buffer, 'seek'):
411396
try:
412397
# GH 19779
413-
io.seek(0)
398+
filepath_or_buffer.seek(0)
414399
except UnsupportedOperation:
415400
# HTTPResponse does not support seek()
416401
# GH 20434
417402
pass
418403

419-
data = io.read()
404+
data = filepath_or_buffer.read()
420405
self.book = xlrd.open_workbook(file_contents=data)
421-
elif isinstance(self._io, compat.string_types):
422-
self.book = xlrd.open_workbook(self._io)
406+
elif isinstance(filepath_or_buffer, compat.string_types):
407+
self.book = xlrd.open_workbook(filepath_or_buffer)
423408
else:
424409
raise ValueError('Must explicitly set engine if not passing in'
425410
' buffer or path for io.')
426411

427-
def __fspath__(self):
428-
return self._io
412+
@property
413+
def sheet_names(self):
414+
return self.book.sheet_names()
429415

430416
def parse(self,
431417
sheet_name=0,
@@ -434,12 +420,13 @@ def parse(self,
434420
index_col=None,
435421
usecols=None,
436422
squeeze=False,
437-
converters=None,
423+
dtype=None,
438424
true_values=None,
439425
false_values=None,
440426
skiprows=None,
441427
nrows=None,
442428
na_values=None,
429+
verbose=False,
443430
parse_dates=False,
444431
date_parser=None,
445432
thousands=None,
@@ -448,72 +435,9 @@ def parse(self,
448435
convert_float=True,
449436
mangle_dupe_cols=True,
450437
**kwds):
451-
"""
452-
Parse specified sheet(s) into a DataFrame
453-
454-
Equivalent to read_excel(ExcelFile, ...) See the read_excel
455-
docstring for more info on accepted parameters
456-
"""
457-
458-
# Can't use _deprecate_kwarg since sheetname=None has a special meaning
459-
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
460-
warnings.warn("The `sheetname` keyword is deprecated, use "
461-
"`sheet_name` instead", FutureWarning, stacklevel=2)
462-
sheet_name = kwds.pop("sheetname")
463-
elif 'sheetname' in kwds:
464-
raise TypeError("Cannot specify both `sheet_name` "
465-
"and `sheetname`. Use just `sheet_name`")
466-
467-
return self._parse_excel(sheet_name=sheet_name,
468-
header=header,
469-
names=names,
470-
index_col=index_col,
471-
usecols=usecols,
472-
squeeze=squeeze,
473-
converters=converters,
474-
true_values=true_values,
475-
false_values=false_values,
476-
skiprows=skiprows,
477-
nrows=nrows,
478-
na_values=na_values,
479-
parse_dates=parse_dates,
480-
date_parser=date_parser,
481-
thousands=thousands,
482-
comment=comment,
483-
skipfooter=skipfooter,
484-
convert_float=convert_float,
485-
mangle_dupe_cols=mangle_dupe_cols,
486-
**kwds)
487-
488-
def _parse_excel(self,
489-
sheet_name=0,
490-
header=0,
491-
names=None,
492-
index_col=None,
493-
usecols=None,
494-
squeeze=False,
495-
dtype=None,
496-
true_values=None,
497-
false_values=None,
498-
skiprows=None,
499-
nrows=None,
500-
na_values=None,
501-
verbose=False,
502-
parse_dates=False,
503-
date_parser=None,
504-
thousands=None,
505-
comment=None,
506-
skipfooter=0,
507-
convert_float=True,
508-
mangle_dupe_cols=True,
509-
**kwds):
510438

511439
_validate_header_arg(header)
512440

513-
if 'chunksize' in kwds:
514-
raise NotImplementedError("chunksize keyword of read_excel "
515-
"is not implemented")
516-
517441
from xlrd import (xldate, XL_CELL_DATE,
518442
XL_CELL_ERROR, XL_CELL_BOOLEAN,
519443
XL_CELL_NUMBER)
@@ -563,7 +487,7 @@ def _parse_cell(cell_contents, cell_typ):
563487
sheets = sheet_name
564488
ret_dict = True
565489
elif sheet_name is None:
566-
sheets = self.sheet_names
490+
sheets = self.book.sheet_names()
567491
ret_dict = True
568492
else:
569493
sheets = [sheet_name]
@@ -678,9 +602,111 @@ def _parse_cell(cell_contents, cell_typ):
678602
else:
679603
return output[asheetname]
680604

605+
606+
class ExcelFile(object):
607+
"""
608+
Class for parsing tabular excel sheets into DataFrame objects.
609+
Uses xlrd. See read_excel for more documentation
610+
611+
Parameters
612+
----------
613+
io : string, path object (pathlib.Path or py._path.local.LocalPath),
614+
file-like object or xlrd workbook
615+
If a string or path object, expected to be a path to xls or xlsx file.
616+
engine : string, default None
617+
If io is not a buffer or path, this must be set to identify io.
618+
Acceptable values are None or ``xlrd``.
619+
"""
620+
621+
_engines = {
622+
'xlrd': _XlrdReader,
623+
}
624+
625+
def __init__(self, io, engine=None):
626+
if engine is None:
627+
engine = 'xlrd'
628+
if engine not in self._engines:
629+
raise ValueError("Unknown engine: {engine}".format(engine=engine))
630+
631+
# could be a str, ExcelFile, Book, etc.
632+
self.io = io
633+
# Always a string
634+
self._io = _stringify_path(io)
635+
636+
self._reader = self._engines[engine](self._io)
637+
638+
def __fspath__(self):
639+
return self._io
640+
641+
def parse(self,
642+
sheet_name=0,
643+
header=0,
644+
names=None,
645+
index_col=None,
646+
usecols=None,
647+
squeeze=False,
648+
converters=None,
649+
true_values=None,
650+
false_values=None,
651+
skiprows=None,
652+
nrows=None,
653+
na_values=None,
654+
parse_dates=False,
655+
date_parser=None,
656+
thousands=None,
657+
comment=None,
658+
skipfooter=0,
659+
convert_float=True,
660+
mangle_dupe_cols=True,
661+
**kwds):
662+
"""
663+
Parse specified sheet(s) into a DataFrame
664+
665+
Equivalent to read_excel(ExcelFile, ...) See the read_excel
666+
docstring for more info on accepted parameters
667+
"""
668+
669+
# Can't use _deprecate_kwarg since sheetname=None has a special meaning
670+
if is_integer(sheet_name) and sheet_name == 0 and 'sheetname' in kwds:
671+
warnings.warn("The `sheetname` keyword is deprecated, use "
672+
"`sheet_name` instead", FutureWarning, stacklevel=2)
673+
sheet_name = kwds.pop("sheetname")
674+
elif 'sheetname' in kwds:
675+
raise TypeError("Cannot specify both `sheet_name` "
676+
"and `sheetname`. Use just `sheet_name`")
677+
678+
if 'chunksize' in kwds:
679+
raise NotImplementedError("chunksize keyword of read_excel "
680+
"is not implemented")
681+
682+
return self._reader.parse(sheet_name=sheet_name,
683+
header=header,
684+
names=names,
685+
index_col=index_col,
686+
usecols=usecols,
687+
squeeze=squeeze,
688+
converters=converters,
689+
true_values=true_values,
690+
false_values=false_values,
691+
skiprows=skiprows,
692+
nrows=nrows,
693+
na_values=na_values,
694+
parse_dates=parse_dates,
695+
date_parser=date_parser,
696+
thousands=thousands,
697+
comment=comment,
698+
skipfooter=skipfooter,
699+
convert_float=convert_float,
700+
mangle_dupe_cols=mangle_dupe_cols,
701+
**kwds)
702+
703+
@property
704+
def book(self):
705+
return self._reader.book
706+
681707
@property
682708
def sheet_names(self):
683-
return self.book.sheet_names()
709+
return self._reader.sheet_names
684710

685711
def close(self):
686712
"""close io if necessary"""

0 commit comments

Comments
 (0)