Skip to content

Commit 9541540

Browse files
WillAydjreback
authored andcommitted
Excel Reader Refactor - Base Class Introduction (#24829)
1 parent 69a2c54 commit 9541540

File tree

1 file changed

+133
-102
lines changed

1 file changed

+133
-102
lines changed

pandas/io/excel.py

+133-102
Original file line numberDiff line numberDiff line change
@@ -375,60 +375,25 @@ def read_excel(io,
375375
**kwds)
376376

377377

378-
class _XlrdReader(object):
379-
380-
def __init__(self, filepath_or_buffer):
381-
"""Reader using xlrd engine.
382-
383-
Parameters
384-
----------
385-
filepath_or_buffer : string, path object or Workbook
386-
Object to be parsed.
387-
"""
388-
err_msg = "Install xlrd >= 1.0.0 for Excel support"
389-
390-
try:
391-
import xlrd
392-
except ImportError:
393-
raise ImportError(err_msg)
394-
else:
395-
if xlrd.__VERSION__ < LooseVersion("1.0.0"):
396-
raise ImportError(err_msg +
397-
". Current version " + xlrd.__VERSION__)
378+
@add_metaclass(abc.ABCMeta)
379+
class _BaseExcelReader(object):
398380

399-
# If filepath_or_buffer is a url, want to keep the data as bytes so
400-
# can't pass to get_filepath_or_buffer()
401-
if _is_url(filepath_or_buffer):
402-
filepath_or_buffer = _urlopen(filepath_or_buffer)
403-
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
404-
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
405-
filepath_or_buffer)
381+
@property
382+
@abc.abstractmethod
383+
def sheet_names(self):
384+
pass
406385

407-
if isinstance(filepath_or_buffer, xlrd.Book):
408-
self.book = filepath_or_buffer
409-
elif not isinstance(filepath_or_buffer, xlrd.Book) and hasattr(
410-
filepath_or_buffer, "read"):
411-
# N.B. xlrd.Book has a read attribute too
412-
if hasattr(filepath_or_buffer, 'seek'):
413-
try:
414-
# GH 19779
415-
filepath_or_buffer.seek(0)
416-
except UnsupportedOperation:
417-
# HTTPResponse does not support seek()
418-
# GH 20434
419-
pass
386+
@abc.abstractmethod
387+
def get_sheet_by_name(self, name):
388+
pass
420389

421-
data = filepath_or_buffer.read()
422-
self.book = xlrd.open_workbook(file_contents=data)
423-
elif isinstance(filepath_or_buffer, compat.string_types):
424-
self.book = xlrd.open_workbook(filepath_or_buffer)
425-
else:
426-
raise ValueError('Must explicitly set engine if not passing in'
427-
' buffer or path for io.')
390+
@abc.abstractmethod
391+
def get_sheet_by_index(self, index):
392+
pass
428393

429-
@property
430-
def sheet_names(self):
431-
return self.book.sheet_names()
394+
@abc.abstractmethod
395+
def get_sheet_data(self, sheet, convert_float):
396+
pass
432397

433398
def parse(self,
434399
sheet_name=0,
@@ -455,56 +420,14 @@ def parse(self,
455420

456421
_validate_header_arg(header)
457422

458-
from xlrd import (xldate, XL_CELL_DATE,
459-
XL_CELL_ERROR, XL_CELL_BOOLEAN,
460-
XL_CELL_NUMBER)
461-
462-
epoch1904 = self.book.datemode
463-
464-
def _parse_cell(cell_contents, cell_typ):
465-
"""converts the contents of the cell into a pandas
466-
appropriate object"""
467-
468-
if cell_typ == XL_CELL_DATE:
469-
470-
# Use the newer xlrd datetime handling.
471-
try:
472-
cell_contents = xldate.xldate_as_datetime(
473-
cell_contents, epoch1904)
474-
except OverflowError:
475-
return cell_contents
476-
477-
# Excel doesn't distinguish between dates and time,
478-
# so we treat dates on the epoch as times only.
479-
# Also, Excel supports 1900 and 1904 epochs.
480-
year = (cell_contents.timetuple())[0:3]
481-
if ((not epoch1904 and year == (1899, 12, 31)) or
482-
(epoch1904 and year == (1904, 1, 1))):
483-
cell_contents = time(cell_contents.hour,
484-
cell_contents.minute,
485-
cell_contents.second,
486-
cell_contents.microsecond)
487-
488-
elif cell_typ == XL_CELL_ERROR:
489-
cell_contents = np.nan
490-
elif cell_typ == XL_CELL_BOOLEAN:
491-
cell_contents = bool(cell_contents)
492-
elif convert_float and cell_typ == XL_CELL_NUMBER:
493-
# GH5394 - Excel 'numbers' are always floats
494-
# it's a minimal perf hit and less surprising
495-
val = int(cell_contents)
496-
if val == cell_contents:
497-
cell_contents = val
498-
return cell_contents
499-
500423
ret_dict = False
501424

502425
# Keep sheetname to maintain backwards compatibility.
503426
if isinstance(sheet_name, list):
504427
sheets = sheet_name
505428
ret_dict = True
506429
elif sheet_name is None:
507-
sheets = self.book.sheet_names()
430+
sheets = self.sheet_names
508431
ret_dict = True
509432
else:
510433
sheets = [sheet_name]
@@ -519,19 +442,13 @@ def _parse_cell(cell_contents, cell_typ):
519442
print("Reading sheet {sheet}".format(sheet=asheetname))
520443

521444
if isinstance(asheetname, compat.string_types):
522-
sheet = self.book.sheet_by_name(asheetname)
445+
sheet = self.get_sheet_by_name(asheetname)
523446
else: # assume an integer if not a string
524-
sheet = self.book.sheet_by_index(asheetname)
447+
sheet = self.get_sheet_by_index(asheetname)
525448

526-
data = []
449+
data = self.get_sheet_data(sheet, convert_float)
527450
usecols = _maybe_convert_usecols(usecols)
528451

529-
for i in range(sheet.nrows):
530-
row = [_parse_cell(value, typ)
531-
for value, typ in zip(sheet.row_values(i),
532-
sheet.row_types(i))]
533-
data.append(row)
534-
535452
if sheet.nrows == 0:
536453
output[asheetname] = DataFrame()
537454
continue
@@ -620,6 +537,120 @@ def _parse_cell(cell_contents, cell_typ):
620537
return output[asheetname]
621538

622539

540+
class _XlrdReader(_BaseExcelReader):
541+
542+
def __init__(self, filepath_or_buffer):
543+
"""Reader using xlrd engine.
544+
545+
Parameters
546+
----------
547+
filepath_or_buffer : string, path object or Workbook
548+
Object to be parsed.
549+
"""
550+
err_msg = "Install xlrd >= 1.0.0 for Excel support"
551+
552+
try:
553+
import xlrd
554+
except ImportError:
555+
raise ImportError(err_msg)
556+
else:
557+
if xlrd.__VERSION__ < LooseVersion("1.0.0"):
558+
raise ImportError(err_msg +
559+
". Current version " + xlrd.__VERSION__)
560+
561+
# If filepath_or_buffer is a url, want to keep the data as bytes so
562+
# can't pass to get_filepath_or_buffer()
563+
if _is_url(filepath_or_buffer):
564+
filepath_or_buffer = _urlopen(filepath_or_buffer)
565+
elif not isinstance(filepath_or_buffer, (ExcelFile, xlrd.Book)):
566+
filepath_or_buffer, _, _, _ = get_filepath_or_buffer(
567+
filepath_or_buffer)
568+
569+
if isinstance(filepath_or_buffer, xlrd.Book):
570+
self.book = filepath_or_buffer
571+
elif hasattr(filepath_or_buffer, "read"):
572+
# N.B. xlrd.Book has a read attribute too
573+
if hasattr(filepath_or_buffer, 'seek'):
574+
try:
575+
# GH 19779
576+
filepath_or_buffer.seek(0)
577+
except UnsupportedOperation:
578+
# HTTPResponse does not support seek()
579+
# GH 20434
580+
pass
581+
582+
data = filepath_or_buffer.read()
583+
self.book = xlrd.open_workbook(file_contents=data)
584+
elif isinstance(filepath_or_buffer, compat.string_types):
585+
self.book = xlrd.open_workbook(filepath_or_buffer)
586+
else:
587+
raise ValueError('Must explicitly set engine if not passing in'
588+
' buffer or path for io.')
589+
590+
@property
591+
def sheet_names(self):
592+
return self.book.sheet_names()
593+
594+
def get_sheet_by_name(self, name):
595+
return self.book.sheet_by_name(name)
596+
597+
def get_sheet_by_index(self, index):
598+
return self.book.sheet_by_index(index)
599+
600+
def get_sheet_data(self, sheet, convert_float):
601+
from xlrd import (xldate, XL_CELL_DATE,
602+
XL_CELL_ERROR, XL_CELL_BOOLEAN,
603+
XL_CELL_NUMBER)
604+
605+
epoch1904 = self.book.datemode
606+
607+
def _parse_cell(cell_contents, cell_typ):
608+
"""converts the contents of the cell into a pandas
609+
appropriate object"""
610+
611+
if cell_typ == XL_CELL_DATE:
612+
613+
# Use the newer xlrd datetime handling.
614+
try:
615+
cell_contents = xldate.xldate_as_datetime(
616+
cell_contents, epoch1904)
617+
except OverflowError:
618+
return cell_contents
619+
620+
# Excel doesn't distinguish between dates and time,
621+
# so we treat dates on the epoch as times only.
622+
# Also, Excel supports 1900 and 1904 epochs.
623+
year = (cell_contents.timetuple())[0:3]
624+
if ((not epoch1904 and year == (1899, 12, 31)) or
625+
(epoch1904 and year == (1904, 1, 1))):
626+
cell_contents = time(cell_contents.hour,
627+
cell_contents.minute,
628+
cell_contents.second,
629+
cell_contents.microsecond)
630+
631+
elif cell_typ == XL_CELL_ERROR:
632+
cell_contents = np.nan
633+
elif cell_typ == XL_CELL_BOOLEAN:
634+
cell_contents = bool(cell_contents)
635+
elif convert_float and cell_typ == XL_CELL_NUMBER:
636+
# GH5394 - Excel 'numbers' are always floats
637+
# it's a minimal perf hit and less surprising
638+
val = int(cell_contents)
639+
if val == cell_contents:
640+
cell_contents = val
641+
return cell_contents
642+
643+
data = []
644+
645+
for i in range(sheet.nrows):
646+
row = [_parse_cell(value, typ)
647+
for value, typ in zip(sheet.row_values(i),
648+
sheet.row_types(i))]
649+
data.append(row)
650+
651+
return data
652+
653+
623654
class ExcelFile(object):
624655
"""
625656
Class for parsing tabular excel sheets into DataFrame objects.

0 commit comments

Comments
 (0)