Skip to content

Commit 5148c1b

Browse files
davidovitchdavidovitch
davidovitch
authored andcommitted
read multiple sheets for ods, small PEP8 changes
1 parent 564ef22 commit 5148c1b

File tree

1 file changed

+146
-112
lines changed

1 file changed

+146
-112
lines changed

pandas/io/excel.py

+146-112
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,14 @@ class BaseFile(object):
7373
""" Class for identifying the type of reader
7474
"""
7575

76-
def __init__(self, try_engine=False):
76+
def __init__(self, engine, extensions, io_class, open_workbook,
77+
try_engine=False):
78+
self.engine = engine
79+
self.extensions = extensions
80+
self.io_class = io_class
81+
self.open_workbook = open_workbook
7782
if try_engine:
78-
self.has_engine()
83+
self.load_engine()
7984

8085
def is_ext(self, path):
8186
"""Verify if the path's extension is supported by the reader
@@ -94,26 +99,29 @@ def is_type(self, io):
9499
else:
95100
return False
96101

97-
def has_engine(self):
98-
"""Verify if the engine is installed
102+
def load_engine(self):
103+
"""Load the engine if installed
99104
"""
100105
try:
101-
self.load_engine()
106+
self._load_engine()
102107
_readers[self.engine] = True
103108
except ImportError:
104109
_readers[self.engine] = False
110+
except AttributeError:
111+
_readers[self.engine] = False
112+
msg = 'Excel engine "%s" is not implemented' % self.engine
113+
raise NotImplementedError(msg)
105114

106115

107116
class XLRDFile(BaseFile):
108117

109-
def __init__(self, **kwargs):
110-
self.engine = 'xlrd'
111-
self.extensions = ['xls', 'xlsx', 'xlsm']
112-
self.io_class = type(None)
113-
self.open_workbook = None
114-
super(XLRDFile, self).__init__(**kwargs)
118+
def __init__(self, try_engine=False):
119+
# engine, extensions, are defined here, but io_class and open_workbook
120+
# are only defined when importing the engine
121+
args = ('xlrd', ['xls', 'xlsx', 'xlsm'], type(None), None)
122+
super(XLRDFile, self).__init__(*args, try_engine=try_engine)
115123

116-
def load_engine(self):
124+
def _load_engine(self):
117125
import xlrd # throw an ImportError if we need to
118126
ver = tuple(map(int, xlrd.__VERSION__.split(".")[:2]))
119127
if ver < (0, 9): # pragma: no cover
@@ -126,14 +134,13 @@ def load_engine(self):
126134

127135
class EZODFFile(BaseFile):
128136

129-
def __init__(self, **kwargs):
130-
self.engine = 'ezodf'
131-
self.extensions = ['ods']
132-
self.io_class = type(None)
133-
self.open_workbook = None
134-
super(EZODFFile, self).__init__(**kwargs)
137+
def __init__(self, try_engine=False):
138+
# engine, extensions, are defined here, but io_class and open_workbook
139+
# are only defined when importing the engine
140+
args = ('ezodf', ['ods'], type(None), None)
141+
super(EZODFFile, self).__init__(*args, try_engine=try_engine)
135142

136-
def load_engine(self):
143+
def _load_engine(self):
137144
import ezodf
138145
self.open_workbook = ezodf.opendoc
139146
self.io_class = ezodf.document.PackagedDocument
@@ -150,17 +157,17 @@ def read_excel(io, sheetname=0, **kwds):
150157
and file. For file URLs, a host is expected. For instance, a local
151158
file could be file://localhost/path/to/workbook.xlsx
152159
sheetname : string, int, mixed list of strings/ints, or None, default 0
153-
154-
Strings are used for sheet names, Integers are used in zero-indexed sheet
155-
positions.
156-
160+
161+
Strings are used for sheet names, Integers are used in zero-indexed sheet
162+
positions.
163+
157164
Lists of strings/integers are used to request multiple sheets.
158-
165+
159166
Specify None to get all sheets.
160-
167+
161168
str|int -> DataFrame is returned.
162169
list|None -> Dict of DataFrames is returned, with keys representing sheets.
163-
170+
164171
Available Cases
165172
166173
* Defaults to 0 -> 1st sheet as a DataFrame
@@ -293,19 +300,19 @@ def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
293300
Parameters
294301
----------
295302
sheetname : string, int, mixed list of strings/ints, or None, default 0
296-
297-
Strings are used for sheet names, Integers are used in zero-indexed sheet
298-
positions.
299-
303+
304+
Strings are used for sheet names, Integers are used in zero-indexed sheet
305+
positions.
306+
300307
Lists of strings/integers are used to request multiple sheets.
301-
308+
302309
Specify None to get all sheets.
303-
310+
304311
str|int -> DataFrame is returned.
305312
list|None -> Dict of DataFrames is returned, with keys representing sheets.
306-
313+
307314
Available Cases
308-
315+
309316
* Defaults to 0 -> 1st sheet as a DataFrame
310317
* 1 -> 2nd sheet as a DataFrame
311318
* "Sheet1" -> 1st sheet as a DataFrame
@@ -426,10 +433,10 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
426433

427434
epoch1904 = self.book.datemode
428435

429-
def _parse_cell(cell_contents,cell_typ):
436+
def _parse_cell(cell_contents, cell_typ):
430437
"""converts the contents of the cell into a pandas
431438
appropriate object"""
432-
439+
433440
if cell_typ == XL_CELL_DATE:
434441
if xlrd_0_9_3:
435442
# Use the newer xlrd datetime handling.
@@ -472,10 +479,10 @@ def _parse_cell(cell_contents,cell_typ):
472479
xlrd_0_9_3 = True
473480
else:
474481
xlrd_0_9_3 = False
475-
482+
476483
ret_dict = False
477-
478-
#Keep sheetname to maintain backwards compatibility.
484+
485+
# Keep sheetname to maintain backwards compatibility.
479486
if isinstance(sheetname, list):
480487
sheets = sheetname
481488
ret_dict = True
@@ -484,38 +491,38 @@ def _parse_cell(cell_contents,cell_typ):
484491
ret_dict = True
485492
else:
486493
sheets = [sheetname]
487-
488-
#handle same-type duplicates.
494+
495+
# handle same-type duplicates.
489496
sheets = list(set(sheets))
490-
497+
491498
output = {}
492-
499+
493500
for asheetname in sheets:
494501
if verbose:
495502
print("Reading sheet %s" % asheetname)
496-
503+
497504
if isinstance(asheetname, compat.string_types):
498505
sheet = self.book.sheet_by_name(asheetname)
499-
else: # assume an integer if not a string
500-
sheet = self.book.sheet_by_index(asheetname)
501-
506+
else: # assume an integer if not a string
507+
sheet = self.book.sheet_by_index(asheetname)
508+
502509
data = []
503510
should_parse = {}
504-
511+
505512
for i in range(sheet.nrows):
506513
row = []
507514
for j, (value, typ) in enumerate(zip(sheet.row_values(i),
508515
sheet.row_types(i))):
509516
if parse_cols is not None and j not in should_parse:
510517
should_parse[j] = self._should_parse(j, parse_cols)
511-
518+
512519
if parse_cols is None or should_parse[j]:
513-
row.append(_parse_cell(value,typ))
520+
row.append(_parse_cell(value, typ))
514521
data.append(row)
515-
522+
516523
if header is not None:
517524
data[header] = _trim_excel_header(data[header])
518-
525+
519526
parser = TextParser(data, header=header, index_col=index_col,
520527
has_index_names=has_index_names,
521528
na_values=na_values,
@@ -526,76 +533,103 @@ def _parse_cell(cell_contents,cell_typ):
526533
skip_footer=skip_footer,
527534
chunksize=chunksize,
528535
**kwds)
529-
536+
530537
output[asheetname] = parser.read()
531-
538+
532539
if ret_dict:
533540
return output
534541
else:
535542
return output[asheetname]
536-
537543

538544
def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
539545
index_col=None, has_index_names=None, parse_cols=None,
540546
parse_dates=False, date_parser=None, na_values=None,
541547
thousands=None, chunksize=None, convert_float=True,
542-
**kwds):
543-
544-
# sheetname can be index or string
545-
sheet = self.book.sheets[sheetname]
546-
547-
data = []
548-
should_parse = {}
549-
for i in range(sheet.nrows()):
550-
row = []
551-
for j, cell in enumerate(sheet.row(i)):
552-
553-
if parse_cols is not None and j not in should_parse:
554-
should_parse[j] = self._should_parse(j, parse_cols)
555-
556-
if parse_cols is None or should_parse[j]:
557-
558-
if isinstance(cell.value, float):
559-
value = cell.value
560-
if convert_float:
561-
# GH5394 - Excel and ODS 'numbers' are always floats
562-
# it's a minimal perf hit and less suprising
563-
# FIXME: this goes wrong when int(cell.value) returns
564-
# a long (>1e18)
565-
val = int(cell.value)
566-
if val == cell.value:
567-
value = val
568-
elif isinstance(cell.value, compat.string_types):
569-
typ = cell.value_type
570-
# if typ == 'string':
571-
# value = cell.value
572-
if typ == 'date' or typ == 'time':
573-
value = self._parse_datetime(cell)
574-
else:
575-
value = cell.value
576-
elif isinstance(cell.value, bool):
577-
value = cell.value
578-
# elif isinstance(cell.value, type(None)):
579-
# value = np.nan
580-
else:
581-
value = np.nan
548+
verbose=False, **kwds):
582549

583-
row.append(value)
550+
def _parse_cell(cell):
551+
"""converts the contents of the cell into a pandas
552+
appropriate object"""
553+
if isinstance(cell.value, float):
554+
value = cell.value
555+
if convert_float:
556+
# GH5394 - Excel and ODS 'numbers' are always floats
557+
# it's a minimal perf hit and less suprising
558+
# FIXME: this goes wrong when int(cell.value) returns
559+
# a long (>1e18)
560+
val = int(cell.value)
561+
if val == cell.value:
562+
value = val
563+
elif isinstance(cell.value, compat.string_types):
564+
typ = cell.value_type
565+
# if typ == 'string':
566+
# value = cell.value
567+
if typ == 'date' or typ == 'time':
568+
value = self._parse_datetime(cell)
569+
else:
570+
value = cell.value
571+
elif isinstance(cell.value, bool):
572+
value = cell.value
573+
# elif isinstance(cell.value, type(None)):
574+
# value = np.nan
575+
else:
576+
value = np.nan
577+
return value
584578

585-
data.append(row)
579+
ret_dict = False
586580

587-
parser = TextParser(data, header=header, index_col=index_col,
588-
has_index_names=has_index_names,
589-
na_values=na_values,
590-
thousands=thousands,
591-
parse_dates=parse_dates,
592-
date_parser=date_parser,
593-
skiprows=skiprows,
594-
skip_footer=skip_footer,
595-
chunksize=chunksize,
596-
**kwds)
581+
# Keep sheetname to maintain backwards compatibility.
582+
if isinstance(sheetname, list):
583+
sheets = sheetname
584+
ret_dict = True
585+
elif sheetname is None:
586+
sheets = self.sheet_names
587+
ret_dict = True
588+
else:
589+
sheets = [sheetname]
590+
591+
# handle same-type duplicates.
592+
sheets = list(set(sheets))
593+
594+
output = {}
595+
596+
for asheetname in sheets:
597+
if verbose:
598+
print("Reading sheet %s" % asheetname)
599+
600+
# sheetname can be index or string
601+
sheet = self.book.sheets[asheetname]
602+
603+
data = []
604+
should_parse = {}
605+
for i in range(sheet.nrows()):
606+
row = []
607+
for j, cell in enumerate(sheet.row(i)):
608+
609+
if parse_cols is not None and j not in should_parse:
610+
should_parse[j] = self._should_parse(j, parse_cols)
611+
612+
if parse_cols is None or should_parse[j]:
613+
row.append(_parse_cell(cell))
597614

598-
return parser.read()
615+
data.append(row)
616+
617+
parser = TextParser(data, header=header, index_col=index_col,
618+
has_index_names=has_index_names,
619+
na_values=na_values,
620+
thousands=thousands,
621+
parse_dates=parse_dates,
622+
date_parser=date_parser,
623+
skiprows=skiprows,
624+
skip_footer=skip_footer,
625+
chunksize=chunksize,
626+
**kwds)
627+
output[asheetname] = parser.read()
628+
629+
if ret_dict:
630+
return output
631+
else:
632+
return output[asheetname]
599633

600634
def _parse_datetime(self, cell):
601635
"""Parse the date or time from on ods cell to a datetime object.
@@ -609,7 +643,7 @@ def _parse_datetime(self, cell):
609643
def _value2date(value):
610644
try:
611645
return datetime.datetime.strptime(value, '%Y-%m-%d')
612-
except ValueError:#, TypeError):
646+
except ValueError: # , TypeError):
613647
return datetime.datetime.strptime(value, '%Y-%m-%dT%H:%M:%S')
614648

615649
# Technically it is not necessary to try to derive the date/time
@@ -643,7 +677,7 @@ def _value2date(value):
643677
value = _value2date(cell.value)
644678
elif cell.value_type == 'time':
645679
try:
646-
# FIXME: what if the decimal separator is a comma in the locale?
680+
# FIXME: what if the decimal separator is a comma in locale?
647681
value = datetime.datetime.strptime(cell.value, 'PT%HH%MM%S.%fS')
648682
except ValueError:
649683
value = datetime.datetime.strptime(cell.value, 'PT%HH%MM%SS')
@@ -657,9 +691,9 @@ def _print_ods_cellinfo(self, cell):
657691
Cell attributes are documented here:
658692
https://pythonhosted.org/ezodf/tableobjects.html#id2
659693
"""
660-
print(' plaintext:', cell.plaintext()) # no formatting
694+
print(' plaintext:', cell.plaintext()) # no formatting
661695
# formatted, but what is difference with value?
662-
print('display_form:', cell.display_form) # format, ?=plaintext
696+
print('display_form:', cell.display_form) # format, ?=plaintext
663697
print(' value:', cell.value) # data handled
664698
print(' value_type:', cell.value_type) # data type
665699
print(' formula:', cell.formula)

0 commit comments

Comments
 (0)