@@ -375,60 +375,25 @@ def read_excel(io,
375
375
** kwds )
376
376
377
377
378
- class _XlrdReader (object ):
379
-
380
- def __init__ (self , filepath_or_buffer ):
381
- """Reader using xlrd engine.
382
-
383
- Parameters
384
- ----------
385
- filepath_or_buffer : string, path object or Workbook
386
- Object to be parsed.
387
- """
388
- err_msg = "Install xlrd >= 1.0.0 for Excel support"
389
-
390
- try :
391
- import xlrd
392
- except ImportError :
393
- raise ImportError (err_msg )
394
- else :
395
- if xlrd .__VERSION__ < LooseVersion ("1.0.0" ):
396
- raise ImportError (err_msg +
397
- ". Current version " + xlrd .__VERSION__ )
378
+ @add_metaclass (abc .ABCMeta )
379
+ class _BaseExcelReader (object ):
398
380
399
- # If filepath_or_buffer is a url, want to keep the data as bytes so
400
- # can't pass to get_filepath_or_buffer()
401
- if _is_url (filepath_or_buffer ):
402
- filepath_or_buffer = _urlopen (filepath_or_buffer )
403
- elif not isinstance (filepath_or_buffer , (ExcelFile , xlrd .Book )):
404
- filepath_or_buffer , _ , _ , _ = get_filepath_or_buffer (
405
- filepath_or_buffer )
381
+ @property
382
+ @abc .abstractmethod
383
+ def sheet_names (self ):
384
+ pass
406
385
407
- if isinstance (filepath_or_buffer , xlrd .Book ):
408
- self .book = filepath_or_buffer
409
- elif not isinstance (filepath_or_buffer , xlrd .Book ) and hasattr (
410
- filepath_or_buffer , "read" ):
411
- # N.B. xlrd.Book has a read attribute too
412
- if hasattr (filepath_or_buffer , 'seek' ):
413
- try :
414
- # GH 19779
415
- filepath_or_buffer .seek (0 )
416
- except UnsupportedOperation :
417
- # HTTPResponse does not support seek()
418
- # GH 20434
419
- pass
386
+ @abc .abstractmethod
387
+ def get_sheet_by_name (self , name ):
388
+ pass
420
389
421
- data = filepath_or_buffer .read ()
422
- self .book = xlrd .open_workbook (file_contents = data )
423
- elif isinstance (filepath_or_buffer , compat .string_types ):
424
- self .book = xlrd .open_workbook (filepath_or_buffer )
425
- else :
426
- raise ValueError ('Must explicitly set engine if not passing in'
427
- ' buffer or path for io.' )
390
+ @abc .abstractmethod
391
+ def get_sheet_by_index (self , index ):
392
+ pass
428
393
429
- @property
430
- def sheet_names (self ):
431
- return self . book . sheet_names ()
394
+ @abc . abstractmethod
395
+ def get_sheet_data (self , sheet , convert_float ):
396
+ pass
432
397
433
398
def parse (self ,
434
399
sheet_name = 0 ,
@@ -455,56 +420,14 @@ def parse(self,
455
420
456
421
_validate_header_arg (header )
457
422
458
- from xlrd import (xldate , XL_CELL_DATE ,
459
- XL_CELL_ERROR , XL_CELL_BOOLEAN ,
460
- XL_CELL_NUMBER )
461
-
462
- epoch1904 = self .book .datemode
463
-
464
- def _parse_cell (cell_contents , cell_typ ):
465
- """converts the contents of the cell into a pandas
466
- appropriate object"""
467
-
468
- if cell_typ == XL_CELL_DATE :
469
-
470
- # Use the newer xlrd datetime handling.
471
- try :
472
- cell_contents = xldate .xldate_as_datetime (
473
- cell_contents , epoch1904 )
474
- except OverflowError :
475
- return cell_contents
476
-
477
- # Excel doesn't distinguish between dates and time,
478
- # so we treat dates on the epoch as times only.
479
- # Also, Excel supports 1900 and 1904 epochs.
480
- year = (cell_contents .timetuple ())[0 :3 ]
481
- if ((not epoch1904 and year == (1899 , 12 , 31 )) or
482
- (epoch1904 and year == (1904 , 1 , 1 ))):
483
- cell_contents = time (cell_contents .hour ,
484
- cell_contents .minute ,
485
- cell_contents .second ,
486
- cell_contents .microsecond )
487
-
488
- elif cell_typ == XL_CELL_ERROR :
489
- cell_contents = np .nan
490
- elif cell_typ == XL_CELL_BOOLEAN :
491
- cell_contents = bool (cell_contents )
492
- elif convert_float and cell_typ == XL_CELL_NUMBER :
493
- # GH5394 - Excel 'numbers' are always floats
494
- # it's a minimal perf hit and less surprising
495
- val = int (cell_contents )
496
- if val == cell_contents :
497
- cell_contents = val
498
- return cell_contents
499
-
500
423
ret_dict = False
501
424
502
425
# Keep sheetname to maintain backwards compatibility.
503
426
if isinstance (sheet_name , list ):
504
427
sheets = sheet_name
505
428
ret_dict = True
506
429
elif sheet_name is None :
507
- sheets = self .book . sheet_names ()
430
+ sheets = self .sheet_names
508
431
ret_dict = True
509
432
else :
510
433
sheets = [sheet_name ]
@@ -519,19 +442,13 @@ def _parse_cell(cell_contents, cell_typ):
519
442
print ("Reading sheet {sheet}" .format (sheet = asheetname ))
520
443
521
444
if isinstance (asheetname , compat .string_types ):
522
- sheet = self .book . sheet_by_name (asheetname )
445
+ sheet = self .get_sheet_by_name (asheetname )
523
446
else : # assume an integer if not a string
524
- sheet = self .book . sheet_by_index (asheetname )
447
+ sheet = self .get_sheet_by_index (asheetname )
525
448
526
- data = []
449
+ data = self . get_sheet_data ( sheet , convert_float )
527
450
usecols = _maybe_convert_usecols (usecols )
528
451
529
- for i in range (sheet .nrows ):
530
- row = [_parse_cell (value , typ )
531
- for value , typ in zip (sheet .row_values (i ),
532
- sheet .row_types (i ))]
533
- data .append (row )
534
-
535
452
if sheet .nrows == 0 :
536
453
output [asheetname ] = DataFrame ()
537
454
continue
@@ -620,6 +537,120 @@ def _parse_cell(cell_contents, cell_typ):
620
537
return output [asheetname ]
621
538
622
539
540
+ class _XlrdReader (_BaseExcelReader ):
541
+
542
+ def __init__ (self , filepath_or_buffer ):
543
+ """Reader using xlrd engine.
544
+
545
+ Parameters
546
+ ----------
547
+ filepath_or_buffer : string, path object or Workbook
548
+ Object to be parsed.
549
+ """
550
+ err_msg = "Install xlrd >= 1.0.0 for Excel support"
551
+
552
+ try :
553
+ import xlrd
554
+ except ImportError :
555
+ raise ImportError (err_msg )
556
+ else :
557
+ if xlrd .__VERSION__ < LooseVersion ("1.0.0" ):
558
+ raise ImportError (err_msg +
559
+ ". Current version " + xlrd .__VERSION__ )
560
+
561
+ # If filepath_or_buffer is a url, want to keep the data as bytes so
562
+ # can't pass to get_filepath_or_buffer()
563
+ if _is_url (filepath_or_buffer ):
564
+ filepath_or_buffer = _urlopen (filepath_or_buffer )
565
+ elif not isinstance (filepath_or_buffer , (ExcelFile , xlrd .Book )):
566
+ filepath_or_buffer , _ , _ , _ = get_filepath_or_buffer (
567
+ filepath_or_buffer )
568
+
569
+ if isinstance (filepath_or_buffer , xlrd .Book ):
570
+ self .book = filepath_or_buffer
571
+ elif hasattr (filepath_or_buffer , "read" ):
572
+ # N.B. xlrd.Book has a read attribute too
573
+ if hasattr (filepath_or_buffer , 'seek' ):
574
+ try :
575
+ # GH 19779
576
+ filepath_or_buffer .seek (0 )
577
+ except UnsupportedOperation :
578
+ # HTTPResponse does not support seek()
579
+ # GH 20434
580
+ pass
581
+
582
+ data = filepath_or_buffer .read ()
583
+ self .book = xlrd .open_workbook (file_contents = data )
584
+ elif isinstance (filepath_or_buffer , compat .string_types ):
585
+ self .book = xlrd .open_workbook (filepath_or_buffer )
586
+ else :
587
+ raise ValueError ('Must explicitly set engine if not passing in'
588
+ ' buffer or path for io.' )
589
+
590
+ @property
591
+ def sheet_names (self ):
592
+ return self .book .sheet_names ()
593
+
594
+ def get_sheet_by_name (self , name ):
595
+ return self .book .sheet_by_name (name )
596
+
597
+ def get_sheet_by_index (self , index ):
598
+ return self .book .sheet_by_index (index )
599
+
600
+ def get_sheet_data (self , sheet , convert_float ):
601
+ from xlrd import (xldate , XL_CELL_DATE ,
602
+ XL_CELL_ERROR , XL_CELL_BOOLEAN ,
603
+ XL_CELL_NUMBER )
604
+
605
+ epoch1904 = self .book .datemode
606
+
607
+ def _parse_cell (cell_contents , cell_typ ):
608
+ """converts the contents of the cell into a pandas
609
+ appropriate object"""
610
+
611
+ if cell_typ == XL_CELL_DATE :
612
+
613
+ # Use the newer xlrd datetime handling.
614
+ try :
615
+ cell_contents = xldate .xldate_as_datetime (
616
+ cell_contents , epoch1904 )
617
+ except OverflowError :
618
+ return cell_contents
619
+
620
+ # Excel doesn't distinguish between dates and time,
621
+ # so we treat dates on the epoch as times only.
622
+ # Also, Excel supports 1900 and 1904 epochs.
623
+ year = (cell_contents .timetuple ())[0 :3 ]
624
+ if ((not epoch1904 and year == (1899 , 12 , 31 )) or
625
+ (epoch1904 and year == (1904 , 1 , 1 ))):
626
+ cell_contents = time (cell_contents .hour ,
627
+ cell_contents .minute ,
628
+ cell_contents .second ,
629
+ cell_contents .microsecond )
630
+
631
+ elif cell_typ == XL_CELL_ERROR :
632
+ cell_contents = np .nan
633
+ elif cell_typ == XL_CELL_BOOLEAN :
634
+ cell_contents = bool (cell_contents )
635
+ elif convert_float and cell_typ == XL_CELL_NUMBER :
636
+ # GH5394 - Excel 'numbers' are always floats
637
+ # it's a minimal perf hit and less surprising
638
+ val = int (cell_contents )
639
+ if val == cell_contents :
640
+ cell_contents = val
641
+ return cell_contents
642
+
643
+ data = []
644
+
645
+ for i in range (sheet .nrows ):
646
+ row = [_parse_cell (value , typ )
647
+ for value , typ in zip (sheet .row_values (i ),
648
+ sheet .row_types (i ))]
649
+ data .append (row )
650
+
651
+ return data
652
+
653
+
623
654
class ExcelFile (object ):
624
655
"""
625
656
Class for parsing tabular excel sheets into DataFrame objects.
0 commit comments