1
1
import abc
2
2
import datetime
3
+ from distutils .version import LooseVersion
3
4
import inspect
4
5
from io import BufferedIOBase , BytesIO , RawIOBase
5
6
import os
6
7
from textwrap import fill
7
- from typing import Any , Dict , Mapping , Union , cast
8
+ from typing import IO , Any , Dict , Mapping , Optional , Union , cast
8
9
import warnings
10
+ import zipfile
9
11
10
12
from pandas ._config import config
11
13
12
14
from pandas ._libs .parsers import STR_NA_VALUES
13
15
from pandas ._typing import Buffer , FilePathOrBuffer , StorageOptions
14
16
from pandas .compat ._optional import import_optional_dependency
15
17
from pandas .errors import EmptyDataError
16
- from pandas .util ._decorators import Appender , deprecate_nonkeyword_arguments
18
+ from pandas .util ._decorators import Appender , deprecate_nonkeyword_arguments , doc
17
19
18
20
from pandas .core .dtypes .common import is_bool , is_float , is_integer , is_list_like
19
21
20
22
from pandas .core .frame import DataFrame
23
+ from pandas .core .shared_docs import _shared_docs
21
24
22
25
from pandas .io .common import IOHandles , get_handle , stringify_path , validate_header_arg
23
26
from pandas .io .excel ._util import (
116
119
When ``engine=None``, the following logic will be
117
120
used to determine the engine:
118
121
119
- - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
120
- then `odf <https://pypi.org/project/odfpy/>`_ will be used.
121
- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
122
- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd`` will
123
- be used.
124
- - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
125
- then ``openpyxl`` will be used.
126
- - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
127
-
128
- Specifying ``engine="xlrd"`` will continue to be allowed for the
129
- indefinite future.
122
+ - If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
123
+ then `odf <https://pypi.org/project/odfpy/>`_ will be used.
124
+ - Otherwise if ``path_or_buffer`` is an xls format,
125
+ ``xlrd`` will be used.
126
+ - Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
127
+ then ``openpyxl`` will be used.
128
+ - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
129
+ - Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised. This
130
+ case will raise a ``ValueError`` in a future version of pandas.
130
131
131
132
converters : dict, default None
132
133
Dict of functions for converting values in certain columns. Keys can
@@ -888,39 +889,92 @@ def close(self):
888
889
return content
889
890
890
891
891
- def _is_ods_stream (stream : Union [BufferedIOBase , RawIOBase ]) -> bool :
892
+ XLS_SIGNATURE = b"\xD0 \xCF \x11 \xE0 \xA1 \xB1 \x1A \xE1 "
893
+ ZIP_SIGNATURE = b"PK\x03 \x04 "
894
+ PEEK_SIZE = max (len (XLS_SIGNATURE ), len (ZIP_SIGNATURE ))
895
+
896
+
897
+ @doc (storage_options = _shared_docs ["storage_options" ])
898
+ def inspect_excel_format (
899
+ path : Optional [str ] = None ,
900
+ content : Union [None , BufferedIOBase , RawIOBase , bytes ] = None ,
901
+ storage_options : StorageOptions = None ,
902
+ ) -> str :
892
903
"""
893
- Check if the stream is an OpenDocument Spreadsheet (.ods) file
904
+ Inspect the path or content of an excel file and get its format.
905
+
906
+ At least one of path or content must be not None. If both are not None,
907
+ content will take precedence.
894
908
895
- It uses magic values inside the stream
909
+ Adopted from xlrd: https://github.com/python-excel/xlrd.
896
910
897
911
Parameters
898
912
----------
899
- stream : Union[BufferedIOBase, RawIOBase]
900
- IO stream with data which might be an ODS file
913
+ path : str, optional
914
+ Path to file to inspect. May be a URL.
915
+ content : file-like object, optional
916
+ Content of file to inspect.
917
+ {storage_options}
901
918
902
919
Returns
903
920
-------
904
- is_ods : bool
905
- Boolean indication that this is indeed an ODS file or not
921
+ str
922
+ Format of file.
923
+
924
+ Raises
925
+ ------
926
+ ValueError
927
+ If resulting stream is empty.
928
+ BadZipFile
929
+ If resulting stream does not have an XLS signature and is not a valid zipfile.
906
930
"""
907
- stream .seek (0 )
908
- is_ods = False
909
- if stream .read (4 ) == b"PK\003 \004 " :
910
- stream .seek (30 )
911
- is_ods = (
912
- stream .read (54 ) == b"mimetype"
913
- b"application/vnd.oasis.opendocument.spreadsheet"
914
- )
915
- stream .seek (0 )
916
- return is_ods
931
+ content_or_path : Union [None , str , BufferedIOBase , RawIOBase , IO [bytes ]]
932
+ if isinstance (content , bytes ):
933
+ content_or_path = BytesIO (content )
934
+ else :
935
+ content_or_path = content or path
936
+ assert content_or_path is not None
937
+
938
+ with get_handle (
939
+ content_or_path , "rb" , storage_options = storage_options , is_text = False
940
+ ) as handle :
941
+ stream = handle .handle
942
+ stream .seek (0 )
943
+ buf = stream .read (PEEK_SIZE )
944
+ if buf is None :
945
+ raise ValueError ("stream is empty" )
946
+ else :
947
+ assert isinstance (buf , bytes )
948
+ peek = buf
949
+ stream .seek (0 )
950
+
951
+ if peek .startswith (XLS_SIGNATURE ):
952
+ return "xls"
953
+ elif not peek .startswith (ZIP_SIGNATURE ):
954
+ raise ValueError ("File is not a recognized excel file" )
955
+
956
+ # ZipFile typing is overly-strict
957
+ # https://github.com/python/typeshed/issues/4212
958
+ zf = zipfile .ZipFile (stream ) # type: ignore[arg-type]
959
+
960
+ # Workaround for some third party files that use forward slashes and
961
+ # lower case names.
962
+ component_names = [name .replace ("\\ " , "/" ).lower () for name in zf .namelist ()]
963
+
964
+ if "xl/workbook.xml" in component_names :
965
+ return "xlsx"
966
+ if "xl/workbook.bin" in component_names :
967
+ return "xlsb"
968
+ if "content.xml" in component_names :
969
+ return "ods"
970
+ return "zip"
917
971
918
972
919
973
class ExcelFile :
920
974
"""
921
975
Class for parsing tabular excel sheets into DataFrame objects.
922
976
923
- See read_excel for more documentation
977
+ See read_excel for more documentation.
924
978
925
979
Parameters
926
980
----------
@@ -947,12 +1001,13 @@ class ExcelFile:
947
1001
948
1002
- If ``path_or_buffer`` is an OpenDocument format (.odf, .ods, .odt),
949
1003
then `odf <https://pypi.org/project/odfpy/>`_ will be used.
950
- - Otherwise if ``path_or_buffer`` is a bytes stream, the file has the
951
- extension ``.xls``, or is an ``xlrd`` Book instance, then ``xlrd``
952
- will be used.
1004
+ - Otherwise if ``path_or_buffer`` is an xls format,
1005
+ ``xlrd`` will be used.
953
1006
- Otherwise if `openpyxl <https://pypi.org/project/openpyxl/>`_ is installed,
954
1007
then ``openpyxl`` will be used.
1008
+ - Otherwise if ``xlrd >= 2.0`` is installed, a ``ValueError`` will be raised.
955
1009
- Otherwise ``xlrd`` will be used and a ``FutureWarning`` will be raised.
1010
+ This case will raise a ``ValueError`` in a future version of pandas.
956
1011
957
1012
.. warning::
958
1013
@@ -975,71 +1030,87 @@ class ExcelFile:
975
1030
def __init__ (
976
1031
self , path_or_buffer , engine = None , storage_options : StorageOptions = None
977
1032
):
978
- if engine is None :
979
- # Determine ext and use odf for ods stream/file
980
- if isinstance (path_or_buffer , (BufferedIOBase , RawIOBase )):
981
- ext = None
982
- if _is_ods_stream (path_or_buffer ):
983
- engine = "odf"
984
- else :
985
- ext = os .path .splitext (str (path_or_buffer ))[- 1 ]
986
- if ext == ".ods" :
987
- engine = "odf"
1033
+ if engine is not None and engine not in self ._engines :
1034
+ raise ValueError (f"Unknown engine: { engine } " )
988
1035
989
- if (
990
- import_optional_dependency (
991
- "xlrd" , raise_on_missing = False , on_version = "ignore"
992
- )
993
- is not None
994
- ):
995
- from xlrd import Book
1036
+ # Could be a str, ExcelFile, Book, etc.
1037
+ self .io = path_or_buffer
1038
+ # Always a string
1039
+ self ._io = stringify_path (path_or_buffer )
996
1040
997
- if isinstance (path_or_buffer , Book ):
998
- engine = "xlrd"
1041
+ # Determine xlrd version if installed
1042
+ if (
1043
+ import_optional_dependency (
1044
+ "xlrd" , raise_on_missing = False , on_version = "ignore"
1045
+ )
1046
+ is None
1047
+ ):
1048
+ xlrd_version = None
1049
+ else :
1050
+ import xlrd
999
1051
1000
- # GH 35029 - Prefer openpyxl except for xls files
1001
- if engine is None :
1002
- if ext is None or isinstance (path_or_buffer , bytes ) or ext == ".xls" :
1003
- engine = "xlrd"
1004
- elif (
1052
+ xlrd_version = LooseVersion (xlrd .__version__ )
1053
+
1054
+ if isinstance (path_or_buffer , (BufferedIOBase , RawIOBase , bytes )):
1055
+ ext = inspect_excel_format (
1056
+ content = path_or_buffer , storage_options = storage_options
1057
+ )
1058
+ elif xlrd_version is not None and isinstance (path_or_buffer , xlrd .Book ):
1059
+ ext = "xls"
1060
+ else :
1061
+ # path_or_buffer is path-like, use stringified path
1062
+ ext = inspect_excel_format (
1063
+ path = str (self ._io ), storage_options = storage_options
1064
+ )
1065
+
1066
+ if engine is None :
1067
+ if ext == "ods" :
1068
+ engine = "odf"
1069
+ elif ext == "xls" :
1070
+ engine = "xlrd"
1071
+ else :
1072
+ # GH 35029 - Prefer openpyxl except for xls files
1073
+ if (
1005
1074
import_optional_dependency (
1006
1075
"openpyxl" , raise_on_missing = False , on_version = "ignore"
1007
1076
)
1008
1077
is not None
1009
1078
):
1010
1079
engine = "openpyxl"
1011
1080
else :
1012
- caller = inspect .stack ()[1 ]
1013
- if (
1014
- caller .filename .endswith ("pandas/io/excel/_base.py" )
1015
- and caller .function == "read_excel"
1016
- ):
1017
- stacklevel = 4
1018
- else :
1019
- stacklevel = 2
1020
- warnings .warn (
1021
- "The xlrd engine is no longer maintained and is not "
1022
- "supported when using pandas with python >= 3.9. However, "
1023
- "the engine xlrd will continue to be allowed for the "
1024
- "indefinite future. Beginning with pandas 1.2.0, the "
1025
- "openpyxl engine will be used if it is installed and the "
1026
- "engine argument is not specified. Either install openpyxl "
1027
- "or specify engine='xlrd' to silence this warning." ,
1028
- FutureWarning ,
1029
- stacklevel = stacklevel ,
1030
- )
1031
1081
engine = "xlrd"
1032
- if engine not in self ._engines :
1033
- raise ValueError (f"Unknown engine: { engine } " )
1082
+
1083
+ if engine == "xlrd" and ext != "xls" and xlrd_version is not None :
1084
+ if xlrd_version >= "2" :
1085
+ raise ValueError (
1086
+ f"Your version of xlrd is { xlrd_version } . In xlrd >= 2.0, "
1087
+ f"only the xls format is supported. Install openpyxl instead."
1088
+ )
1089
+ else :
1090
+ caller = inspect .stack ()[1 ]
1091
+ if (
1092
+ caller .filename .endswith (
1093
+ os .path .join ("pandas" , "io" , "excel" , "_base.py" )
1094
+ )
1095
+ and caller .function == "read_excel"
1096
+ ):
1097
+ stacklevel = 4
1098
+ else :
1099
+ stacklevel = 2
1100
+ warnings .warn (
1101
+ f"Your version of xlrd is { xlrd_version } . In xlrd >= 2.0, "
1102
+ f"only the xls format is supported. As a result, the "
1103
+ f"openpyxl engine will be used if it is installed and the "
1104
+ f"engine argument is not specified. Install "
1105
+ f"openpyxl instead." ,
1106
+ FutureWarning ,
1107
+ stacklevel = stacklevel ,
1108
+ )
1109
+ assert engine in self ._engines , f"Engine { engine } not recognized"
1034
1110
1035
1111
self .engine = engine
1036
1112
self .storage_options = storage_options
1037
1113
1038
- # Could be a str, ExcelFile, Book, etc.
1039
- self .io = path_or_buffer
1040
- # Always a string
1041
- self ._io = stringify_path (path_or_buffer )
1042
-
1043
1114
self ._reader = self ._engines [engine ](self ._io , storage_options = storage_options )
1044
1115
1045
1116
def __fspath__ (self ):
0 commit comments