24
24
from pandas .core .arrays import ArrowExtensionArray
25
25
from pandas .core .frame import DataFrame
26
26
27
- from pandas .io .common import get_handle
27
+ from pandas .io .common import (
28
+ get_handle ,
29
+ is_fsspec_url ,
30
+ )
28
31
29
32
30
33
def read_orc (
31
34
path : FilePath | ReadBuffer [bytes ],
32
35
columns : list [str ] | None = None ,
33
36
use_nullable_dtypes : bool | lib .NoDefault = lib .no_default ,
37
+ filesystem = None ,
34
38
** kwargs ,
35
39
) -> DataFrame :
36
40
"""
@@ -64,6 +68,11 @@ def read_orc(
64
68
65
69
.. versionadded:: 2.0
66
70
71
+ filesystem : fsspec or pyarrow filesystem, default None
72
+ Filesystem object to use when reading the parquet file.
73
+
74
+ .. versionadded:: 2.1.0
75
+
67
76
**kwargs
68
77
Any additional kwargs are passed to pyarrow.
69
78
@@ -75,6 +84,11 @@ def read_orc(
75
84
-----
76
85
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
77
86
and :ref:`install optional dependencies <install.warn_orc>`.
87
+
88
+ If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
89
+ a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
90
+ pyarrow or fsspec filesystem object into the filesystem keyword to override this
91
+ behavior.
78
92
"""
79
93
# we require a newer version of pyarrow than we support for parquet
80
94
@@ -87,8 +101,18 @@ def read_orc(
87
101
)
88
102
89
103
with get_handle (path , "rb" , is_text = False ) as handles :
90
- orc_file = orc .ORCFile (handles .handle )
91
- pa_table = orc_file .read (columns = columns , ** kwargs )
104
+ source = handles .handle
105
+ if is_fsspec_url (path ) and filesystem is None :
106
+ pa = import_optional_dependency ("pyarrow" )
107
+ pa_fs = import_optional_dependency ("pyarrow.fs" )
108
+ try :
109
+ filesystem , source = pa_fs .FileSystem .from_uri (path )
110
+ except (TypeError , pa .ArrowInvalid ):
111
+ pass
112
+
113
+ pa_table = orc .read_table (
114
+ source = source , columns = columns , filesystem = filesystem , ** kwargs
115
+ )
92
116
if use_nullable_dtypes :
93
117
dtype_backend = get_option ("mode.dtype_backend" )
94
118
if dtype_backend == "pyarrow" :
0 commit comments