Skip to content

Commit 89f6a12

Browse files
authored
ENH: use native filesystem (if available) for read_orc (#51623)
1 parent 4700a61 commit 89f6a12

File tree

3 files changed

+38
-4
lines changed

3 files changed

+38
-4
lines changed

doc/source/whatsnew/v2.1.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ Performance improvements
104104
- Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
105105
- Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
106106
- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
107-
-
107+
- Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`)
108108

109109
.. ---------------------------------------------------------------------------
110110
.. _whatsnew_210.bug_fixes:

pandas/io/orc.py

+27-3
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,17 @@
2424
from pandas.core.arrays import ArrowExtensionArray
2525
from pandas.core.frame import DataFrame
2626

27-
from pandas.io.common import get_handle
27+
from pandas.io.common import (
28+
get_handle,
29+
is_fsspec_url,
30+
)
2831

2932

3033
def read_orc(
3134
path: FilePath | ReadBuffer[bytes],
3235
columns: list[str] | None = None,
3336
use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
37+
filesystem=None,
3438
**kwargs,
3539
) -> DataFrame:
3640
"""
@@ -64,6 +68,11 @@ def read_orc(
6468
6569
.. versionadded:: 2.0
6670
71+
filesystem : fsspec or pyarrow filesystem, default None
72+
Filesystem object to use when reading the parquet file.
73+
74+
.. versionadded:: 2.1.0
75+
6776
**kwargs
6877
Any additional kwargs are passed to pyarrow.
6978
@@ -75,6 +84,11 @@ def read_orc(
7584
-----
7685
Before using this function you should read the :ref:`user guide about ORC <io.orc>`
7786
and :ref:`install optional dependencies <install.warn_orc>`.
87+
88+
If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
89+
a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
90+
pyarrow or fsspec filesystem object into the filesystem keyword to override this
91+
behavior.
7892
"""
7993
# we require a newer version of pyarrow than we support for parquet
8094

@@ -87,8 +101,18 @@ def read_orc(
87101
)
88102

89103
with get_handle(path, "rb", is_text=False) as handles:
90-
orc_file = orc.ORCFile(handles.handle)
91-
pa_table = orc_file.read(columns=columns, **kwargs)
104+
source = handles.handle
105+
if is_fsspec_url(path) and filesystem is None:
106+
pa = import_optional_dependency("pyarrow")
107+
pa_fs = import_optional_dependency("pyarrow.fs")
108+
try:
109+
filesystem, source = pa_fs.FileSystem.from_uri(path)
110+
except (TypeError, pa.ArrowInvalid):
111+
pass
112+
113+
pa_table = orc.read_table(
114+
source=source, columns=columns, filesystem=filesystem, **kwargs
115+
)
92116
if use_nullable_dtypes:
93117
dtype_backend = get_option("mode.dtype_backend")
94118
if dtype_backend == "pyarrow":

pandas/tests/io/test_orc.py

+10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from decimal import Decimal
44
from io import BytesIO
55
import os
6+
import pathlib
67

78
import numpy as np
89
import pytest
@@ -396,3 +397,12 @@ def test_orc_use_nullable_dtypes_option():
396397

397398
expected = pd.DataFrame({"int": pd.Series([1, 2, 3], dtype="Int64")})
398399
tm.assert_frame_equal(result, expected)
400+
401+
402+
def test_orc_uri_path():
403+
expected = pd.DataFrame({"int": list(range(1, 4))})
404+
with tm.ensure_clean("tmp.orc") as path:
405+
expected.to_orc(path)
406+
uri = pathlib.Path(path).as_uri()
407+
result = read_orc(uri)
408+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)