ENH: use native filesystem (if available) for read_orc (#51623)

mroeschke · web-flow · commit 89f6a1296531 · 2023-02-25T22:49:50.000+01:00
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -104,7 +104,7 @@ Performance improvements
 - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
 - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
 - Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
--
+- Performance improvement in :meth:`read_orc` when reading a remote URI file path. (:issue:`51609`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.bug_fixes:
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -24,13 +24,17 @@
 from pandas.core.arrays import ArrowExtensionArray
 from pandas.core.frame import DataFrame
 
-from pandas.io.common import get_handle
+from pandas.io.common import (
+    get_handle,
+    is_fsspec_url,
+)
 
 
 def read_orc(
     path: FilePath | ReadBuffer[bytes],
     columns: list[str] | None = None,
     use_nullable_dtypes: bool | lib.NoDefault = lib.no_default,
+    filesystem=None,
     **kwargs,
 ) -> DataFrame:
     """
@@ -64,6 +68,11 @@ def read_orc(
 
         .. versionadded:: 2.0
 
+    filesystem : fsspec or pyarrow filesystem, default None
+        Filesystem object to use when reading the parquet file.
+
+        .. versionadded:: 2.1.0
+
     **kwargs
         Any additional kwargs are passed to pyarrow.
 
@@ -75,6 +84,11 @@ def read_orc(
     -----
     Before using this function you should read the :ref:`user guide about ORC <io.orc>`
     and :ref:`install optional dependencies <install.warn_orc>`.
+
+    If ``path`` is a URI scheme pointing to a local or remote file (e.g. "s3://"),
+    a ``pyarrow.fs`` filesystem will be attempted to read the file. You can also pass a
+    pyarrow or fsspec filesystem object into the filesystem keyword to override this
+    behavior.
     """
     # we require a newer version of pyarrow than we support for parquet
 
@@ -87,8 +101,18 @@ def read_orc(
     )
 
     with get_handle(path, "rb", is_text=False) as handles:
-        orc_file = orc.ORCFile(handles.handle)
-        pa_table = orc_file.read(columns=columns, **kwargs)
+        source = handles.handle
+        if is_fsspec_url(path) and filesystem is None:
+            pa = import_optional_dependency("pyarrow")
+            pa_fs = import_optional_dependency("pyarrow.fs")
+            try:
+                filesystem, source = pa_fs.FileSystem.from_uri(path)
+            except (TypeError, pa.ArrowInvalid):
+                pass
+
+        pa_table = orc.read_table(
+            source=source, columns=columns, filesystem=filesystem, **kwargs
+        )
     if use_nullable_dtypes:
         dtype_backend = get_option("mode.dtype_backend")
         if dtype_backend == "pyarrow":
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -3,6 +3,7 @@
 from decimal import Decimal
 from io import BytesIO
 import os
+import pathlib
 
 import numpy as np
 import pytest
@@ -396,3 +397,12 @@ def test_orc_use_nullable_dtypes_option():
 
     expected = pd.DataFrame({"int": pd.Series([1, 2, 3], dtype="Int64")})
     tm.assert_frame_equal(result, expected)
+
+
+def test_orc_uri_path():
+    expected = pd.DataFrame({"int": list(range(1, 4))})
+    with tm.ensure_clean("tmp.orc") as path:
+        expected.to_orc(path)
+        uri = pathlib.Path(path).as_uri()
+        result = read_orc(uri)
+    tm.assert_frame_equal(result, expected)