diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b6351ac2232ff..89b21d1984ad3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -362,6 +362,21 @@ pyarrow 0.15.0 Parquet, ORC, and feather reading / pyreadstat SPSS files (.sav) reading ========================= ================== ============================================================= +.. _install.warn_orc: + +.. warning:: + + * If you want to use :func:`~pandas.read_orc`, it is highly recommended to install pyarrow using conda. + The following is a summary of the environment in which :func:`~pandas.read_orc` can work. + + ========================= ================== ============================================================= + System Conda PyPI + ========================= ================== ============================================================= + Linux Successful Failed(pyarrow==3.0 Successful) + macOS Successful Failed + Windows Failed Failed + ========================= ================== ============================================================= + Access data in the cloud ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3b7a6037a9715..5148bb87b0eb0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5443,6 +5443,11 @@ Similar to the :ref:`parquet ` format, the `ORC Format `__ library. +.. warning:: + + * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. + * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. + .. _io.sql: SQL queries diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 85d9acff353be..58f46206b6d57 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -783,6 +783,7 @@ I/O - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) - Bug in :meth:`DataFrame.to_string` misplacing the truncation column when ``index=False`` (:issue:`40907`) +- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) Period ^^^^^^ diff --git a/pandas/io/orc.py b/pandas/io/orc.py index db14a07e4b91b..6bdb4df806b5c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,10 +1,10 @@ """ orc compat """ from __future__ import annotations -import distutils from typing import TYPE_CHECKING from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency from pandas.io.common import get_handle @@ -42,13 +42,16 @@ def read_orc( Returns ------- DataFrame + + Notes + ------- + Before using this function you should read the :ref:`user guide about ORC ` + and :ref:`install optional dependencies `. """ # we require a newer version of pyarrow than we support for parquet - import pyarrow - if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": - raise ImportError("pyarrow must be >= 0.13.0 for read_orc") + orc = import_optional_dependency("pyarrow.orc") with get_handle(path, "rb", is_text=False) as handles: - orc_file = pyarrow.orc.ORCFile(handles.handle) + orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a1f9c6f6af51a..f34e9b940317d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,7 +9,6 @@ from pandas import read_orc import pandas._testing as tm -pytest.importorskip("pyarrow", minversion="0.13.0") pytest.importorskip("pyarrow.orc") pytestmark = pytest.mark.filterwarnings(