From 67a6e5b960c91976ea2e290938265235effc1be0 Mon Sep 17 00:00:00 2001 From: yangshudong Date: Fri, 16 Apr 2021 13:48:33 +0800 Subject: [PATCH 1/7] BUG: Fix pd.read_orc raising AttributeError --- pandas/io/orc.py | 2 ++ pandas/tests/io/test_orc.py | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index db14a07e4b91b..0600c5d5628c6 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -49,6 +49,8 @@ def read_orc( if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": raise ImportError("pyarrow must be >= 0.13.0 for read_orc") + import pyarrow.orc + with get_handle(path, "rb", is_text=False) as handles: orc_file = pyarrow.orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a1f9c6f6af51a..942afc5e56680 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -10,7 +10,6 @@ import pandas._testing as tm pytest.importorskip("pyarrow", minversion="0.13.0") -pytest.importorskip("pyarrow.orc") pytestmark = pytest.mark.filterwarnings( "ignore:RangeIndex.* is deprecated:DeprecationWarning" From 06230370c7e2d5cc7c89972c8be32556ed948540 Mon Sep 17 00:00:00 2001 From: amznero Date: Mon, 19 Apr 2021 19:19:50 +0800 Subject: [PATCH 2/7] PR comments --- pandas/io/orc.py | 10 ++++------ pandas/tests/io/test_orc.py | 3 +-- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 0600c5d5628c6..87056c3dfa969 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -8,6 +8,8 @@ from pandas.io.common import get_handle +from pandas.compat._optional import import_optional_dependency + if TYPE_CHECKING: from pandas import DataFrame @@ -44,13 +46,9 @@ def read_orc( DataFrame """ # we require a newer version of pyarrow than we support for parquet - import pyarrow - - if distutils.version.LooseVersion(pyarrow.__version__) < "0.13.0": - raise ImportError("pyarrow must be >= 0.13.0 for read_orc") - import pyarrow.orc + orc = import_optional_dependency("pyarrow.orc") with get_handle(path, "rb", is_text=False) as handles: - orc_file = pyarrow.orc.ORCFile(handles.handle) + orc_file = orc.ORCFile(handles.handle) return orc_file.read(columns=columns, **kwargs).to_pandas() diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 942afc5e56680..4d47af9dddf43 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -9,8 +9,7 @@ from pandas import read_orc import pandas._testing as tm -pytest.importorskip("pyarrow", minversion="0.13.0") - +pytest.importorskip("pyarrow.orc") pytestmark = pytest.mark.filterwarnings( "ignore:RangeIndex.* is deprecated:DeprecationWarning" ) From e61629ac72388cf515f92bbc288687c1b6f816dc Mon Sep 17 00:00:00 2001 From: amznero Date: Mon, 19 Apr 2021 23:34:41 +0800 Subject: [PATCH 3/7] pre-commit --- pandas/io/orc.py | 4 +--- pandas/tests/io/test_orc.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 87056c3dfa969..65275006986a8 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -1,15 +1,13 @@ """ orc compat """ from __future__ import annotations -import distutils from typing import TYPE_CHECKING from pandas._typing import FilePathOrBuffer +from pandas.compat._optional import import_optional_dependency from pandas.io.common import get_handle -from pandas.compat._optional import import_optional_dependency - if TYPE_CHECKING: from pandas import DataFrame diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index 4d47af9dddf43..f34e9b940317d 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -10,6 +10,7 @@ import pandas._testing as tm pytest.importorskip("pyarrow.orc") + pytestmark = pytest.mark.filterwarnings( "ignore:RangeIndex.* is deprecated:DeprecationWarning" ) From 728d26da514e0b8d865cb596196936164f9dd743 Mon Sep 17 00:00:00 2001 From: amznero Date: Tue, 20 Apr 2021 00:15:35 +0800 Subject: [PATCH 4/7] add whatsnew --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 0ec9758477eba..932e3734dcd2a 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -740,6 +740,7 @@ I/O - Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) - Bug in :func:`read_sas` raising ``ValueError`` when ``datetimes`` were null (:issue:`39725`) - Bug in :func:`read_excel` dropping empty values from single-column spreadsheets (:issue:`39808`) +- Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) Period ^^^^^^ From 8728848f03814dd5d73bca91bc05970470c3070a Mon Sep 17 00:00:00 2001 From: amznero Date: Tue, 20 Apr 2021 15:41:00 +0800 Subject: [PATCH 5/7] add doc --- doc/source/getting_started/install.rst | 13 +++++++++++++ doc/source/user_guide/io.rst | 5 +++++ pandas/io/orc.py | 4 ++++ 3 files changed, 22 insertions(+) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index b6351ac2232ff..e6c9fa875f666 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -362,6 +362,19 @@ pyarrow 0.15.0 Parquet, ORC, and feather reading / pyreadstat SPSS files (.sav) reading ========================= ================== ============================================================= +.. warning:: + + * If you want to use :func:`~pandas.read_orc`, it is highly recommended to install pyarrow using conda. + The following is a summary of the environment in which :func:`~pandas.read_orc` can work. + + ========================= ================== ============================================================= + System Conda PyPI + ========================= ================== ============================================================= + Linux Successful Failed(pyarrow==3.0 excepted) + macOS Successful Failed + Windows Failed Failed + ========================= ================== ============================================================= + Access data in the cloud ^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3b7a6037a9715..b8a0ae8596cfc 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5443,6 +5443,11 @@ Similar to the :ref:`parquet ` format, the `ORC Format `__ library. +Several caveats. + +* It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. +* :func:`~pandas.read_orc` is not supported on Windows yet. + .. _io.sql: SQL queries diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 65275006986a8..fc25a11d0299f 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -42,6 +42,10 @@ def read_orc( Returns ------- DataFrame + + Notes + ------- + Before using this function you should read the :ref:`user guide about ORC `. """ # we require a newer version of pyarrow than we support for parquet From d0c34dbdbd56048330504b89b46a94c42b35cbd8 Mon Sep 17 00:00:00 2001 From: amznero Date: Tue, 20 Apr 2021 16:37:35 +0800 Subject: [PATCH 6/7] fix ref --- pandas/io/orc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/orc.py b/pandas/io/orc.py index fc25a11d0299f..747175a431380 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -45,7 +45,7 @@ def read_orc( Notes ------- - Before using this function you should read the :ref:`user guide about ORC `. + Before using this function you should read the :ref:`user guide about ORC `. """ # we require a newer version of pyarrow than we support for parquet From 2d665cda4ae7c62672a664c502ead946d6c11632 Mon Sep 17 00:00:00 2001 From: amznero Date: Wed, 21 Apr 2021 17:09:47 +0800 Subject: [PATCH 7/7] PR comments --- doc/source/getting_started/install.rst | 4 +++- doc/source/user_guide/io.rst | 6 +++--- pandas/io/orc.py | 3 ++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index e6c9fa875f666..89b21d1984ad3 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -362,6 +362,8 @@ pyarrow 0.15.0 Parquet, ORC, and feather reading / pyreadstat SPSS files (.sav) reading ========================= ================== ============================================================= +.. _install.warn_orc: + .. warning:: * If you want to use :func:`~pandas.read_orc`, it is highly recommended to install pyarrow using conda. @@ -370,7 +372,7 @@ pyreadstat SPSS files (.sav) reading ========================= ================== ============================================================= System Conda PyPI ========================= ================== ============================================================= - Linux Successful Failed(pyarrow==3.0 excepted) + Linux Successful Failed(pyarrow==3.0 Successful) macOS Successful Failed Windows Failed Failed ========================= ================== ============================================================= diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b8a0ae8596cfc..5148bb87b0eb0 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5443,10 +5443,10 @@ Similar to the :ref:`parquet ` format, the `ORC Format `__ library. -Several caveats. +.. warning:: -* It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. -* :func:`~pandas.read_orc` is not supported on Windows yet. + * It is *highly recommended* to install pyarrow using conda due to some issues occurred by pyarrow. + * :func:`~pandas.read_orc` is not supported on Windows yet, you can find valid environments on :ref:`install optional dependencies `. .. _io.sql: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 747175a431380..6bdb4df806b5c 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -45,7 +45,8 @@ def read_orc( Notes ------- - Before using this function you should read the :ref:`user guide about ORC `. + Before using this function you should read the :ref:`user guide about ORC ` + and :ref:`install optional dependencies `. """ # we require a newer version of pyarrow than we support for parquet