Skip to content

ENH: Add use_nullable_dtypes and nullable_backend global option to read_orc #49827

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Nov 23, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (
Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``)
A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc` and :func:`read_csv` (with ``engine="pyarrow"``)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Off-topic, but it seems read_excel supports use_nullable_dtypes but not io.nullable_backend. We should fix this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'll add this in a follow up PR.

to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).

.. ipython:: python
Expand All @@ -45,7 +45,7 @@ to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
""")
with pd.option_context("io.nullable_backend", "pyarrow"):
df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
df
df.dtypes

.. _whatsnew_200.enhancements.other:

Expand Down
48 changes: 42 additions & 6 deletions pandas/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@
import io
from types import ModuleType
from typing import (
TYPE_CHECKING,
Any,
Literal,
)

from pandas._config import get_option

from pandas._typing import (
FilePath,
ReadBuffer,
Expand All @@ -23,14 +24,17 @@
is_unsigned_integer_dtype,
)

from pandas.io.common import get_handle
from pandas.core.arrays import ArrowExtensionArray
from pandas.core.frame import DataFrame

if TYPE_CHECKING:
from pandas import DataFrame
from pandas.io.common import get_handle


def read_orc(
path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs
path: FilePath | ReadBuffer[bytes],
columns: list[str] | None = None,
use_nullable_dtypes: bool = False,
**kwargs,
) -> DataFrame:
"""
Load an ORC object from the file path, returning a DataFrame.
Expand All @@ -50,6 +54,21 @@ def read_orc(
Output always follows the ordering of the file and not the columns list.
This mirrors the original behaviour of
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame.

The nullable dtype implementation can be configured by setting the global
``io.nullable_backend`` configuration option to ``"pandas"`` to use
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
nullable dtypes (using ``pd.ArrowDtype``).

.. versionadded:: 2.0.0

.. note

Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you intend to implement the flag for pandas as well?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would want to do this in a follow up PR (unless you're interested :) )

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No is fine, just wanted to understand if this is intended at all.

i want to tackle json and sql next


**kwargs
Any additional kwargs are passed to pyarrow.

Expand All @@ -68,7 +87,24 @@ def read_orc(

with get_handle(path, "rb", is_text=False) as handles:
orc_file = orc.ORCFile(handles.handle)
return orc_file.read(columns=columns, **kwargs).to_pandas()
pa_table = orc_file.read(columns=columns, **kwargs)
if use_nullable_dtypes:
nullable_backend = get_option("io.nullable_backend")
if nullable_backend != "pyarrow":
raise NotImplementedError(
f"io.nullable_backend set to {nullable_backend} is not implemented."
)
df = DataFrame(
{
col_name: ArrowExtensionArray(pa_col)
for col_name, pa_col in zip(
pa_table.column_names, pa_table.itercolumns()
)
}
)
return df
else:
return pa_table.to_pandas()


def to_orc(
Expand Down
5 changes: 5 additions & 0 deletions pandas/io/parsers/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,11 @@
set to True, nullable dtypes are used for all dtypes that have a nullable
implementation, even if no nulls are present.

The nullable dtype implementation can be configured by setting the global
``io.nullable_backend`` configuration option to ``"pandas"`` to use
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
nullable dtypes (using ``pd.ArrowDtype``).

.. versionadded:: 2.0

Returns
Expand Down
45 changes: 45 additions & 0 deletions pandas/tests/io/test_orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

pytest.importorskip("pyarrow.orc")

import pyarrow as pa


@pytest.fixture
def dirpath(datapath):
Expand Down Expand Up @@ -301,3 +303,46 @@ def test_orc_writer_dtypes_not_supported(df_not_supported):
msg = "The dtype of one or more columns is not supported yet."
with pytest.raises(NotImplementedError, match=msg):
df_not_supported.to_orc()


def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath):
input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
with pytest.raises(
NotImplementedError,
match="io.nullable_backend set to pandas is not implemented.",
):
with pd.option_context("io.nullable_backend", "pandas"):
read_orc(input_file, use_nullable_dtypes=True)


@td.skip_if_no("pyarrow", min_version="7.0.0")
def test_orc_use_nullable_dtypes_pyarrow_backend():
df = pd.DataFrame(
{
"string": list("abc"),
"string_with_nan": ["a", np.nan, "c"],
"string_with_none": ["a", None, "c"],
"bytes": [b"foo", b"bar", None],
"int": list(range(1, 4)),
"float": np.arange(4.0, 7.0, dtype="float64"),
"float_with_nan": [2.0, np.nan, 3.0],
"bool": [True, False, True],
"bool_with_na": [True, False, None],
"datetime": pd.date_range("20130101", periods=3),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add bool without na?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, added.

"datetime_with_nat": [
pd.Timestamp("20130101"),
pd.NaT,
pd.Timestamp("20130103"),
],
}
)
bytes_data = df.copy().to_orc()
with pd.option_context("io.nullable_backend", "pyarrow"):
result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
expected = pd.DataFrame(
{
col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
for col in df.columns
}
)
tm.assert_frame_equal(result, expected)