-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Add use_nullable_dtypes and nullable_backend global option to read_orc #49827
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d8164b2
7859083
bfdb535
843db9c
a81e81c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,11 +4,12 @@ | |
import io | ||
from types import ModuleType | ||
from typing import ( | ||
TYPE_CHECKING, | ||
Any, | ||
Literal, | ||
) | ||
|
||
from pandas._config import get_option | ||
|
||
from pandas._typing import ( | ||
FilePath, | ||
ReadBuffer, | ||
|
@@ -23,14 +24,17 @@ | |
is_unsigned_integer_dtype, | ||
) | ||
|
||
from pandas.io.common import get_handle | ||
from pandas.core.arrays import ArrowExtensionArray | ||
from pandas.core.frame import DataFrame | ||
|
||
if TYPE_CHECKING: | ||
from pandas import DataFrame | ||
from pandas.io.common import get_handle | ||
|
||
|
||
def read_orc( | ||
path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs | ||
path: FilePath | ReadBuffer[bytes], | ||
columns: list[str] | None = None, | ||
use_nullable_dtypes: bool = False, | ||
**kwargs, | ||
) -> DataFrame: | ||
""" | ||
Load an ORC object from the file path, returning a DataFrame. | ||
|
@@ -50,6 +54,21 @@ def read_orc( | |
Output always follows the ordering of the file and not the columns list. | ||
This mirrors the original behaviour of | ||
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. | ||
use_nullable_dtypes : bool, default False | ||
If True, use dtypes that use ``pd.NA`` as missing value indicator | ||
for the resulting DataFrame. | ||
|
||
The nullable dtype implementation can be configured by setting the global | ||
``io.nullable_backend`` configuration option to ``"pandas"`` to use | ||
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed | ||
nullable dtypes (using ``pd.ArrowDtype``). | ||
|
||
.. versionadded:: 2.0.0 | ||
|
||
.. note | ||
|
||
Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you intend to implement the flag for pandas as well? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would want to do this in a follow up PR (unless you're interested :) ) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No is fine, just wanted to understand if this is intended at all. i want to tackle json and sql next |
||
|
||
**kwargs | ||
Any additional kwargs are passed to pyarrow. | ||
|
||
|
@@ -68,7 +87,24 @@ def read_orc( | |
|
||
with get_handle(path, "rb", is_text=False) as handles: | ||
orc_file = orc.ORCFile(handles.handle) | ||
return orc_file.read(columns=columns, **kwargs).to_pandas() | ||
pa_table = orc_file.read(columns=columns, **kwargs) | ||
if use_nullable_dtypes: | ||
nullable_backend = get_option("io.nullable_backend") | ||
if nullable_backend != "pyarrow": | ||
raise NotImplementedError( | ||
f"io.nullable_backend set to {nullable_backend} is not implemented." | ||
) | ||
df = DataFrame( | ||
{ | ||
col_name: ArrowExtensionArray(pa_col) | ||
for col_name, pa_col in zip( | ||
pa_table.column_names, pa_table.itercolumns() | ||
) | ||
} | ||
) | ||
return df | ||
else: | ||
return pa_table.to_pandas() | ||
|
||
|
||
def to_orc( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
|
||
pytest.importorskip("pyarrow.orc") | ||
|
||
import pyarrow as pa | ||
|
||
|
||
@pytest.fixture | ||
def dirpath(datapath): | ||
|
@@ -301,3 +303,46 @@ def test_orc_writer_dtypes_not_supported(df_not_supported): | |
msg = "The dtype of one or more columns is not supported yet." | ||
with pytest.raises(NotImplementedError, match=msg): | ||
df_not_supported.to_orc() | ||
|
||
|
||
def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): | ||
input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") | ||
with pytest.raises( | ||
NotImplementedError, | ||
match="io.nullable_backend set to pandas is not implemented.", | ||
): | ||
with pd.option_context("io.nullable_backend", "pandas"): | ||
read_orc(input_file, use_nullable_dtypes=True) | ||
|
||
|
||
@td.skip_if_no("pyarrow", min_version="7.0.0") | ||
def test_orc_use_nullable_dtypes_pyarrow_backend(): | ||
df = pd.DataFrame( | ||
{ | ||
"string": list("abc"), | ||
"string_with_nan": ["a", np.nan, "c"], | ||
"string_with_none": ["a", None, "c"], | ||
"bytes": [b"foo", b"bar", None], | ||
"int": list(range(1, 4)), | ||
"float": np.arange(4.0, 7.0, dtype="float64"), | ||
"float_with_nan": [2.0, np.nan, 3.0], | ||
"bool": [True, False, True], | ||
"bool_with_na": [True, False, None], | ||
"datetime": pd.date_range("20130101", periods=3), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add bool without na? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, added. |
||
"datetime_with_nat": [ | ||
pd.Timestamp("20130101"), | ||
pd.NaT, | ||
pd.Timestamp("20130103"), | ||
], | ||
} | ||
) | ||
bytes_data = df.copy().to_orc() | ||
with pd.option_context("io.nullable_backend", "pyarrow"): | ||
result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) | ||
expected = pd.DataFrame( | ||
{ | ||
col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True)) | ||
for col in df.columns | ||
} | ||
) | ||
tm.assert_frame_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Off-topic, but it seems read_excel supports
use_nullable_dtypes
but notio.nullable_backend
. We should fix this.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I'll add this in a follow up PR.