Skip to content

Commit 997392b

Browse files
mroeschkemliu08
authored andcommitted
ENH: Add use_nullable_dtypes and nullable_backend global option to read_orc (pandas-dev#49827)
* ENH: Add use_nullable_dtypes and nullable_backend to read_orc * Skip if not required pa version * Address review
1 parent 6b8a0fb commit 997392b

File tree

4 files changed

+94
-8
lines changed

4 files changed

+94
-8
lines changed

doc/source/whatsnew/v2.0.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (
3333
Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
3434
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
3535

36-
A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``)
36+
A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc` and :func:`read_csv` (with ``engine="pyarrow"``)
3737
to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
3838

3939
.. ipython:: python
@@ -45,7 +45,7 @@ to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
4545
""")
4646
with pd.option_context("io.nullable_backend", "pyarrow"):
4747
df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
48-
df
48+
df.dtypes
4949
5050
.. _whatsnew_200.enhancements.other:
5151

pandas/io/orc.py

+42-6
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,12 @@
44
import io
55
from types import ModuleType
66
from typing import (
7-
TYPE_CHECKING,
87
Any,
98
Literal,
109
)
1110

11+
from pandas._config import get_option
12+
1213
from pandas._typing import (
1314
FilePath,
1415
ReadBuffer,
@@ -23,14 +24,17 @@
2324
is_unsigned_integer_dtype,
2425
)
2526

26-
from pandas.io.common import get_handle
27+
from pandas.core.arrays import ArrowExtensionArray
28+
from pandas.core.frame import DataFrame
2729

28-
if TYPE_CHECKING:
29-
from pandas import DataFrame
30+
from pandas.io.common import get_handle
3031

3132

3233
def read_orc(
33-
path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs
34+
path: FilePath | ReadBuffer[bytes],
35+
columns: list[str] | None = None,
36+
use_nullable_dtypes: bool = False,
37+
**kwargs,
3438
) -> DataFrame:
3539
"""
3640
Load an ORC object from the file path, returning a DataFrame.
@@ -50,6 +54,21 @@ def read_orc(
5054
Output always follows the ordering of the file and not the columns list.
5155
This mirrors the original behaviour of
5256
:external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
57+
use_nullable_dtypes : bool, default False
58+
If True, use dtypes that use ``pd.NA`` as missing value indicator
59+
for the resulting DataFrame.
60+
61+
The nullable dtype implementation can be configured by setting the global
62+
``io.nullable_backend`` configuration option to ``"pandas"`` to use
63+
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
64+
nullable dtypes (using ``pd.ArrowDtype``).
65+
66+
.. versionadded:: 2.0.0
67+
68+
.. note
69+
70+
Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported.
71+
5372
**kwargs
5473
Any additional kwargs are passed to pyarrow.
5574
@@ -68,7 +87,24 @@ def read_orc(
6887

6988
with get_handle(path, "rb", is_text=False) as handles:
7089
orc_file = orc.ORCFile(handles.handle)
71-
return orc_file.read(columns=columns, **kwargs).to_pandas()
90+
pa_table = orc_file.read(columns=columns, **kwargs)
91+
if use_nullable_dtypes:
92+
nullable_backend = get_option("io.nullable_backend")
93+
if nullable_backend != "pyarrow":
94+
raise NotImplementedError(
95+
f"io.nullable_backend set to {nullable_backend} is not implemented."
96+
)
97+
df = DataFrame(
98+
{
99+
col_name: ArrowExtensionArray(pa_col)
100+
for col_name, pa_col in zip(
101+
pa_table.column_names, pa_table.itercolumns()
102+
)
103+
}
104+
)
105+
return df
106+
else:
107+
return pa_table.to_pandas()
72108

73109

74110
def to_orc(

pandas/io/parsers/readers.py

+5
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,11 @@
406406
set to True, nullable dtypes are used for all dtypes that have a nullable
407407
implementation, even if no nulls are present.
408408
409+
The nullable dtype implementation can be configured by setting the global
410+
``io.nullable_backend`` configuration option to ``"pandas"`` to use
411+
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
412+
nullable dtypes (using ``pd.ArrowDtype``).
413+
409414
.. versionadded:: 2.0
410415
411416
Returns

pandas/tests/io/test_orc.py

+45
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414

1515
pytest.importorskip("pyarrow.orc")
1616

17+
import pyarrow as pa
18+
1719

1820
@pytest.fixture
1921
def dirpath(datapath):
@@ -301,3 +303,46 @@ def test_orc_writer_dtypes_not_supported(df_not_supported):
301303
msg = "The dtype of one or more columns is not supported yet."
302304
with pytest.raises(NotImplementedError, match=msg):
303305
df_not_supported.to_orc()
306+
307+
308+
def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath):
309+
input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
310+
with pytest.raises(
311+
NotImplementedError,
312+
match="io.nullable_backend set to pandas is not implemented.",
313+
):
314+
with pd.option_context("io.nullable_backend", "pandas"):
315+
read_orc(input_file, use_nullable_dtypes=True)
316+
317+
318+
@td.skip_if_no("pyarrow", min_version="7.0.0")
319+
def test_orc_use_nullable_dtypes_pyarrow_backend():
320+
df = pd.DataFrame(
321+
{
322+
"string": list("abc"),
323+
"string_with_nan": ["a", np.nan, "c"],
324+
"string_with_none": ["a", None, "c"],
325+
"bytes": [b"foo", b"bar", None],
326+
"int": list(range(1, 4)),
327+
"float": np.arange(4.0, 7.0, dtype="float64"),
328+
"float_with_nan": [2.0, np.nan, 3.0],
329+
"bool": [True, False, True],
330+
"bool_with_na": [True, False, None],
331+
"datetime": pd.date_range("20130101", periods=3),
332+
"datetime_with_nat": [
333+
pd.Timestamp("20130101"),
334+
pd.NaT,
335+
pd.Timestamp("20130103"),
336+
],
337+
}
338+
)
339+
bytes_data = df.copy().to_orc()
340+
with pd.option_context("io.nullable_backend", "pyarrow"):
341+
result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
342+
expected = pd.DataFrame(
343+
{
344+
col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
345+
for col in df.columns
346+
}
347+
)
348+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)