diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 6a63058dfab94..6e9dfdc1263b3 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -33,7 +33,7 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` ( Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``) +A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc` and :func:`read_csv` (with ``engine="pyarrow"``) to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`). .. ipython:: python @@ -45,7 +45,7 @@ to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`). """) with pd.option_context("io.nullable_backend", "pyarrow"): df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow") - df + df.dtypes .. _whatsnew_200.enhancements.other: diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 2670390746d80..42a75701cd9c4 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -4,11 +4,12 @@ import io from types import ModuleType from typing import ( - TYPE_CHECKING, Any, Literal, ) +from pandas._config import get_option + from pandas._typing import ( FilePath, ReadBuffer, @@ -23,14 +24,17 @@ is_unsigned_integer_dtype, ) -from pandas.io.common import get_handle +from pandas.core.arrays import ArrowExtensionArray +from pandas.core.frame import DataFrame -if TYPE_CHECKING: - from pandas import DataFrame +from pandas.io.common import get_handle def read_orc( - path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs + path: FilePath | ReadBuffer[bytes], + columns: list[str] | None = None, + use_nullable_dtypes: bool = False, + **kwargs, ) -> DataFrame: """ Load an ORC object from the file path, returning a DataFrame. @@ -50,6 +54,21 @@ def read_orc( Output always follows the ordering of the file and not the columns list. This mirrors the original behaviour of :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`. + use_nullable_dtypes : bool, default False + If True, use dtypes that use ``pd.NA`` as missing value indicator + for the resulting DataFrame. + + The nullable dtype implementation can be configured by setting the global + ``io.nullable_backend`` configuration option to ``"pandas"`` to use + numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed + nullable dtypes (using ``pd.ArrowDtype``). + + .. versionadded:: 2.0.0 + + .. note + + Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported. + **kwargs Any additional kwargs are passed to pyarrow. @@ -68,7 +87,24 @@ def read_orc( with get_handle(path, "rb", is_text=False) as handles: orc_file = orc.ORCFile(handles.handle) - return orc_file.read(columns=columns, **kwargs).to_pandas() + pa_table = orc_file.read(columns=columns, **kwargs) + if use_nullable_dtypes: + nullable_backend = get_option("io.nullable_backend") + if nullable_backend != "pyarrow": + raise NotImplementedError( + f"io.nullable_backend set to {nullable_backend} is not implemented." + ) + df = DataFrame( + { + col_name: ArrowExtensionArray(pa_col) + for col_name, pa_col in zip( + pa_table.column_names, pa_table.itercolumns() + ) + } + ) + return df + else: + return pa_table.to_pandas() def to_orc( diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 700a2b6ba964c..575390e9b97a4 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -406,6 +406,11 @@ set to True, nullable dtypes are used for all dtypes that have a nullable implementation, even if no nulls are present. + The nullable dtype implementation can be configured by setting the global + ``io.nullable_backend`` configuration option to ``"pandas"`` to use + numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed + nullable dtypes (using ``pd.ArrowDtype``). + .. versionadded:: 2.0 Returns diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py index a0acf160854ac..e747c03568603 100644 --- a/pandas/tests/io/test_orc.py +++ b/pandas/tests/io/test_orc.py @@ -14,6 +14,8 @@ pytest.importorskip("pyarrow.orc") +import pyarrow as pa + @pytest.fixture def dirpath(datapath): @@ -301,3 +303,46 @@ def test_orc_writer_dtypes_not_supported(df_not_supported): msg = "The dtype of one or more columns is not supported yet." with pytest.raises(NotImplementedError, match=msg): df_not_supported.to_orc() + + +def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath): + input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc") + with pytest.raises( + NotImplementedError, + match="io.nullable_backend set to pandas is not implemented.", + ): + with pd.option_context("io.nullable_backend", "pandas"): + read_orc(input_file, use_nullable_dtypes=True) + + +@td.skip_if_no("pyarrow", min_version="7.0.0") +def test_orc_use_nullable_dtypes_pyarrow_backend(): + df = pd.DataFrame( + { + "string": list("abc"), + "string_with_nan": ["a", np.nan, "c"], + "string_with_none": ["a", None, "c"], + "bytes": [b"foo", b"bar", None], + "int": list(range(1, 4)), + "float": np.arange(4.0, 7.0, dtype="float64"), + "float_with_nan": [2.0, np.nan, 3.0], + "bool": [True, False, True], + "bool_with_na": [True, False, None], + "datetime": pd.date_range("20130101", periods=3), + "datetime_with_nat": [ + pd.Timestamp("20130101"), + pd.NaT, + pd.Timestamp("20130103"), + ], + } + ) + bytes_data = df.copy().to_orc() + with pd.option_context("io.nullable_backend", "pyarrow"): + result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True) + expected = pd.DataFrame( + { + col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True)) + for col in df.columns + } + ) + tm.assert_frame_equal(result, expected)