ENH: Add use_nullable_dtypes and nullable_backend global option to read_orc (pandas-dev#49827)

mroeschke · mliu08 · commit 997392bd7e58 · 2022-11-26T22:09:10.000-08:00
* ENH: Add use_nullable_dtypes and nullable_backend to read_orc

* Skip if not required pa version

* Address review
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -33,7 +33,7 @@ sql-other, html, xml, plot, output_formatting, clipboard, compression, test]`` (
 Configuration option, ``io.nullable_backend``, to return pyarrow-backed dtypes from IO functions
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet` and :func:`read_csv` (with ``engine="pyarrow"``)
+A new global configuration, ``io.nullable_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in :func:`read_parquet`, :func:`read_orc` and :func:`read_csv` (with ``engine="pyarrow"``)
 to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
 
 .. ipython:: python
@@ -45,7 +45,7 @@ to return pyarrow-backed dtypes when set to ``"pyarrow"`` (:issue:`48957`).
     """)
     with pd.option_context("io.nullable_backend", "pyarrow"):
         df = pd.read_csv(data, use_nullable_dtypes=True, engine="pyarrow")
-    df
+    df.dtypes
 
 .. _whatsnew_200.enhancements.other:
 
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
@@ -4,11 +4,12 @@
 import io
 from types import ModuleType
 from typing import (
-    TYPE_CHECKING,
     Any,
     Literal,
 )
 
+from pandas._config import get_option
+
 from pandas._typing import (
     FilePath,
     ReadBuffer,
@@ -23,14 +24,17 @@
     is_unsigned_integer_dtype,
 )
 
-from pandas.io.common import get_handle
+from pandas.core.arrays import ArrowExtensionArray
+from pandas.core.frame import DataFrame
 
-if TYPE_CHECKING:
-    from pandas import DataFrame
+from pandas.io.common import get_handle
 
 
 def read_orc(
-    path: FilePath | ReadBuffer[bytes], columns: list[str] | None = None, **kwargs
+    path: FilePath | ReadBuffer[bytes],
+    columns: list[str] | None = None,
+    use_nullable_dtypes: bool = False,
+    **kwargs,
 ) -> DataFrame:
     """
     Load an ORC object from the file path, returning a DataFrame.
@@ -50,6 +54,21 @@ def read_orc(
         Output always follows the ordering of the file and not the columns list.
         This mirrors the original behaviour of
         :external+pyarrow:py:meth:`pyarrow.orc.ORCFile.read`.
+    use_nullable_dtypes : bool, default False
+        If True, use dtypes that use ``pd.NA`` as missing value indicator
+        for the resulting DataFrame.
+
+        The nullable dtype implementation can be configured by setting the global
+        ``io.nullable_backend`` configuration option to ``"pandas"`` to use
+        numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
+        nullable dtypes (using ``pd.ArrowDtype``).
+
+        .. versionadded:: 2.0.0
+
+        .. note
+
+            Currently only ``io.nullable_backend`` set to ``"pyarrow"`` is supported.
+
     **kwargs
         Any additional kwargs are passed to pyarrow.
 
@@ -68,7 +87,24 @@ def read_orc(
 
     with get_handle(path, "rb", is_text=False) as handles:
         orc_file = orc.ORCFile(handles.handle)
-        return orc_file.read(columns=columns, **kwargs).to_pandas()
+        pa_table = orc_file.read(columns=columns, **kwargs)
+    if use_nullable_dtypes:
+        nullable_backend = get_option("io.nullable_backend")
+        if nullable_backend != "pyarrow":
+            raise NotImplementedError(
+                f"io.nullable_backend set to {nullable_backend} is not implemented."
+            )
+        df = DataFrame(
+            {
+                col_name: ArrowExtensionArray(pa_col)
+                for col_name, pa_col in zip(
+                    pa_table.column_names, pa_table.itercolumns()
+                )
+            }
+        )
+        return df
+    else:
+        return pa_table.to_pandas()
 
 
 def to_orc(
diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py
@@ -406,6 +406,11 @@
     set to True, nullable dtypes are used for all dtypes that have a nullable
     implementation, even if no nulls are present.
 
+    The nullable dtype implementation can be configured by setting the global
+    ``io.nullable_backend`` configuration option to ``"pandas"`` to use
+    numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
+    nullable dtypes (using ``pd.ArrowDtype``).
+
     .. versionadded:: 2.0
 
 Returns
diff --git a/pandas/tests/io/test_orc.py b/pandas/tests/io/test_orc.py
@@ -14,6 +14,8 @@
 
 pytest.importorskip("pyarrow.orc")
 
+import pyarrow as pa
+
 
 @pytest.fixture
 def dirpath(datapath):
@@ -301,3 +303,46 @@ def test_orc_writer_dtypes_not_supported(df_not_supported):
     msg = "The dtype of one or more columns is not supported yet."
     with pytest.raises(NotImplementedError, match=msg):
         df_not_supported.to_orc()
+
+
+def test_orc_use_nullable_dtypes_pandas_backend_not_supported(dirpath):
+    input_file = os.path.join(dirpath, "TestOrcFile.emptyFile.orc")
+    with pytest.raises(
+        NotImplementedError,
+        match="io.nullable_backend set to pandas is not implemented.",
+    ):
+        with pd.option_context("io.nullable_backend", "pandas"):
+            read_orc(input_file, use_nullable_dtypes=True)
+
+
+@td.skip_if_no("pyarrow", min_version="7.0.0")
+def test_orc_use_nullable_dtypes_pyarrow_backend():
+    df = pd.DataFrame(
+        {
+            "string": list("abc"),
+            "string_with_nan": ["a", np.nan, "c"],
+            "string_with_none": ["a", None, "c"],
+            "bytes": [b"foo", b"bar", None],
+            "int": list(range(1, 4)),
+            "float": np.arange(4.0, 7.0, dtype="float64"),
+            "float_with_nan": [2.0, np.nan, 3.0],
+            "bool": [True, False, True],
+            "bool_with_na": [True, False, None],
+            "datetime": pd.date_range("20130101", periods=3),
+            "datetime_with_nat": [
+                pd.Timestamp("20130101"),
+                pd.NaT,
+                pd.Timestamp("20130103"),
+            ],
+        }
+    )
+    bytes_data = df.copy().to_orc()
+    with pd.option_context("io.nullable_backend", "pyarrow"):
+        result = read_orc(BytesIO(bytes_data), use_nullable_dtypes=True)
+    expected = pd.DataFrame(
+        {
+            col: pd.arrays.ArrowExtensionArray(pa.array(df[col], from_pandas=True))
+            for col in df.columns
+        }
+    )
+    tm.assert_frame_equal(result, expected)