ENH: Add use_nullable_dtypes to read_feather (#50765)

phofl · web-flow · commit 06d074fba8b4 · 2023-01-17T09:55:01.000-08:00
* ENH: Add use_nullable_dtypes to read_feather

* Add gh ref

* Refactor
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -45,6 +45,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
 * :func:`read_sql_query`
 * :func:`read_sql_table`
 * :func:`read_orc`
+* :func:`read_feather`
 * :func:`to_numeric`
 
 Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
@@ -57,6 +58,7 @@ to select the nullable dtypes implementation.
 * :func:`read_xml`
 * :func:`read_parquet`
 * :func:`read_orc`
+* :func:`read_feather`
 
 
 And the following methods will also utilize the ``mode.dtype_backend`` option.
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -15,6 +15,10 @@
 from pandas.compat._optional import import_optional_dependency
 from pandas.util._decorators import doc
 
+from pandas import (
+    arrays,
+    get_option,
+)
 from pandas.core.api import (
     DataFrame,
     NumericIndex,
@@ -99,6 +103,7 @@ def read_feather(
     columns: Sequence[Hashable] | None = None,
     use_threads: bool = True,
     storage_options: StorageOptions = None,
+    use_nullable_dtypes: bool = False,
 ):
     """
     Load a feather-format object from the file path.
@@ -118,6 +123,19 @@ def read_feather(
 
         .. versionadded:: 1.2.0
 
+    use_nullable_dtypes : bool = False
+        Whether or not to use nullable dtypes as default when reading data. If
+        set to True, nullable dtypes are used for all dtypes that have a nullable
+        implementation, even if no nulls are present.
+
+        The nullable dtype implementation can be configured by calling
+        ``pd.set_option("mode.dtype_backend", "pandas")`` to use
+        numpy-backed nullable dtypes or
+        ``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
+        pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
+
+        .. versionadded:: 2.0
+
     Returns
     -------
     type of object stored in file
@@ -128,7 +146,28 @@ def read_feather(
     with get_handle(
         path, "rb", storage_options=storage_options, is_text=False
     ) as handles:
+        if not use_nullable_dtypes:
+            return feather.read_feather(
+                handles.handle, columns=columns, use_threads=bool(use_threads)
+            )
 
-        return feather.read_feather(
+        dtype_backend = get_option("mode.dtype_backend")
+
+        pa_table = feather.read_table(
             handles.handle, columns=columns, use_threads=bool(use_threads)
         )
+
+        if dtype_backend == "pandas":
+            from pandas.io._util import _arrow_dtype_mapping
+
+            return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
+
+        elif dtype_backend == "pyarrow":
+            return DataFrame(
+                {
+                    col_name: arrays.ArrowExtensionArray(pa_col)
+                    for col_name, pa_col in zip(
+                        pa_table.column_names, pa_table.itercolumns()
+                    )
+                }
+            )
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
@@ -4,6 +4,10 @@
 
 import pandas as pd
 import pandas._testing as tm
+from pandas.core.arrays import (
+    ArrowStringArray,
+    StringArray,
+)
 
 from pandas.io.feather_format import read_feather, to_feather  # isort:skip
 
@@ -194,3 +198,60 @@ def test_http_path(self, feather_file):
         expected = read_feather(feather_file)
         res = read_feather(url)
         tm.assert_frame_equal(expected, res)
+
+    @pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
+    def test_read_json_nullable(self, string_storage, dtype_backend):
+        # GH#50765
+        pa = pytest.importorskip("pyarrow")
+        df = pd.DataFrame(
+            {
+                "a": pd.Series([1, np.nan, 3], dtype="Int64"),
+                "b": pd.Series([1, 2, 3], dtype="Int64"),
+                "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
+                "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": [True, False, None],
+                "f": [True, False, True],
+                "g": ["a", "b", "c"],
+                "h": ["a", "b", None],
+            }
+        )
+
+        if string_storage == "python":
+            string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
+            string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
+
+        else:
+            string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
+            string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
+
+        with tm.ensure_clean() as path:
+            to_feather(df, path)
+            with pd.option_context("mode.string_storage", string_storage):
+                with pd.option_context("mode.dtype_backend", dtype_backend):
+                    result = read_feather(path, use_nullable_dtypes=True)
+
+        expected = pd.DataFrame(
+            {
+                "a": pd.Series([1, np.nan, 3], dtype="Int64"),
+                "b": pd.Series([1, 2, 3], dtype="Int64"),
+                "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
+                "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
+                "e": pd.Series([True, False, pd.NA], dtype="boolean"),
+                "f": pd.Series([True, False, True], dtype="boolean"),
+                "g": string_array,
+                "h": string_array_na,
+            }
+        )
+
+        if dtype_backend == "pyarrow":
+
+            from pandas.arrays import ArrowExtensionArray
+
+            expected = pd.DataFrame(
+                {
+                    col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
+                    for col in expected.columns
+                }
+            )
+
+        tm.assert_frame_equal(result, expected)