Skip to content

Commit 06d074f

Browse files
authored
ENH: Add use_nullable_dtypes to read_feather (#50765)
* ENH: Add use_nullable_dtypes to read_feather * Add gh ref * Refactor
1 parent f63e7b8 commit 06d074f

File tree

3 files changed

+103
-1
lines changed

3 files changed

+103
-1
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ The ``use_nullable_dtypes`` keyword argument has been expanded to the following
4545
* :func:`read_sql_query`
4646
* :func:`read_sql_table`
4747
* :func:`read_orc`
48+
* :func:`read_feather`
4849
* :func:`to_numeric`
4950

5051
Additionally a new global configuration, ``mode.dtype_backend`` can now be used in conjunction with the parameter ``use_nullable_dtypes=True`` in the following functions
@@ -57,6 +58,7 @@ to select the nullable dtypes implementation.
5758
* :func:`read_xml`
5859
* :func:`read_parquet`
5960
* :func:`read_orc`
61+
* :func:`read_feather`
6062

6163

6264
And the following methods will also utilize the ``mode.dtype_backend`` option.

pandas/io/feather_format.py

+40-1
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@
1515
from pandas.compat._optional import import_optional_dependency
1616
from pandas.util._decorators import doc
1717

18+
from pandas import (
19+
arrays,
20+
get_option,
21+
)
1822
from pandas.core.api import (
1923
DataFrame,
2024
NumericIndex,
@@ -99,6 +103,7 @@ def read_feather(
99103
columns: Sequence[Hashable] | None = None,
100104
use_threads: bool = True,
101105
storage_options: StorageOptions = None,
106+
use_nullable_dtypes: bool = False,
102107
):
103108
"""
104109
Load a feather-format object from the file path.
@@ -118,6 +123,19 @@ def read_feather(
118123
119124
.. versionadded:: 1.2.0
120125
126+
use_nullable_dtypes : bool = False
127+
Whether or not to use nullable dtypes as default when reading data. If
128+
set to True, nullable dtypes are used for all dtypes that have a nullable
129+
implementation, even if no nulls are present.
130+
131+
The nullable dtype implementation can be configured by calling
132+
``pd.set_option("mode.dtype_backend", "pandas")`` to use
133+
numpy-backed nullable dtypes or
134+
``pd.set_option("mode.dtype_backend", "pyarrow")`` to use
135+
pyarrow-backed nullable dtypes (using ``pd.ArrowDtype``).
136+
137+
.. versionadded:: 2.0
138+
121139
Returns
122140
-------
123141
type of object stored in file
@@ -128,7 +146,28 @@ def read_feather(
128146
with get_handle(
129147
path, "rb", storage_options=storage_options, is_text=False
130148
) as handles:
149+
if not use_nullable_dtypes:
150+
return feather.read_feather(
151+
handles.handle, columns=columns, use_threads=bool(use_threads)
152+
)
131153

132-
return feather.read_feather(
154+
dtype_backend = get_option("mode.dtype_backend")
155+
156+
pa_table = feather.read_table(
133157
handles.handle, columns=columns, use_threads=bool(use_threads)
134158
)
159+
160+
if dtype_backend == "pandas":
161+
from pandas.io._util import _arrow_dtype_mapping
162+
163+
return pa_table.to_pandas(types_mapper=_arrow_dtype_mapping().get)
164+
165+
elif dtype_backend == "pyarrow":
166+
return DataFrame(
167+
{
168+
col_name: arrays.ArrowExtensionArray(pa_col)
169+
for col_name, pa_col in zip(
170+
pa_table.column_names, pa_table.itercolumns()
171+
)
172+
}
173+
)

pandas/tests/io/test_feather.py

+61
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,10 @@
44

55
import pandas as pd
66
import pandas._testing as tm
7+
from pandas.core.arrays import (
8+
ArrowStringArray,
9+
StringArray,
10+
)
711

812
from pandas.io.feather_format import read_feather, to_feather # isort:skip
913

@@ -194,3 +198,60 @@ def test_http_path(self, feather_file):
194198
expected = read_feather(feather_file)
195199
res = read_feather(url)
196200
tm.assert_frame_equal(expected, res)
201+
202+
@pytest.mark.parametrize("dtype_backend", ["pandas", "pyarrow"])
203+
def test_read_json_nullable(self, string_storage, dtype_backend):
204+
# GH#50765
205+
pa = pytest.importorskip("pyarrow")
206+
df = pd.DataFrame(
207+
{
208+
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
209+
"b": pd.Series([1, 2, 3], dtype="Int64"),
210+
"c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
211+
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
212+
"e": [True, False, None],
213+
"f": [True, False, True],
214+
"g": ["a", "b", "c"],
215+
"h": ["a", "b", None],
216+
}
217+
)
218+
219+
if string_storage == "python":
220+
string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
221+
string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
222+
223+
else:
224+
string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
225+
string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
226+
227+
with tm.ensure_clean() as path:
228+
to_feather(df, path)
229+
with pd.option_context("mode.string_storage", string_storage):
230+
with pd.option_context("mode.dtype_backend", dtype_backend):
231+
result = read_feather(path, use_nullable_dtypes=True)
232+
233+
expected = pd.DataFrame(
234+
{
235+
"a": pd.Series([1, np.nan, 3], dtype="Int64"),
236+
"b": pd.Series([1, 2, 3], dtype="Int64"),
237+
"c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
238+
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
239+
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
240+
"f": pd.Series([True, False, True], dtype="boolean"),
241+
"g": string_array,
242+
"h": string_array_na,
243+
}
244+
)
245+
246+
if dtype_backend == "pyarrow":
247+
248+
from pandas.arrays import ArrowExtensionArray
249+
250+
expected = pd.DataFrame(
251+
{
252+
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
253+
for col in expected.columns
254+
}
255+
)
256+
257+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)