Skip to content

Commit 0dce285

Browse files
authored
ENH: Implement io.nullable_backend config for read_parquet (#49039)
1 parent ba5031e commit 0dce285

File tree

4 files changed

+90
-17
lines changed

4 files changed

+90
-17
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Other enhancements
4242
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
4343
- :func:`assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
4444
- Added new argument ``use_nullable_dtypes`` to :func:`read_csv` to enable automatic conversion to nullable dtypes (:issue:`36712`)
45+
- Added new global configuration, ``io.nullable_backend`` to allow ``use_nullable_dtypes=True`` to return pyarrow-backed dtypes when set to ``"pyarrow"`` in :func:`read_parquet` (:issue:`48957`)
4546
- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
4647
- Added metadata propagation for binary operators on :class:`DataFrame` (:issue:`28283`)
4748
- :class:`.CategoricalConversionWarning`, :class:`.InvalidComparison`, :class:`.InvalidVersion`, :class:`.LossySetitemError`, and :class:`.NoBufferPresent` are now exposed in ``pandas.errors`` (:issue:`27656`)

pandas/core/config_init.py

+14
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,20 @@ def use_inf_as_na_cb(key) -> None:
730730
validator=is_one_of_factory(["auto", "sqlalchemy"]),
731731
)
732732

733+
io_nullable_backend_doc = """
734+
: string
735+
The nullable dtype implementation to return when ``use_nullable_dtypes=True``.
736+
Available options: 'pandas', 'pyarrow', the default is 'pandas'.
737+
"""
738+
739+
with cf.config_prefix("io.nullable_backend"):
740+
cf.register_option(
741+
"io_nullable_backend",
742+
"pandas",
743+
io_nullable_backend_doc,
744+
validator=is_one_of_factory(["pandas", "pyarrow"]),
745+
)
746+
733747
# --------
734748
# Plotting
735749
# ---------

pandas/io/parquet.py

+38-17
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from pandas import (
2323
DataFrame,
2424
MultiIndex,
25+
arrays,
2526
get_option,
2627
)
2728
from pandas.core.shared_docs import _shared_docs
@@ -221,25 +222,27 @@ def read(
221222
) -> DataFrame:
222223
kwargs["use_pandas_metadata"] = True
223224

225+
nullable_backend = get_option("io.nullable_backend")
224226
to_pandas_kwargs = {}
225227
if use_nullable_dtypes:
226228
import pandas as pd
227229

228-
mapping = {
229-
self.api.int8(): pd.Int8Dtype(),
230-
self.api.int16(): pd.Int16Dtype(),
231-
self.api.int32(): pd.Int32Dtype(),
232-
self.api.int64(): pd.Int64Dtype(),
233-
self.api.uint8(): pd.UInt8Dtype(),
234-
self.api.uint16(): pd.UInt16Dtype(),
235-
self.api.uint32(): pd.UInt32Dtype(),
236-
self.api.uint64(): pd.UInt64Dtype(),
237-
self.api.bool_(): pd.BooleanDtype(),
238-
self.api.string(): pd.StringDtype(),
239-
self.api.float32(): pd.Float32Dtype(),
240-
self.api.float64(): pd.Float64Dtype(),
241-
}
242-
to_pandas_kwargs["types_mapper"] = mapping.get
230+
if nullable_backend == "pandas":
231+
mapping = {
232+
self.api.int8(): pd.Int8Dtype(),
233+
self.api.int16(): pd.Int16Dtype(),
234+
self.api.int32(): pd.Int32Dtype(),
235+
self.api.int64(): pd.Int64Dtype(),
236+
self.api.uint8(): pd.UInt8Dtype(),
237+
self.api.uint16(): pd.UInt16Dtype(),
238+
self.api.uint32(): pd.UInt32Dtype(),
239+
self.api.uint64(): pd.UInt64Dtype(),
240+
self.api.bool_(): pd.BooleanDtype(),
241+
self.api.string(): pd.StringDtype(),
242+
self.api.float32(): pd.Float32Dtype(),
243+
self.api.float64(): pd.Float64Dtype(),
244+
}
245+
to_pandas_kwargs["types_mapper"] = mapping.get
243246
manager = get_option("mode.data_manager")
244247
if manager == "array":
245248
to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment]
@@ -251,9 +254,20 @@ def read(
251254
mode="rb",
252255
)
253256
try:
254-
result = self.api.parquet.read_table(
257+
pa_table = self.api.parquet.read_table(
255258
path_or_handle, columns=columns, **kwargs
256-
).to_pandas(**to_pandas_kwargs)
259+
)
260+
if nullable_backend == "pandas":
261+
result = pa_table.to_pandas(**to_pandas_kwargs)
262+
elif nullable_backend == "pyarrow":
263+
result = DataFrame(
264+
{
265+
col_name: arrays.ArrowExtensionArray(pa_col)
266+
for col_name, pa_col in zip(
267+
pa_table.column_names, pa_table.itercolumns()
268+
)
269+
}
270+
)
257271
if manager == "array":
258272
result = result._as_manager("array", copy=False)
259273
return result
@@ -494,6 +508,13 @@ def read_parquet(
494508
495509
.. versionadded:: 1.2.0
496510
511+
The nullable dtype implementation can be configured by setting the global
512+
``io.nullable_backend`` configuration option to ``"pandas"`` to use
513+
numpy-backed nullable dtypes or ``"pyarrow"`` to use pyarrow-backed
514+
nullable dtypes (using ``pd.ArrowDtype``).
515+
516+
.. versionadded:: 2.0.0
517+
497518
**kwargs
498519
Any additional kwargs are passed to the engine.
499520

pandas/tests/io/test_parquet.py

+37
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,43 @@ def test_read_parquet_manager(self, pa, using_array_manager):
10141014
else:
10151015
assert isinstance(result._mgr, pd.core.internals.BlockManager)
10161016

1017+
def test_read_use_nullable_types_pyarrow_config(self, pa, df_full):
1018+
import pyarrow
1019+
1020+
df = df_full
1021+
1022+
# additional supported types for pyarrow
1023+
dti = pd.date_range("20130101", periods=3, tz="Europe/Brussels")
1024+
dti = dti._with_freq(None) # freq doesn't round-trip
1025+
df["datetime_tz"] = dti
1026+
df["bool_with_none"] = [True, None, True]
1027+
1028+
pa_table = pyarrow.Table.from_pandas(df)
1029+
expected = pd.DataFrame(
1030+
{
1031+
col_name: pd.arrays.ArrowExtensionArray(pa_column)
1032+
for col_name, pa_column in zip(
1033+
pa_table.column_names, pa_table.itercolumns()
1034+
)
1035+
}
1036+
)
1037+
# pyarrow infers datetimes as us instead of ns
1038+
expected["datetime"] = expected["datetime"].astype("timestamp[us][pyarrow]")
1039+
expected["datetime_with_nat"] = expected["datetime_with_nat"].astype(
1040+
"timestamp[us][pyarrow]"
1041+
)
1042+
expected["datetime_tz"] = expected["datetime_tz"].astype(
1043+
pd.ArrowDtype(pyarrow.timestamp(unit="us", tz="Europe/Brussels"))
1044+
)
1045+
1046+
with pd.option_context("io.nullable_backend", "pyarrow"):
1047+
check_round_trip(
1048+
df,
1049+
engine=pa,
1050+
read_kwargs={"use_nullable_dtypes": True},
1051+
expected=expected,
1052+
)
1053+
10171054

10181055
class TestParquetFastParquet(Base):
10191056
def test_basic(self, fp, df_full):

0 commit comments

Comments
 (0)