Skip to content

Commit 1c986d6

Browse files
kleinhenzyuanx749jorisvandenbossche
authored
ENH: expose to_pandas_kwargs in read_parquet with pyarrow backend (#59654)
Co-authored-by: Joseph Kleinhenz <[email protected]> Co-authored-by: Xiao Yuan <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 72ab3fd commit 1c986d6

File tree

4 files changed

+39
-3
lines changed

4 files changed

+39
-3
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ Other enhancements
5454
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
5555
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
5656
- :meth:`DataFrame.plot.scatter` argument ``c`` now accepts a column of strings, where rows with the same string are colored identically (:issue:`16827` and :issue:`16485`)
57+
- :func:`read_parquet` accepts ``to_pandas_kwargs`` which are forwarded to :meth:`pyarrow.Table.to_pandas` which enables passing additional keywords to customize the conversion to pandas, such as ``maps_as_pydicts`` to read the Parquet map data type as python dictionaries (:issue:`56842`)
5758
- :meth:`DataFrameGroupBy.transform`, :meth:`SeriesGroupBy.transform`, :meth:`DataFrameGroupBy.agg`, :meth:`SeriesGroupBy.agg`, :meth:`RollingGroupby.apply`, :meth:`ExpandingGroupby.apply`, :meth:`Rolling.apply`, :meth:`Expanding.apply`, :meth:`DataFrame.apply` with ``engine="numba"`` now supports positional arguments passed as kwargs (:issue:`58995`)
5859
- :meth:`Series.map` can now accept kwargs to pass on to func (:issue:`59814`)
5960
- :meth:`pandas.concat` will raise a ``ValueError`` when ``ignore_index=True`` and ``keys`` is not ``None`` (:issue:`59274`)

pandas/io/_util.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,12 @@ def arrow_table_to_pandas(
6060
table: pyarrow.Table,
6161
dtype_backend: DtypeBackend | Literal["numpy"] | lib.NoDefault = lib.no_default,
6262
null_to_int64: bool = False,
63+
to_pandas_kwargs: dict | None = None,
6364
) -> pd.DataFrame:
6465
pa = import_optional_dependency("pyarrow")
6566

67+
to_pandas_kwargs = {} if to_pandas_kwargs is None else to_pandas_kwargs
68+
6669
types_mapper: type[pd.ArrowDtype] | None | Callable
6770
if dtype_backend == "numpy_nullable":
6871
mapping = _arrow_dtype_mapping()
@@ -80,5 +83,5 @@ def arrow_table_to_pandas(
8083
else:
8184
raise NotImplementedError
8285

83-
df = table.to_pandas(types_mapper=types_mapper)
86+
df = table.to_pandas(types_mapper=types_mapper, **to_pandas_kwargs)
8487
return df

pandas/io/parquet.py

+20-2
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ def read(
242242
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
243243
storage_options: StorageOptions | None = None,
244244
filesystem=None,
245+
to_pandas_kwargs: dict[str, Any] | None = None,
245246
**kwargs,
246247
) -> DataFrame:
247248
kwargs["use_pandas_metadata"] = True
@@ -266,7 +267,11 @@ def read(
266267
"make_block is deprecated",
267268
DeprecationWarning,
268269
)
269-
result = arrow_table_to_pandas(pa_table, dtype_backend=dtype_backend)
270+
result = arrow_table_to_pandas(
271+
pa_table,
272+
dtype_backend=dtype_backend,
273+
to_pandas_kwargs=to_pandas_kwargs,
274+
)
270275

271276
if pa_table.schema.metadata:
272277
if b"PANDAS_ATTRS" in pa_table.schema.metadata:
@@ -347,6 +352,7 @@ def read(
347352
filters=None,
348353
storage_options: StorageOptions | None = None,
349354
filesystem=None,
355+
to_pandas_kwargs: dict | None = None,
350356
**kwargs,
351357
) -> DataFrame:
352358
parquet_kwargs: dict[str, Any] = {}
@@ -362,6 +368,10 @@ def read(
362368
raise NotImplementedError(
363369
"filesystem is not implemented for the fastparquet engine."
364370
)
371+
if to_pandas_kwargs is not None:
372+
raise NotImplementedError(
373+
"to_pandas_kwargs is not implemented for the fastparquet engine."
374+
)
365375
path = stringify_path(path)
366376
handles = None
367377
if is_fsspec_url(path):
@@ -452,7 +462,7 @@ def to_parquet(
452462
.. versionadded:: 2.1.0
453463
454464
kwargs
455-
Additional keyword arguments passed to the engine
465+
Additional keyword arguments passed to the engine.
456466
457467
Returns
458468
-------
@@ -491,6 +501,7 @@ def read_parquet(
491501
dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
492502
filesystem: Any = None,
493503
filters: list[tuple] | list[list[tuple]] | None = None,
504+
to_pandas_kwargs: dict | None = None,
494505
**kwargs,
495506
) -> DataFrame:
496507
"""
@@ -564,6 +575,12 @@ def read_parquet(
564575
565576
.. versionadded:: 2.1.0
566577
578+
to_pandas_kwargs : dict | None, default None
579+
Keyword arguments to pass through to :func:`pyarrow.Table.to_pandas`
580+
when ``engine="pyarrow"``.
581+
582+
.. versionadded:: 3.0.0
583+
567584
**kwargs
568585
Any additional kwargs are passed to the engine.
569586
@@ -636,5 +653,6 @@ def read_parquet(
636653
storage_options=storage_options,
637654
dtype_backend=dtype_backend,
638655
filesystem=filesystem,
656+
to_pandas_kwargs=to_pandas_kwargs,
639657
**kwargs,
640658
)

pandas/tests/io/test_parquet.py

+14
Original file line numberDiff line numberDiff line change
@@ -1172,6 +1172,20 @@ def test_non_nanosecond_timestamps(self, temp_file):
11721172
)
11731173
tm.assert_frame_equal(result, expected)
11741174

1175+
def test_maps_as_pydicts(self, pa):
1176+
pyarrow = pytest.importorskip("pyarrow", "13.0.0")
1177+
1178+
schema = pyarrow.schema(
1179+
[("foo", pyarrow.map_(pyarrow.string(), pyarrow.int64()))]
1180+
)
1181+
df = pd.DataFrame([{"foo": {"A": 1}}, {"foo": {"B": 2}}])
1182+
check_round_trip(
1183+
df,
1184+
pa,
1185+
write_kwargs={"schema": schema},
1186+
read_kwargs={"to_pandas_kwargs": {"maps_as_pydicts": "strict"}},
1187+
)
1188+
11751189

11761190
class TestParquetFastParquet(Base):
11771191
def test_basic(self, fp, df_full, request):

0 commit comments

Comments
 (0)