Skip to content

Commit ac7ca23

Browse files
authored
ENH: DataFrame.to_parquet() returns bytes if path_or_buf not provided (pandas-dev#37129)
1 parent a349462 commit ac7ca23

File tree

4 files changed

+45
-13
lines changed

4 files changed

+45
-13
lines changed

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ Other enhancements
221221
- :meth:`Rolling.var()` and :meth:`Rolling.std()` use Kahan summation and Welfords Method to avoid numerical issues (:issue:`37051`)
222222
- :meth:`DataFrame.plot` now recognizes ``xlabel`` and ``ylabel`` arguments for plots of type ``scatter`` and ``hexbin`` (:issue:`37001`)
223223
- :class:`DataFrame` now supports ``divmod`` operation (:issue:`37165`)
224+
- :meth:`DataFrame.to_parquet` now returns a ``bytes`` object when no ``path`` argument is passed (:issue:`37105`)
224225

225226
.. _whatsnew_120.api_breaking.python:
226227

pandas/core/frame.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -2289,14 +2289,14 @@ def to_markdown(
22892289
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
22902290
def to_parquet(
22912291
self,
2292-
path: FilePathOrBuffer[AnyStr],
2292+
path: Optional[FilePathOrBuffer] = None,
22932293
engine: str = "auto",
22942294
compression: Optional[str] = "snappy",
22952295
index: Optional[bool] = None,
22962296
partition_cols: Optional[List[str]] = None,
22972297
storage_options: StorageOptions = None,
22982298
**kwargs,
2299-
) -> None:
2299+
) -> Optional[bytes]:
23002300
"""
23012301
Write a DataFrame to the binary parquet format.
23022302
@@ -2307,14 +2307,15 @@ def to_parquet(
23072307
23082308
Parameters
23092309
----------
2310-
path : str or file-like object
2310+
path : str or file-like object, default None
23112311
If a string, it will be used as Root Directory path
23122312
when writing a partitioned dataset. By file-like object,
23132313
we refer to objects with a write() method, such as a file handle
23142314
(e.g. via builtin open function) or io.BytesIO. The engine
2315-
fastparquet does not accept file-like objects.
2315+
fastparquet does not accept file-like objects. If path is None,
2316+
a bytes object is returned.
23162317
2317-
.. versionchanged:: 1.0.0
2318+
.. versionchanged:: 1.2.0
23182319
23192320
Previously this was "fname"
23202321
@@ -2357,6 +2358,10 @@ def to_parquet(
23572358
Additional arguments passed to the parquet library. See
23582359
:ref:`pandas io <io.parquet>` for more details.
23592360
2361+
Returns
2362+
-------
2363+
bytes if no path argument is provided else None
2364+
23602365
See Also
23612366
--------
23622367
read_parquet : Read a parquet file.
@@ -2392,7 +2397,7 @@ def to_parquet(
23922397
"""
23932398
from pandas.io.parquet import to_parquet
23942399

2395-
to_parquet(
2400+
return to_parquet(
23962401
self,
23972402
path,
23982403
engine,

pandas/io/parquet.py

+22-7
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
""" parquet compat """
22

3+
import io
34
from typing import Any, AnyStr, Dict, List, Optional
45
from warnings import catch_warnings
56

@@ -238,28 +239,29 @@ def read(
238239

239240
def to_parquet(
240241
df: DataFrame,
241-
path: FilePathOrBuffer[AnyStr],
242+
path: Optional[FilePathOrBuffer] = None,
242243
engine: str = "auto",
243244
compression: Optional[str] = "snappy",
244245
index: Optional[bool] = None,
245246
storage_options: StorageOptions = None,
246247
partition_cols: Optional[List[str]] = None,
247248
**kwargs,
248-
):
249+
) -> Optional[bytes]:
249250
"""
250251
Write a DataFrame to the parquet format.
251252
252253
Parameters
253254
----------
254255
df : DataFrame
255-
path : str or file-like object
256+
path : str or file-like object, default None
256257
If a string, it will be used as Root Directory path
257258
when writing a partitioned dataset. By file-like object,
258259
we refer to objects with a write() method, such as a file handle
259260
(e.g. via builtin open function) or io.BytesIO. The engine
260-
fastparquet does not accept file-like objects.
261+
fastparquet does not accept file-like objects. If path is None,
262+
a bytes object is returned.
261263
262-
.. versionchanged:: 0.24.0
264+
.. versionchanged:: 1.2.0
263265
264266
engine : {'auto', 'pyarrow', 'fastparquet'}, default 'auto'
265267
Parquet library to use. If 'auto', then the option
@@ -298,20 +300,33 @@ def to_parquet(
298300
299301
kwargs
300302
Additional keyword arguments passed to the engine
303+
304+
Returns
305+
-------
306+
bytes if no path argument is provided else None
301307
"""
302308
if isinstance(partition_cols, str):
303309
partition_cols = [partition_cols]
304310
impl = get_engine(engine)
305-
return impl.write(
311+
312+
path_or_buf: FilePathOrBuffer = io.BytesIO() if path is None else path
313+
314+
impl.write(
306315
df,
307-
path,
316+
path_or_buf,
308317
compression=compression,
309318
index=index,
310319
partition_cols=partition_cols,
311320
storage_options=storage_options,
312321
**kwargs,
313322
)
314323

324+
if path is None:
325+
assert isinstance(path_or_buf, io.BytesIO)
326+
return path_or_buf.getvalue()
327+
else:
328+
return None
329+
315330

316331
def read_parquet(path, engine: str = "auto", columns=None, **kwargs):
317332
"""

pandas/tests/io/test_parquet.py

+11
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,17 @@ def test_basic_subset_columns(self, pa, df_full):
512512
read_kwargs={"columns": ["string", "int"]},
513513
)
514514

515+
def test_to_bytes_without_path_or_buf_provided(self, pa, df_full):
516+
# GH 37105
517+
518+
buf_bytes = df_full.to_parquet(engine=pa)
519+
assert isinstance(buf_bytes, bytes)
520+
521+
buf_stream = BytesIO(buf_bytes)
522+
res = pd.read_parquet(buf_stream)
523+
524+
tm.assert_frame_equal(df_full, res)
525+
515526
def test_duplicate_columns(self, pa):
516527
# not currently able to handle duplicate columns
517528
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()

0 commit comments

Comments
 (0)